diff --git a/lib/cuda/Makefile b/lib/cuda/Makefile
new file mode 100644
index 0000000000..844906ba89
--- /dev/null
+++ b/lib/cuda/Makefile
@@ -0,0 +1,4 @@
+#Makefile for liblammpscuda.a 
+#No need to modify anything here! The CUDA path is inserted into Makefile.common
+
+include Makefile.cudalib
\ No newline at end of file
diff --git a/lib/cuda/Makefile.common b/lib/cuda/Makefile.common
new file mode 100644
index 0000000000..b4018cc5ed
--- /dev/null
+++ b/lib/cuda/Makefile.common
@@ -0,0 +1,108 @@
+#Common commandline argument interpreter for compilation with lammpscuda (USER-CUDA) installed
+
+# make options:
+# emu=1        switch to cuda emulation mode (otherwise: use gpu)
+# dbg=1        print a lot of debugging output during runtime
+# verbose=1    output nvcc command line during compilation
+# keep=1       do not delete temporary compilation files (.ii, .cubin, ...)
+# cufft=1      use cuda's fast fourier transformation lib "cufft" where possible (otherwise: use cpu fftw)
+# binning=1    create virtual particle grid (neighbor-lists otherwise); currently this is not supported
+# precision=1  single precision (global setting)
+# precision=2  double precision (global setting)
+
+SHELL = /bin/sh
+
+# System-specific settings
+
+CUDA_INSTALL_PATH = /usr/local/cuda
+# e.g. in Gentoo
+# CUDA_INSTALL_PATH = /opt/cuda
+
+
+#//////////////////////////////////////////////////////////////////////////////////////////////
+# no need to change anything below this line
+#//////////////////////////////////////////////////////////////////////////////////////////////
+
+#use CPU FFT if cufft=0 is requested.
+FALLBACK_FFT = 1
+
+#default settings for compiler switches
+#ifdef COMPILELIB 
+#include Makefile.defaults
+#else
+include ../../lib/cuda/Makefile.defaults
+#endif
+
+#shell echo "Compiling with precision = " ${precision} ", arch = " ${arch} ", cufft = " ${cufft} ", dbg = " ${dbg} ", prec_timer = " ${prec_timer}
+
+CUDA_FLAGS := -DUNIX 
+CUDA_USRLIB_CONDITIONAL := 
+
+# debug setting
+ifeq ($(dbg), 1)
+	CUDA_FLAGS += -D_DEBUG -g
+	NVCC_FLAGS += -g -G 
+else
+	NVCC_FLAGS += --compiler-options -fno-strict-aliasing -O2
+endif
+
+# skip timing on Mac and Windows manually
+ifeq ($(prec_timer), 0)
+	CUDA_FLAGS += -DNO_PREC_TIMING
+endif
+
+# set fft routine
+ifeq ($(cufft), 0)
+	ifneq ($(FALLBACK_FFT), 1)
+	    FFT_INC = -DFFT_NONE
+	    FFT_PATH = 
+	    FFT_LIB = 
+		CUDA_FLAGS += -DFFT_NONE
+	endif
+else
+	CUDA_FLAGS += -DFFT_CUFFT
+	CUDA_USRLIB_CONDITIONAL += -lcufft
+endif
+
+# make global precision setting
+ifeq ($(precision), 1)
+	CUDA_FLAGS += -DCUDA_PRECISION=1
+else
+	ifeq ($(precision), 3)
+		CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2
+	else
+		ifeq ($(precision), 4)
+			CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2
+		else
+			CUDA_FLAGS += -DCUDA_PRECISION=2
+		endif
+	endif
+endif
+
+# make architecture settings
+ifeq ($(arch), 13)
+	CUDA_FLAGS += -DCUDA_ARCH=13
+	SMVERSIONFLAGS	:= -arch sm_13
+else
+  ifeq ($(arch), 20)
+	 CUDA_FLAGS += -DCUDA_ARCH=20 
+	 #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
+	 NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
+	 SMVERSIONFLAGS	:= -arch sm_20
+  else
+     ifeq ($(arch), 21)
+	   CUDA_FLAGS += -DCUDA_ARCH=20 
+	   #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
+	   NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
+	   SMVERSIONFLAGS	:= -arch sm_21
+     else
+       CUDA_FLAGS += -DCUDA_ARCH=99  
+       SMVERSIONFLAGS	:= -arch sm_13
+     endif
+  endif
+endif
+
+
+
+CCFLAGS := $(CCFLAGS) $(CUDA_FLAGS) \
+		-I$(CUDA_INSTALL_PATH)/include 
diff --git a/lib/cuda/Makefile.cudalib b/lib/cuda/Makefile.cudalib
new file mode 100644
index 0000000000..e60ac38f18
--- /dev/null
+++ b/lib/cuda/Makefile.cudalib
@@ -0,0 +1,82 @@
+#Makefile for liblammpscuda.a 
+#No need to modify anything here! The CUDA path is inserted into Makefile.common
+
+.DEFAULT: lib
+
+COMPILELIB := 1
+
+SHELL = /bin/sh
+
+CUDA_SRC_DIR = ../cuda
+CUDA_TEMP = $(CUDA_SRC_DIR)/.lastmake
+CUDA_TEMP_DUMMY := $(shell touch $(CUDA_TEMP) )
+include $(CUDA_TEMP)
+CUDA_CU = $(wildcard $(CUDA_SRC_DIR)/*_kernel.cu)
+CUDA_CUO = $(CUDA_CU:_kernel.cu=_cu.o)
+CUDA_OBJ = $(subst $(CUDA_SRC_DIR)/,,$(CUDA_CUO)) 
+CUDA_DEP = $(CUDA_OBJ:.o=.d)
+
+NVCC_FLAGS := 
+
+VPATH = $(CUDA_SRC_DIR)
+
+#rewriting default settings if new ones are specified
+
+
+ifdef precision
+tmp := $(shell sed -i 's|precision ?= [0-9]|precision ?= '${precision}'|g' Makefile.defaults)
+endif
+
+ifdef arch
+tmp := $(shell sed -i 's|arch ?= [0-9][0-9]|arch ?= '${arch}'|g' Makefile.defaults)
+endif
+
+ifdef cufft
+tmp := $(shell sed -i 's|cufft ?= [0-9]|cufft ?= '${cufft}'|g' Makefile.defaults)
+endif
+
+ifdef dbg
+tmp := $(shell sed -i 's|dbg ?= [0-9]|dbg ?= '${dbg}'|g' Makefile.defaults)
+endif
+
+ifdef prec_timer
+tmp := $(shell sed -i 's|prec_timer ?= [0-9]|prec_timer ?= '${prec_timer}'|g' Makefile.defaults)
+endif
+
+include Makefile.common
+
+# verbose nvcc output during compilation
+ifeq ($(verbose), 1)
+	VERBOSE :=
+	NVCC_FLAGS += --ptxas-options=-v
+else
+	VERBOSE := @
+endif
+
+# keep temporary compilation files of nvcc
+ifeq ($(keep), 1)
+	NVCC_FLAGS += -keep -Xptxas="--verbose"
+endif
+
+
+NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
+CUDA_INCLUDES = -I$(CUDA_INSTALL_PATH)/include -I../../src/USER-CUDA
+CUDA_USRLIB = 
+
+# Link target
+
+lib: $(CUDA_OBJ)
+	$(NVCC) -lib $(CUDA_OBJ) $(CUDA_FLAGS) $(CUDA_USRLIB_CONDITIONAL) -o liblammpscuda.a
+
+clean:
+	rm $(CUDA_SRC_DIR)/*.o
+	rm liblammpscuda.a
+	
+# Library target
+
+
+# Cuda compilation rules
+
+%_cu.o: %.cu %_kernel.cu %_cu.h cuda_shared.h
+	$(VERBOSE)$(NVCC) $(NVCC_FLAGS) $(CUDA_FLAGS) $(CUDA_INCLUDES) $(CUDA_USRLIB) $(SMVERSIONFLAGS) -o $@ -c $<
+
diff --git a/lib/cuda/Makefile.defaults b/lib/cuda/Makefile.defaults
new file mode 100644
index 0000000000..d006a02d81
--- /dev/null
+++ b/lib/cuda/Makefile.defaults
@@ -0,0 +1,16 @@
+
+#precision setting: 1 single, 2 double, 4 mixed
+precision ?= 2
+
+#GPU architecture (compute capability): 13, 20, 21
+arch ?= 20
+
+#Using cufft (should not be changed)
+cufft ?= 1 
+
+#Using dbg mode 
+dbg ?= 0  
+
+#On mac machines set this to 0 in order to avoid usage of linux specific precision timer
+prec_timer ?= 1
+
diff --git a/lib/cuda/atom_vec_angle_cuda.cu b/lib/cuda/atom_vec_angle_cuda.cu
new file mode 100644
index 0000000000..a11d9adbe4
--- /dev/null
+++ b/lib/cuda/atom_vec_angle_cuda.cu
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+const unsigned int ANGLE_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
+
+#include "atom_vec_angle_cuda_cu.h"
+
+void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata)
+{
+  return Cuda_AtomVecCuda_Init<ANGLE_DATA_MASK>(sdata);
+}
+
+int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
+}
+
+int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
+}
+
+int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
+}
+
+int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
+}
+
+int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
+}
diff --git a/lib/cuda/atom_vec_angle_cuda_cu.h b/lib/cuda/atom_vec_angle_cuda_cu.h
new file mode 100644
index 0000000000..d8f5a2b9a4
--- /dev/null
+++ b/lib/cuda/atom_vec_angle_cuda_cu.h
@@ -0,0 +1,15 @@
+#ifndef ATOM_VEC_ANGLE_CUDA_CU_H_
+#define ATOM_VEC_ANGLE_CUDA_CU_H_
+
+extern "C" void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata);
+extern "C" int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
+extern "C" int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
+extern "C" int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
+extern "C" int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
+extern "C" int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
+
+#endif /*ATOM_VEC_ANGLE2_CUDA_CU_H_*/
diff --git a/lib/cuda/atom_vec_atomic_cuda.cu b/lib/cuda/atom_vec_atomic_cuda.cu
new file mode 100644
index 0000000000..0a75de2754
--- /dev/null
+++ b/lib/cuda/atom_vec_atomic_cuda.cu
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+const unsigned int ATOMIC_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
+
+#include "atom_vec_atomic_cuda_cu.h"
+
+void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata)
+{
+  return Cuda_AtomVecCuda_Init<ATOMIC_DATA_MASK>(sdata);
+}
+
+int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
+  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
+}
+
+int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
+  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
+}
+
+int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
+  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
+}
+
+int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
+}
+
+int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
+}
diff --git a/lib/cuda/atom_vec_atomic_cuda_cu.h b/lib/cuda/atom_vec_atomic_cuda_cu.h
new file mode 100644
index 0000000000..8e776308e0
--- /dev/null
+++ b/lib/cuda/atom_vec_atomic_cuda_cu.h
@@ -0,0 +1,15 @@
+#ifndef ATOM_VEC_ATOMIC_CUDA_CU_H_
+#define ATOM_VEC_ATOMIC_CUDA_CU_H_
+
+extern "C" void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata);
+extern "C" int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
+extern "C" int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
+extern "C" int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
+extern "C" int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
+extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
+
+#endif /*ATOM_VEC_ATOMIC2_CUDA_CU_H_*/
diff --git a/lib/cuda/atom_vec_charge_cuda.cu b/lib/cuda/atom_vec_charge_cuda.cu
new file mode 100644
index 0000000000..a78ffb9de0
--- /dev/null
+++ b/lib/cuda/atom_vec_charge_cuda.cu
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+const unsigned int CHARGE_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
+
+#include "atom_vec_charge_cuda_cu.h"
+
+void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata)
+{
+  return Cuda_AtomVecCuda_Init<CHARGE_DATA_MASK>(sdata);
+}
+
+int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
+  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
+}
+
+int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
+  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
+}
+
+int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
+  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
+}
+
+int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
+}
+
+int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
+}
diff --git a/lib/cuda/atom_vec_charge_cuda_cu.h b/lib/cuda/atom_vec_charge_cuda_cu.h
new file mode 100644
index 0000000000..137b001847
--- /dev/null
+++ b/lib/cuda/atom_vec_charge_cuda_cu.h
@@ -0,0 +1,15 @@
+#ifndef ATOM_VEC_CHARGE_CUDA_CU_H_
+#define ATOM_VEC_CHARGE_CUDA_CU_H_
+
+extern "C" void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata);
+extern "C" int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
+extern "C" int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
+extern "C" int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
+extern "C" int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
+extern "C" int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
+
+#endif /*ATOM_VEC_CHARGE2_CUDA_CU_H_*/
diff --git a/lib/cuda/atom_vec_cuda.cu b/lib/cuda/atom_vec_cuda.cu
new file mode 100644
index 0000000000..187718dc36
--- /dev/null
+++ b/lib/cuda/atom_vec_cuda.cu
@@ -0,0 +1,553 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX atom_vec_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "cuda_wrapper_cu.h"
+#include "crm_cuda_utils.cu"
+
+#include "atom_vec_cuda_kernel.cu"
+
+int AtomVecCuda_CountDataItems(unsigned int data_mask)
+{
+	int n=0;
+	if(data_mask & X_MASK) n+=3;
+	if(data_mask & V_MASK) n+=3;
+	if(data_mask & F_MASK) n+=3;
+	if(data_mask & TAG_MASK) n++;
+	if(data_mask & TYPE_MASK) n++;
+	if(data_mask & MASK_MASK) n++;	
+	if(data_mask & IMAGE_MASK) n++;	
+	if(data_mask & Q_MASK) n++;
+	if(data_mask & MOLECULE_MASK) n++;
+	if(data_mask & RMASS_MASK) n++;
+	if(data_mask & RADIUS_MASK) n++;
+	if(data_mask & DENSITY_MASK) n++;
+	if(data_mask & OMEGA_MASK) n+=3;
+	if(data_mask & TORQUE_MASK) n++;
+	
+	//if(data_mask & NSPECIAL_MASK) n+=3;	
+	return n;
+}
+
+void Cuda_AtomVecCuda_UpdateBuffer(cuda_shared_data* sdata,int size)
+{
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_AtomVecCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+}
+
+template <const unsigned int data_mask>
+void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(image)   , & sdata->atom.image.dev_data, sizeof(int*) );
+		if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_CONST(q)       , & sdata->atom.q    .dev_data, sizeof(F_FLOAT*) );
+		if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_CONST(molecule)   , & sdata->atom.molecule.dev_data, sizeof(int*) );
+		if(data_mask & RADIUS_MASK) cudaMemcpyToSymbol(MY_CONST(radius)   , & sdata->atom.radius.dev_data, sizeof(int*) );
+		if(data_mask & DENSITY_MASK) cudaMemcpyToSymbol(MY_CONST(density)   , & sdata->atom.density.dev_data, sizeof(int*) );
+		if(data_mask & RMASS_MASK) cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(int*) );
+		if(data_mask & OMEGA_MASK) cudaMemcpyToSymbol(MY_CONST(omega)   , & sdata->atom.omega.dev_data, sizeof(int*) );
+		//if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_CONST(nspecial)   , & sdata->atom.nspecial.dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(flag)    , & sdata->flag, sizeof(int*) );
+}
+
+template <const unsigned int data_mask>
+void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata)
+{
+	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n"); )
+	Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n"); )
+    cudaMemcpyToSymbol(MY_CONST(prd)   , sdata->domain.prd, 3*sizeof(X_FLOAT));
+	cudaMemcpyToSymbol(MY_CONST(sublo)   , & sdata->domain.sublo, 3*sizeof(X_FLOAT) );
+	cudaMemcpyToSymbol(MY_CONST(subhi)   , & sdata->domain.subhi, 3*sizeof(X_FLOAT) );
+	cudaMemcpyToSymbol(MY_CONST(flag)   , & sdata->flag, sizeof(int*) );
+	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n"); )
+}
+
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
+	int size=(n*n_data_items)*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
+
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemset( sdata->flag,0,sizeof(int));
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
+	  Cuda_AtomVecCuda_PackComm_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
+	  ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_pack+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+      
+	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
+      if(not sdata->overlap_comm)
+        cudaMemcpy(buf_send, sdata->buffer, n*n_data_items*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+      //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_download+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  int aflag;
+	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
+	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
+		
+	}		
+    return n_data_items*n;
+}
+
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+	MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");)
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
+	int size=(n*n_data_items)*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
+	static int count=-1;
+	count++;
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+
+clock_gettime(CLOCK_REALTIME,&time1);
+	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed");
+
+	  Cuda_AtomVecCuda_PackComm_Self_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_self+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self: Kernel execution failed");
+	}	
+	
+    return n_data_items*n;
+}
+
+
+template <const unsigned int data_mask>
+void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
+{
+    timespec time1,time2;
+
+	if(sdata->atom.update_nmax) 
+		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
+	int size=(n*n_data_items)*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+clock_gettime(CLOCK_REALTIME,&time1);
+      if(not sdata->overlap_comm||iswap<0)
+	    cudaMemcpy(sdata->buffer,(void*)buf_recv, n_data_items*n*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_upload+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+	  void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
+	  Cuda_AtomVecCuda_UnpackComm_Kernel<data_mask><<<grid, threads,0>>>(n,first,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_kernel_unpack+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackComm: Kernel execution failed");
+		
+	}		
+}
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
+{
+	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n",dim); )
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	Cuda_AtomVecCuda_Init<data_mask>(sdata);
+	int size=n*sizeof(double);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
+
+    cudaMemset((int*) (sdata->buffer),0,sizeof(int));
+    
+    int3 layout=getgrid(sdata->atom.nlocal,sizeof(int),256,true);
+    dim3 threads(layout.z, 1, 1);
+    dim3 grid(layout.x, layout.y, 1);		
+	
+    timespec time1,time2;
+	clock_gettime(CLOCK_REALTIME,&time1);
+    
+    Cuda_AtomVecCuda_PackExchangeList_Kernel<<<grid, threads,(threads.x+1)*sizeof(int)>>>(n-1,dim);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed");
+
+	clock_gettime(CLOCK_REALTIME,&time2);
+	sdata->cuda_timings.comm_exchange_kernel_pack+=
+      	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost);
+	int return_value = ((int*) buf_send)[0];
+	cudaMemcpy(buf_send, sdata->buffer, (1+return_value)*sizeof(double), cudaMemcpyDeviceToHost);
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_exchange_download+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+	
+	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n"); )
+	return return_value;
+}
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
+{
+	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n"); )
+	Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	
+	int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1;
+	int size=(nsend*n_data_items+1)*sizeof(double);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
+
+    cudaMemset((int*) (sdata->buffer),0,sizeof(int));
+    
+    int3 layout=getgrid(nsend,0);
+    dim3 threads(layout.z, 1, 1);
+    dim3 grid(layout.x, layout.y, 1);		
+	
+    timespec time1,time2;
+	clock_gettime(CLOCK_REALTIME,&time1);
+
+    Cuda_AtomVecCuda_PackExchange_Kernel<data_mask><<<grid, threads,0>>>(nsend,(int*) copylist);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed");
+	
+	clock_gettime(CLOCK_REALTIME,&time2);
+	sdata->cuda_timings.comm_exchange_kernel_pack+=
+      	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
+
+	clock_gettime(CLOCK_REALTIME,&time1);
+	sdata->cuda_timings.comm_exchange_download+=
+      	time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+	
+	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n"); )
+	return nsend*n_data_items+1;
+}
+
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
+{
+	Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1;
+	
+	int size=(nsend*n_data_items+1)*sizeof(double);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
+	cudaMemcpyToSymbol(MY_CONST(flag)   , & sdata->flag, sizeof(int*) );
+
+    cudaMemset((int*) (sdata->flag),0,sizeof(int));
+    if(nsend)
+    {
+      int3 layout=getgrid(nsend,0);
+      dim3 threads(layout.z, 1, 1);
+      dim3 grid(layout.x, layout.y, 1);		
+	  if(sdata->atom.nlocal>0)
+	  {
+    	timespec time1,time2;
+		clock_gettime(CLOCK_REALTIME,&time1);
+	    
+	    cudaMemcpy(sdata->buffer,buf_send , size, cudaMemcpyHostToDevice);
+
+		clock_gettime(CLOCK_REALTIME,&time2);
+		sdata->cuda_timings.comm_exchange_upload+=
+      		time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	    Cuda_AtomVecCuda_UnpackExchange_Kernel<data_mask><<<grid, threads,0>>>(sdata->exchange_dim,nsend,(int*) copylist);
+	    cudaThreadSynchronize();
+
+		clock_gettime(CLOCK_REALTIME,&time1);
+		sdata->cuda_timings.comm_exchange_kernel_unpack+=
+      		time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	    CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackExchange: Kernel execution failed");
+	  }	
+    }
+    int naccept;
+ 	cudaMemcpy((void*)&naccept, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+    
+	return naccept;
+}
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+      timespec atime1,atime2;
+	  clock_gettime(CLOCK_REALTIME,&atime1);
+ 	if(sdata->atom.update_nmax) 
+		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	  clock_gettime(CLOCK_REALTIME,&atime2);
+	  sdata->cuda_timings.test1+=
+        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
+
+	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
+
+	int size=nsend*n_data_items*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
+
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }}	
+
+	int3 layout=getgrid(nsend);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+      timespec time1,time2;
+	  clock_gettime(CLOCK_REALTIME,&time1);
+
+	  Cuda_AtomVecCuda_PackBorder_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,nsend,sdata->comm.maxlistlength,iswap,dx,dy,dz);
+	  cudaThreadSynchronize();
+
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  sdata->cuda_timings.comm_border_kernel_pack+=
+      	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
+	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed");
+
+	  clock_gettime(CLOCK_REALTIME,&time1);
+	  sdata->cuda_timings.comm_border_download+=
+        time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+		
+	}		
+    return nsend*n_data_items;
+}
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+
+	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
+
+	int size=n*n_data_items*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
+
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }}	
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+      timespec time1,time2;
+	  clock_gettime(CLOCK_REALTIME,&time1);
+
+	  Cuda_AtomVecCuda_PackBorder_Self_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
+	  cudaThreadSynchronize();
+	
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  sdata->cuda_timings.comm_border_kernel_self+=
+      	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder_Self: Kernel execution failed");
+		
+	}	
+    return n*n_data_items;
+}
+
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
+{
+      timespec atime1,atime2;
+	  clock_gettime(CLOCK_REALTIME,&atime1);
+	if(sdata->atom.update_nmax) 
+		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+	//if(sdata->atom.update_nlocal) 
+	  cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	  clock_gettime(CLOCK_REALTIME,&atime2);
+	  sdata->cuda_timings.test1+=
+        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
+
+	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
+
+	int size=n*n_data_items*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+      timespec time1,time2;
+	  clock_gettime(CLOCK_REALTIME,&time1);
+      
+      cudaMemset((int*) (sdata->flag),0,sizeof(int));
+	  cudaMemcpy(sdata->buffer,(void*)buf_recv, size, cudaMemcpyHostToDevice);
+
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  sdata->cuda_timings.comm_border_upload+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  Cuda_AtomVecCuda_UnpackBorder_Kernel<data_mask><<<grid, threads,0>>>(n,first);
+	  cudaThreadSynchronize();
+
+	  clock_gettime(CLOCK_REALTIME,&time1);
+	  sdata->cuda_timings.comm_border_kernel_unpack+=
+      	time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  cudaMemcpy(&sdata->comm.grow_flag,sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+	  
+	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackBorder: Kernel execution failed");
+		
+	}		
+	return sdata->comm.grow_flag;
+}
+
+
+#include "atom_vec_angle_cuda.cu"
+#include "atom_vec_atomic_cuda.cu"
+#include "atom_vec_charge_cuda.cu"
+#include "atom_vec_full_cuda.cu"
+//#include "atom_vec_granular_cuda.cu"
diff --git a/lib/cuda/atom_vec_cuda_cu.h b/lib/cuda/atom_vec_cuda_cu.h
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/cuda/atom_vec_cuda_kernel.cu b/lib/cuda/atom_vec_cuda_kernel.cu
new file mode 100644
index 0000000000..0ec079d45b
--- /dev/null
+++ b/lib/cuda/atom_vec_cuda_kernel.cu
@@ -0,0 +1,371 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#define RIMLARGER 1.000001
+#define RIMSMALLER 0.999999
+#define SMALL 1e-5
+
+extern __shared__ int shared[];
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+    int j=list[i];
+    if(j>_nmax) _flag[0]=1;
+    int k=0;
+    if(data_mask & X_MASK){
+    ((X_FLOAT*) buffer)[i+k*n]=_x[j] + dx; k++;
+    ((X_FLOAT*) buffer)[i+k*n] = _x[j+_nmax] + dy; k++;
+    ((X_FLOAT*) buffer)[i+k*n] = _x[j+2*_nmax] + dz; k++;}
+    if(data_mask & V_MASK){
+    ((X_FLOAT*) buffer)[i+k*n]=_v[j]; k++;
+    ((X_FLOAT*) buffer)[i+k*n] = _v[j+_nmax]; k++;
+    ((X_FLOAT*) buffer)[i+k*n] = _v[j+2*_nmax]; k++;}
+    if(data_mask & OMEGA_MASK){
+    ((X_FLOAT*) buffer)[i+k*n]=_omega[j]; k++;
+    ((X_FLOAT*) buffer)[i+k*n] = _omega[j+_nmax]; k++;
+    ((X_FLOAT*) buffer)[i+k*n] = _omega[j+2*_nmax]; k++;}
+    if(data_mask & RADIUS_MASK) ((X_FLOAT*) buffer)[i+k*n]=_radius[j]; k++;
+    if(data_mask & RMASS_MASK) ((X_FLOAT*) buffer)[i+k*n]=_rmass[j]; k++;
+  }
+}
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+  	int j=i;
+    j=list[i];
+    if(data_mask & X_MASK){
+      _x[i+first]=_x[j] + dx;
+      _x[i+first+_nmax] = _x[j+_nmax] + dy;
+      _x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;}
+    if(data_mask & V_MASK){
+      _v[i+first]=_v[j];
+      _v[i+first+_nmax] = _v[j+_nmax];
+      _v[i+first+2*_nmax] = _v[j+2*_nmax];}
+    if(data_mask & OMEGA_MASK) {
+  	  _omega[i+first] = _omega[j];
+  	  _omega[i+first+_nmax] = _omega[j+_nmax];
+  	  _omega[i+first+2*_nmax] = _omega[j+2*_nmax];} 
+    if(data_mask & RADIUS_MASK) _radius[i+first]=_radius[j]; 
+    if(data_mask & RMASS_MASK) _rmass[i+first]=_rmass[j]; 
+  }
+}
+
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n,int first,void* buffer)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(i<n)
+  {
+  	int k=0;
+    if(data_mask & X_MASK){
+  	  _x[i+first]=((X_FLOAT*) buffer)[i+k*n]; k++;
+  	  _x[i+first+_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;
+  	  _x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;}
+    if(data_mask & V_MASK){
+  	  _v[i+first]=((X_FLOAT*) buffer)[i+k*n]; k++;
+  	  _v[i+first+_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;
+  	  _v[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;}
+    if(data_mask & OMEGA_MASK){
+  	  _omega[i+first]=((X_FLOAT*) buffer)[i+k*n]; k++;
+  	  _omega[i+first+_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;
+  	  _omega[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;}
+    if(data_mask & RADIUS_MASK) _radius[i+first] = ((X_FLOAT*) buffer)[i+k*n]; k++;
+    if(data_mask & RMASS_MASK) _rmass[i+first] = ((X_FLOAT*) buffer)[i+k*n]; k++;
+  }
+}
+
+
+__global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n,int dim)
+{
+  double* buf=(double*) _buffer;
+  buf=&buf[1];
+
+  //X_FLOAT lo=slablo[iswap];
+  //X_FLOAT hi=slabhi[iswap];
+
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  bool add=false;
+  
+  if(i<_nlocal)
+  {
+  	double xdim_tmp=static_cast <double> (_x[i+dim*_nmax]);
+    if (xdim_tmp < _sublo[dim] || xdim_tmp >= _subhi[dim]) 
+    {
+      add=true;
+    }
+  }
+  shared[threadIdx.x]=add?1:0;
+  __syncthreads();
+  int nsend=0;
+  if(threadIdx.x==0)
+  {
+    for(int k=0;k<blockDim.x;k++)
+    {
+      if(shared[k]) {nsend++; shared[k]=nsend;}
+    }
+    shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
+  }
+  __syncthreads();
+      
+  nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
+  if(add&&nsend+1<n)
+    buf[nsend]=i;
+}
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_PackExchange_Kernel(int nsend, int* copylist)
+{
+  double* buf=(double*) _buffer;
+  int k=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(k>=nsend) return;
+  buf=&buf[1+k];
+  
+  int i=static_cast <int> (buf[0]);
+  int j=copylist[k];
+    
+  int m=1;
+  if(data_mask & X_MASK){
+  buf[(m++)*nsend] = static_cast <double> (_x[i]);
+  buf[(m++)*nsend] = static_cast <double> (_x[i+_nmax]);
+  buf[(m++)*nsend] = static_cast <double> (_x[i+2*_nmax]);}
+  if(data_mask & V_MASK){
+  buf[(m++)*nsend] = _v[i];
+  buf[(m++)*nsend] = _v[i+_nmax];
+  buf[(m++)*nsend] = _v[i+2*_nmax];}
+  if(data_mask & TAG_MASK) 		buf[(m++)*nsend] = _tag[i];
+  if(data_mask & TYPE_MASK) 	buf[(m++)*nsend] = _type[i];
+  if(data_mask & MASK_MASK) 	buf[(m++)*nsend] = _mask[i];
+  if(data_mask & IMAGE_MASK) 	buf[(m++)*nsend] = _image[i];
+  if(data_mask & Q_MASK) 		buf[(m++)*nsend] = _q[i];
+  if(data_mask & MOLECULE_MASK) buf[(m++)*nsend] = _molecule[i];
+  if(data_mask & RADIUS_MASK) 	buf[(m++)*nsend] = _radius[i];
+  if(data_mask & DENSITY_MASK) 	buf[(m++)*nsend] = _density[i];
+  if(data_mask & RMASS_MASK) 	buf[(m++)*nsend] = _rmass[i];
+  if(data_mask & OMEGA_MASK) {
+  buf[(m++)*nsend] = _omega[i];
+  buf[(m++)*nsend] = _omega[i+_nmax];
+  buf[(m++)*nsend] = _omega[i+2*_nmax];}
+  
+/*  if(data_mask & NSPECIAL_MASK) 
+  {
+  	buf[(m++)*nsend] = _nspecial[i];
+  	buf[(m++)*nsend] = _nspecial[i+_nmax];
+  	buf[(m++)*nsend] = _nspecial[i+2* _nmax];
+  }*/
+
+  if(i>=_nlocal) return; 
+  if(data_mask & X_MASK){
+  _x[i] = _x[j];
+  _x[i+_nmax] = _x[j+_nmax];
+  _x[i+2*_nmax] = _x[j+2*_nmax];}
+  if(data_mask & V_MASK){
+  _v[i] = _v[j];
+  _v[i+_nmax] = _v[j+_nmax];
+  _v[i+2*_nmax] = _v[j+2*_nmax];}
+  if(data_mask & TAG_MASK)		_tag[i] 	= _tag[j];
+  if(data_mask & TYPE_MASK)		_type[i] 	= _type[j];
+  if(data_mask & MASK_MASK)		_mask[i] 	= _mask[j];
+  if(data_mask & IMAGE_MASK)	_image[i] 	= _image[j];
+  
+  if(data_mask & Q_MASK) 		_q[i] 		= _q[j];
+  if(data_mask & MOLECULE_MASK) _molecule[i]= _molecule[j];
+  if(data_mask & RADIUS_MASK) 	_radius[i] 	= _radius[j];
+  if(data_mask & DENSITY_MASK) 	_density[i] = _density[j];
+  if(data_mask & RMASS_MASK) 	_rmass[i] 	= _rmass[j];
+  if(data_mask & OMEGA_MASK) 
+  {
+  	_omega[i] = _omega[j];
+  	_omega[i+_nmax] = _omega[j+_nmax];
+  	_omega[i+2*_nmax] = _omega[j+2*_nmax];
+  } 
+  	/* if(data_mask & NSPECIAL_MASK) 
+  {
+  	_nspecial[i] = _nspecial[j];
+  	_nspecial[i+_nmax] = _nspecial[j+_nmax];
+  	_nspecial[i+2* _nmax] = _nspecial[j+2* _nmax];
+  }*/
+}
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim,int nsend,int* copylist)
+{
+  double* buf=(double*) _buffer;
+  int k=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(k>=nsend) return;
+  buf=&buf[1+k];
+  int i=-1;
+  double xdim_tmp = buf[(1+dim)*nsend];
+  if(xdim_tmp>=_sublo[dim]-SMALL && xdim_tmp<_subhi[dim]+SMALL)
+  {   	
+  	 i=atomicAdd(_flag,1)+_nlocal;
+  	 
+  	 int m=1;
+  	 if(data_mask & X_MASK){
+     _x[i] = buf[(m++)*nsend];
+     _x[i+_nmax] = buf[(m++)*nsend];
+     _x[i+2*_nmax] = buf[(m++)*nsend];}
+  	 if(data_mask & V_MASK){
+     _v[i] = buf[(m++)*nsend];
+     _v[i+_nmax] = buf[(m++)*nsend];
+     _v[i+2*_nmax] = buf[(m++)*nsend];}
+     if(data_mask & TAG_MASK) 	_tag[i] = buf[(m++)*nsend];
+     if(data_mask & TYPE_MASK) 	_type[i] = buf[(m++)*nsend];
+     if(data_mask & MASK_MASK) 	_mask[i] = buf[(m++)*nsend];
+     if(data_mask & IMAGE_MASK) _image[i] = buf[(m++)*nsend];
+  
+     if(data_mask & Q_MASK) _q[i] = buf[(m++)*nsend];
+     if(data_mask & MOLECULE_MASK) _molecule[i] = buf[(m++)*nsend];
+  	 if(data_mask & RADIUS_MASK) _radius[i] = buf[(m++)*nsend];
+  	 if(data_mask & DENSITY_MASK) _density[i] = buf[(m++)*nsend];
+  	 if(data_mask & RMASS_MASK) _rmass[i] = buf[(m++)*nsend];
+  	 if(data_mask & OMEGA_MASK) 
+  	 {
+  		_omega[i] = buf[(m++)*nsend];
+  		_omega[i+_nmax] = buf[(m++)*nsend];
+  		_omega[i+2*_nmax] = buf[(m++)*nsend];
+  	 }
+   /*  if(data_mask & NSPECIAL_MASK) 
+     {
+  	   _nspecial[i] = buf[(m++)*nsend];
+  	   _nspecial[i+_nmax] = buf[(m++)*nsend];
+  	   _nspecial[i+2*_nmax] = buf[(m++)*nsend];
+     }*/
+  }
+  copylist[k]=i;
+}
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+    int j=list[i];
+    int m=0;
+	if(data_mask & X_MASK) {
+	((X_FLOAT*) _buffer)[i+(m++)*n]= _x[j] + dx;
+    ((X_FLOAT*) _buffer)[i+(m++)*n] = _x[j+_nmax] + dy;
+    ((X_FLOAT*) _buffer)[i+(m++)*n] = _x[j+2*_nmax] + dz;}
+    if(data_mask & V_MASK) {
+	((X_FLOAT*) _buffer)[i+(m++)*n]= _v[j];
+    ((X_FLOAT*) _buffer)[i+(m++)*n] = _v[j+_nmax];
+    ((X_FLOAT*) _buffer)[i+(m++)*n] = _v[j+2*_nmax];}
+    if(data_mask & TAG_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _tag[j];
+    if(data_mask & TYPE_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _type[j];
+    if(data_mask & MASK_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _mask[j];
+    if(data_mask & Q_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _q[j];
+    if(data_mask & MOLECULE_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _molecule[j];
+  	if(data_mask & RADIUS_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _radius[i];
+  	if(data_mask & DENSITY_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _density[i];
+  	if(data_mask & RMASS_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _rmass[i];
+  	if(data_mask & OMEGA_MASK) {
+  	((X_FLOAT*) _buffer)[i+(m++)*n] = _omega[i];
+  	((X_FLOAT*) _buffer)[i+(m++)*n] = _omega[i+_nmax];
+  	((X_FLOAT*) _buffer)[i+(m++)*n] = _omega[i+2*_nmax];}
+  }
+}
+
+
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+    int j=list[i];
+  
+	if(data_mask & X_MASK) {
+    _x[i+first]= _x[j] + dx;
+    _x[i+first+_nmax] = _x[j+_nmax] + dy;
+    _x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;}
+	if(data_mask & V_MASK) {
+    _v[i+first]= _v[j];
+    _v[i+first+_nmax] = _v[j+_nmax];
+    _v[i+first+2*_nmax] =  _v[j+2*_nmax];}
+	if(data_mask & TAG_MASK) _tag[i+first] = _tag[j];
+	if(data_mask & TYPE_MASK) _type[i+first] = _type[j];
+	if(data_mask & MASK_MASK) _mask[i+first] = _mask[j];
+	if(data_mask & Q_MASK) _q[i+first] = _q[j];
+	if(data_mask & MOLECULE_MASK) _molecule[i+first] = _molecule[j];
+  	if(data_mask & RADIUS_MASK) _radius[i+first] = _radius[j];
+  	if(data_mask & DENSITY_MASK) _density[i+first] = _density[j];
+  	if(data_mask & RMASS_MASK) _rmass[i+first] = _rmass[j];
+	if(data_mask & OMEGA_MASK) {
+    _omega[i+first]= _omega[j];
+    _omega[i+first+_nmax] = _omega[j+_nmax];
+    _omega[i+first+2*_nmax] =  _omega[j+2*_nmax];}
+  }
+}
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n,int first)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(i<n)
+  {
+  	if(i+first<_nmax)
+  	{
+      int m=0;
+	  if(data_mask & X_MASK) {
+  	  _x[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
+  	  _x[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];
+  	  _x[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];}
+	  if(data_mask & V_MASK) {
+  	  _v[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
+  	  _v[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];
+  	  _v[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];}
+  	  if(data_mask & TAG_MASK) _tag[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
+  	  if(data_mask & TYPE_MASK) _type[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
+  	  if(data_mask & MASK_MASK) _mask[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
+  	  if(data_mask & Q_MASK) _q[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
+  	  if(data_mask & MOLECULE_MASK) _molecule[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
+  	  if(data_mask & RADIUS_MASK) _radius[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
+  	  if(data_mask & DENSITY_MASK) _density[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
+  	  if(data_mask & RMASS_MASK) _rmass[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
+	  if(data_mask & OMEGA_MASK) {
+  	  _omega[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
+  	  _omega[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];
+  	  _omega[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];}
+  	}
+  	else
+  	{
+  	  _flag[0]=1;
+  	}
+  }
+}
+
+
diff --git a/lib/cuda/atom_vec_full_cuda.cu b/lib/cuda/atom_vec_full_cuda.cu
new file mode 100644
index 0000000000..a5aae11824
--- /dev/null
+++ b/lib/cuda/atom_vec_full_cuda.cu
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+const unsigned int FULL_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
+
+#include "atom_vec_full_cuda_cu.h"
+
+void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata)
+{
+  return Cuda_AtomVecCuda_Init<FULL_DATA_MASK>(sdata);
+}
+
+int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
+}
+
+int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
+}
+
+int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
+}
+
+int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
+}
+
+int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
+{
+  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
+}
+
+int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
+{
+  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
+}
diff --git a/lib/cuda/atom_vec_full_cuda_cu.h b/lib/cuda/atom_vec_full_cuda_cu.h
new file mode 100644
index 0000000000..6cf163ab71
--- /dev/null
+++ b/lib/cuda/atom_vec_full_cuda_cu.h
@@ -0,0 +1,15 @@
+#ifndef ATOM_VEC_FULL_CUDA_CU_H_
+#define ATOM_VEC_FULL_CUDA_CU_H_
+
+extern "C" void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata);
+extern "C" int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
+extern "C" int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
+extern "C" int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
+extern "C" int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
+extern "C" int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
+extern "C" int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
+
+#endif /*ATOM_VEC_FULL2_CUDA_CU_H_*/
diff --git a/lib/cuda/binning.cu b/lib/cuda/binning.cu
new file mode 100644
index 0000000000..823015ff55
--- /dev/null
+++ b/lib/cuda/binning.cu
@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef CUDA_USE_BINNING
+#include <stdio.h>
+#define MY_PREFIX binning
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "binning_cu.h"
+#include "binning_kernel.cu"
+
+void Cuda_PreBinning(cuda_shared_data* sdata)
+{
+	// initialize only on first call
+	short init = 0;
+	if(! init)
+	{
+		init = 1;
+		int cuda_dummy_type = sdata->atom.ntypes + 1;
+		X_FLOAT outside[3] =
+		{
+			(sdata->domain.subhi[0] - sdata->domain.sublo[0])/1000.0,
+			(sdata->domain.subhi[1] - sdata->domain.sublo[1])/1000.0,
+			(sdata->domain.subhi[2] - sdata->domain.sublo[2])/1000.0
+		};
+		cudaMemcpyToSymbol("binned_size_all"    , & sdata->atom.binned_type.dim[0]  , sizeof(unsigned) );
+		cudaMemcpyToSymbol("cuda_dummy_type"    , & cuda_dummy_type                 , sizeof(int)      );
+		cudaMemcpyToSymbol("outside"            , & outside                         , sizeof(X_FLOAT)*3);
+		cudaMemcpyToSymbol(MY_CONST(binned_type), & sdata->atom.binned_type.dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(binned_x)   , & sdata->atom.binned_x   .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(subhi)      ,   sdata->domain.subhi             , sizeof(X_FLOAT)*3);
+		// bin_nmax == blockDim.x
+		
+		// printf("# CUDA: MY_CONST(binned_type) = %s\n", MY_CONST(binned_type));
+		// int* p = pre_binning_binned_type; // pre_binning_binned_type is defined here!!
+	}
+	
+	dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_dim[2], 1);
+	dim3 threads(sdata->domain.bin_nmax, 1, 1);
+	
+	MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning grid = (%u, %u, %u)\n", grid.x, grid.y, grid.z);)
+	MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning threads = (%u, %u, %u)\n", threads.x, threads.y, threads.z);	)
+	PreBinning_Kernel<<<grid, threads>>> ();
+	cudaThreadSynchronize();
+    MYDBG(printf("ERROR-CUDA pre_binning: %s\n",cudaGetErrorString(cudaGetLastError())));
+	CUT_CHECK_ERROR("Cuda_PreBinning: binning Kernel execution failed");
+}
+
+void Cuda_Binning(cuda_shared_data* sdata)
+{
+	MYDBG(	// check assumption in debug mode
+		if(sdata->atom.x.dim[1] != 3)
+		{
+			printf("# CUDA: Cuda_Binning: binning error: atom array dimensions not Nx3\n");
+			return;
+		}
+	)
+	
+	// initialize only on first call
+	short init = 0;
+	if(! init)
+	{
+		init = 0;
+		X_FLOAT const_rez_bin_size[3] = 
+		{
+			(1.0 * sdata->domain.bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
+			(1.0 * sdata->domain.bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
+			(1.0 * sdata->domain.bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
+		};
+		cudaMemcpyToSymbol("bin_error_count"        , & sdata->atom.bin_error_count.dev_data, sizeof(X_FLOAT)*1);
+		cudaMemcpyToSymbol("rez_bin_size"           , & const_rez_bin_size                  , sizeof(X_FLOAT)*3);
+		cudaMemcpyToSymbol(MY_CONST(bin_count_all)  , & sdata->atom.bin_count_all  .dev_data, sizeof(unsigned*));
+		cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
+		cudaMemcpyToSymbol(MY_CONST(bin_dim)        ,   sdata->domain.bin_dim               , sizeof(int3)     );
+		cudaMemcpyToSymbol(MY_CONST(bin_nmax)       , & sdata->domain.bin_nmax              , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(binned_f)       , & sdata->atom.binned_f       .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(binned_q)       , & sdata->atom.binned_q       .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(binned_rmass)   , & sdata->atom.binned_rmass   .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(binned_tag)     , & sdata->atom.binned_tag     .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(binned_type)    , & sdata->atom.binned_type    .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(binned_v)       , & sdata->atom.binned_v       .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(binpos)         , & sdata->atom.binpos         .dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(f)              , & sdata->atom.f              .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(natoms)         , & sdata->atom.nall                    , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(nghost)         , & sdata->atom.nghost                  , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)         , & sdata->atom.nlocal                  , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(nmax)           , & sdata->atom.nmax                    , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(q)              , & sdata->atom.q              .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(rmass)          , & sdata->atom.rmass          .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(sublo)          ,   sdata->domain.sublo                 , sizeof(X_FLOAT)*3);
+		cudaMemcpyToSymbol(MY_CONST(tag)            , & sdata->atom.tag            .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(type)           , & sdata->atom.type           .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(v)              , & sdata->atom.v              .dev_data, sizeof(V_FLOAT*) );
+	}
+	
+	dim3 grid((unsigned)(1 + sdata->atom.nlocal/64.0), 1, 1);
+	MYDBG( printf("# CUDA: Cuda_Binning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); )
+	dim3 threads(64, 1, 1);
+	
+	cudaMemset((int*) (sdata->atom.bin_count_all.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2]));
+	cudaMemset((int*) (sdata->atom.bin_count_local.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2]));
+	cudaMemset(sdata->atom.bin_error_count.dev_data,0,sizeof(int)*1);
+	int binning_error_l[1];
+	
+	
+	Binning_Kernel<<<grid, threads>>> (
+		(X_FLOAT*) (sdata->atom.       x.dev_data),
+		(X_FLOAT*) (sdata->atom.binned_x.dev_data),
+		sdata->atom.q_flag,
+		0,
+		sdata->atom.rmass_flag
+	);
+	cudaThreadSynchronize();
+	cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost);
+	if(binning_error_l[0]!=0) 
+	{
+		printf("CUDA-ERROR: binning local: could not bin %i atoms\n",binning_error_l[0]);
+	}
+	CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
+	
+	grid.x=(unsigned)(1 + (sdata->atom.nall-sdata->atom.nlocal)/32.0);
+	MYDBG( printf("# CUDA: Cuda_Binning Ghost: grid dim.x = %u\n", grid.x); )
+	
+	
+	Binning_Kernel<<<grid, threads>>> (
+		(X_FLOAT*) (sdata->atom.       x.dev_data),
+		(X_FLOAT*) (sdata->atom.binned_x.dev_data),
+		sdata->atom.q_flag,
+		sdata->atom.nlocal,
+		sdata->atom.rmass_flag
+	);
+	cudaThreadSynchronize();
+	cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost);
+	if(binning_error_l[0]!=0) printf("CUDA-ERROR: binning ghost: could not bin %i atoms\n",binning_error_l[0]);
+}
+
+void Cuda_ReverseBinning(cuda_shared_data* sdata)
+{
+	// initialize only on first call
+	short init = 0;
+	if(! init)
+	{
+		init = 0;
+		cudaMemcpyToSymbol(MY_CONST(bin_count_all)  , & sdata->atom.bin_count_all  .dev_data, sizeof(unsigned*));
+		cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
+		cudaMemcpyToSymbol(MY_CONST(bin_dim)        ,   sdata->domain.bin_dim               , sizeof(int3)     );
+		cudaMemcpyToSymbol(MY_CONST(binned_f)       , & sdata->atom.binned_f       .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(binned_q)       , & sdata->atom.binned_q       .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(binned_tag)     , & sdata->atom.binned_tag     .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(binned_type)    , & sdata->atom.binned_type    .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(binned_v)       , & sdata->atom.binned_v       .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)              , & sdata->atom.f              .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(natoms)         , & sdata->atom.nall                    , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(nmax)           , & sdata->atom.nmax                    , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(q)              , & sdata->atom.q              .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(tag)            , & sdata->atom.tag            .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(type)           , & sdata->atom.type           .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(v)              , & sdata->atom.v              .dev_data, sizeof(V_FLOAT*) );
+	}
+	
+	dim3 grid((unsigned)(1 + sdata->atom.nlocal/32.0), 1, 1);
+	MYDBG( printf("# CUDA: Cuda_ReverseBinning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); )
+	dim3 threads(32, 1, 1);
+
+	ReverseBinning_Kernel<<<grid, threads>>> (
+		(X_FLOAT*) (sdata->atom.       x.dev_data),
+		(X_FLOAT*) (sdata->atom.binned_x.dev_data),
+		sdata->atom.q_flag
+	);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Binning: reverse binning Kernel execution failed");
+}
+
+#endif
diff --git a/lib/cuda/binning_cu.h b/lib/cuda/binning_cu.h
new file mode 100644
index 0000000000..4f932c392f
--- /dev/null
+++ b/lib/cuda/binning_cu.h
@@ -0,0 +1,28 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PreBinning(cuda_shared_data* sdata);
+extern "C" void Cuda_Binning(cuda_shared_data* sdata);
+extern "C" void Cuda_ReverseBinning(cuda_shared_data* sdata);
diff --git a/lib/cuda/binning_kernel.cu b/lib/cuda/binning_kernel.cu
new file mode 100644
index 0000000000..f5677d475f
--- /dev/null
+++ b/lib/cuda/binning_kernel.cu
@@ -0,0 +1,149 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+// load some variables from shared cuda data into device's constant memory:
+__device__ __constant__ X_FLOAT rez_bin_size[3];
+__device__ __constant__ unsigned* bin_error_count;
+
+__device__ __constant__ int cuda_dummy_type;
+__device__ __constant__ unsigned binned_size_all;
+__device__ __constant__ X_FLOAT outside[3];
+
+__global__ void PreBinning_Kernel()
+{
+	const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
+	
+	if(bin < gridDim.x * gridDim.y) // TODO: suspected always to be true
+	{
+		_binned_type[blockDim.x * bin + threadIdx.x] = cuda_dummy_type;
+		
+		const int i = 3*blockDim.x * bin + threadIdx.x;
+		X_FLOAT* binned_x = _binned_x + i; *binned_x = _subhi[0] + outside[0] * (1+i);
+		binned_x += blockDim.x;            *binned_x = _subhi[1] + outside[1] * (1+i);
+		binned_x += blockDim.x;            *binned_x = _subhi[2] + outside[2] * (1+i);
+		_binned_tag[i]=-1;
+	}
+}
+
+__global__ void Binning_Kernel(X_FLOAT* x, X_FLOAT* binned_x, int q_flag, int offset, int rmass_flag)
+{
+	const unsigned i = blockDim.x * blockIdx.x + threadIdx.x+offset;
+	
+	int binatoms=_natoms;
+	if(offset==0) binatoms=_nlocal ;
+
+	if(i < binatoms)
+	{
+		// copy atom position from global device memory to local register
+		// in this 3 steps to get as much coalesced access as possible
+		X_FLOAT my_xX, my_xY, my_xZ;
+		x += i;        my_xX = *x;
+		x += _nmax;  my_xY = *x;
+		x += _nmax;  my_xZ = *x;
+		//my_xX=x[i];
+		//my_xY=x[i+_nmax];
+		//my_xZ=x[i+2*_nmax];
+		
+		
+		// calculate flat bin index
+		int bx=__float2int_rd(rez_bin_size[0] * (my_xX - _sublo[0]))+2;
+		int by=__float2int_rd(rez_bin_size[1] * (my_xY - _sublo[1]))+2;
+		int bz=__float2int_rd(rez_bin_size[2] * (my_xZ - _sublo[2]))+2;
+
+		bx-=bx*negativCUDA(1.0f*bx);
+		bx-=(bx-_bin_dim.x+1)*negativCUDA(1.0f*_bin_dim.x-1.0f-1.0f*bx);
+		by-=by*negativCUDA(1.0f*by);
+		by-=(by-_bin_dim.y+1)*negativCUDA(1.0f*_bin_dim.y-1.0f-1.0f*by);
+		bz-=bz*negativCUDA(1.0f*bz);
+		bz-=(bz-_bin_dim.z+1)*negativCUDA(1.0f*_bin_dim.z-1.0f-1.0f*bz);
+		
+
+		const unsigned j = _bin_dim.z * ( _bin_dim.y *bx+by)+bz;
+		
+		// add new atom to bin, get bin-array position
+		const unsigned k = atomicAdd(& _bin_count_all[j], 1);
+		if(offset==0) atomicAdd(& _bin_count_local[j], 1);
+		if(k < _bin_nmax)
+		{
+			// copy register values back to global device memory
+			unsigned pos = 3*_bin_nmax * j + k;
+			_binpos[i]=pos;
+			binned_x += pos;       *binned_x = my_xX;
+			binned_x += _bin_nmax; *binned_x = my_xY;
+			binned_x += _bin_nmax; *binned_x = my_xZ;
+			
+			// also copy velocity and force accordingly
+		
+			binned_x  = _binned_v + pos; x  = _v + i;  *binned_x = *x;
+			binned_x += _bin_nmax;       x += _nmax;   *binned_x = *x;
+			binned_x += _bin_nmax;       x += _nmax;   *binned_x = *x;
+			
+			binned_x  = _binned_f + pos; x  = _f + i;  *binned_x = *x;
+			binned_x += _bin_nmax;       x += _nmax;   *binned_x = *x;
+			binned_x += _bin_nmax;       x += _nmax;   *binned_x = *x;
+			
+			pos = _bin_nmax * j + k;
+				_binned_type [pos] = _type[i];
+				_binned_tag  [pos] = _tag[i];
+			if(rmass_flag)
+				_binned_rmass[pos] = _rmass[i];
+			if(q_flag)
+				_binned_q    [pos] = _q[i];
+		}
+		else
+		{	// normally, this should not happen:
+			int errorn=atomicAdd(bin_error_count, 1);
+			MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); )
+		}
+	}
+}
+
+__global__ void ReverseBinning_Kernel(X_FLOAT* x, X_FLOAT* binned_x,int q_flag)
+{
+	const unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
+	
+	if(i < _nlocal)
+	{
+	unsigned bin_pos3 = _binpos[i];
+	unsigned bin_pos=bin_pos3/(3*_bin_nmax);
+	bin_pos*=_bin_nmax;
+	bin_pos+=bin_pos3-bin_pos*3;
+
+		binned_x  = _binned_x + bin_pos3; x  = x + i; *x = *binned_x;
+		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
+		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
+		
+		binned_x  = _binned_v + bin_pos3; x  = _v + i; *x = *binned_x;
+		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
+		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
+		
+		binned_x  = _binned_f + bin_pos3; x  = _f + i; *x = *binned_x;
+		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
+		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
+		
+		
+		_type[i] = _binned_type[bin_pos];
+		_tag[i] = _binned_tag[bin_pos];
+		if(q_flag) _q[i] = _binned_q[bin_pos];
+	}
+}
diff --git a/lib/cuda/comm_cuda.cu b/lib/cuda/comm_cuda.cu
new file mode 100644
index 0000000000..0233f3ee13
--- /dev/null
+++ b/lib/cuda/comm_cuda.cu
@@ -0,0 +1,483 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX comm_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "comm_cuda_cu.h"
+#include "comm_cuda_kernel.cu"
+#include <ctime>
+
+void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata,int n)
+{
+		int size=n*3*sizeof(X_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+}
+
+
+void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*) 	  );
+}
+
+
+void Cuda_CommCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_CommCuda_UpdateNmax(sdata);
+	int ntypesp=sdata->atom.ntypes+1;
+    cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)   , &ntypesp, sizeof(int));
+    cudaMemcpyToSymbol(MY_CONST(prd)   , sdata->domain.prd, 3*sizeof(X_FLOAT));
+    cudaMemcpyToSymbol(MY_CONST(flag)  , &sdata->flag, sizeof(int*));
+  	cudaMemcpyToSymbol(MY_CONST(debugdata)  , &sdata->debugdata, sizeof(int*));
+}
+
+int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemset( sdata->flag,0,sizeof(int));
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
+	  ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_pack+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+      
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+      if(not sdata->overlap_comm)
+        cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+      //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_download+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  int aflag;
+	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+		
+	}		
+    return 3*n;
+}
+
+int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*6*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemset( sdata->flag,0,sizeof(int));
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
+	  ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_pack+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+      
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+      if(not sdata->overlap_comm)
+        cudaMemcpy(buf_send, sdata->buffer, n*6*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+      //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_download+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  int aflag;
+	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+		
+	}		
+    return 6*n;
+}
+
+int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+	MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+	static int count=-1;
+	count++;
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  Cuda_CommCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_self+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
+	}	
+	
+    return 3*n;
+}
+
+int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+	MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*6*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+	static int count=-1;
+	count++;
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  Cuda_CommCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_self+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
+	}	
+	
+    return 6*n;
+}
+
+void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
+{
+    timespec time1,time2;
+
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+clock_gettime(CLOCK_REALTIME,&time1);
+      if(not sdata->overlap_comm||iswap<0)
+	    cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_upload+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+	  void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_kernel_unpack+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
+		
+	}		
+}
+
+void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
+{
+    timespec time1,time2;
+
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*6*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+clock_gettime(CLOCK_REALTIME,&time1);
+
+      if(not sdata->overlap_comm||iswap<0)
+	    cudaMemcpy(sdata->buffer,(void*)buf_recv, n*6*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_upload+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+	  void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_kernel_unpack+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
+		
+	}		
+}
+
+int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(F_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+
+	F_FLOAT* buf=(F_FLOAT*)buf_send;
+	F_FLOAT* f_dev=(F_FLOAT*)sdata->atom.f.dev_data;
+	f_dev+=first;
+	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+	buf+=n; f_dev+=sdata->atom.nmax;
+	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+	buf+=n; f_dev+=sdata->atom.nmax;
+	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+	return 	n*3;
+}
+
+
+void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(F_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemcpy(sdata->buffer,buf_recv, size, cudaMemcpyHostToDevice);
+	  Cuda_CommCuda_UnpackReverse_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed");		
+	}		
+}
+
+void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  Cuda_CommCuda_UnpackReverse_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,first);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed");
+		
+	}		
+}
+
+
+int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap)
+{
+	MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
+    timespec time1,time2;
+	Cuda_CommCuda_UpdateNmax(sdata);
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	if(sdata->buffer_new or (80>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,10);
+	int n;
+	if (!bordergroup || ineed >= 2)
+	n=nlast-nfirst+1;
+	else
+	{
+	  n=atom_nfirst;
+	  if(nlast-sdata->atom.nlocal+1>n) n=nlast-sdata->atom.nlocal+1;
+	}
+	int3 layout=getgrid(n,0,512,true);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x+1, layout.y, 1);
+
+
+    cudaMemset((int*) (sdata->buffer),0,sizeof(int));
+
+clock_gettime(CLOCK_REALTIME,&time1);
+	if(style==1)
+	Cuda_CommCuda_BuildSendlist_Single<<<grid, threads,(threads.x+1)*sizeof(int)>>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.slablo.dev_data,(X_FLOAT*) sdata->comm.slabhi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength);
+	else
+	Cuda_CommCuda_BuildSendlist_Multi<<<grid, threads,(threads.x+1)*sizeof(int)>>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.multilo.dev_data,(X_FLOAT*) sdata->comm.multihi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength);
+    cudaThreadSynchronize();
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_border_kernel_buildlist+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed");
+    int nsend;
+	cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
+	return nsend;
+	
+	
+}
+
diff --git a/lib/cuda/comm_cuda_cu.h b/lib/cuda/comm_cuda_cu.h
new file mode 100644
index 0000000000..b5b2d192ba
--- /dev/null
+++ b/lib/cuda/comm_cuda_cu.h
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbcflag);
+extern "C" int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbcflag);
+extern "C" int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbcflag);
+extern "C" int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbcflag);
+extern "C" void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap=-1);
+extern "C" void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap=-1);
+extern "C" int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send);
+extern "C" void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv);
+extern "C" void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first);
+extern "C" int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap);
diff --git a/lib/cuda/comm_cuda_kernel.cu b/lib/cuda/comm_cuda_kernel.cu
new file mode 100644
index 0000000000..c171a721a4
--- /dev/null
+++ b/lib/cuda/comm_cuda_kernel.cu
@@ -0,0 +1,353 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+    int j=list[i];
+    if(j>_nmax) _flag[0]=1;
+    ((X_FLOAT*) buffer)[i]=_x[j] + dx;
+    ((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy;
+    ((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz;
+  }
+}
+
+__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+    int j=list[i];
+    if(j>_nmax) _flag[0]=1;
+    ((X_FLOAT*) buffer)[i]=_x[j] + dx;
+    ((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy;
+    ((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz;
+    ((X_FLOAT*) buffer)[i+3*n]=_v[j];
+    ((X_FLOAT*) buffer)[i+4*n] = _v[j+_nmax];
+    ((X_FLOAT*) buffer)[i+5*n] = _v[j+2*_nmax];
+  }
+}
+
+__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+  	int j=i;
+    j=list[i];
+    
+    _x[i+first]=_x[j] + dx;
+    _x[i+first+_nmax] = _x[j+_nmax] + dy;
+    _x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;
+  }
+}
+
+__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+  	int j=i;
+    j=list[i];
+    
+    _x[i+first]=_x[j] + dx;
+    _x[i+first+_nmax] = _x[j+_nmax] + dy;
+    _x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;
+    _v[i+first]=_v[j];
+    _v[i+first+_nmax] = _v[j+_nmax];
+    _v[i+first+2*_nmax] = _v[j+2*_nmax];
+  }
+}
+
+__global__ void Cuda_CommCuda_UnpackComm_Kernel(int n,int first,void* buffer)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(i<n)
+  {
+  _x[i+first]=((X_FLOAT*) buffer)[i];
+  _x[i+first+_nmax]=((X_FLOAT*) buffer)[i+1*n];
+  _x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+2*n];
+  }
+}
+
+
+__global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n,int first,void* buffer)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(i<n)
+  {
+  _x[i+first]=((X_FLOAT*) buffer)[i];
+  _x[i+first+_nmax]=((X_FLOAT*) buffer)[i+1*n];
+  _x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+2*n];
+  _v[i+first]=((X_FLOAT*) buffer)[i+3*n];
+  _v[i+first+_nmax]=((X_FLOAT*) buffer)[i+4*n];
+  _v[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+5*n];
+  }
+}
+
+__global__ void Cuda_CommCuda_PackReverse_Kernel(int n,int first)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(i<n)
+  {
+  ((F_FLOAT*) _buffer)[i]=_f[i+first];
+  ((F_FLOAT*) _buffer)[i+n] = _f[i+first+_nmax];
+  ((F_FLOAT*) _buffer)[i+2*n] = _f[i+first+2*_nmax];
+  }
+  
+}
+
+__global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist,int n,int maxlistlength,int iswap)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+  int j=list[i];
+  _f[j]+=((F_FLOAT*)_buffer)[i];
+  _f[j+_nmax]+=((F_FLOAT*) _buffer)[i+n];
+  _f[j+2*_nmax]+=((F_FLOAT*) _buffer)[i+2*n];
+  }
+  
+}
+
+__global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,int first)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+  int j=list[i];
+  
+  _f[j]+=_f[i+first];
+  _f[j+_nmax]+=_f[i+first+_nmax];
+  _f[j+2*_nmax]+=_f[i+first+2*_nmax];
+  }
+  
+}
+
+extern __shared__ int shared[];
+
+__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup,int ineed,int atom_nfirst,
+int nfirst,int nlast,int dim,int iswap,X_FLOAT* slablo, X_FLOAT* slabhi,int* sendlist,int maxlistlength)
+{
+	  int* list=sendlist+iswap*maxlistlength;
+	  X_FLOAT lo=slablo[iswap];
+	  X_FLOAT hi=slabhi[iswap];
+      bool add=false;
+      if (!bordergroup || ineed >= 2) {
+ 	  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst;
+ 	  if(i<nlast)
+	    if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
+          add=true;
+	    }
+      shared[threadIdx.x]=add?1:0;
+      
+      __syncthreads();
+      
+      int nsend=0;
+      if(threadIdx.x==0)
+      {
+        for(int k=0;k<blockDim.x;k++)
+        {
+           if(shared[k]) {nsend++; shared[k]=nsend;}
+        }
+        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
+      }
+      
+      __syncthreads();
+      
+      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
+      if(add&&nsend<maxlistlength)
+	      list[nsend] = i;
+      
+
+      } else {
+
+ 	  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	  if(i<atom_nfirst)
+	    if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
+          add=true;
+	    }
+
+      shared[threadIdx.x]=add?1:0;
+      
+      __syncthreads();
+      
+      int nsend=0;
+      if(threadIdx.x==0)
+      {
+        for(int k=0;k<blockDim.x;k++)
+        {
+           if(shared[k]) {nsend++; shared[k]=nsend;}
+        }
+        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
+      }
+      
+      __syncthreads();
+      
+      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
+      if(add&&nsend<maxlistlength)
+	      list[nsend] = i;
+
+      __syncthreads();
+      
+      add=false;
+	  i+=_nlocal;
+	  if(i < nlast)
+	    if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
+	      add=true;
+	    }
+       shared[threadIdx.x]=add?1:0;
+      
+      __syncthreads();
+      
+      nsend=0;
+      if(threadIdx.x==0)
+      {
+        for(int k=0;k<blockDim.x;k++)
+        {
+           if(shared[k]) {nsend++; shared[k]=nsend;}
+        }
+        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
+      }
+      
+      __syncthreads();
+      
+      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
+      if(add&&nsend<maxlistlength)
+	      list[nsend] = i;
+
+    }	
+}
+
+
+__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup,int ineed,int atom_nfirst
+,int nfirst,int nlast,int dim,int iswap,X_FLOAT* multilo, X_FLOAT* multihi,int* sendlist,int maxlistlength)
+{
+	  int* list=sendlist+iswap*maxlistlength;
+	  X_FLOAT* mlo=&multilo[iswap*_cuda_ntypes];
+	  X_FLOAT* mhi=&multihi[iswap*_cuda_ntypes];
+	  int itype=0;
+      bool add=false;
+      if (!bordergroup || ineed >= 2) {
+ 	  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst;
+ 	  if(i<nlast)
+ 	  {
+ 	    itype=_type[i];
+	    if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
+          add=true;
+	    }
+ 	  }
+      shared[threadIdx.x]=add?1:0;
+      
+      __syncthreads();
+      
+      int nsend=0;
+      if(threadIdx.x==0)
+      {
+        for(int k=0;k<blockDim.x;k++)
+        {
+           if(shared[k]) {nsend++; shared[k]=nsend;}
+        }
+        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
+      }
+      
+      __syncthreads();
+      
+      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
+      if(add&&nsend<maxlistlength)
+	      list[nsend] = i;
+      
+
+      } else {
+
+ 	  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	  if(i<atom_nfirst)
+	  {
+ 	    itype=_type[i];
+	    if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
+          add=true;
+	    }
+	  }
+      shared[threadIdx.x]=add?1:0;
+      
+      __syncthreads();
+      
+      int nsend=0;
+      if(threadIdx.x==0)
+      {
+        for(int k=0;k<blockDim.x;k++)
+        {
+           if(shared[k]) {nsend++; shared[k]=nsend;}
+        }
+        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
+      }
+      
+      __syncthreads();
+      
+      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
+      if(add&&nsend<maxlistlength)
+	      list[nsend] = i;
+
+      __syncthreads();
+      
+      add=false;
+	  i+=_nlocal;
+	  if(i < nlast)
+	  {
+	  	itype = _type[i];
+	    if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
+	      add=true;
+	    }
+	  }
+       shared[threadIdx.x]=add?1:0;
+      
+      __syncthreads();
+      
+      nsend=0;
+      if(threadIdx.x==0)
+      {
+        for(int k=0;k<blockDim.x;k++)
+        {
+           if(shared[k]) {nsend++; shared[k]=nsend;}
+        }
+        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
+      }
+      
+      __syncthreads();
+      
+      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
+      if(add&&nsend<maxlistlength)
+	      list[nsend] = i;
+
+    }	
+}
diff --git a/lib/cuda/compute_temp_cuda.cu b/lib/cuda/compute_temp_cuda.cu
new file mode 100644
index 0000000000..bb3fa5ce2a
--- /dev/null
+++ b/lib/cuda/compute_temp_cuda.cu
@@ -0,0 +1,121 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX compute_temp_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "compute_temp_cuda_cu.h"
+#include "compute_temp_cuda_kernel.cu"
+
+void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+		int size=(unsigned)((sdata->atom.nlocal+63)/64.0)*6*sizeof(ENERGY_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+}
+
+void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(mass)    , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) );
+		if(sdata->atom.rmass_flag) 
+		cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(rmass_flag)   , & sdata->atom.rmass_flag, sizeof(int) );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)       , & sdata->atom.type    .dev_data, sizeof(int*) );
+}
+
+void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_ComputeTempCuda_UpdateNmax(sdata);
+}
+
+
+void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t)
+{
+	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+		Cuda_ComputeTempCuda_UpdateNmax(sdata);
+	//if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	//if(sdata->buffer_new)
+		Cuda_ComputeTempCuda_UpdateBuffer(sdata);
+	
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+	  Cuda_ComputeTempCuda_Vector_Kernel<<<grid, threads,threads.x*6*sizeof(ENERGY_FLOAT)>>> (groupbit);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed");
+  
+	  int oldgrid=grid.x;
+	  grid.x=6;
+	  threads.x=512;
+      Cuda_ComputeTempCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed");
+	}
+}
+
+void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t)
+{
+	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+		Cuda_ComputeTempCuda_UpdateNmax(sdata);
+	//if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	//if(sdata->buffer_new)
+		Cuda_ComputeTempCuda_UpdateBuffer(sdata);
+	MYDBG(printf("#CUDA ComputeTempCuda_Scalar: %i\n",sdata->atom.nlocal);)
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{	
+	  CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel");
+	  Cuda_ComputeTempCuda_Scalar_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (groupbit);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed");
+  
+	  int oldgrid=grid.x;
+	  grid.x=1;
+	  threads.x=512;
+      Cuda_ComputeTempCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed");
+	}
+}
diff --git a/lib/cuda/compute_temp_cuda_cu.h b/lib/cuda/compute_temp_cuda_cu.h
new file mode 100644
index 0000000000..0793be77cb
--- /dev/null
+++ b/lib/cuda/compute_temp_cuda_cu.h
@@ -0,0 +1,28 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t);
+extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t);
diff --git a/lib/cuda/compute_temp_cuda_kernel.cu b/lib/cuda/compute_temp_cuda_kernel.cu
new file mode 100644
index 0000000000..3e97148f6b
--- /dev/null
+++ b/lib/cuda/compute_temp_cuda_kernel.cu
@@ -0,0 +1,109 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ ENERGY_FLOAT sharedmem[];
+
+
+__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+      sharedmem[threadIdx.x]=0;
+  if(i < _nlocal)
+  {
+    if (_rmass_flag) {
+      if (_mask[i] & groupbit)
+	    sharedmem[threadIdx.x] = (_v[i]*_v[i] + _v[i+_nmax]*_v[i+_nmax] + _v[i+2*_nmax]*_v[i+2*_nmax]) * _rmass[i];
+    } else {
+     if (_mask[i] & groupbit)
+	  sharedmem[threadIdx.x] = (_v[i]*_v[i] + _v[i+_nmax]*_v[i+_nmax] + _v[i+2*_nmax]*_v[i+2*_nmax]) * (_mass[_type[i]]);
+    }
+  }
+  reduceBlock(sharedmem);
+  ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
+  if(threadIdx.x==0) 
+  {
+  	buffer[blockIdx.x]=sharedmem[0];	
+  }
+}
+
+__global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+      sharedmem[threadIdx.x]=0;
+      sharedmem[threadIdx.x+blockDim.x]=0;
+      sharedmem[threadIdx.x+2*blockDim.x]=0;
+      sharedmem[threadIdx.x+3*blockDim.x]=0;
+      sharedmem[threadIdx.x+4*blockDim.x]=0;
+      sharedmem[threadIdx.x+5*blockDim.x]=0;
+   if(i < _nlocal)
+     if (_mask[i] & groupbit) {
+      V_FLOAT massone;
+      if (_rmass_flag) massone = _rmass[i];
+      else massone = _mass[_type[i]];
+      sharedmem[threadIdx.x] = massone * _v[i]*_v[i];
+      sharedmem[threadIdx.x+blockDim.x] = massone * _v[i+_nmax]*_v[i+_nmax];
+      sharedmem[threadIdx.x+2*blockDim.x] = massone * _v[i+2*_nmax]*_v[i+2*_nmax];
+      sharedmem[threadIdx.x+3*blockDim.x] = massone * _v[i]*_v[i+_nmax];
+      sharedmem[threadIdx.x+4*blockDim.x] = massone * _v[i]*_v[i+2*_nmax];
+      sharedmem[threadIdx.x+5*blockDim.x] = massone * _v[i+_nmax]*_v[i+2*_nmax];
+    }
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2*blockDim.x]);
+  reduceBlock(&sharedmem[3*blockDim.x]);
+  reduceBlock(&sharedmem[4*blockDim.x]);
+  reduceBlock(&sharedmem[5*blockDim.x]);
+  ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
+  if(threadIdx.x==0) 
+  {
+  	buffer[blockIdx.x]=sharedmem[0];	
+  	buffer[blockIdx.x+gridDim.x]=sharedmem[blockDim.x];	
+  	buffer[blockIdx.x+2*gridDim.x]=sharedmem[2*blockDim.x];	
+  	buffer[blockIdx.x+3*gridDim.x]=sharedmem[3*blockDim.x];	
+  	buffer[blockIdx.x+4*gridDim.x]=sharedmem[4*blockDim.x];	
+  	buffer[blockIdx.x+5*gridDim.x]=sharedmem[5*blockDim.x];	
+  }
+}
+
+
+__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n,ENERGY_FLOAT* t)
+{
+	int i=0;
+    sharedmem[threadIdx.x]=0;
+    ENERGY_FLOAT myforig=0.0;
+    ENERGY_FLOAT* buf=(ENERGY_FLOAT*) _buffer;
+    buf=&buf[blockIdx.x*n];
+	while(i<n)
+	{
+      sharedmem[threadIdx.x]=0;
+	  if(i+threadIdx.x<n)
+      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
+      __syncthreads();
+	  reduceBlock(sharedmem);
+      i+=blockDim.x;
+      if(threadIdx.x==0)
+      myforig+=sharedmem[0];
+	}
+	if(threadIdx.x==0)
+	t[blockIdx.x]=myforig;
+}
diff --git a/lib/cuda/compute_temp_partial_cuda.cu b/lib/cuda/compute_temp_partial_cuda.cu
new file mode 100644
index 0000000000..07e19936f1
--- /dev/null
+++ b/lib/cuda/compute_temp_partial_cuda.cu
@@ -0,0 +1,161 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX compute_temp_partial_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "compute_temp_partial_cuda_cu.h"
+#include "compute_temp_partial_cuda_kernel.cu"
+
+void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+		int size=(unsigned)((sdata->atom.nlocal+63)/64.0)*6*sizeof(ENERGY_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+}
+
+void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(mass)    , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) );
+		if(sdata->atom.rmass_flag) 
+		cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(rmass_flag)   , & sdata->atom.rmass_flag, sizeof(int) );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)       , & sdata->atom.type    .dev_data, sizeof(int*) );
+}
+
+void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
+}
+
+
+void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag)
+{
+	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+		Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
+	//if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	//if(sdata->buffer_new)
+		Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
+	
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+	  Cuda_ComputeTempPartialCuda_Vector_Kernel<<<grid, threads,threads.x*6*sizeof(ENERGY_FLOAT)>>> (groupbit,xflag,yflag,zflag);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
+  
+	  int oldgrid=grid.x;
+	  grid.x=6;
+	  threads.x=512;
+      Cuda_ComputeTempPartialCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
+	}
+}
+
+void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag)
+{
+	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+		Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
+	//if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	//if(sdata->buffer_new)
+		Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
+	MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n",sdata->atom.nlocal);)
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{	
+	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
+	  Cuda_ComputeTempPartialCuda_Scalar_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (groupbit,xflag,yflag,zflag);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
+  
+	  int oldgrid=grid.x;
+	  grid.x=1;
+	  threads.x=512;
+      Cuda_ComputeTempPartialCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
+	}
+}
+
+void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall)
+{
+	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+		Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
+	//if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	//if(sdata->buffer_new)
+		Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
+	
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+	  Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel<<<grid, threads,0>>> (groupbit,xflag,yflag,zflag,(V_FLOAT*) vbiasall);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
+	}
+}
+
+void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall)
+{
+	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+		Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
+	//if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	//if(sdata->buffer_new)
+		Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
+	
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+	  Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel<<<grid, threads,0>>> (groupbit,xflag,yflag,zflag,(V_FLOAT*) vbiasall);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
+	}
+}
diff --git a/lib/cuda/compute_temp_partial_cuda_cu.h b/lib/cuda/compute_temp_partial_cuda_cu.h
new file mode 100644
index 0000000000..82fe86fa71
--- /dev/null
+++ b/lib/cuda/compute_temp_partial_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag);
+extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag);
+extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall);
+extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall);
diff --git a/lib/cuda/compute_temp_partial_cuda_kernel.cu b/lib/cuda/compute_temp_partial_cuda_kernel.cu
new file mode 100644
index 0000000000..c14c3a06a2
--- /dev/null
+++ b/lib/cuda/compute_temp_partial_cuda_kernel.cu
@@ -0,0 +1,152 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ ENERGY_FLOAT sharedmem[];
+
+
+__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit,int xflag,int yflag,int zflag)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+      sharedmem[threadIdx.x]=0;
+  if(i < _nlocal)
+  {
+    if (_rmass_flag) {
+      if (_mask[i] & groupbit)
+	    sharedmem[threadIdx.x] = (_v[i]*_v[i]*xflag + _v[i+_nmax]*_v[i+_nmax]*yflag + _v[i+2*_nmax]*_v[i+2*_nmax]*zflag) * _rmass[i];
+    } else {
+     if (_mask[i] & groupbit)
+	  sharedmem[threadIdx.x] = (_v[i]*_v[i]*xflag + _v[i+_nmax]*_v[i+_nmax]*yflag + _v[i+2*_nmax]*_v[i+2*_nmax]*zflag) * (_mass[_type[i]]);
+    }
+  }
+  reduceBlock(sharedmem);
+  ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
+  if(threadIdx.x==0) 
+  {
+  	buffer[blockIdx.x]=sharedmem[0];	
+  }
+}
+
+__global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit,int xflag,int yflag,int zflag)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+      sharedmem[threadIdx.x]=0;
+      sharedmem[threadIdx.x+blockDim.x]=0;
+      sharedmem[threadIdx.x+2*blockDim.x]=0;
+      sharedmem[threadIdx.x+3*blockDim.x]=0;
+      sharedmem[threadIdx.x+4*blockDim.x]=0;
+      sharedmem[threadIdx.x+5*blockDim.x]=0;
+   if(i < _nlocal)
+     if (_mask[i] & groupbit) {
+      V_FLOAT massone;
+      if (_rmass_flag) massone = _rmass[i];
+      else massone = _mass[_type[i]];
+      sharedmem[threadIdx.x] = massone * _v[i]*_v[i]*xflag;
+      sharedmem[threadIdx.x+blockDim.x] = massone * _v[i+_nmax]*_v[i+_nmax]*yflag;
+      sharedmem[threadIdx.x+2*blockDim.x] = massone * _v[i+2*_nmax]*_v[i+2*_nmax]*zflag;
+      sharedmem[threadIdx.x+3*blockDim.x] = massone * _v[i]*_v[i+_nmax]*xflag*yflag;
+      sharedmem[threadIdx.x+4*blockDim.x] = massone * _v[i]*_v[i+2*_nmax]*xflag*zflag;
+      sharedmem[threadIdx.x+5*blockDim.x] = massone * _v[i+_nmax]*_v[i+2*_nmax]*yflag*zflag;
+    }
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2*blockDim.x]);
+  reduceBlock(&sharedmem[3*blockDim.x]);
+  reduceBlock(&sharedmem[4*blockDim.x]);
+  reduceBlock(&sharedmem[5*blockDim.x]);
+  ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
+  if(threadIdx.x==0) 
+  {
+  	buffer[blockIdx.x]=sharedmem[0];	
+  	buffer[blockIdx.x+gridDim.x]=sharedmem[blockDim.x];	
+  	buffer[blockIdx.x+2*gridDim.x]=sharedmem[2*blockDim.x];	
+  	buffer[blockIdx.x+3*gridDim.x]=sharedmem[3*blockDim.x];	
+  	buffer[blockIdx.x+4*gridDim.x]=sharedmem[4*blockDim.x];	
+  	buffer[blockIdx.x+5*gridDim.x]=sharedmem[5*blockDim.x];	
+  }
+}
+
+
+__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n,ENERGY_FLOAT* t)
+{
+	int i=0;
+    sharedmem[threadIdx.x]=0;
+    ENERGY_FLOAT myforig=0.0;
+    ENERGY_FLOAT* buf=(ENERGY_FLOAT*) _buffer;
+    buf=&buf[blockIdx.x*n];
+	while(i<n)
+	{
+      sharedmem[threadIdx.x]=0;
+	  if(i+threadIdx.x<n)
+      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
+      __syncthreads();
+	  reduceBlock(sharedmem);
+      i+=blockDim.x;
+      if(threadIdx.x==0)
+      myforig+=sharedmem[0];
+	}
+	if(threadIdx.x==0)
+	t[blockIdx.x]=myforig;
+}
+
+__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit,int xflag,int yflag,int zflag,V_FLOAT* vbiasall)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+   if(i < _nlocal)
+     if (_mask[i] & groupbit) {
+	 if(!xflag)
+	 {
+		vbiasall[i] = _v[i];
+		_v[i] = V_F(0.0);
+     }
+	 if(!yflag)
+	 {
+		vbiasall[i+_nmax] = _v[i+_nmax];
+		_v[i+_nmax] = V_F(0.0);
+     }
+	 if(!zflag)
+	 {
+		vbiasall[i+2*_nmax] = _v[i+2*_nmax];
+		_v[i+2*_nmax] = V_F(0.0);
+	 }
+     }
+}
+
+__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit,int xflag,int yflag,int zflag,V_FLOAT* vbiasall)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+   if(i < _nlocal)
+     if (_mask[i] & groupbit) {
+	 if(!xflag)
+	 {
+		_v[i] += vbiasall[i];
+     }
+	 if(!yflag)
+	 {
+		_v[i+_nmax] += vbiasall[i+_nmax];
+     }
+	 if(!zflag)
+	 {
+		_v[i+2*_nmax] += vbiasall[i+2*_nmax];
+     }
+     }
+}
diff --git a/lib/cuda/crm_cuda_utils.cu b/lib/cuda/crm_cuda_utils.cu
new file mode 100644
index 0000000000..2808033421
--- /dev/null
+++ b/lib/cuda/crm_cuda_utils.cu
@@ -0,0 +1,803 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef CRM_CUDA_UTILS
+#define CRM_CUDA_UTILS
+
+//split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+inline int3 getgrid(int n,int shared_per_thread=0,int threadsmax=256, bool p2=false)
+{
+  int3 gridparams;
+  int sharedsize=16000;
+  if(shared_per_thread>0) threadsmax= sharedsize/shared_per_thread<threadsmax?sharedsize/shared_per_thread:threadsmax;
+  
+  if((n<60*32)||(threadsmax<64))
+  gridparams.z=32;
+  else if((n<60*64)||(threadsmax<128))
+  gridparams.z=64;
+  else if((n<60*128)||(threadsmax<256))
+  gridparams.z=128;
+  else if((n<60*256)||(threadsmax<512))
+  gridparams.z=256;
+  else gridparams.z=512;
+  
+  if(p2)
+  {
+  	gridparams.z=16;
+  	while(gridparams.z*2<=threadsmax) gridparams.z*=2;
+  }
+  
+
+  int blocks=(n+gridparams.z-1)/gridparams.z;
+  if(blocks>10000)
+  gridparams.x=gridparams.y=int(sqrt(blocks));
+  else
+  {gridparams.x=blocks; gridparams.y=1;}
+  while(gridparams.x*gridparams.y*gridparams.z<n) gridparams.x++;
+  if(gridparams.x==0) gridparams.x=1;
+  return gridparams;
+}
+
+//return value: 1 if f<0; else: 0
+//take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
+static inline __device__ int negativCUDA(float f)
+{
+ return ((unsigned int)1<<31&(__float_as_int(f)))>>31;
+}
+
+//return value: -1 if f<0; else +1 
+static inline __device__ float fsignCUDA(float f)
+{
+  return f<0.0f?-1.0f:1.0f;
+}
+
+//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
+//blockDim.y and blockDim.z are assumed to be 1
+static inline __device__ void copySharedToGlob(int* shared, int* glob,const int& n)
+{
+  	int i,k;
+	k=n-blockDim.x;
+	for(i=0;i<k;i+=blockDim.x)
+	{
+	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
+	}
+	if(threadIdx.x<n-i) 
+	{
+	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
+	}
+	__syncthreads();
+}
+
+static inline __device__ void copySharedToGlob(float* shared, float* glob,const int& n)
+{
+ 	int i,k;
+	k=n-blockDim.x;
+	for(i=0;i<k;i+=blockDim.x)
+	{
+	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
+	}
+	if(threadIdx.x<n-i) 
+	{
+	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
+	}
+	__syncthreads();
+}
+
+static inline __device__ void copySharedToGlob(double* shared, double* glob,const int& n)
+{
+  	int i,k;
+	k=n-blockDim.x;
+	for(i=0;i<k;i+=blockDim.x)
+	{
+	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
+	}
+	if(threadIdx.x<n-i) 
+	{
+	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
+	}
+	__syncthreads();
+}
+
+static inline __device__ void copyGlobToShared(int* glob,int* shared,const int& n)
+{
+  	int i,k;
+	k=n-blockDim.x;
+	for(i=0;i<k;i+=blockDim.x)
+	{
+	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
+	}
+	if(threadIdx.x<n-i) 
+	{
+	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
+	}
+	__syncthreads();
+}
+
+static __device__ inline void copyGlobToShared(float* glob,float* shared,const int& n)
+{
+  	int i,k;
+	k=n-blockDim.x;
+	for(i=0;i<k;i+=blockDim.x)
+	{
+	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
+	}
+	if(threadIdx.x<n-i) 
+	{
+	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
+	}
+	__syncthreads();
+}
+
+static __device__ inline void copyGlobToShared(double* glob,double* shared,const int& n)
+{
+  	int i;
+	for(i=0;i<n-blockDim.x;i+=blockDim.x)
+	{
+	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
+	}
+	if(threadIdx.x<n-i) 
+	{
+	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
+	}
+	__syncthreads();
+}
+
+//copy data between two memory areas on device, 3d BlockDims are allowed
+static __device__ inline void copyData(double* source,double* target,const int& n)
+{
+	int i;
+	int offset=threadIdx.x*blockDim.y*blockDim.z+threadIdx.y*blockDim.z+threadIdx.z;
+	for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
+	{
+	  target[i+offset]=source[i+offset];
+  	}
+	if(offset<n-i) 
+	{
+	  target[i+offset]=source[i+offset];
+	}
+	__syncthreads();
+}
+
+static __device__ inline void copyData(float* source,float* target,const int& n)
+{
+	int i;
+	int offset=threadIdx.x*blockDim.y*blockDim.z+threadIdx.y*blockDim.z+threadIdx.z;
+	for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
+	{
+	  target[i+offset]=source[i+offset];
+  	}
+	if(offset<n-i) 
+	{
+	  target[i+offset]=source[i+offset];
+	}
+	__syncthreads();
+}
+
+static __device__ inline void copyData(int* source,int* target,const int& n)
+{
+	int i;
+	int offset=threadIdx.x*blockDim.y*blockDim.z+threadIdx.y*blockDim.z+threadIdx.z;
+	for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
+	{
+	  target[i+offset]=source[i+offset];
+  	}
+	if(offset<n-i) 
+	{
+	  target[i+offset]=source[i+offset];
+	}
+	__syncthreads();
+}
+
+static __device__ inline void copyData(unsigned int* source,unsigned int* target,const int& n)
+{
+	int i;
+	int offset=threadIdx.x*blockDim.y*blockDim.z+threadIdx.y*blockDim.z+threadIdx.z;
+	for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
+	{
+	  target[i+offset]=source[i+offset];
+  	}
+	if(offset<n-i) 
+	{
+	  target[i+offset]=source[i+offset];
+	}
+	__syncthreads();
+}
+
+//functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
+//in the end in data[0]=sum_i=0^blockDim.x data[i]
+//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1 
+static __device__ inline void reduceBlockP2(int* data)
+{
+  __syncthreads();
+  for(int i=2;i<=blockDim.x;i*=2)
+  {
+    if(threadIdx.x<blockDim.x/i)
+    data[threadIdx.x]+=data[threadIdx.x+blockDim.x/i];
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlockP2(unsigned int* data)
+{
+  __syncthreads();
+  for(int i=2;i<=blockDim.x;i*=2)
+  {
+    if(threadIdx.x<blockDim.x/i)
+    data[threadIdx.x]+=data[threadIdx.x+blockDim.x/i];
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlockP2(float* data)
+{
+  __syncthreads();
+  for(int i=2;i<=blockDim.x;i*=2)
+  {
+    if(threadIdx.x<blockDim.x/i)
+    data[threadIdx.x]+=data[threadIdx.x+blockDim.x/i];
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlockP2(double* data)
+{
+  __syncthreads();
+  for(int i=2;i<=blockDim.x;i*=2)
+  {
+    if(threadIdx.x<blockDim.x/i)
+    data[threadIdx.x]+=data[threadIdx.x+blockDim.x/i];
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlock(float* data)
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<blockDim.x) p2*=2;
+
+  if(threadIdx.x<blockDim.x-p2)
+    data[threadIdx.x]+=data[threadIdx.x+p2];
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    if(threadIdx.x<p2/i)
+    data[threadIdx.x]+=data[threadIdx.x+p2/i];
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlock(int* data)
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<blockDim.x) p2*=2;
+
+  if(threadIdx.x<blockDim.x-p2)
+    data[threadIdx.x]+=data[threadIdx.x+p2];
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    if(threadIdx.x<p2/i)
+    data[threadIdx.x]+=data[threadIdx.x+p2/i];
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlock(unsigned int* data)
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<blockDim.x) p2*=2;
+
+  if(threadIdx.x<blockDim.x-p2)
+    data[threadIdx.x]+=data[threadIdx.x+p2];
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    if(threadIdx.x<p2/i)
+    data[threadIdx.x]+=data[threadIdx.x+p2/i];
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlock(double* data)
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<blockDim.x) p2*=2;
+
+  if(threadIdx.x<blockDim.x-p2)
+    data[threadIdx.x]+=data[threadIdx.x+p2];
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    if(threadIdx.x<p2/i)
+    data[threadIdx.x]+=data[threadIdx.x+p2/i];
+    __syncthreads();
+  }
+}
+
+static __device__ inline void cudaFillBlockData_int(int* data,const int& n,const int& value)
+{
+	int i;
+	for(i=0;i<n-blockDim.x;i+=blockDim.x)
+	{
+	  data[i+threadIdx.x]=value;
+	}
+	if(threadIdx.x<n-i) data[i+threadIdx.x]=value;
+}
+
+static __device__ inline void cudaFillBlockData_float(float* data,const int& n,const float& value)
+{
+	int i;
+	for(i=0;i<n-blockDim.x;i+=blockDim.x)
+	{
+	  data[i+threadIdx.x]=value;
+	}
+	if(threadIdx.x<n-i) data[i+threadIdx.x]=value;
+}
+
+static __device__ inline void reduce(float* data,int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<n) p2*=2;
+
+  int j=0;
+  while((threadIdx.x+blockDim.x*j)*2<n-p2)
+  {
+    data[threadIdx.x+blockDim.x*j]+=data[(threadIdx.x+blockDim.x*j)+p2];
+    j++;
+  }
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    while((threadIdx.x+blockDim.x*j)<p2/i)
+    {
+      data[threadIdx.x+blockDim.x*j]+=data[(threadIdx.x+blockDim.x*j)+p2/i];
+      j++;
+    }
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduce(double* data,int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<n) p2*=2;
+
+  int j=0;
+  while((threadIdx.x+blockDim.x*j)*2<n-p2)
+  {
+    data[threadIdx.x+blockDim.x*j]+=data[(threadIdx.x+blockDim.x*j)+p2];
+    j++;
+  }
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    while((threadIdx.x+blockDim.x*j)<p2/i)
+    {
+      data[threadIdx.x+blockDim.x*j]+=data[(threadIdx.x+blockDim.x*j)+p2/i];
+      j++;
+    }
+    __syncthreads();
+  }
+}
+
+static __device__ inline void minOfBlock(float* data)
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<blockDim.x) p2*=2;
+
+  if(threadIdx.x<blockDim.x-p2)
+    data[threadIdx.x]=MIN(data[threadIdx.x+p2],data[threadIdx.x]);
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    if(threadIdx.x<p2/i)
+    data[threadIdx.x]=MIN(data[threadIdx.x+p2/i],data[threadIdx.x]);
+    __syncthreads();
+  }
+}
+
+static __device__ inline void maxOfBlock(float* data)
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<blockDim.x) p2*=2;
+
+  if(threadIdx.x<blockDim.x-p2)
+    data[threadIdx.x]=MAX(data[threadIdx.x+p2],data[threadIdx.x]);
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    if(threadIdx.x<p2/i)
+    data[threadIdx.x]=MAX(data[threadIdx.x+p2/i],data[threadIdx.x]);
+    __syncthreads();
+  }
+}
+
+static __device__ inline void minOfBlock(double* data)
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<blockDim.x) p2*=2;
+
+  if(threadIdx.x<blockDim.x-p2)
+    data[threadIdx.x]=MIN(data[threadIdx.x+p2],data[threadIdx.x]);
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    if(threadIdx.x<p2/i)
+    data[threadIdx.x]=MIN(data[threadIdx.x+p2/i],data[threadIdx.x]);
+    __syncthreads();
+  }
+}
+
+static __device__ inline void maxOfBlock(double* data)
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<blockDim.x) p2*=2;
+
+  if(threadIdx.x<blockDim.x-p2)
+    data[threadIdx.x]=MAX(data[threadIdx.x+p2],data[threadIdx.x]);
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    if(threadIdx.x<p2/i)
+    data[threadIdx.x]=MAX(data[threadIdx.x+p2/i],data[threadIdx.x]);
+    __syncthreads();
+  }
+}
+
+
+static __device__ inline void minOfData(double* data,int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<n) p2*=2;
+
+  int j=0;
+  while((threadIdx.x+blockDim.x*j)<n-p2)
+  {
+    data[threadIdx.x+blockDim.x*j]=MIN(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2]);
+    j++;
+  }
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    while((threadIdx.x+blockDim.x*j)<p2/i)
+    {
+      data[threadIdx.x+blockDim.x*j]=MIN(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2/i]);
+      j++;
+    }
+    __syncthreads();
+  }
+}
+
+static __device__ inline void maxOfData(double* data,int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<n) p2*=2;
+
+  int j=0;
+  while((threadIdx.x+blockDim.x*j)<n-p2)
+  {
+    data[threadIdx.x+blockDim.x*j]=MAX(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2]);
+    j++;
+  }
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    while((threadIdx.x+blockDim.x*j)<p2/i)
+    {
+      data[threadIdx.x+blockDim.x*j]=MAX(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2/i]);
+      j++;
+    }
+    __syncthreads();
+  }
+}
+
+static __device__ inline void minOfData(float* data,int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<n) p2*=2;
+
+  int j=0;
+  while((threadIdx.x+blockDim.x*j)<n-p2)
+  {
+    data[threadIdx.x+blockDim.x*j]=MIN(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2]);
+    j++;
+  }
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    while((threadIdx.x+blockDim.x*j)<p2/i)
+    {
+      data[threadIdx.x+blockDim.x*j]=MIN(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2/i]);
+      j++;
+    }
+    __syncthreads();
+  }
+}
+
+static __device__ inline void maxOfData(float* data,int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2=1;
+  while(p2*2<n) p2*=2;
+
+  int j=0;
+  while((threadIdx.x+blockDim.x*j)<n-p2)
+  {
+    data[threadIdx.x+blockDim.x*j]=MAX(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2]);
+    j++;
+  }
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    while((threadIdx.x+blockDim.x*j)<p2/i)
+    {
+      data[threadIdx.x+blockDim.x*j]=MAX(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2/i]);
+      j++;
+    }
+    __syncthreads();
+  }
+}
+
+#if X_PRECISION == 2
+static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
+{
+    int2 v = tex1Dfetch(t,i);
+    return __hiloint2double(v.y, v.x);
+}
+
+static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
+{
+    int4 v = tex1Dfetch(t,2*i);
+    int4 u = tex1Dfetch(t,2*i+1);
+    X_FLOAT4 w;
+     
+    w.x= __hiloint2double(v.y, v.x);
+    w.y= __hiloint2double(v.w, v.z);
+    w.z= __hiloint2double(u.y, u.x);
+    w.w= __hiloint2double(u.w, u.z);
+    return w;
+}
+#endif
+
+inline void BindXTypeTexture(cuda_shared_data* sdata)
+{
+	#ifdef CUDA_USE_TEXTURE
+		_x_type_tex.normalized = false;                      // access with normalized texture coordinates
+		_x_type_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_x_type_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* x_type_texture_ptr;
+		cudaGetTextureReference(&x_type_texture_ptr, MY_CONST(x_type_tex));
+
+		#if X_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
+		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
+		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
+		#endif
+	#endif
+}
+
+static __device__ inline X_FLOAT4 fetchXType(int i)
+{
+		#ifdef CUDA_USE_TEXTURE
+		  #if X_PRECISION == 1
+		     return tex1Dfetch(_x_type_tex,i);
+		  #else
+		     return tex1Dfetch_double(_x_type_tex,i);
+		  #endif
+		#else
+		  return _x_type[i];
+		#endif		
+}
+
+#if V_PRECISION == 2
+static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
+{
+    int2 v = tex1Dfetch(t,i);
+    return __hiloint2double(v.y, v.x);
+}
+
+static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
+{
+    int4 v = tex1Dfetch(t,2*i);
+    int4 u = tex1Dfetch(t,2*i+1);
+    V_FLOAT4 w;
+     
+    w.x= __hiloint2double(v.y, v.x);
+    w.y= __hiloint2double(v.w, v.z);
+    w.z= __hiloint2double(u.y, u.x);
+    w.w= __hiloint2double(u.w, u.z);
+    return w;
+}
+#endif
+
+inline void BindVRadiusTexture(cuda_shared_data* sdata)
+{
+	#ifdef CUDA_USE_TEXTURE
+		_v_radius_tex.normalized = false;                      // access with normalized texture coordinates
+		_v_radius_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_v_radius_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* v_radius_texture_ptr;
+		cudaGetTextureReference(&v_radius_texture_ptr, MY_CONST(v_radius_tex));
+
+		#if V_PRECISION == 1
+		cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
+		cudaBindTexture(0,v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax*sizeof(X_FLOAT4));
+		#else
+		cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
+		cudaBindTexture(0,v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax*2*sizeof(int4));
+		#endif
+	#endif
+}
+
+static __device__ inline V_FLOAT4 fetchVRadius(int i)
+{
+		#ifdef CUDA_USE_TEXTURE
+		  #if V_PRECISION == 1
+		     return tex1Dfetch(_v_radius_tex,i);
+		  #else
+		     return tex1Dfetch_double_v(_v_radius_tex,i);
+		  #endif
+		#else
+		  return _v_radius[i];
+		#endif		
+}
+
+inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
+{
+	#ifdef CUDA_USE_TEXTURE
+		_omega_rmass_tex.normalized = false;                      // access with normalized texture coordinates
+		_omega_rmass_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_omega_rmass_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* omega_rmass_texture_ptr;
+		cudaGetTextureReference(&omega_rmass_texture_ptr, MY_CONST(omega_rmass_tex));
+
+		#if V_PRECISION == 1
+		cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
+		cudaBindTexture(0,omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax*sizeof(X_FLOAT4));
+		#else
+		cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
+		cudaBindTexture(0,omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax*2*sizeof(int4));
+		#endif
+	#endif
+}
+
+static __device__ inline V_FLOAT4 fetchOmegaRmass(int i)
+{
+		#ifdef CUDA_USE_TEXTURE
+		  #if V_PRECISION == 1
+		     return tex1Dfetch(_omega_rmass_tex,i);
+		  #else
+		     return tex1Dfetch_double_v(_omega_rmass_tex,i);
+		  #endif
+		#else
+		  return _omega_rmass[i];
+		#endif		
+}
+
+#if F_PRECISION == 2
+static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
+{
+    int2 v = tex1Dfetch(t,i);
+    return __hiloint2double(v.y, v.x);
+}
+
+static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
+{
+    int4 v = tex1Dfetch(t,2*i);
+    int4 u = tex1Dfetch(t,2*i+1);
+    F_FLOAT4 w;
+     
+    w.x= __hiloint2double(v.y, v.x);
+    w.y= __hiloint2double(v.w, v.z);
+    w.z= __hiloint2double(u.y, u.x);
+    w.w= __hiloint2double(u.w, u.z);
+    return w;
+}
+#endif
+
+inline void BindQTexture(cuda_shared_data* sdata)
+{
+	#ifdef CUDA_USE_TEXTURE
+		_q_tex.normalized = false;                      // access with normalized texture coordinates
+		_q_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_q_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* q_texture_ptr;
+		cudaGetTextureReference(&q_texture_ptr, MY_CONST(q_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
+		cudaBindTexture(0,q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax*sizeof(F_FLOAT));
+		#else
+		cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
+		cudaBindTexture(0,q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax*sizeof(int2));
+		#endif
+	#endif
+}
+
+static __device__ inline F_FLOAT fetchQ(int i)
+{
+		#ifdef CUDA_USE_TEXTURE
+		  #if F_PRECISION == 1
+		     return tex1Dfetch(_q_tex,i);
+		  #else
+		     return tex1Dfetch_double_f(_q_tex,i);
+		  #endif
+		#else
+		  return _q[i];
+		#endif		
+}
+
+#endif
+
+/*
+
+inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
+{
+	#ifdef CUDA_USE_TEXTURE
+		_coeff_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff_texture_ptr;
+		cudaGetTextureReference(&coeff_texture_ptr, MY_CONST(coeff_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
+		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
+		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
+		#endif
+	#endif
+}
+
+static __device__ inline X_FLOAT4 fetchXType(int i)
+{
+		#ifdef CUDA_USE_TEXTURE
+		  #if X_PRECISION == 1
+		     return tex1Dfetch(_x_type_tex,i);
+		  #else
+		     return tex1Dfetch_double(_x_type_tex,i);
+		  #endif
+		#else
+		  return _x_type[i];
+		#endif		
+}
+*/
diff --git a/lib/cuda/cuda.cu b/lib/cuda/cuda.cu
new file mode 100644
index 0000000000..1fc4dc4a41
--- /dev/null
+++ b/lib/cuda/cuda.cu
@@ -0,0 +1,22 @@
+#include "cuda_precision.h"
+#include "cuda_shared.h"
+#include "cuda_cu.h"
+
+void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata)
+{
+	sdata->compile_settings.prec_glob=sizeof(CUDA_FLOAT)/4;
+	sdata->compile_settings.prec_x=sizeof(X_FLOAT)/4;
+	sdata->compile_settings.prec_v=sizeof(V_FLOAT)/4;
+	sdata->compile_settings.prec_f=sizeof(F_FLOAT)/4;
+	sdata->compile_settings.prec_pppm=sizeof(PPPM_FLOAT)/4;
+	sdata->compile_settings.prec_fft=sizeof(FFT_FLOAT)/4;
+    
+    #ifdef FFT_CUFFT
+      sdata->compile_settings.cufft=1;
+    #else	
+      sdata->compile_settings.cufft=0;
+    #endif
+    
+    sdata->compile_settings.arch=CUDA_ARCH;
+    
+}
diff --git a/lib/cuda/cuda_cu.h b/lib/cuda/cuda_cu.h
new file mode 100644
index 0000000000..48498b8d0f
--- /dev/null
+++ b/lib/cuda/cuda_cu.h
@@ -0,0 +1 @@
+extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata);
diff --git a/lib/cuda/cuda_data.cu b/lib/cuda/cuda_data.cu
new file mode 100644
index 0000000000..327cbd9014
--- /dev/null
+++ b/lib/cuda/cuda_data.cu
@@ -0,0 +1,168 @@
+enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
+
+#include "cuda_data_cu.h"
+#include "cuda_wrapper_cu.h"
+#include "cuda_data_kernel.cu"
+#include <cstdio>
+
+void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
+{
+	int size=n[0];
+	if(n[1]>0) size*=n[1];
+	if(n[2]>0) size*=n[2];
+	
+	dim3 threads; threads.x=1; threads.y=1; threads.z=1;
+	dim3 grid; grid.x=1; grid.y=1; grid.z=1;
+	
+	if(size<=128*30)
+	threads.x=32;
+	else if(size<=256*30)
+	threads.x=64;
+	else if(size<=512*30)
+	threads.x=128;
+	else 
+	threads.x=256;
+	
+	grid.x=((size-1)+threads.x)/threads.x;
+	if(grid.x>32000)
+	grid.x=32000;
+	while(grid.x*grid.y*threads.x<size) grid.y++; 
+	float debugdata[size];
+	//int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT));
+	size*=sizeof(double);
+	printf("size: %i (%i %i %i) (%i %i %i) %p\n",size,grid.x,grid.y,threads.x,n[0],n[1],n[2],buffer);
+	CudaWrapper_UploadCudaData(host_data, buffer, size);
+	CudaData_Upload_Kernel_DoubleFloat<<<grid,threads>>>((double*)buffer,(float*)dev_data,n[0],n[1],n[2],mode);
+	cudaThreadSynchronize();
+	CudaWrapper_DownloadCudaData(debugdata, dev_data, size/2);
+	double sum=0;
+	printf("debugdata: ");
+	for(int i=0;i<size/sizeof(double);i++) sum+=(debugdata[i]-((double*) host_data)[i])*(debugdata[i]-((double*) host_data)[i]);
+	
+	printf("%lf \n",sum);
+	
+}
+
+void CudaData_Upload_DoubleDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
+{
+	int size=n[0];
+	if(n[1]>0) size*=n[1];
+	if(n[2]>0) size*=n[2];
+	
+	dim3 threads; threads.x=1; threads.y=1; threads.z=1;
+	dim3 grid; grid.x=1; grid.y=1; grid.z=1;
+	
+	if(size<=128*30)
+	threads.x=32;
+	else if(size<=256*30)
+	threads.x=64;
+	else if(size<=512*30)
+	threads.x=128;
+	else 
+	threads.x=256;
+	
+	grid.x=((size-1)+threads.x)/threads.x;
+	if(grid.x>32000)
+	grid.x=32000;
+	while(grid.x*grid.y*threads.x<size) grid.y++; 
+	
+	size*=sizeof(double);
+	
+	CudaWrapper_UploadCudaData(host_data, buffer, size);
+	CudaData_Upload_Kernel_DoubleDouble<<<grid,threads>>>((double*)buffer,(double*)dev_data,n[0],n[1],n[2],mode);
+	cudaThreadSynchronize();
+}
+
+void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
+{
+	int size=n[0];
+	if(n[1]>0) size*=n[1];
+	if(n[2]>0) size*=n[2];
+	
+	dim3 threads; threads.x=1; threads.y=1; threads.z=1;
+	dim3 grid; grid.x=1; grid.y=1; grid.z=1;
+	
+	if(size<=128*30)
+	threads.x=32;
+	else if(size<=256*30)
+	threads.x=64;
+	else if(size<=512*30)
+	threads.x=128;
+	else 
+	threads.x=256;
+	
+	grid.x=((size-1)+threads.x)/threads.x;
+	if(grid.x>32000)
+	grid.x=32000;
+	while(grid.x*grid.y*threads.x<size) grid.y++; 
+	
+	size*=sizeof(float);
+	
+	CudaWrapper_UploadCudaData(host_data, buffer, size);
+	CudaData_Upload_Kernel_FloatDouble<<<grid,threads>>>((float*)buffer,(double*)dev_data,n[0],n[1],n[2],mode);
+	cudaThreadSynchronize();
+}
+
+void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
+{
+	int size=n[0];
+	if(n[1]>0) size*=n[1];
+	if(n[2]>0) size*=n[2];
+	
+	dim3 threads; threads.x=1; threads.y=1; threads.z=1;
+	dim3 grid; grid.x=1; grid.y=1; grid.z=1;
+	
+	if(size<=128*30)
+	threads.x=32;
+	else if(size<=256*30)
+	threads.x=64;
+	else if(size<=512*30)
+	threads.x=128;
+	else 
+	threads.x=256;
+	
+	grid.x=((size-1)+threads.x)/threads.x;
+	if(grid.x>32000)
+	grid.x=32000;
+	while(grid.x*grid.y*threads.x<size) grid.y++; 
+	
+	size*=sizeof(float);
+	
+	CudaWrapper_UploadCudaData(host_data, buffer, size);
+	CudaData_Upload_Kernel_FloatFloat<<<grid,threads>>>((float*)buffer,(float*)dev_data,n[0],n[1],n[2],mode);
+	cudaThreadSynchronize();
+}
+
+void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
+{
+	int size=n[0];
+	if(n[1]>0) size*=n[1];
+	if(n[2]>0) size*=n[2];
+	
+    dim3 threads; threads.x=1; threads.y=1; threads.z=1;
+	dim3 grid; grid.x=1; grid.y=1; grid.z=1;
+	
+	if(size<=128*30)
+	threads.x=32;
+	else if(size<=256*30)
+	threads.x=64;
+	else if(size<=512*30)
+	threads.x=128;
+	else 
+	threads.x=256;
+	
+	grid.x=((size-1)+threads.x)/threads.x;
+	if(grid.x>32000)
+	grid.x=32000;
+	while(grid.x*grid.y*threads.x<size) grid.y++; 
+	
+	size*=sizeof(int);
+	
+	CudaWrapper_UploadCudaData(host_data, buffer, size);
+	CudaData_Upload_Kernel_IntInt<<<grid,threads>>>((int*)buffer,(int*)dev_data,n[0],n[1],n[2],mode);
+	cudaThreadSynchronize();
+}
+
+void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer)
+{
+}
diff --git a/lib/cuda/cuda_data_cu.h b/lib/cuda/cuda_data_cu.h
new file mode 100644
index 0000000000..e323b30429
--- /dev/null
+++ b/lib/cuda/cuda_data_cu.h
@@ -0,0 +1,13 @@
+#ifndef CUDA_DATA_CU_H_
+#define CUDA_DATA_CU_H_
+
+extern "C" void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
+extern "C" void CudaData_Upload_DoubleDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
+extern "C" void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
+extern "C" void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
+extern "C" void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
+
+extern "C" void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer);
+
+
+#endif /*CUDA_DATA_CU_H_*/
diff --git a/lib/cuda/cuda_data_kernel.cu b/lib/cuda/cuda_data_kernel.cu
new file mode 100644
index 0000000000..831b7b08bb
--- /dev/null
+++ b/lib/cuda/cuda_data_kernel.cu
@@ -0,0 +1,156 @@
+__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer,float* dev_data,
+						unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
+{
+  if(mode==x) mode=xx;
+  unsigned length=nx;
+  if(ny>0) length*=ny;
+  if(nz>0) length*=nz;
+  unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
+
+  
+  if(i>=length) return;
+  switch(mode)
+  {
+     case xx:
+     {
+       dev_data[i]=buffer[i];
+     }
+     case xy:
+     {
+       dev_data[i]=buffer[i];
+     }
+     case yx:
+       {
+       j=i/ny;
+       k=i%ny;
+       dev_data[k*nx+j]=buffer[j*ny+k];
+       }
+     case xyz:
+     {
+       dev_data[i]=buffer[i];
+     }
+     case xzy:
+       {
+       	j=i/(ny*nz);
+       k=(i%(ny*nz))/nz;
+       l=i%nz;
+       dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
+       }
+  }
+}
+
+__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer,double* dev_data,
+						unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
+{
+  if(mode==x) mode=xx;
+  unsigned length=nx;
+  if(ny>0) length*=ny;
+  if(nz>0) length*=nz;
+  unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
+  if(i>=length) return;
+  switch(mode)
+  {
+     case xx:
+       dev_data[i]=buffer[i];
+     case xy:
+       dev_data[i]=buffer[i];
+     case yx:
+       j=i/ny;
+       k=i%ny;
+       dev_data[k*nx+j]=buffer[j*ny+k];
+     case xyz:
+       dev_data[i]=buffer[i];
+     case xzy:
+       j=i/(ny*nz);
+       k=(i%(ny*nz))/nz;
+       l=i%nz;
+       dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
+  }
+}
+
+__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer,double* dev_data,
+						unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
+{
+  if(mode==x) mode=xx;
+  unsigned length=nx;
+  if(ny>0) length*=ny;
+  if(nz>0) length*=nz;
+  unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
+  if(i>=length) return;
+  switch(mode)
+  {
+     case xx:
+       dev_data[i]=buffer[i];
+     case xy:
+       dev_data[i]=buffer[i];
+     case yx:
+       j=i/ny;
+       k=i%ny;
+       dev_data[k*nx+j]=buffer[j*ny+k];
+     case xyz:
+       dev_data[i]=buffer[i];
+     case xzy:
+       j=i/(ny*nz);
+       k=(i%(ny*nz))/nz;
+       l=i%nz;
+       dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
+  }
+}
+
+__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer,float* dev_data,
+						unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
+{
+  if(mode==x) mode=xx;
+  unsigned length=nx;
+  if(ny>0) length*=ny;
+  if(nz>0) length*=nz;
+  unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
+  if(i>=length) return;
+  switch(mode)
+  {
+     case xx:
+       dev_data[i]=buffer[i];
+     case xy:
+       dev_data[i]=buffer[i];
+     case yx:
+       j=i/ny;
+       k=i%ny;
+       dev_data[k*nx+j]=buffer[j*ny+k];
+     case xyz:
+       dev_data[i]=buffer[i];
+     case xzy:
+       j=i/(ny*nz);
+       k=(i%(ny*nz))/nz;
+       l=i%nz;
+       dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
+  }
+}
+
+__global__ void CudaData_Upload_Kernel_IntInt(int* buffer,int* dev_data,
+						unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
+{
+  if(mode==x) mode=xx;
+  unsigned length=nx;
+  if(ny>0) length*=ny;
+  if(nz>0) length*=nz;
+  unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
+  if(i>=length) return;
+  switch(mode)
+  {
+     case xx:
+       dev_data[i]=buffer[i];
+     case xy:
+       dev_data[i]=buffer[i];
+     case yx:
+       j=i/ny;
+       k=i%ny;
+       dev_data[k*nx+j]=buffer[j*ny+k];
+     case xyz:
+       dev_data[i]=buffer[i];
+     case xzy:
+       j=i/(ny*nz);
+       k=(i%(ny*nz))/nz;
+       l=i%nz;
+       dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
+  }
+}
diff --git a/lib/cuda/cuda_kernel.cu b/lib/cuda/cuda_kernel.cu
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/cuda/cuda_pair.cu b/lib/cuda/cuda_pair.cu
new file mode 100644
index 0000000000..531db7e2b3
--- /dev/null
+++ b/lib/cuda/cuda_pair.cu
@@ -0,0 +1,1000 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+enum PAIR_FORCES {PAIR_NONE,PAIR_BORN,PAIR_BUCK,PAIR_CG_CMM,PAIR_LJ_CHARMM,PAIR_LJ_CLASS2,PAIR_LJ_CUT, PAIR_LJ_EXPAND, PAIR_LJ_GROMACS, PAIR_LJ_SMOOTH, PAIR_LJ96_CUT, PAIR_MORSE, PAIR_MORSE_R6};
+enum COUL_FORCES {COUL_NONE,COUL_CHARMM,COUL_CHARMM_IMPLICIT,COUL_CUT,COUL_LONG, COUL_DEBYE, COUL_GROMACS,COUL_SPECIAL};
+#define DATA_NONE 0
+#define DATA_V 1
+#define DATA_TAG 2
+#define DATA_RMASS 4
+#define DATA_MASS 8
+#define DATA_TORQUE 16
+#define DATA_OMEGA 32
+#define DATA_RADIUS 64
+#define DATA_DENSITY 128
+#define DATA_MASK 256
+#define DATA_V_RADIUS 512
+#define DATA_OMEGA_RMASS 1024
+
+#define MY_PREFIX cuda_pair
+#define IncludeCommonNeigh
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "cuda_wrapper_cu.h"
+#include "crm_cuda_utils.cu"
+
+//constants used by multiple forces
+
+//general
+#define _cutsq MY_AP(cutsq)
+#define _offset MY_AP(offset)
+#define _special_lj MY_AP(special_lj)
+#define _special_coul MY_AP(special_coul)
+#define _cutsq_global MY_AP(cutsq_global)
+#define _collect_forces_later MY_AP(collect_forces_later)
+
+__device__ __constant__ X_FLOAT _cutsq[CUDA_MAX_TYPES2];
+__device__ __constant__ ENERGY_FLOAT _offset[CUDA_MAX_TYPES2];
+__device__ __constant__ F_FLOAT _special_lj[4];
+__device__ __constant__ F_FLOAT _special_coul[4];
+__device__ __constant__ X_FLOAT _cutsq_global;
+__device__ __constant__ int _collect_forces_later;
+
+__device__ __constant__ F_FLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space)
+__device__ __constant__ F_FLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2];
+__device__ __constant__ F_FLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2];
+__device__ __constant__ F_FLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2];
+__device__ __constant__ F_FLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2];
+
+
+__device__ __constant__ F_FLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space)
+__device__ __constant__ F_FLOAT* MY_AP(coeff2_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff3_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff4_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff5_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff6_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff7_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff8_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff9_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff10_gm);
+
+	#define _coeff1_gm_tex         MY_AP(coeff1_gm_tex)
+	#if F_PRECISION == 1
+	texture<float> _coeff1_gm_tex;
+	#else
+	texture<int2,1> _coeff1_gm_tex;
+	#endif
+
+	#define _coeff2_gm_tex         MY_AP(coeff2_gm_tex)
+	#if F_PRECISION == 1
+	texture<float> _coeff2_gm_tex;
+	#else
+	texture<int2,1> _coeff2_gm_tex;
+	#endif
+
+	#define _coeff3_gm_tex         MY_AP(coeff3_gm_tex)
+	#if F_PRECISION == 1
+	texture<float> _coeff3_gm_tex;
+	#else
+	texture<int2,1> _coeff3_gm_tex;
+	#endif
+
+	#define _coeff4_gm_tex         MY_AP(coeff4_gm_tex)
+	#if F_PRECISION == 1
+	texture<float> _coeff4_gm_tex;
+	#else
+	texture<int2,1> _coeff4_gm_tex;
+	#endif
+
+	#define _coeff5_gm_tex         MY_AP(coeff5_gm_tex)
+	#if F_PRECISION == 1
+	texture<float> _coeff5_gm_tex;
+	#else
+	texture<int2,1> _coeff5_gm_tex;
+	#endif
+
+	#define _coeff6_gm_tex         MY_AP(coeff6_gm_tex)
+	#if F_PRECISION == 1
+	texture<float> _coeff6_gm_tex;
+	#else
+	texture<int2,1> _coeff6_gm_tex;
+	#endif
+
+	#define _coeff7_gm_tex         MY_AP(coeff7_gm_tex)
+	#if F_PRECISION == 1
+	texture<float> _coeff7_gm_tex;
+	#else
+	texture<int2,1> _coeff7_gm_tex;
+	#endif
+
+	#define _coeff8_gm_tex         MY_AP(coeff8_gm_tex)
+	#if F_PRECISION == 1
+	texture<float> _coeff8_gm_tex;
+	#else
+	texture<int2,1> _coeff8_gm_tex;
+	#endif
+
+	#define _coeff9_gm_tex         MY_AP(coeff9_gm_tex)
+	#if F_PRECISION == 1
+	texture<float> _coeff9_gm_tex;
+	#else
+	texture<int2,1> _coeff9_gm_tex;
+	#endif
+
+	#define _coeff10_gm_tex         MY_AP(coeff10_gm_tex)
+	#if F_PRECISION == 1
+	texture<float> _coeff10_gm_tex;
+	#else
+	texture<int2,1> _coeff10_gm_tex;
+	#endif
+
+//if more than 5 coefficients are needed for a pair potential add them here
+
+
+//coulomb
+#define _cut_coulsq MY_AP(cut_coulsq)
+#define _cut_coulsq_global MY_AP(cut_coulsq_global)
+#define _g_ewald MY_AP(g_ewald)
+#define _qqrd2e MY_AP(qqrd2e)
+#define _kappa MY_AP(kappa)
+__device__ __constant__ X_FLOAT _cut_coulsq[CUDA_MAX_TYPES2];
+__device__ __constant__ X_FLOAT _cut_coulsq_global;
+__device__ __constant__ F_FLOAT _g_ewald;
+__device__ __constant__ F_FLOAT _qqrd2e;
+__device__ __constant__ F_FLOAT _kappa;
+
+//inner cutoff
+#define _cut_innersq MY_AP(cut_innersq)
+#define _cut_innersq_global MY_AP(cut_innersq_global)
+__device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2];
+__device__ __constant__ X_FLOAT _cut_innersq_global;
+
+
+template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
+__global__ void Pair_Kernel_TpA(int eflag, int vflag,int eflag_atom,int vflag_atom);
+
+template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
+__global__ void Pair_Kernel_BpA(int eflag, int vflag,int eflag_atom,int vflag_atom);
+
+template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
+__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase);
+
+template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
+__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase);
+
+#include <stdio.h>
+#include "cuda_pair_cu.h"
+#include "cuda_pair_virial_kernel_nc.cu"
+
+//Functions which are shared by pair styles
+
+//Update Buffersize
+void Cuda_UpdateBuffer(cuda_shared_data* sdata,int size)
+{
+	CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed");
+ 		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+	CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed");
+}
+
+//Update constants after nmax change which are generally needed by all pair styles 
+void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+	CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin");
+	  //Neighbor
+		cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) 	, & sneighlist->firstneigh.dim[0]  , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(firstneigh)			, & sneighlist->firstneigh.dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(ilist)     			, & sneighlist->ilist     .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(inum)      			, & sneighlist->inum               , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(numneigh)  			, & sneighlist->numneigh  .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(neighbors)  		, & sneighlist->neighbors  .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(maxneighbors)     	, & sneighlist->maxneighbors	   , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(overlap_comm)     	, & sdata->overlap_comm, sizeof(int) );
+		
+	if(sdata->overlap_comm) 
+	{
+		cudaMemcpyToSymbol(MY_CONST(numneigh_border)  , & sneighlist->numneigh_border .dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(numneigh_inner)   , & sneighlist->numneigh_inner  .dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(neighbors_inner)  , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(ilist_border)     , & sneighlist->ilist_border    .dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(inum_border)      , & sneighlist->inum_border     .dev_data, sizeof(int*) );		
+	}
+
+	  //System
+		cudaMemcpyToSymbol(MY_CONST(nlocal)    			, & sdata->atom.nlocal             , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(nall)      			, & sdata->atom.nall               , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(nmax)      			, & sdata->atom.nmax               , sizeof(int)  );
+		
+      //Atom
+		cudaMemcpyToSymbol(MY_CONST(x)         			, & sdata->atom.x         .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(x_type)         	, & sdata->atom.x_type    .dev_data, sizeof(X_FLOAT4*) );
+		cudaMemcpyToSymbol(MY_CONST(f)         			, & sdata->atom.f         .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)      			, & sdata->atom.type      .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(q)         			, & sdata->atom.q         .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(tag)      			, & sdata->atom.tag       .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*) );
+
+
+	  //Other
+		cudaMemcpyToSymbol(MY_CONST(debugdata)      , & sdata->debugdata      , sizeof(int*) );
+	CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End");
+}
+
+//Initialisation of GPU Constants which rarely change
+void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=false, bool use_global_params=false, bool need_innercut=false, bool need_cut=true ) 
+{
+	unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+	unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
+	unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
+	unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2;
+
+    //check if enough constant memory is available
+	if((cuda_ntypes2 > CUDA_MAX_TYPES2 )&& !use_global_params)
+		printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u "
+		"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
+		"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE-1);
+	if((cuda_ntypes2 > CUDA_MAX_TYPES2 )&& !use_global_params)
+	  exit(0);
+	//type conversion of cutoffs and parameters
+	if(need_cut)
+	{
+	  X_FLOAT cutsq[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_global * sdata->pair.cut_global);
+		}
+	  }
+	
+	  int cutsqdiffer=0;
+	  X_FLOAT cutsq_global;
+	  cutsq_global = (X_FLOAT) (sdata->pair.cut_global * sdata->pair.cut_global);	
+	  if(sdata->pair.cut) 
+	  {
+		for(int i=1; i<=sdata->atom.ntypes; ++i)
+		{
+			for(int j=1; j<=sdata->atom.ntypes; ++j)
+			{
+				if(sdata->pair.cut[i][j]>1e-6)
+				cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
+				else
+				if(sdata->pair.cut[j][i]>1e-6)
+				cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut[j][i] * sdata->pair.cut[j][i]);
+				if(i==1&&j==1) cutsq_global = cutsq[i * cuda_ntypes + j];
+				if((cutsq_global - cutsq[i * cuda_ntypes + j])*(cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
+				  cutsqdiffer++;
+			}
+		}
+	  }
+
+	  if(sdata->pair.cutsq) 
+	  {
+		for(int i=1; i<=sdata->atom.ntypes; ++i)
+		{
+			for(int j=1; j<=sdata->atom.ntypes; ++j)
+			{
+				if(sdata->pair.cut[i][j]>1e-6)
+				cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cutsq[i][j]);
+				else
+				if(sdata->pair.cut[j][i]>1e-6)
+				cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cutsq[j][i]);
+				if(i==1&&j==1) cutsq_global = cutsq[i * cuda_ntypes + j];
+				if((cutsq_global - cutsq[i * cuda_ntypes + j])*(cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
+				  cutsqdiffer++;
+			}
+		}
+	  }
+	  
+	  if(cutsqdiffer) 
+	  {
+	  	cutsq_global = -1.0;
+	    cudaMemcpyToSymbol(MY_CONST(cutsq)      	, cutsq                    		, nx               );
+	  }
+	  cudaMemcpyToSymbol(MY_CONST(cutsq_global)	,&cutsq_global  				, sizeof(X_FLOAT)  );
+	}
+
+    if(need_innercut)
+    {
+	  X_FLOAT cut_innersq[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			cut_innersq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
+		}
+	  }
+
+	  int cutsqdiffer=0;
+	  X_FLOAT cut_innersq_global;
+	  cut_innersq_global = (X_FLOAT) (sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);	
+	  if(sdata->pair.cut_inner) 
+	  {
+		for(int i=1; i<=sdata->atom.ntypes; ++i)
+		{
+			for(int j=1; j<=sdata->atom.ntypes; ++j)
+			{
+				if(sdata->pair.cut_inner[i][j]>1e-6)
+				cut_innersq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
+				else
+				if(sdata->pair.cut_inner[j][i]>1e-6)
+				cut_innersq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_inner[j][i] * sdata->pair.cut_inner[j][i]);
+				if(i==1&&j==1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j];
+				if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j])*(cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6)
+				  cutsqdiffer++;
+			}
+		}
+	  }
+	  if(cutsqdiffer) 
+	  {  
+	  	cut_innersq_global = -1.0;
+ 	    cudaMemcpyToSymbol(MY_CONST(cut_innersq)      	, cut_innersq                    		, nx               );
+	  }
+	  cudaMemcpyToSymbol(MY_CONST(cut_innersq_global)	,&cut_innersq_global  				, sizeof(X_FLOAT)  );
+    }
+	
+    if(need_q)
+    {
+	  X_FLOAT cut_coulsq[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
+		}
+	  }
+	  
+	  int cutsqdiffer=0;
+	  X_FLOAT cut_coulsq_global;
+	  cut_coulsq_global = (X_FLOAT) (sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);	
+	  if(sdata->pair.cut_coulsq_global> cut_coulsq_global)  cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global; 
+	  if(sdata->pair.cut_coul) 
+	  {
+		for(int i=1; i<=sdata->atom.ntypes; ++i)
+		{
+			for(int j=1; j<=sdata->atom.ntypes; ++j)
+			{
+				if(sdata->pair.cut_coul[i][j]>1e-6)
+				cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
+				else
+				if(sdata->pair.cut_coul[j][i]>1e-6)
+				cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_coul[j][i] * sdata->pair.cut_coul[j][i]);
+				if(i==1&&j==1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j];
+				if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j])*(cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6)
+				  cutsqdiffer++;
+			}
+		}
+	  }
+	  if(cutsqdiffer) 
+	  {
+	  	cut_coulsq_global = -1.0;
+ 	    cudaMemcpyToSymbol(MY_CONST(cut_coulsq)      	, cut_coulsq                    		, nx               );
+	  }
+	  cudaMemcpyToSymbol(MY_CONST(cut_coulsq_global),&cut_coulsq_global  					, sizeof(X_FLOAT)  );
+    }
+	CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed");
+
+	if(ncoeff>0)
+	{	
+	  F_FLOAT coeff1[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j];
+		}
+	  }
+
+	  if(use_global_params)
+	  {
+	    cudaMemcpyToSymbol(MY_CONST(coeff1_gm)  , &sdata->pair.coeff1_gm.dev_data   , sizeof(F_FLOAT*)  );
+	    cudaMemcpy((sdata->pair.coeff1_gm.dev_data),coeff1, n,cudaMemcpyHostToDevice);
+	   
+	    _coeff1_gm_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff1_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff1_gm_texture_ptr;
+		cudaGetTextureReference(&coeff1_gm_texture_ptr, MY_CONST(coeff1_gm_tex));
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed");
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed");
+		cudaBindTexture(0,coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed");
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b-d failed");
+		cudaBindTexture(0,coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed");
+		#endif
+	    
+	  }
+	  else
+	    cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n);
+	}
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed");
+
+	if(ncoeff>1)
+	{	
+	  F_FLOAT coeff2[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j];
+		}
+	  }
+
+	  if(use_global_params)
+	  {
+	    cudaMemcpyToSymbol(MY_CONST(coeff2_gm)  , &sdata->pair.coeff2_gm.dev_data   , sizeof(F_FLOAT*)  );
+	    cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n,cudaMemcpyHostToDevice);
+
+	    _coeff2_gm_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff2_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff2_gm_texture_ptr;
+		cudaGetTextureReference(&coeff2_gm_texture_ptr, MY_CONST(coeff2_gm_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+		cudaBindTexture(0,coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+		cudaBindTexture(0,coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
+		#endif
+
+	  }
+	  else
+	    cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n);
+	}
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed");
+	
+	if(ncoeff>2)
+	{	
+	  F_FLOAT coeff3[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j];
+		}
+	  }
+
+	  if(use_global_params)
+	  {
+	    cudaMemcpyToSymbol(MY_CONST(coeff3_gm)  , &sdata->pair.coeff3_gm.dev_data   , sizeof(F_FLOAT*)  );
+	    cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n,cudaMemcpyHostToDevice);
+	    _coeff3_gm_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff3_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff3_gm_texture_ptr;
+		cudaGetTextureReference(&coeff3_gm_texture_ptr, MY_CONST(coeff3_gm_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+		cudaBindTexture(0,coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+		cudaBindTexture(0,coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
+		#endif
+	  }
+	  else
+	    cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n);
+	}
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed");
+	
+	if(ncoeff>3)
+	{	
+	  F_FLOAT coeff4[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j];
+	  	}
+	  }
+	
+	  if(use_global_params)
+	  {
+	    cudaMemcpyToSymbol(MY_CONST(coeff4_gm)  , &sdata->pair.coeff4_gm.dev_data   , sizeof(F_FLOAT*)  );
+	    cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n,cudaMemcpyHostToDevice);
+	    _coeff4_gm_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff4_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff4_gm_texture_ptr;
+		cudaGetTextureReference(&coeff4_gm_texture_ptr, MY_CONST(coeff4_gm_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+		cudaBindTexture(0,coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+		cudaBindTexture(0,coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
+		#endif
+	  }
+	  else
+	    cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n);
+	}
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed");
+
+	if(ncoeff>4)
+	{	
+	  F_FLOAT coeff5[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j];
+		}
+	  }
+	
+	  if(use_global_params)
+	  {
+	    cudaMemcpyToSymbol(MY_CONST(coeff5_gm)  , &sdata->pair.coeff5_gm.dev_data   , sizeof(F_FLOAT*)  );
+	    cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n,cudaMemcpyHostToDevice);
+	    _coeff5_gm_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff5_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff5_gm_texture_ptr;
+		cudaGetTextureReference(&coeff5_gm_texture_ptr, MY_CONST(coeff5_gm_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+		cudaBindTexture(0,coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+		cudaBindTexture(0,coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
+		#endif
+	  }
+	  else
+	    cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n);
+	}
+
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed");
+	if(ncoeff>5)
+	{	
+	  F_FLOAT coeff6[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j];
+		}
+	  }
+	
+	  if(use_global_params)
+	  {
+	    cudaMemcpyToSymbol(MY_CONST(coeff6_gm)  , &sdata->pair.coeff6_gm.dev_data   , sizeof(F_FLOAT*)  );
+	    cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n,cudaMemcpyHostToDevice);
+	    _coeff6_gm_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff6_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff6_gm_texture_ptr;
+		cudaGetTextureReference(&coeff6_gm_texture_ptr, MY_CONST(coeff6_gm_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+		cudaBindTexture(0,coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+		cudaBindTexture(0,coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
+		#endif
+	  }
+	}
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed");
+
+	if(ncoeff>6)
+	{	
+	  F_FLOAT coeff7[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j];
+		}
+	  }
+	
+	  if(use_global_params)
+	  {
+	    cudaMemcpyToSymbol(MY_CONST(coeff7_gm)  , &sdata->pair.coeff7_gm.dev_data   , sizeof(F_FLOAT*)  );
+	    cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n,cudaMemcpyHostToDevice);
+	    _coeff7_gm_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff7_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff7_gm_texture_ptr;
+		cudaGetTextureReference(&coeff7_gm_texture_ptr, MY_CONST(coeff7_gm_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+		cudaBindTexture(0,coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+		cudaBindTexture(0,coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
+		#endif
+	  }
+	}
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed");
+
+	if(ncoeff>7)
+	{	
+	  F_FLOAT coeff8[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j];
+		}
+	  }
+	
+	  if(use_global_params)
+	  {
+	    cudaMemcpyToSymbol(MY_CONST(coeff8_gm)  , &sdata->pair.coeff8_gm.dev_data   , sizeof(F_FLOAT*)  );
+	    cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n,cudaMemcpyHostToDevice);
+	    _coeff8_gm_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff8_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff8_gm_texture_ptr;
+		cudaGetTextureReference(&coeff8_gm_texture_ptr, MY_CONST(coeff8_gm_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+		cudaBindTexture(0,coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+		cudaBindTexture(0,coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
+		#endif
+	  }
+	}
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed");
+
+	if(ncoeff>8)
+	{	
+	  F_FLOAT coeff9[cuda_ntypes2];
+	  for(int i=1; i<=sdata->atom.ntypes; ++i)
+	  {
+		for(int j=1; j<=sdata->atom.ntypes; ++j)
+		{
+			coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j];
+		}
+	  }
+	
+	  if(use_global_params)
+	  {
+	    cudaMemcpyToSymbol(MY_CONST(coeff9_gm)  , &sdata->pair.coeff9_gm.dev_data   , sizeof(F_FLOAT*)  );
+	    cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n,cudaMemcpyHostToDevice);
+	    _coeff9_gm_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff9_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff9_gm_texture_ptr;
+		cudaGetTextureReference(&coeff9_gm_texture_ptr, MY_CONST(coeff9_gm_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+		cudaBindTexture(0,coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+		cudaBindTexture(0,coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
+		#endif
+	  }
+	}
+	CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed");
+	
+	F_FLOAT special_lj[4];
+	special_lj[0]=sdata->pair.special_lj[0];
+	special_lj[1]=sdata->pair.special_lj[1];
+	special_lj[2]=sdata->pair.special_lj[2];
+	special_lj[3]=sdata->pair.special_lj[3];
+
+
+	X_FLOAT box_size[3] =
+	{
+		sdata->domain.subhi[0] - sdata->domain.sublo[0],
+		sdata->domain.subhi[1] - sdata->domain.sublo[1],
+		sdata->domain.subhi[2] - sdata->domain.sublo[2]
+	};
+	
+	cudaMemcpyToSymbol(MY_CONST(box_size)   	, box_size                 		, sizeof(X_FLOAT)*3);
+	cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)	,&cuda_ntypes            		, sizeof(unsigned) );
+	cudaMemcpyToSymbol(MY_CONST(special_lj) 	, special_lj               		, sizeof(F_FLOAT)*4);
+	cudaMemcpyToSymbol(MY_CONST(virial)     	,&sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*)  );
+	cudaMemcpyToSymbol(MY_CONST(eng_vdwl)     	,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)  );
+	cudaMemcpyToSymbol(MY_CONST(periodicity)	, sdata->domain.periodicity		, sizeof(int)*3    );
+	cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int)  );
+	
+	if(need_q)
+	{
+	  F_FLOAT qqrd2e_tmp=sdata->pppm.qqrd2e;
+	  F_FLOAT special_coul[4];
+	  special_coul[0]=sdata->pair.special_coul[0];
+	  special_coul[1]=sdata->pair.special_coul[1];
+	  special_coul[2]=sdata->pair.special_coul[2];
+	  special_coul[3]=sdata->pair.special_coul[3];
+	
+	  cudaMemcpyToSymbol(MY_CONST(special_coul)	, special_coul             		, sizeof(F_FLOAT)*4);
+	  cudaMemcpyToSymbol(MY_CONST(g_ewald)    	,&sdata->pair.g_ewald	   		, sizeof(F_FLOAT)  );
+	  cudaMemcpyToSymbol(MY_CONST(qqrd2e)     	,&qqrd2e_tmp	   				, sizeof(F_FLOAT)  );
+	  cudaMemcpyToSymbol(MY_CONST(kappa)     	,&sdata->pair.kappa				, sizeof(F_FLOAT)  );
+	  cudaMemcpyToSymbol(MY_CONST(eng_coul)     ,&sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*)  );
+	}
+	CUT_CHECK_ERROR("Cuda_Pair: init failed");
+}
+timespec startpairtime, endpairtime;
+//Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary
+void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,int eflag, int vflag, dim3& grid, dim3& threads, int& sharedperproc,bool need_q=false,int maxthreads=256)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_Pair_UpdateNmax_AllStyles(sdata,sneighlist);
+	if(sdata->atom.update_nlocal) 		
+	{
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
+	}		
+
+
+
+	BindXTypeTexture(sdata);
+	if(need_q) BindQTexture(sdata);
+	
+
+	sharedperproc=0;
+	if(sdata->pair.use_block_per_atom) sharedperproc+=3;
+	if(eflag) sharedperproc+=1;
+	if(need_q && eflag) sharedperproc+=1;
+	if(vflag) sharedperproc+=6;
+
+    int threadnum = sneighlist->inum;
+    if (sdata->comm.comm_phase==2)threadnum=sneighlist->inum_border2;
+    if(sdata->pair.use_block_per_atom) {threadnum*=64; maxthreads=64;}
+	int3 layout=getgrid(threadnum,sharedperproc*sizeof(ENERGY_FLOAT),maxthreads,true); //need to limit to 192 threads due to register limit
+	threads.x = layout.z; threads.y = 1; threads.z = 1;
+	grid.x = layout.x; grid.y = layout.y; grid.z = 1;
+	
+	int size=(unsigned)(layout.y*layout.x)*sharedperproc*sizeof(ENERGY_FLOAT);
+ 		Cuda_UpdateBuffer(sdata,size);
+ 		
+    if(sdata->pair.use_block_per_atom)
+    	cudaMemset(sdata->buffer, 0, size);
+
+    sdata->pair.lastgridsize=grid.x*grid.y;
+    sdata->pair.n_energy_virial=sharedperproc;
+	if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial-=3;
+
+    clock_gettime(CLOCK_REALTIME,&startpairtime);
+
+	MYDBG( printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); )
+}
+
+//Function which is called after the kernel invocation, collects energy and virial
+void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3& grid, int& sharedperproc,int eflag, int vflag)
+{
+	if((not sdata->pair.collect_forces_later) && (eflag||vflag))//not sdata->comm.comm_phase==2))
+	{
+	  cudaThreadSynchronize();
+      clock_gettime(CLOCK_REALTIME,&endpairtime);
+    sdata->cuda_timings.pair_kernel+=
+      endpairtime.tv_sec-startpairtime.tv_sec+1.0*(endpairtime.tv_nsec-startpairtime.tv_nsec)/1000000000;
+	  CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
+
+	  if(eflag||vflag) 
+	  {
+		int n=grid.x*grid.y;
+		if(sdata->pair.use_block_per_atom)
+		grid.x=sharedperproc-3;
+		else 
+		grid.x=sharedperproc;
+		grid.y=1;
+		dim3 threads(128,1,1);
+	    MYDBG( printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); )
+		MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(n);
+		cudaThreadSynchronize();
+		CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed");
+	  }
+
+	  MYDBG( printf("# CUDA: Cuda_Pair: kernel done\n"); )
+	}
+}
+	
+
+#include "pair_born_coul_long_cuda.cu"
+#include "pair_buck_coul_cut_cuda.cu"
+#include "pair_buck_coul_long_cuda.cu"
+#include "pair_buck_cuda.cu"
+#include "pair_cg_cmm_cuda.cu"
+#include "pair_cg_cmm_coul_cut_cuda.cu"
+#include "pair_cg_cmm_coul_debye_cuda.cu"
+#include "pair_cg_cmm_coul_long_cuda.cu"
+#include "pair_gran_hooke_cuda.cu"
+#include "pair_lj_charmm_coul_charmm_implicit_cuda.cu"
+#include "pair_lj_charmm_coul_charmm_cuda.cu"
+#include "pair_lj_charmm_coul_long_cuda.cu"
+#include "pair_lj_class2_coul_cut_cuda.cu"
+#include "pair_lj_class2_coul_long_cuda.cu"
+#include "pair_lj_class2_cuda.cu"
+#include "pair_lj_cut_coul_cut_cuda.cu"
+#include "pair_lj_cut_coul_debye_cuda.cu"
+#include "pair_lj_cut_coul_long_cuda.cu"
+#include "pair_lj_cut_cuda.cu"
+#include "pair_lj_cut_experimental_cuda.cu"
+#include "pair_lj_expand_cuda.cu"
+#include "pair_lj_gromacs_cuda.cu"
+#include "pair_lj_gromacs_coul_gromacs_cuda.cu"
+#include "pair_lj_smooth_cuda.cu"
+#include "pair_lj96_cut_cuda.cu"
+#include "pair_morse_coul_long_cuda.cu"
+#include "pair_morse_cuda.cu"
+#include "pair_eam_cuda.cu"
+
+#include "cuda_pair_kernel.cu"
+
+
+void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata)
+{
+	CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed");
+		cudaMemcpyToSymbol(MY_CONST(nlocal)    , & sdata->atom.nlocal             , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(nall)      , & sdata->atom.nall               , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(nmax)      , & sdata->atom.nmax               , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(type)      , & sdata->atom.type       .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(x)         , & sdata->atom.x          .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(x_type)    , & sdata->atom.x_type     .dev_data, sizeof(X_FLOAT4*) );
+		cudaMemcpyToSymbol(MY_CONST(xhold)     , & sdata->atom.xhold      .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(v)         , & sdata->atom.v          .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(radius)    , & sdata->atom.radius     .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(v_radius)  , & sdata->atom.v_radius   .dev_data, sizeof(V_FLOAT4*) );
+		cudaMemcpyToSymbol(MY_CONST(omega)     , & sdata->atom.omega      .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(rmass)     , & sdata->atom.rmass      .dev_data, sizeof(V_FLOAT*) );
+	    cudaMemcpyToSymbol(MY_CONST(omega_rmass),& sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*) );
+	CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed");
+}
+
+
+void Cuda_Pair_GenerateXType(cuda_shared_data* sdata)
+{
+	MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); )
+	Cuda_Pair_UpdateNmax(sdata);
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
+	cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
+	MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout); )
+
+	int3 layout=getgrid(sdata->atom.nall);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	MYDBG(printf(" # CUDA: GenerateXType ... kernel start test\n");  fflush(stdout);)
+	Pair_GenerateXType_Kernel<<<grid, threads,0>>>();
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
+	MYDBG(printf(" # CUDA: GenerateXType ... end\n");  fflush(stdout); )
+}
+
+void Cuda_Pair_RevertXType(cuda_shared_data* sdata)
+{
+	MYDBG(printf(" # CUDA: RevertXType ... start\n"); )
+	Cuda_Pair_UpdateNmax(sdata);
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
+	cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
+
+	int3 layout=getgrid(sdata->atom.nall);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Pair_RevertXType_Kernel<<<grid, threads,0>>>();
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
+	MYDBG(printf(" # CUDA: RevertXType ... end\n"); )
+}
+
+void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata)
+{
+	MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); )
+	Cuda_Pair_UpdateNmax(sdata);
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
+	cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
+	MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout); )
+
+	int3 layout=getgrid(sdata->atom.nall);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	MYDBG(printf(" # CUDA: GenerateVRadius ... kernel start test\n");  fflush(stdout);)
+	Pair_GenerateVRadius_Kernel<<<grid, threads,0>>>();
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Pair GenerateVRadius: Kernel failed");
+	MYDBG(printf(" # CUDA: GenerateVRadius ... end\n");  fflush(stdout); )
+}
+
+void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata)
+{
+	MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); )
+	Cuda_Pair_UpdateNmax(sdata);
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
+	cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
+	MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout); )
+
+	int3 layout=getgrid(sdata->atom.nall);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	MYDBG(printf(" # CUDA: GenerateOmegaRmass ... kernel start test\n");  fflush(stdout);)
+	Pair_GenerateOmegaRmass_Kernel<<<grid, threads,0>>>();
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Pair GenerateOmegaRmass: Kernel failed");
+	MYDBG(printf(" # CUDA: GenerateOmegaRmass ... end\n");  fflush(stdout); )
+}
+
+void Cuda_Pair_BuildXHold(cuda_shared_data* sdata)
+{
+	Cuda_Pair_UpdateNmax(sdata);
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
+	cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
+
+	int3 layout=getgrid(sdata->atom.nall);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Pair_BuildXHold_Kernel<<<grid, threads,0>>>();
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
+}
+
+void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag)
+{
+	cudaThreadSynchronize();
+    clock_gettime(CLOCK_REALTIME,&endpairtime);
+    sdata->cuda_timings.pair_kernel+=
+      endpairtime.tv_sec-startpairtime.tv_sec+1.0*(endpairtime.tv_nsec-startpairtime.tv_nsec)/1000000000;
+  CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
+	dim3 threads;
+	dim3 grid;
+	
+	if(eflag||vflag) 
+	{
+		int n=sdata->pair.lastgridsize;
+		grid.x=sdata->pair.n_energy_virial;
+		grid.y=1;
+		threads.x=128;
+		//printf("A grid.x: %i\n",grid.x);
+		MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(n);
+		cudaThreadSynchronize();
+		CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed");
+	}	
+	int3 layout=getgrid(sdata->atom.nlocal);
+	threads.x = layout.z;
+	grid.x = layout.x;
+	grid.y = layout.y;
+	Pair_CollectForces_Kernel<<<grid, threads,0>>>(sdata->pair.n_energy_virial,sdata->pair.lastgridsize);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Pair_CollectForces: Force Summation Kernel execution failed");
+	
+}
diff --git a/lib/cuda/cuda_pair_cu.h b/lib/cuda/cuda_pair_cu.h
new file mode 100644
index 0000000000..1844735a16
--- /dev/null
+++ b/lib/cuda/cuda_pair_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#include "cuda_shared.h"
+
+extern "C" void Cuda_Pair_GenerateXType(cuda_shared_data* sdata);
+extern "C" void Cuda_Pair_RevertXType(cuda_shared_data* sdata);
+extern "C" void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata);
+extern "C" void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata);
+extern "C" void Cuda_Pair_BuildXHold(cuda_shared_data* sdata);
+extern "C" void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag);
diff --git a/lib/cuda/cuda_pair_kernel.cu b/lib/cuda/cuda_pair_kernel.cu
new file mode 100644
index 0000000000..fe7a38a782
--- /dev/null
+++ b/lib/cuda/cuda_pair_kernel.cu
@@ -0,0 +1,1350 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
+__global__ void Pair_Kernel_TpA(int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+	ENERGY_FLOAT ecoul = ENERGY_F(0.0);
+
+	ENERGY_FLOAT* sharedE;
+	ENERGY_FLOAT* sharedECoul;
+	ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
+	
+	if(eflag||eflag_atom)
+    {
+      sharedE = &sharedmem[threadIdx.x];
+      sharedE[0] = ENERGY_F(0.0); 
+      sharedV += blockDim.x;
+      if(coul_type!=COUL_NONE)
+      {
+        sharedECoul = sharedE + blockDim.x;
+        sharedECoul[0] = ENERGY_F(0.0); 
+        sharedV += blockDim.x;
+      }
+    }
+    if(vflag||vflag_atom)
+    {  
+      sharedV[0*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[1*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[2*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[3*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[4*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[5*blockDim.x] = ENERGY_F(0.0); 
+    }
+    
+	int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+
+	X_FLOAT xtmp,ytmp,ztmp;
+	X_FLOAT4 myxtype;
+	F_FLOAT fxtmp,fytmp,fztmp,fpair;
+	F_FLOAT delx,dely,delz;
+	F_FLOAT factor_lj,factor_coul;
+	F_FLOAT qtmp;
+	int itype,i,j;
+	int jnum=0;
+	int* jlist;
+	
+	if(ii < _inum)
+	{
+		i = _ilist[ii];
+		
+		myxtype=fetchXType(i);
+		xtmp=myxtype.x;
+		ytmp=myxtype.y;
+		ztmp=myxtype.z;
+		itype=static_cast <int> (myxtype.w);
+		
+
+		fxtmp = F_F(0.0);
+		fytmp = F_F(0.0);
+		fztmp = F_F(0.0);
+
+        if(coul_type!=COUL_NONE)
+  			qtmp = fetchQ(i);
+
+		jnum = _numneigh[i];
+		jlist = &_neighbors[i];
+	} 
+	__syncthreads();
+	
+	for (int jj = 0; jj < jnum; jj++)
+	{
+		if(ii < _inum)
+		if(jj<jnum)
+		{
+			fpair=F_F(0.0);
+			j = jlist[jj*_nlocal];
+			factor_lj = j<_nall ? F_F(1.0) : _special_lj[j/_nall];
+            if(coul_type!=COUL_NONE)
+			  factor_coul = j<_nall ? F_F(1.0) : _special_coul[j/_nall];
+			j = j<_nall ? j : j % _nall;
+			
+			myxtype = fetchXType(j);
+			delx = xtmp - myxtype.x;
+			dely = ytmp - myxtype.y;
+			delz = ztmp - myxtype.z;
+			int jtype = static_cast <int> (myxtype.w);
+		  
+			
+			const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+			bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); 
+		    if (in_cutoff)
+		    {
+			  switch(pair_type)
+			  {
+				case PAIR_BORN:
+				  fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_BUCK:
+				  fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_CG_CMM:
+				  fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CHARMM:
+				  fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CLASS2:
+				  fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CUT:
+				  fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_EXPAND:
+				  fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_GROMACS:
+				  fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_SMOOTH:
+				  fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ96_CUT:
+				  fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_MORSE_R6:
+				  fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_MORSE:
+				  fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+			  }
+		    }
+		    
+            if(coul_type!=COUL_NONE)
+            {
+              const F_FLOAT qiqj=qtmp*fetchQ(j);
+              if(qiqj*qiqj>1e-8)
+              {
+			  const bool in_coul_cutoff = 
+			  	rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); 
+		      if (in_coul_cutoff)
+		      {
+			    switch(coul_type)
+			    {
+				  case COUL_CHARMM:
+				    fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
+				    break;
+
+				  case COUL_CHARMM_IMPLICIT:
+				    fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
+				    break;
+
+	    	      case COUL_CUT:
+	    	      {
+	    	      	const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq);
+	    	    	if(eflag) 
+	    	    	{
+	    	    	  ecoul += forcecoul;
+	    	    	}
+	    	        fpair += forcecoul*(F_F(1.0)/rsq);
+	   	          }
+	   	            break;	
+				  	    	        
+				  case COUL_DEBYE:
+				  {
+				    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	  		    	const X_FLOAT r = _RSQRT_(r2inv);
+	  		    	const X_FLOAT rinv = F_F(1.0)/r;
+	  		    	const F_FLOAT screening = _EXP_(-_kappa*r);
+	  				F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
+	    	    	if(eflag) 
+	    	    	{
+	    	    		ecoul += forcecoul*rinv;
+	    	    	}
+	    	    	forcecoul *= (_kappa + rinv);
+	    	        fpair += forcecoul*r2inv;
+				  }
+	    	        break;
+	    
+				  case COUL_GROMACS:
+				    fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj);
+				    break;
+				
+				  case COUL_LONG:
+				  {
+				    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	    		    const F_FLOAT r = _RSQRT_(r2inv);
+	    		    const F_FLOAT grij = _g_ewald * r;
+	    		    const F_FLOAT expm2 = _EXP_(-grij*grij);
+	    		    const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij);
+	    		    const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	    		    const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r);
+	   	 		    F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+	    		    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+	    	        if(eflag) 
+	    	        {
+	    	    	  ecoul += prefactor*erfc;
+	    	    	  if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+	    	        }
+	    	        fpair += forcecoul*r2inv;
+				  }
+				    break;
+				    
+			    }
+		      }
+		      in_cutoff=in_cutoff || in_coul_cutoff;
+              }
+            }
+		    
+		    
+		    if (in_cutoff)
+		    {
+				F_FLOAT dxfp,dyfp,dzfp;
+				fxtmp += dxfp = delx*fpair;
+				fytmp += dyfp = dely*fpair; 
+				fztmp += dzfp = delz*fpair;
+				if(vflag)
+				{
+				  sharedV[0 * blockDim.x]+= delx*dxfp;
+    			  sharedV[1 * blockDim.x]+= dely*dyfp;
+    			  sharedV[2 * blockDim.x]+= delz*dzfp;
+    			  sharedV[3 * blockDim.x]+= delx*dyfp;
+    			  sharedV[4 * blockDim.x]+= delx*dzfp;
+    			  sharedV[5 * blockDim.x]+= dely*dzfp;
+				}
+		    }				
+		}
+	}
+	__syncthreads();
+	if(ii < _inum)
+	{
+ 	    F_FLOAT* my_f;
+  	    if(_collect_forces_later)
+  	    {
+  			ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+        	if(eflag) 
+        	{
+          		buffer=&buffer[1 * gridDim.x * gridDim.y];
+                if(coul_type!=COUL_NONE)
+          		  buffer=&buffer[1 * gridDim.x * gridDim.y];     		
+        	}
+	    	if(vflag)
+	    	{
+		  		buffer=&buffer[6 * gridDim.x * gridDim.y];
+	    	}
+  	    	my_f = (F_FLOAT*) buffer; 
+  	    	my_f += i;
+	    	*my_f = fxtmp; my_f += _nmax;
+	    	*my_f = fytmp; my_f += _nmax;
+	    	*my_f = fztmp;
+  	    }
+  	    else
+  	    {
+  	     	my_f = _f + i;
+	    	*my_f += fxtmp; my_f += _nmax;
+	    	*my_f += fytmp; my_f += _nmax;
+	    	*my_f += fztmp;
+  	    }
+	}
+	__syncthreads();
+	
+	if(eflag) 
+	{
+	  sharedE[0] = evdwl;
+      if(coul_type!=COUL_NONE)
+	    sharedECoul[0] = ecoul;
+	}
+    if(eflag_atom && i<_nlocal) 
+    {
+      if(coul_type!=COUL_NONE)
+       _eatom[i] += evdwl + ecoul;
+      else
+       _eatom[i] += evdwl;
+    }
+       
+	if(vflag_atom && i<_nlocal)
+	{
+	  _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
+	  _vatom[i+_nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
+	  _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
+	  _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
+	  _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
+	  _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
+	}
+	if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,coul_type!=COUL_NONE?1:0);
+ }
+
+template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
+ __global__ void Pair_Kernel_BpA(int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	int ii = (blockIdx.x*gridDim.y+blockIdx.y);
+	if( ii >= _inum )
+	return;
+
+	ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+	ENERGY_FLOAT ecoul = ENERGY_F(0.0);
+	F_FLOAT3* sharedVirial1;
+	F_FLOAT3* sharedVirial2;
+	F_FLOAT* sharedEnergy;
+	F_FLOAT* sharedEnergyCoul;
+	
+    F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
+    if(vflag)
+    {
+       sharedVirial1 = &sharedForce[64];
+       sharedVirial2 = &sharedVirial1[64];
+    }
+    else
+    {
+       sharedVirial1 = &sharedForce[0];
+	   sharedVirial2 = &sharedVirial1[0];
+    }
+    
+    if(eflag)
+    {
+	   if(vflag||vflag_atom)
+	     sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
+	   else
+	     sharedEnergy = (F_FLOAT*) &sharedForce[64];
+	     
+	   if(coul_type!=COUL_NONE)
+	     sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
+	     
+    }
+	
+	F_FLOAT3 partialForce = { F_F(0.0),  F_F(0.0),  F_F(0.0) };
+	F_FLOAT3 partialVirial1 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
+	F_FLOAT3 partialVirial2 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
+
+	X_FLOAT xtmp,ytmp,ztmp;
+	X_FLOAT4 myxtype;
+	F_FLOAT delx,dely,delz;
+	F_FLOAT factor_lj,factor_coul;
+	F_FLOAT fpair;
+	F_FLOAT qtmp;
+	int itype,jnum,i,j;
+	int* jlist;
+
+	i = _ilist[ii];
+
+	myxtype = fetchXType(i);
+		
+    xtmp=myxtype.x;
+    ytmp=myxtype.y;
+    ztmp=myxtype.z;
+    itype=static_cast <int> (myxtype.w);
+		
+    if(coul_type!=COUL_NONE)
+  	  qtmp = fetchQ(i);
+
+	jnum = _numneigh[i];
+
+	jlist = &_neighbors[i*_maxneighbors];
+	__syncthreads();
+	for (int jj = threadIdx.x; jj < jnum+blockDim.x; jj+=blockDim.x)
+	{
+		if(jj<jnum)
+		{
+			fpair=F_F(0.0);
+			j = jlist[jj];
+			factor_lj   = j<_nall ? F_F(1.0) : _special_lj[j/_nall];
+        	if(coul_type!=COUL_NONE)
+				factor_coul = j<_nall ? F_F(1.0) : _special_coul[j/_nall];
+			j 			= j<_nall ? j : j % _nall;
+  	    
+  	    	myxtype = fetchXType(j);
+
+			delx = xtmp - myxtype.x;
+			dely = ytmp - myxtype.y;
+			delz = ztmp - myxtype.z;
+		    int jtype = static_cast <int> (myxtype.w);
+
+			const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+			
+			bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); 
+		    bool in_coul_cutoff;
+		    if (in_cutoff)
+		    {
+			  switch(pair_type)
+			  {
+				case PAIR_BORN:
+				  fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_BUCK:
+				  fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_CG_CMM:
+				  fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CHARMM:
+				  fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CLASS2:
+				  fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CUT:
+				  fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_EXPAND:
+				  fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_GROMACS:
+				  fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_SMOOTH:
+				  fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ96_CUT:
+				  fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_MORSE_R6:
+				  fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_MORSE:
+				  fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+			  }
+		    }
+
+            if(coul_type!=COUL_NONE)
+            {
+              const F_FLOAT qiqj=qtmp*fetchQ(j);
+              if(qiqj*qiqj>(1e-8f))
+              {
+			    in_coul_cutoff = 
+			  	rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); 
+		      if (in_coul_cutoff)
+		      {
+			    switch(coul_type)
+			    {
+				  case COUL_CHARMM:
+				    fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
+				    break;
+
+				  case COUL_CHARMM_IMPLICIT:
+				    fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
+				    break;
+
+				  case COUL_GROMACS:
+				    fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj);
+				    break;
+				
+				  case COUL_LONG:
+				  {
+				    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	    		    const F_FLOAT r = _RSQRT_(r2inv);
+	    		    const F_FLOAT grij = _g_ewald * r;
+	    		    const F_FLOAT expm2 = _EXP_(-grij*grij);
+	    		    const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij);
+	    		    const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	    		    const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r);
+	   	 		    F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+	    		    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+	    	        if(eflag) 
+	    	        {
+	    	    	  ecoul += prefactor*erfc;
+	    	    	  if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+	    	        }
+	    	        fpair += forcecoul*r2inv;
+			      }
+				    break;
+				  
+				  case COUL_DEBYE:
+				  {
+				    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	  		    	const X_FLOAT r = _RSQRT_(r2inv);
+	  		    	const X_FLOAT rinv = F_F(1.0)/r;
+	  		    	const F_FLOAT screening = _EXP_(-_kappa*r);
+	  				F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
+	    	    	if(eflag) 
+	    	    	{
+	    	    		ecoul += forcecoul*rinv;
+	    	    	}
+	    	    	forcecoul *= (_kappa + rinv);
+	    	        fpair += forcecoul*r2inv;
+				  }
+	    	        break;
+	    	        
+	    	      case COUL_CUT:
+	    	      {
+	    	      	const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq);
+	    	    	if(eflag) 
+	    	    	{
+	    	    	  ecoul += forcecoul;
+	    	    	}
+	    	        fpair += forcecoul*(F_F(1.0)/rsq);
+	   	          }
+	   	            break;	
+				  
+	    	        
+			    }
+		      }
+              }
+            }
+		    
+
+		    
+		    if (in_cutoff||in_coul_cutoff)
+		    {
+				F_FLOAT dxfp,dyfp,dzfp;
+				partialForce.x += dxfp = delx*fpair; 
+				partialForce.y += dyfp = dely*fpair;
+				partialForce.z += dzfp = delz*fpair;
+				if(vflag)
+				{
+			  	  partialVirial1.x+= delx*dxfp;
+    		  	  partialVirial1.y+= dely*dyfp;
+    		  	  partialVirial1.z+= delz*dzfp;
+    		  	  partialVirial2.x+= delx*dyfp;
+    		  	  partialVirial2.y+= delx*dzfp;
+    		 	  partialVirial2.z+= dely*dzfp;
+				}
+		    }				
+		}
+    }
+
+	  if(eflag)
+	  {
+	  	sharedEnergy[threadIdx.x]= evdwl; 
+        if(coul_type!=COUL_NONE)
+	  	  sharedEnergyCoul[threadIdx.x]= ecoul;
+	  }
+	  sharedForce[threadIdx.x]=partialForce;
+	  if(vflag)
+	  {
+	    sharedVirial1[threadIdx.x]=partialVirial1;
+	    sharedVirial2[threadIdx.x]=partialVirial2;
+	  }
+	  
+	  __syncthreads();  
+ 
+    
+	  for( unsigned int s = blockDim.x >> 1; s > 0; s >>= 1 ) 
+	  {
+
+		if( threadIdx.x < s ) 
+		{
+		  sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
+		  sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
+		  sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
+
+	      if(vflag)
+	      {
+		    sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
+		    sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
+		    sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
+
+		    sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
+		    sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
+		    sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
+	      }
+	      
+	      if(eflag)
+	      {
+		    sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
+            if(coul_type!=COUL_NONE)
+		      sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
+	      }
+	    }
+	    __syncthreads();
+	  }
+
+	  if(threadIdx.x == 0)
+	  {
+	  
+  		ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+        if(eflag) 
+        {
+          ENERGY_FLOAT tmp_evdwl;
+          buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergy[0];
+          if(eflag_atom)
+            _eatom[i] = tmp_evdwl;
+          buffer=&buffer[gridDim.x * gridDim.y];
+          if(coul_type!=COUL_NONE)
+          {
+            buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergyCoul[0];
+            if(eflag_atom)
+              _eatom[i] += tmp_evdwl;
+            buffer=&buffer[gridDim.x * gridDim.y];
+          }
+        }
+	    if(vflag)
+	    {
+	      ENERGY_FLOAT tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
+		  if(vflag_atom) _vatom[i+0*_nmax] = tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
+		  if(vflag_atom) _vatom[i+1*_nmax] = tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
+		  if(vflag_atom) _vatom[i+2*_nmax] = tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
+		  if(vflag_atom) _vatom[i+3*_nmax] = tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
+		  if(vflag_atom) _vatom[i+4*_nmax] = tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
+		  if(vflag_atom) _vatom[i+5*_nmax] = tmp;
+		  buffer=&buffer[6 * gridDim.x * gridDim.y];
+	    }
+ 	    F_FLOAT* my_f;
+  	    if(_collect_forces_later)
+  	    {
+  	    	my_f = (F_FLOAT*) buffer; 
+  	    	my_f += i;
+	    	*my_f = sharedForce[0].x; my_f += _nmax;
+	    	*my_f = sharedForce[0].y; my_f += _nmax;
+	    	*my_f = sharedForce[0].z;
+  	    }
+  	    else
+  	    {
+  	     	my_f = _f + i;
+	    	*my_f += sharedForce[0].x; my_f += _nmax;
+	    	*my_f += sharedForce[0].y; my_f += _nmax;
+	    	*my_f += sharedForce[0].z;
+  	    }
+	  }  
+}
+
+
+template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
+__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase)
+{
+	ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+	ENERGY_FLOAT ecoul = ENERGY_F(0.0);
+
+	ENERGY_FLOAT* sharedE;
+	ENERGY_FLOAT* sharedECoul;
+	ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
+	
+	if(eflag||eflag_atom)
+    {
+      sharedE = &sharedmem[threadIdx.x];
+      sharedE[0] = ENERGY_F(0.0); 
+      sharedV += blockDim.x;
+      if(coul_type!=COUL_NONE)
+      {
+        sharedECoul = sharedE + blockDim.x;
+        sharedECoul[0] = ENERGY_F(0.0); 
+        sharedV += blockDim.x;
+      }
+    }
+    if(vflag||vflag_atom)
+    {  
+      sharedV[0*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[1*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[2*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[3*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[4*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[5*blockDim.x] = ENERGY_F(0.0); 
+    }
+    
+	int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+
+	X_FLOAT xtmp,ytmp,ztmp;
+	X_FLOAT4 myxtype;
+	F_FLOAT fxtmp,fytmp,fztmp,fpair;
+	F_FLOAT delx,dely,delz;
+	F_FLOAT factor_lj,factor_coul;
+	F_FLOAT qtmp;
+	int itype,i,j;
+	int jnum=0;
+	int* jlist;
+	
+	if(ii < (comm_phase<2?_inum:_inum_border[0]))
+	{
+		i = comm_phase<2? _ilist[ii] : _ilist_border[ii] ;
+		
+		myxtype=fetchXType(i);
+		myxtype=_x_type[i];
+		xtmp=myxtype.x;
+		ytmp=myxtype.y;
+		ztmp=myxtype.z;
+		itype=static_cast <int> (myxtype.w);
+		
+
+		fxtmp = F_F(0.0);
+		fytmp = F_F(0.0);
+		fztmp = F_F(0.0);
+
+        if(coul_type!=COUL_NONE)
+  			qtmp = fetchQ(i);
+		jnum = comm_phase==0? _numneigh[i]: (comm_phase==1?_numneigh_inner[i]:_numneigh_border[ii]);
+
+
+		jlist = comm_phase==0? &_neighbors[i]: (comm_phase==1?&_neighbors_inner[i]:&_neighbors_border[ii]);
+	} 
+	__syncthreads();
+	
+	for (int jj = 0; jj < jnum; jj++)
+	{
+		if(ii < (comm_phase<2?_inum:_inum_border[0]))
+		if(jj<jnum)
+		{
+			fpair=F_F(0.0);
+			j = jlist[jj*_nlocal]; 
+
+			factor_lj = j<_nall ? F_F(1.0) : _special_lj[j/_nall];
+            if(coul_type!=COUL_NONE)
+			  factor_coul = j<_nall ? F_F(1.0) : _special_coul[j/_nall];
+			j = j<_nall ? j : j % _nall;
+			
+			myxtype = fetchXType(j);
+			delx = xtmp - myxtype.x;
+			dely = ytmp - myxtype.y;
+			delz = ztmp - myxtype.z;
+			int jtype = static_cast <int> (myxtype.w);
+		  
+			
+			const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+			bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); 
+		    if (in_cutoff)
+		    {
+			  switch(pair_type)
+			  {
+				case PAIR_BORN:
+				  fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_BUCK:
+				  fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_CG_CMM:
+				  fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CHARMM:
+				  fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CLASS2:
+				  fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CUT:
+				  fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_EXPAND:
+				  fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_GROMACS:
+				  fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_SMOOTH:
+				  fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ96_CUT:
+				  fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_MORSE_R6:
+				  fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_MORSE:
+				  fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+			  }
+		    }
+		    
+            if(coul_type!=COUL_NONE)
+            {
+              const F_FLOAT qiqj=qtmp*fetchQ(j);
+              if(qiqj*qiqj>1e-8)
+              {
+			  const bool in_coul_cutoff = 
+			  	rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); 
+		      if (in_coul_cutoff)
+		      {
+			    switch(coul_type)
+			    {
+				  case COUL_CHARMM:
+				    fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
+				    break;
+
+				  case COUL_CHARMM_IMPLICIT:
+				    fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
+				    break;
+
+	    	      case COUL_CUT:
+	    	      {
+	    	      	const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq);
+	    	    	if(eflag) 
+	    	    	{
+	    	    	  ecoul += forcecoul;
+	    	    	}
+	    	        fpair += forcecoul*(F_F(1.0)/rsq);
+	   	          }
+	   	            break;	
+				  	    	        
+				  case COUL_DEBYE:
+				  {
+				    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	  		    	const X_FLOAT r = _RSQRT_(r2inv);
+	  		    	const X_FLOAT rinv = F_F(1.0)/r;
+	  		    	const F_FLOAT screening = _EXP_(-_kappa*r);
+	  				F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
+	    	    	if(eflag) 
+	    	    	{
+	    	    		ecoul += forcecoul*rinv;
+	    	    	}
+	    	    	forcecoul *= (_kappa + rinv);
+	    	        fpair += forcecoul*r2inv;
+				  }
+	    	        break;
+	    
+				  case COUL_GROMACS:
+				    fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj);
+				    break;
+				
+				  case COUL_LONG:
+				  {
+				    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	    		    const F_FLOAT r = _RSQRT_(r2inv);
+	    		    const F_FLOAT grij = _g_ewald * r;
+	    		    const F_FLOAT expm2 = _EXP_(-grij*grij);
+	    		    const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij);
+	    		    const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	    		    const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r);
+	   	 		    F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+	    		    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+	    	        if(eflag) 
+	    	        {
+	    	    	  ecoul += prefactor*erfc;
+	    	    	  if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+	    	        }
+	    	        fpair += forcecoul*r2inv;
+				  }
+				    break;
+				    
+			    }
+		      }
+		      in_cutoff=in_cutoff || in_coul_cutoff;
+              }
+            }
+		    
+		    
+		    if (in_cutoff)
+		    {
+				F_FLOAT dxfp,dyfp,dzfp;
+				fxtmp += dxfp = delx*fpair;
+				fytmp += dyfp = dely*fpair; 
+				fztmp += dzfp = delz*fpair;
+				if(vflag)
+				{
+				  sharedV[0 * blockDim.x]+= delx*dxfp;
+    			  sharedV[1 * blockDim.x]+= dely*dyfp;
+    			  sharedV[2 * blockDim.x]+= delz*dzfp;
+    			  sharedV[3 * blockDim.x]+= delx*dyfp;
+    			  sharedV[4 * blockDim.x]+= delx*dzfp;
+    			  sharedV[5 * blockDim.x]+= dely*dzfp;
+				}
+		    }				
+		}
+	}
+    __syncthreads();
+	if(ii < (comm_phase<2?_inum:_inum_border[0]))
+	{
+ 	    F_FLOAT* my_f;
+  	    if(_collect_forces_later)
+  	    {
+  			ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+        	if(eflag) 
+        	{
+          		buffer=&buffer[1 * gridDim.x * gridDim.y];
+                if(coul_type!=COUL_NONE)
+          		  buffer=&buffer[1 * gridDim.x * gridDim.y];     		
+        	}
+	    	if(vflag)
+	    	{
+		  		buffer=&buffer[6 * gridDim.x * gridDim.y];
+	    	}
+  	    	my_f = (F_FLOAT*) buffer; 
+  	    	my_f += i;
+	    	*my_f = fxtmp; my_f += _nmax;
+	    	*my_f = fytmp; my_f += _nmax;
+	    	*my_f = fztmp;
+  	    }
+  	    else
+  	    {
+  	     	my_f = _f + i;
+	    	*my_f += fxtmp; my_f += _nmax;
+	    	*my_f += fytmp; my_f += _nmax;
+	    	*my_f += fztmp;
+  	    }
+	}
+	__syncthreads();
+	
+	if(eflag) 
+	{
+	  sharedE[0] = evdwl;
+      if(coul_type!=COUL_NONE)
+	    sharedECoul[0] = ecoul;
+	}
+    if(eflag_atom && i<_nlocal) 
+    {
+      if(coul_type!=COUL_NONE)
+       _eatom[i] += evdwl + ecoul;
+      else
+       _eatom[i] += evdwl;
+    }
+       
+	if(vflag_atom && i<_nlocal)
+	{
+	  _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
+	  _vatom[i+_nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
+	  _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
+	  _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
+	  _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
+	  _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
+	}
+	if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,coul_type!=COUL_NONE?1:0);
+ }
+
+template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
+ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase)
+{
+	int ii = (blockIdx.x*gridDim.y+blockIdx.y);
+	if( ii >= (comm_phase<2?_inum:_inum_border[0]))
+	return;
+
+	ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+	ENERGY_FLOAT ecoul = ENERGY_F(0.0);
+	F_FLOAT3* sharedVirial1;
+	F_FLOAT3* sharedVirial2;
+	F_FLOAT* sharedEnergy;
+	F_FLOAT* sharedEnergyCoul;
+	
+    F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
+    if(vflag)
+    {
+       sharedVirial1 = &sharedForce[64];
+       sharedVirial2 = &sharedVirial1[64];
+    }
+    else
+    {
+       sharedVirial1 = &sharedForce[0];
+	   sharedVirial2 = &sharedVirial1[0];
+    }
+    
+    if(eflag)
+    {
+	   if(vflag||vflag_atom)
+	     sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
+	   else
+	     sharedEnergy = (F_FLOAT*) &sharedForce[64];
+	     
+	   if(coul_type!=COUL_NONE)
+	     sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
+	     
+    }
+	
+	F_FLOAT3 partialForce = { F_F(0.0),  F_F(0.0),  F_F(0.0) };
+	F_FLOAT3 partialVirial1 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
+	F_FLOAT3 partialVirial2 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
+
+	X_FLOAT xtmp,ytmp,ztmp;
+	X_FLOAT4 myxtype;
+	F_FLOAT delx,dely,delz;
+	F_FLOAT factor_lj,factor_coul;
+	F_FLOAT fpair;
+	F_FLOAT qtmp;
+	int itype,jnum,i,j;
+	int* jlist;
+
+	i = comm_phase<2? _ilist[ii] : _ilist_border[ii];
+
+	myxtype = fetchXType(i);
+		
+    xtmp=myxtype.x;
+    ytmp=myxtype.y;
+    ztmp=myxtype.z;
+    itype=static_cast <int> (myxtype.w);
+		
+    if(coul_type!=COUL_NONE)
+  	  qtmp = fetchQ(i);
+
+	jnum = comm_phase==0? _numneigh[i]: (comm_phase==1?_numneigh_inner[i]:_numneigh_border[ii]);
+
+	jlist = comm_phase==0? &_neighbors[i*_maxneighbors]: (comm_phase==1?&_neighbors_inner[i*_maxneighbors]:&_neighbors_border[ii*_maxneighbors]);
+	__syncthreads();
+	for (int jj = threadIdx.x; jj < jnum+blockDim.x; jj+=blockDim.x)
+	{
+		if(jj<jnum)
+		{
+			fpair=F_F(0.0);
+			j = jlist[jj];
+			factor_lj   = j<_nall ? F_F(1.0) : _special_lj[j/_nall];
+        	if(coul_type!=COUL_NONE)
+				factor_coul = j<_nall ? F_F(1.0) : _special_coul[j/_nall];
+			j 			= j<_nall ? j : j % _nall;
+  	    
+  	    	myxtype = fetchXType(j);
+
+			delx = xtmp - myxtype.x;
+			dely = ytmp - myxtype.y;
+			delz = ztmp - myxtype.z;
+		    int jtype = static_cast <int> (myxtype.w);
+
+			const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+			
+			bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); 
+		    bool in_coul_cutoff;
+		    if (in_cutoff)
+		    {
+			  switch(pair_type)
+			  {
+				case PAIR_BORN:
+				  fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_BUCK:
+				  fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_CG_CMM:
+				  fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CHARMM:
+				  fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CLASS2:
+				  fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_CUT:
+				  fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_EXPAND:
+				  fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_GROMACS:
+				  fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ_SMOOTH:
+				  fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_LJ96_CUT:
+				  fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_MORSE_R6:
+				  fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+				case PAIR_MORSE:
+				  fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
+				  break;
+			  }
+		    }
+
+            if(coul_type!=COUL_NONE)
+            {
+              const F_FLOAT qiqj=qtmp*fetchQ(j);
+              if(qiqj*qiqj>(1e-8f))
+              {
+			    in_coul_cutoff = 
+			  	rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); 
+		      if (in_coul_cutoff)
+		      {
+			    switch(coul_type)
+			    {
+				  case COUL_CHARMM:
+				    fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
+				    break;
+
+				  case COUL_CHARMM_IMPLICIT:
+				    fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
+				    break;
+
+				  case COUL_GROMACS:
+				    fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj);
+				    break;
+				
+				  case COUL_LONG:
+				  {
+				    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	    		    const F_FLOAT r = _RSQRT_(r2inv);
+	    		    const F_FLOAT grij = _g_ewald * r;
+	    		    const F_FLOAT expm2 = _EXP_(-grij*grij);
+	    		    const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij);
+	    		    const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	    		    const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r);
+	   	 		    F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+	    		    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+	    	        if(eflag) 
+	    	        {
+	    	    	  ecoul += prefactor*erfc;
+	    	    	  if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+	    	        }
+	    	        fpair += forcecoul*r2inv;
+			      }
+				    break;
+				  
+				  case COUL_DEBYE:
+				  {
+				    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	  		    	const X_FLOAT r = _RSQRT_(r2inv);
+	  		    	const X_FLOAT rinv = F_F(1.0)/r;
+	  		    	const F_FLOAT screening = _EXP_(-_kappa*r);
+	  				F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
+	    	    	if(eflag) 
+	    	    	{
+	    	    		ecoul += forcecoul*rinv;
+	    	    	}
+	    	    	forcecoul *= (_kappa + rinv);
+	    	        fpair += forcecoul*r2inv;
+				  }
+	    	        break;
+	    	        
+	    	      case COUL_CUT:
+	    	      {
+	    	      	const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq);
+	    	    	if(eflag) 
+	    	    	{
+	    	    	  ecoul += forcecoul;
+	    	    	}
+	    	        fpair += forcecoul*(F_F(1.0)/rsq);
+	   	          }
+	   	            break;	
+				  
+	    	        
+			    }
+		      }
+              }
+            }
+		    
+
+		    
+		    if (in_cutoff||in_coul_cutoff)
+		    {
+				F_FLOAT dxfp,dyfp,dzfp;
+				partialForce.x += dxfp = delx*fpair; 
+				partialForce.y += dyfp = dely*fpair;
+				partialForce.z += dzfp = delz*fpair;
+				if(vflag)
+				{
+			  	  partialVirial1.x+= delx*dxfp;
+    		  	  partialVirial1.y+= dely*dyfp;
+    		  	  partialVirial1.z+= delz*dzfp;
+    		  	  partialVirial2.x+= delx*dyfp;
+    		  	  partialVirial2.y+= delx*dzfp;
+    		 	  partialVirial2.z+= dely*dzfp;
+				}
+		    }				
+		}
+    }
+
+	  if(eflag)
+	  {
+	  	sharedEnergy[threadIdx.x]= evdwl; 
+        if(coul_type!=COUL_NONE)
+	  	  sharedEnergyCoul[threadIdx.x]= ecoul;
+	  }
+	  sharedForce[threadIdx.x]=partialForce;
+	  if(vflag)
+	  {
+	    sharedVirial1[threadIdx.x]=partialVirial1;
+	    sharedVirial2[threadIdx.x]=partialVirial2;
+	  }
+	  
+	  __syncthreads();  
+ 
+    
+	  for( unsigned int s = blockDim.x >> 1; s > 0; s >>= 1 ) 
+	  {
+
+		if( threadIdx.x < s ) 
+		{
+		  sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
+		  sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
+		  sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
+
+	      if(vflag)
+	      {
+		    sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
+		    sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
+		    sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
+
+		    sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
+		    sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
+		    sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
+	      }
+	      
+	      if(eflag)
+	      {
+		    sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
+            if(coul_type!=COUL_NONE)
+		      sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
+	      }
+	    }
+	    __syncthreads();
+	  }
+
+	  if(threadIdx.x == 0)
+	  {
+	  
+  		ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+        if(eflag) 
+        {
+          ENERGY_FLOAT tmp_evdwl;
+          buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergy[0];
+          if(eflag_atom)
+            _eatom[i] = tmp_evdwl;
+          buffer=&buffer[gridDim.x * gridDim.y];
+          if(coul_type!=COUL_NONE)
+          {
+            buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergyCoul[0];
+            if(eflag_atom)
+              _eatom[i] += tmp_evdwl;
+            buffer=&buffer[gridDim.x * gridDim.y];
+          }
+        }
+	    if(vflag)
+	    {
+	      ENERGY_FLOAT tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
+		  if(vflag_atom) _vatom[i+0*_nmax] = tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
+		  if(vflag_atom) _vatom[i+1*_nmax] = tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
+		  if(vflag_atom) _vatom[i+2*_nmax] = tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
+		  if(vflag_atom) _vatom[i+3*_nmax] = tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
+		  if(vflag_atom) _vatom[i+4*_nmax] = tmp;
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
+		  if(vflag_atom) _vatom[i+5*_nmax] = tmp;
+		  buffer=&buffer[6 * gridDim.x * gridDim.y];
+	    }
+ 	    F_FLOAT* my_f;
+  	    if(_collect_forces_later)
+  	    {
+  	    	my_f = (F_FLOAT*) buffer; 
+  	    	my_f += i;
+	    	*my_f = sharedForce[0].x; my_f += _nmax;
+	    	*my_f = sharedForce[0].y; my_f += _nmax;
+	    	*my_f = sharedForce[0].z;
+  	    }
+  	    else
+  	    {
+  	     	my_f = _f + i;
+	    	*my_f += sharedForce[0].x; my_f += _nmax;
+	    	*my_f += sharedForce[0].y; my_f += _nmax;
+	    	*my_f += sharedForce[0].z;
+  	    }
+	  }  
+}
+
+__global__ void Pair_GenerateXType_Kernel()
+{
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nall)
+	{
+		X_FLOAT4 xtype;
+		xtype.x=_x[i];
+		xtype.y=_x[i+_nmax];
+		xtype.z=_x[i+2*_nmax];
+		xtype.w=_type[i];
+		_x_type[i]=xtype;
+	}
+	
+}
+
+__global__ void Pair_GenerateVRadius_Kernel()
+{
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nall)
+	{
+		V_FLOAT4 vradius;
+		vradius.x=_v[i];
+		vradius.y=_v[i+_nmax];
+		vradius.z=_v[i+2*_nmax];
+		vradius.w=_radius[i];
+		_v_radius[i]=vradius;
+	}
+}
+
+__global__ void Pair_GenerateOmegaRmass_Kernel()
+{
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nall)
+	{
+		V_FLOAT4 omegarmass;
+		omegarmass.x=_omega[i];
+		omegarmass.y=_omega[i+_nmax];
+		omegarmass.z=_omega[i+2*_nmax];
+		omegarmass.w=_rmass[i];
+		_omega_rmass[i]=omegarmass;
+	}
+}
+
+__global__ void Pair_RevertXType_Kernel()
+{
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nall)
+	{
+		X_FLOAT4 xtype=_x_type[i];
+		_x[i]=xtype.x;
+		_x[i+_nmax]=xtype.y;
+		_x[i+2*_nmax]=xtype.z;
+		_type[i]=static_cast <int> (xtype.w);
+	}
+	
+}
+
+__global__ void Pair_BuildXHold_Kernel()
+{
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nall)
+	{
+		X_FLOAT4 xtype=_x_type[i];
+		_xhold[i]=xtype.x;
+		_xhold[i+_nmax]=xtype.y;
+		_xhold[i+2*_nmax]=xtype.z;
+	}
+	
+}
+
+__global__ void Pair_CollectForces_Kernel(int nperblock,int n)
+{
+	int i = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i>=_nlocal) return;
+	ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
+	
+	F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n];
+	F_FLOAT* my_f = _f + i;
+	buf_f += i;
+	*my_f += * buf_f; my_f+=_nmax; buf_f+=_nmax;
+	*my_f += * buf_f; my_f+=_nmax; buf_f+=_nmax;
+	*my_f += * buf_f; my_f+=_nmax; 
+}
diff --git a/lib/cuda/cuda_pair_virial_kernel_nc.cu b/lib/cuda/cuda_pair_virial_kernel_nc.cu
new file mode 100644
index 0000000000..8ea06604c9
--- /dev/null
+++ b/lib/cuda/cuda_pair_virial_kernel_nc.cu
@@ -0,0 +1,126 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ ENERGY_FLOAT sharedmem[];
+
+static inline __device__ void PairVirialCompute_A_Kernel(int &eflag,int &vflag,int coulflag=0)
+{
+	__syncthreads();
+	ENERGY_FLOAT* shared=sharedmem;
+
+	if(eflag)
+	{
+	  reduceBlock(shared);
+	  shared+=blockDim.x;
+	  if(coulflag)
+	  {
+	  	reduceBlock(shared);
+	    shared+=blockDim.x;
+	  }
+	} 
+	if(vflag)
+	{
+	  reduceBlock(shared + 0 * blockDim.x);
+	  reduceBlock(shared + 1 * blockDim.x);
+	  reduceBlock(shared + 2 * blockDim.x);
+	  reduceBlock(shared + 3 * blockDim.x);
+	  reduceBlock(shared + 4 * blockDim.x);
+	  reduceBlock(shared + 5 * blockDim.x);
+	}
+	if(threadIdx.x == 0) 
+	{
+	    shared=sharedmem;
+	    ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+		if(eflag)
+		{
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0];
+	  	  shared+=blockDim.x; buffer+=gridDim.x * gridDim.y;
+	  	  if(coulflag)
+	  	  {
+		    buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0];
+	  	    shared+=blockDim.x; buffer+=gridDim.x * gridDim.y;
+	  	  } 
+		}	
+		if(vflag)
+		{
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[0 * blockDim.x];	
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[1 * blockDim.x];	
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[2 * blockDim.x];	
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[3 * blockDim.x];	
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[4 * blockDim.x];	
+		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[5 * blockDim.x];
+		}	
+	}
+	__syncthreads();
+}
+
+__global__ void MY_AP(PairVirialCompute_reduce)(int n)
+{
+	sharedmem[threadIdx.x] = ENERGY_F(0.0);
+	ENERGY_FLOAT sum = ENERGY_F(0.0);
+	ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
+	buf = &buf[blockIdx.x * n];
+	//if(blockIdx.x==2) buf=&buf[n];
+
+	for(int i = 0; i < n; i += blockDim.x)
+	{
+		sharedmem[threadIdx.x] = (i + threadIdx.x < n) ? buf[i + threadIdx.x] : ENERGY_F(0.0);
+		__syncthreads();
+		reduceBlock(sharedmem);
+		if(threadIdx.x == 0) sum += sharedmem[0];
+	}
+    if(threadIdx.x==0)
+    {
+	  if(gridDim.x == 1) //evdwl
+	  {
+	    _eng_vdwl[0]+=sum;
+	  }
+	  if(gridDim.x == 2) //evdwl + ecoul only
+	  {
+	    if(blockIdx.x==0) 
+	    _eng_vdwl[0]+=sum;
+	    else
+	    _eng_coul[0]+=sum;
+	  }
+	  if(gridDim.x == 6) //virial 
+	  {
+	    _virial[blockIdx.x] += sum;
+	  }
+	  if(gridDim.x == 7) //evdwl+virial
+	  {
+        if(blockIdx.x==0) 
+	    _eng_vdwl[0]+=sum;
+	    else _virial[blockIdx.x-1] += sum;
+	  }
+	  if(gridDim.x == 8) //evdwl+ecoul+virial
+	  {
+	    if(blockIdx.x==0) 
+	    _eng_vdwl[0]+=sum;
+	    else 
+	    if(blockIdx.x==1)
+	    _eng_coul[0]+=sum;
+	    else
+	    _virial[blockIdx.x-2] += sum;
+	  }
+	}
+}
diff --git a/lib/cuda/cuda_precision.h b/lib/cuda/cuda_precision.h
new file mode 100644
index 0000000000..5b7d6a6843
--- /dev/null
+++ b/lib/cuda/cuda_precision.h
@@ -0,0 +1,269 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef CUDA_PRECISION_H_
+#define CUDA_PRECISION_H_
+/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
+ * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
+ * ***_FLOAT: type definition of given property
+ * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
+ */
+
+#ifdef CUDA_USE_BINNING
+#define CUDA_IF_BINNING(a) a
+#else
+#define CUDA_IF_BINNING(a) 
+#endif
+
+//GLOBAL
+
+#ifdef CUDA_PRECISION
+  #if CUDA_PRECISION == 1
+    #define CUDA_FLOAT float
+    #define CUDA_F(x) x##f
+  #endif 
+  #if CUDA_PRECISION == 2
+    #define CUDA_FLOAT double
+    #define CUDA_F(x) x
+  #endif
+#endif
+ 
+#ifndef CUDA_PRECISION
+  #define CUDA_FLOAT double
+  #define CUDA_F(x) x
+  #define CUDA_PRECISION 2
+#endif
+//--------------------------------
+//-----------FFT-----------------
+//--------------------------------
+
+#ifdef FFT_PRECISION_CU
+  #if FFT_PRECISION_CU == 1
+    #define FFT_FLOAT float
+    #define FFT_F(x) x##f 
+  #endif 
+  #if FFT_PRECISION_CU == 2
+    #define FFT_FLOAT double
+    #define FFT_F(x) x 
+  #endif
+#endif
+
+#ifndef FFT_PRECISION_CU
+  #define FFT_FLOAT CUDA_FLOAT
+  #define FFT_F(x) CUDA_F(x)
+  #define FFT_PRECISION_CU CUDA_PRECISION 
+#endif
+
+//--------------------------------
+//-----------PPPM-----------------
+//--------------------------------
+
+#ifdef PPPM_PRECISION
+  #if PPPM_PRECISION == 1
+    #define PPPM_FLOAT float
+    #define PPPM_F(x) x##f 
+  #endif 
+  #if PPPM_PRECISION == 2
+    #define PPPM_FLOAT double
+    #define PPPM_F(x) x 
+  #endif
+#endif
+
+#ifndef PPPM_PRECISION
+  #define PPPM_FLOAT CUDA_FLOAT
+  #define PPPM_F(x) CUDA_F(x) 
+  #define PPPM_PRECISION CUDA_PRECISION 
+#endif
+
+//--------------------------------
+//-----------FORCE-----------------
+//--------------------------------
+
+
+#ifdef F_PRECISION
+  #if F_PRECISION == 1
+    #define F_FLOAT float
+    #define F_F(x) x##f 
+  #endif 
+  #if F_PRECISION == 2
+    #define F_FLOAT double
+    #define F_F(x) x 
+  #endif
+#endif
+
+#ifndef F_PRECISION
+  #define F_FLOAT CUDA_FLOAT
+  #define F_F(x) CUDA_F(x) 
+  #define F_PRECISION CUDA_PRECISION 
+#endif
+
+#if F_PRECISION == 1
+#define _SQRT_ sqrtf
+#define _RSQRT_ rsqrtf
+#define _EXP_ expf
+#else
+#define _SQRT_ sqrt
+#define _RSQRT_ rsqrt
+#define _EXP_ exp
+#endif
+
+#if F_PRECISION == 2
+struct F_FLOAT2
+{
+  F_FLOAT x;
+  F_FLOAT y;
+};
+struct F_FLOAT3
+{
+  F_FLOAT x;
+  F_FLOAT y;
+  F_FLOAT z;
+};
+struct F_FLOAT4
+{
+  F_FLOAT x;
+  F_FLOAT y;
+  F_FLOAT z;
+  F_FLOAT w;
+};
+#else
+#define F_FLOAT2 float2
+#define F_FLOAT3 float3
+#define F_FLOAT4 float4
+#endif
+//--------------------------------
+//-----------ENERGY-----------------
+//--------------------------------
+
+#ifndef ENERGY_PRECISION
+  #define ENERGY_FLOAT CUDA_FLOAT
+  #define ENERGY_F(x) CUDA_F(x) 
+#endif
+
+#ifdef ENERGY_PRECISION
+  #if ENERGY_PRECISION == 1
+    #define ENERGY_FLOAT float
+    #define ENERGY_F(x) x##f 
+  #endif 
+  #if ENERGY_PRECISION == 2
+    #define ENERGY_FLOAT double
+    #define ENERGY_F(x) x 
+  #endif
+#endif
+
+#ifndef ENERGY_PRECISION
+  #define ENERGY_FLOAT CUDA_FLOAT
+  #define ENERGY_F(x) CUDA_F(x) 
+  #define ENERGY_PRECISION CUDA_PRECISION 
+#endif
+
+//--------------------------------
+//-----------POSITIONS------------
+//--------------------------------
+
+#ifdef X_PRECISION
+  #if X_PRECISION == 1
+    #define X_FLOAT float
+    #define X_F(x) x##f 
+  #endif 
+  #if X_PRECISION == 2
+    #define X_FLOAT double
+    #define X_F(x) x 
+  #endif
+#endif
+
+#ifndef X_PRECISION
+  #define X_FLOAT CUDA_FLOAT
+  #define X_F(x) CUDA_F(x) 
+  #define X_PRECISION CUDA_PRECISION 
+#endif
+
+#if X_PRECISION == 2
+struct X_FLOAT2
+{
+  X_FLOAT x;
+  X_FLOAT y;
+};
+struct X_FLOAT3
+{
+  X_FLOAT x;
+  X_FLOAT y;
+  X_FLOAT z;
+};
+struct X_FLOAT4
+{
+  X_FLOAT x;
+  X_FLOAT y;
+  X_FLOAT z;
+  X_FLOAT w;
+};
+#else
+#define X_FLOAT2 float2
+#define X_FLOAT3 float3
+#define X_FLOAT4 float4
+#endif
+
+//--------------------------------
+//-----------velocities-----------
+//--------------------------------
+
+#ifdef V_PRECISION
+  #if V_PRECISION == 1
+    #define V_FLOAT float
+    #define V_F(x) x##f  
+  #endif 
+  #if V_PRECISION == 2
+    #define V_FLOAT double
+    #define V_F(x) x  
+  #endif
+#endif
+
+#ifndef V_PRECISION
+  #define V_FLOAT CUDA_FLOAT
+  #define V_F(x) CUDA_F(x) 
+  #define V_PRECISION CUDA_PRECISION 
+#endif
+
+#if V_PRECISION == 2
+struct V_FLOAT4
+{
+  V_FLOAT x;
+  V_FLOAT y;
+  V_FLOAT z;
+  V_FLOAT w;
+};
+#else
+#define V_FLOAT4 float4
+#endif
+
+#ifdef NO_PREC_TIMING
+struct timespec_2
+{
+	unsigned int tv_sec;
+	unsigned int tv_nsec;
+};
+
+#define timespec timespec_2
+#define clock_gettime(a,b) 
+#endif
+#endif /*CUDA_PRECISION_H_*/
diff --git a/lib/cuda/cuda_shared.h b/lib/cuda/cuda_shared.h
new file mode 100644
index 0000000000..f7983fff05
--- /dev/null
+++ b/lib/cuda/cuda_shared.h
@@ -0,0 +1,378 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_SHARED_H_
+#define _CUDA_SHARED_H_
+#include "cuda_precision.h"
+
+#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
+
+struct dev_array
+{
+	void* dev_data;			// pointer to memory address on cuda device
+	unsigned dim[3];		// array dimensions
+};
+
+struct cuda_shared_atom		// relevent data from atom class
+{
+	dev_array dx; 			// cumulated distance for binning settings
+	dev_array x;			// position
+	dev_array v;			// velocity
+	dev_array f;			// force
+	dev_array tag;
+	dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)   
+	dev_array mask;
+	dev_array image; 		
+	dev_array q;			// charges
+	dev_array mass;			// per-type masses
+	dev_array rmass;		// per-atom masses
+	dev_array radius;		// per-atom radius
+	dev_array density;
+	dev_array omega;
+	dev_array torque;
+	dev_array molecule;
+	
+	dev_array special;
+	int maxspecial;
+	dev_array nspecial;
+	int* special_flag;
+	int molecular;
+	
+	dev_array eatom;		// per-atom energy
+	dev_array vatom;		// per-atom virial
+	int need_eatom;
+	int need_vatom;
+	
+	dev_array x_type;		// position + type in X_FLOAT4 struct
+	dev_array v_radius;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
+	dev_array omega_rmass;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
+
+	double* mass_host;		// remember per-type host pointer to masses
+	//int natoms;				// total # of atoms in system, could be 0
+	int nghost;				// and ghost atoms on this proc
+	int nlocal;				// # of owned
+	int nall;			    // total # of atoms in this proc
+	int nmax;				// max # of owned+ghost in arrays on this proc 	
+	int ntypes;
+	int q_flag;				// do we have charges?
+	int rmass_flag;			// do we have per-atom masses?
+	int firstgroup;
+	int nfirst;
+	
+ 	int update_nlocal;
+ 	int update_nmax;
+ 	
+	dev_array xhold;	    // position at last neighboring
+ 	X_FLOAT triggerneighsq;		// maximum square movement before reneighboring
+ 	int reneigh_flag;		// is reneighboring necessary
+ 	int maxhold;			// size of xhold
+ 	int dist_check; 		//perform distance check for reneighboring
+ 	dev_array binned_id;    //id of each binned atom (not tag!!)
+ 	dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
+    float bin_extraspace;	
+	int bin_dim[3];
+	int bin_nmax;
+	dev_array map_array;
+};
+
+struct cuda_shared_pair		// relevent data from pair class
+{
+	char cudable_force;		// check for (cudable_force!=0)
+	X_FLOAT cut_global;
+	X_FLOAT cut_inner_global;
+	X_FLOAT cut_coul_global;
+	double** cut;			// type-type cutoff 
+	double** cutsq;			// type-type cutoff 
+	double** cut_inner;			// type-type cutoff for coul
+	double** cut_coul;			// type-type cutoff for coul
+	double** coeff1;		// tpye-type pair parameters
+	double** coeff2;
+	double** coeff3;
+	double** coeff4;
+	double** coeff5;		
+	double** coeff6;		
+	double** coeff7;		
+	double** coeff8;		
+	double** coeff9;		
+	double** coeff10;		
+	double** offset;
+	double* special_lj;
+	double* special_coul;
+	dev_array virial; // ENERGY_FLOAT
+	dev_array eng_vdwl; // ENERGY_FLOAT
+	dev_array eng_coul; // ENERGY_FLOAT
+	X_FLOAT cut_coulsq_global;
+	F_FLOAT g_ewald,kappa;
+	int freeze_group_bit;
+	
+	dev_array coeff1_gm;
+	dev_array coeff2_gm;
+	dev_array coeff3_gm;
+	dev_array coeff4_gm;
+	dev_array coeff5_gm;
+	dev_array coeff6_gm;
+	dev_array coeff7_gm;
+	dev_array coeff8_gm;
+	dev_array coeff9_gm;
+	dev_array coeff10_gm;
+	
+	int lastgridsize;
+	int n_energy_virial;
+	int collect_forces_later;
+	int use_block_per_atom;
+	int override_block_per_atom;
+	
+};
+
+struct cuda_shared_domain	// relevent data from domain class
+{
+	X_FLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
+	X_FLOAT subhi[3];
+	X_FLOAT boxlo[3];
+	X_FLOAT boxhi[3];
+	X_FLOAT prd[3];
+	int periodicity[3];		// xyz periodicity as array
+
+	int triclinic;
+	X_FLOAT xy;
+	X_FLOAT xz;
+	X_FLOAT yz;
+	X_FLOAT boxlo_lamda[3];
+	X_FLOAT boxhi_lamda[3];
+	X_FLOAT prd_lamda[3];
+	X_FLOAT h[6];
+	X_FLOAT h_inv[6];
+	V_FLOAT h_rate[6];
+	int update;
+};
+
+struct cuda_shared_pppm
+{
+   char cudable_force;
+#ifdef FFT_CUFFT  
+   FFT_FLOAT* work1;
+   FFT_FLOAT* work2;
+   FFT_FLOAT* work3;
+   PPPM_FLOAT* greensfn;
+   PPPM_FLOAT* fkx;
+   PPPM_FLOAT* fky;
+   PPPM_FLOAT* fkz;
+   PPPM_FLOAT* vg;
+#endif
+   int* part2grid;
+   PPPM_FLOAT* density_brick;
+   int* density_brick_int;
+   PPPM_FLOAT density_intScale;
+   PPPM_FLOAT* vdx_brick;
+   PPPM_FLOAT* vdy_brick;
+   PPPM_FLOAT* vdz_brick;
+   PPPM_FLOAT* density_fft;
+   ENERGY_FLOAT* energy;
+   ENERGY_FLOAT* virial;
+   int nxlo_in;
+   int nxhi_in;
+   int nxlo_out;
+   int nxhi_out;
+   int nylo_in;
+   int nyhi_in;
+   int nylo_out;
+   int nyhi_out;
+   int nzlo_in;
+   int nzhi_in;
+   int nzlo_out;
+   int nzhi_out;
+   int nx_pppm;
+   int ny_pppm;
+   int nz_pppm;
+   PPPM_FLOAT qqrd2e;
+   int order;
+  // float3 sublo;
+   PPPM_FLOAT* rho_coeff;
+   int nmax;
+   int nlocal;
+   PPPM_FLOAT* debugdata;
+   PPPM_FLOAT delxinv;
+   PPPM_FLOAT delyinv;
+   PPPM_FLOAT delzinv;
+   int nlower;
+   int nupper;
+   PPPM_FLOAT shiftone;
+   
+};
+
+struct cuda_shared_comm
+{
+   int maxswap;
+   int maxlistlength;
+   dev_array pbc;
+   dev_array slablo;
+   dev_array slabhi;
+   dev_array multilo;
+   dev_array multihi;
+   dev_array sendlist;
+   int grow_flag;
+   int comm_phase;
+   
+   int nsend;
+   int* nsend_swap;
+   int* send_size;
+   int* recv_size; 
+   double** buf_send;
+   void** buf_send_dev;
+   double** buf_recv;
+   void** buf_recv_dev;
+   void* buffer;
+   int buffer_size;
+   double overlap_split_ratio;
+};
+
+struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
+{
+	int maxlocal;
+	int inum;                // # of I atoms neighbors are stored for local indices of I atoms
+	int inum_border2;
+	dev_array inum_border;         // # of atoms which interact with border atoms
+	dev_array ilist;
+	dev_array ilist_border;
+	dev_array numneigh;
+	dev_array numneigh_inner;
+	dev_array numneigh_border;
+	dev_array firstneigh;
+	dev_array neighbors;
+	dev_array neighbors_border;
+	dev_array neighbors_inner;
+	int maxpage;
+	dev_array page_pointers;
+	dev_array* pages;
+	int maxneighbors;
+	int neigh_lists_per_page;
+	double** cutneighsq;
+	CUDA_FLOAT* cu_cutneighsq;
+	int* binned_id;
+	int* bin_dim;
+	int bin_nmax;
+	float bin_extraspace;
+	double maxcut;
+	dev_array ex_type;
+	int nex_type;
+	dev_array ex1_bit;
+	dev_array ex2_bit;
+	int nex_group;
+	dev_array ex_mol_bit;
+	int nex_mol;
+	
+};
+
+struct cuda_compile_settings		// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
+{
+    int prec_glob;
+    int prec_x;
+    int prec_v;
+    int prec_f;
+    int prec_pppm;
+    int prec_fft;
+    int cufft;
+    int arch;
+};
+
+struct cuda_timings_struct
+{
+	//Debug:
+	double test1;
+	double test2;
+	//transfers
+	double transfer_upload_tmp_constr;
+	double transfer_download_tmp_deconstr;
+	
+	//communication
+	double comm_forward_total;
+	double comm_forward_mpi_upper;
+	double comm_forward_mpi_lower;
+	double comm_forward_kernel_pack;
+	double comm_forward_kernel_unpack;
+	double comm_forward_kernel_self;
+	double comm_forward_upload;
+	double comm_forward_download;
+
+	double comm_exchange_total;
+	double comm_exchange_mpi;
+	double comm_exchange_kernel_pack;
+	double comm_exchange_kernel_unpack;
+	double comm_exchange_kernel_fill;
+	double comm_exchange_cpu_pack;
+	double comm_exchange_upload;
+	double comm_exchange_download;
+
+	double comm_border_total;
+	double comm_border_mpi;
+	double comm_border_kernel_pack;
+	double comm_border_kernel_unpack;
+	double comm_border_kernel_self;
+	double comm_border_kernel_buildlist;
+	double comm_border_upload;
+	double comm_border_download;
+	
+	//pair forces
+	double pair_xtype_conversion;
+	double pair_kernel;
+	double pair_virial;
+	double pair_force_collection;
+	
+	//neighbor
+	double neigh_bin;
+	double neigh_build;
+	double neigh_special;
+	
+	//PPPM
+ 	double pppm_particle_map; 
+    double pppm_make_rho; 
+    double pppm_brick2fft; 
+    double pppm_poisson; 
+    double pppm_fillbrick; 
+    double pppm_fieldforce; 
+    double pppm_compute; 
+	
+};
+
+struct cuda_shared_data		// holds space for all relevent data from the different classes
+{
+	void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
+	int buffersize; //maxsize of buffer
+	int buffer_new; //should be 1 if the pointer to buffer has changed
+	void* flag;
+	void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
+	cuda_shared_atom atom;
+	cuda_shared_pair pair;	
+	cuda_shared_domain domain;
+	cuda_shared_pppm pppm;
+	cuda_shared_comm comm;
+	cuda_compile_settings compile_settings;
+	cuda_timings_struct cuda_timings;
+	int exchange_dim;
+	int me; //mpi rank
+	unsigned int datamask;
+	int overlap_comm;
+};
+
+
+#endif // #ifndef _CUDA_SHARED_H_
diff --git a/lib/cuda/cuda_wrapper.cu b/lib/cuda/cuda_wrapper.cu
new file mode 100644
index 0000000000..d74f731da0
--- /dev/null
+++ b/lib/cuda/cuda_wrapper.cu
@@ -0,0 +1,315 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "cuda_wrapper_cu.h"
+#include "cuda_wrapper_kernel.cu"
+
+static int CudaWrapper_total_gpu_mem=0;
+static double CudaWrapper_total_upload_time=0;
+static double CudaWrapper_total_download_time=0;
+static double CudaWrapper_cpubuffer_upload_time=0;
+static double CudaWrapper_cpubuffer_download_time=0;
+static cudaStream_t* streams;
+static int nstreams=0;
+
+void CudaWrapper_Init(int argc, char** argv,int me,int ppn,int* devicelist)
+{
+	MYDBG( printf("# CUDA: debug mode on\n"); )
+	
+	#if __DEVICE_EMULATION__
+	
+	printf("# CUDA: emulation mode on\n");
+	
+	#else
+	
+	// modified from cutil.h
+    static int deviceCount=0;
+    static bool sharedmode=false;
+    if(deviceCount && !sharedmode) return;
+    if(deviceCount && sharedmode) cudaThreadExit();
+  
+    CUDA_SAFE_CALL_NO_SYNC( cudaGetDeviceCount(&deviceCount) );
+    if (deviceCount == 0)
+    {
+        fprintf(stderr, "cutil error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+    MYDBG( printf("# CUDA There are %i devices supporting CUDA in this system.\n",deviceCount);)
+    
+    cudaDeviceProp deviceProp[deviceCount];
+    for(int i=0;i<deviceCount;i++)
+    CUDA_SAFE_CALL_NO_SYNC( cudaGetDeviceProperties(&(deviceProp[i]), i) );
+
+  
+    int dev_list[deviceCount];
+    for(int i=0;i<deviceCount;i++) dev_list[i]=i;
+    for(int i=0;i<deviceCount;i++)
+    {
+      for(int j=0;j<deviceCount-1-i;j++)
+      if(deviceProp[dev_list[j]].multiProcessorCount<deviceProp[dev_list[j+1]].multiProcessorCount)
+      {
+      	int k=dev_list[j];
+      	dev_list[j]=dev_list[j+1];
+      	dev_list[j+1]=k;
+      }
+    }
+
+    for(int i=0;i<deviceCount;i++)
+      if((deviceProp[dev_list[i]].computeMode==0)) sharedmode=true;
+    
+    if(sharedmode)
+    {
+      if(ppn&&(me%ppn+1)>deviceCount) {printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n"); exit(0);}
+      int devicea=me%ppn;
+      if(devicelist) devicea=devicelist[devicea];
+      else
+      devicea=dev_list[devicea];
+      if(devicea>=deviceCount)  {printf("Asking for non existent GPU %i. Found only %i GPUs.\n",devicea,deviceCount); exit(0);}
+      MYDBG( 
+        printf(" # CUDA  myid: %i take device: %i\n",me,devicea);
+      )
+      CUDA_SAFE_CALL( cudaSetDevice(devicea) );
+    }
+    else
+    {
+      CUDA_SAFE_CALL( cudaSetValidDevices(dev_list,deviceCount) );
+    }
+    cudaSetDeviceFlags(cudaDeviceMapHost);
+    cudaThreadSynchronize();
+    
+    int dev;
+    CUDA_SAFE_CALL( cudaGetDevice(&dev));
+    
+    if (deviceProp[dev].major < 1)
+    {
+        fprintf(stderr, "CUDA error: device does not support CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+    else
+    if ((deviceProp[dev].major == 1)&&(deviceProp[dev].minor != 3))
+    {
+        fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n",dev,deviceProp[dev].name,deviceProp[dev].major,deviceProp[dev].minor);
+        exit(EXIT_FAILURE);
+    }
+    if ((deviceProp[dev].major == 2)&&(CUDA_ARCH<20))
+    {
+      fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n",deviceProp[dev].major,deviceProp[dev].minor);
+    }
+    if ((deviceProp[dev].major == 1)&&(CUDA_ARCH>=20))
+    {
+      fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n",CUDA_ARCH);
+      exit(EXIT_FAILURE);
+    }
+    
+    
+fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);
+    MYDBG( fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);)
+
+	MYDBG
+	(
+		printf("name = %s\n", deviceProp[dev].name);
+		printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem);
+		printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock);
+		printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock);
+		printf("warpSize = %i\n", deviceProp[dev].warpSize);
+		printf("memPitch = %i\n", deviceProp[dev].memPitch);
+		printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock);
+		printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]);
+		printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]);
+		printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem);
+		printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor);
+		printf("clockRate = %i\n", deviceProp[dev].clockRate);
+		printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment);
+		printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap);
+		printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount);
+		printf("computeMode = %i\n", deviceProp[dev].computeMode);
+	)
+	
+	#endif
+ }
+
+void* CudaWrapper_AllocCudaData(unsigned nbytes)
+{
+	void* dev_data;
+	CUDA_SAFE_CALL( cudaMalloc((void**)&dev_data, nbytes) );
+	MYDBG( printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data); )
+	CudaWrapper_total_gpu_mem+=nbytes;
+	return dev_data;
+}
+
+void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes)
+{
+	MYDBG( printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data,host_data); )
+	cudaThreadSynchronize();
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+	CUDA_SAFE_CALL( cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice) );
+    clock_gettime(CLOCK_REALTIME,&time2);
+    CudaWrapper_total_upload_time+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+}
+
+void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream)
+{
+	MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); )
+	cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice,streams[stream]);
+}
+
+void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes)
+{
+	MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); )
+	cudaThreadSynchronize();
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+	CUDA_SAFE_CALL( cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost) );
+    clock_gettime(CLOCK_REALTIME,&time2);
+    CudaWrapper_total_download_time+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+}
+
+void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream)
+{
+	MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); )
+	cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost,streams[stream]);
+}
+
+void CudaWrapper_FreeCudaData(void* dev_data,unsigned nbytes)
+{
+	MYDBG( printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data,nbytes,(char*)dev_data+nbytes); )
+	CUDA_SAFE_CALL( cudaFree(dev_data) );
+	CudaWrapper_total_gpu_mem-=nbytes;
+}
+
+void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes)
+{
+	MYDBG( printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data); )
+	CUDA_SAFE_CALL( cudaMemset(dev_data, value, nbytes) );
+}
+
+void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes)
+{
+	MYDBG( printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source,dev_dest); )
+	CUDA_SAFE_CALL( cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice) );
+}
+
+void* CudaWrapper_AllocPinnedHostData(unsigned nbytes,bool mapped,bool writeCombined)
+{
+	void* host_data;
+	int flags=0;
+	if(mapped) flags=flags | cudaHostAllocMapped;
+	if(writeCombined) flags=flags | cudaHostAllocWriteCombined;
+	
+	CUDA_SAFE_CALL( cudaHostAlloc((void**)&host_data, nbytes,flags) );
+//	CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) );
+	MYDBG( printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data); )
+	return host_data;
+}
+
+void CudaWrapper_FreePinnedHostData(void* host_data)
+{
+	MYDBG( printf("# CUDA: freeing pinned host memory at %p \n",host_data); )
+	if(host_data)
+	CUDA_SAFE_CALL( cudaFreeHost(host_data) );
+}
+
+void cuda_check_error(char* comment)
+{
+  printf("ERROR-CUDA %s %s\n",comment,cudaGetErrorString(cudaGetLastError()));
+}
+
+int CudaWrapper_CheckMemUseage()
+{
+	size_t free,total;
+	cudaMemGetInfo(&free,&total);
+	return total-free; //possible with cuda 3.0 ???
+	//return CudaWrapper_total_gpu_mem;
+}
+
+double CudaWrapper_CheckUploadTime(bool reset)
+{
+	if(reset) CudaWrapper_total_upload_time=0.0;
+	return CudaWrapper_total_upload_time;
+}
+
+double CudaWrapper_CheckDownloadTime(bool reset)
+{
+	if(reset) CudaWrapper_total_download_time=0.0;
+	return CudaWrapper_total_download_time;
+}
+
+double CudaWrapper_CheckCPUBufUploadTime(bool reset)
+{
+	if(reset) CudaWrapper_cpubuffer_upload_time=0.0;
+	return CudaWrapper_cpubuffer_upload_time;
+}
+
+double CudaWrapper_CheckCPUBufDownloadTime(bool reset)
+{
+	if(reset) CudaWrapper_cpubuffer_download_time=0.0;
+	return CudaWrapper_cpubuffer_download_time;
+}
+
+void CudaWrapper_AddCPUBufUploadTime(double dt)
+{
+	CudaWrapper_cpubuffer_upload_time+=dt;
+}
+
+void CudaWrapper_AddCPUBufDownloadTime(double dt)
+{
+	CudaWrapper_cpubuffer_download_time+=dt;
+}
+
+void CudaWrapper_Sync()
+{
+	cudaThreadSynchronize();
+}
+
+void CudaWrapper_SyncStream(int stream)
+{
+	cudaStreamSynchronize(streams[stream]);
+}
+
+void CudaWrapper_AddStreams(int n)
+{
+	cudaStream_t* new_streams=new cudaStream_t[nstreams+n];
+	for(int i=0;i<nstreams;i++) new_streams[i]=streams[i];
+	for(int i=nstreams;i<nstreams+n;i++) cudaStreamCreate(&new_streams[i]);
+	if(nstreams>0)
+	delete [] streams;
+	streams=new_streams;
+	nstreams+=n;
+}
+
+void* CudaWrapper_returnStreams()
+{
+    return (void*) streams;
+}
+
+int CudaWrapper_returnNStreams()
+{
+    return nstreams;
+}
+
diff --git a/lib/cuda/cuda_wrapper_cu.h b/lib/cuda/cuda_wrapper_cu.h
new file mode 100644
index 0000000000..85d51a8586
--- /dev/null
+++ b/lib/cuda/cuda_wrapper_cu.h
@@ -0,0 +1,52 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_DATA_WRAPPER_H_
+#define _CUDA_DATA_WRAPPER_H_
+
+extern "C" void  CudaWrapper_Init(int argc, char** argv,int me=0,int ppn=2,int* devicelist=NULL);
+extern "C" void* CudaWrapper_AllocCudaData(unsigned nbytes);
+extern "C" void  CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes);
+extern "C" void  CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
+extern "C" void  CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes);
+extern "C" void  CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
+extern "C" void  CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes=0);
+extern "C" void  CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes);
+extern "C" void  CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes);
+extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped=false, bool writeCombind=false);
+extern "C" void  CudaWrapper_FreePinnedHostData(void* dev_data);
+extern "C" void  cuda_check_error(char* comment);
+extern "C" int   CudaWrapper_CheckMemUseage();
+extern "C" double CudaWrapper_CheckUploadTime(bool reset=false);
+extern "C" double CudaWrapper_CheckDownloadTime(bool reset=false);
+extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset=false);
+extern "C" double CudaWrapper_CheckCPUBufDownloadTime(bool reset=false);
+extern "C" void CudaWrapper_AddCPUBufUploadTime(double dt);
+extern "C" void CudaWrapper_AddCPUBufDownloadTime(double dt);
+extern "C" void CudaWrapper_Sync();
+extern "C" void CudaWrapper_SyncStream(int n);
+extern "C" void CudaWrapper_AddStreams(int n);
+extern "C" void* CudaWrapper_returnStreams();
+extern "C" int CudaWrapper_returnNStreams();
+
+#endif // _CUDA_DATA_WRAPPER_H_
diff --git a/lib/cuda/cuda_wrapper_kernel.cu b/lib/cuda/cuda_wrapper_kernel.cu
new file mode 100644
index 0000000000..951563b67b
--- /dev/null
+++ b/lib/cuda/cuda_wrapper_kernel.cu
@@ -0,0 +1,24 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+// empty file to obay common make rule
diff --git a/lib/cuda/domain.cu b/lib/cuda/domain.cu
new file mode 100644
index 0000000000..0f1583dda1
--- /dev/null
+++ b/lib/cuda/domain.cu
@@ -0,0 +1,194 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX domain
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "domain_cu.h"
+#include "domain_kernel.cu"
+
+void Cuda_Domain_UpdateBuffer(cuda_shared_data* sdata,int size)
+{
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_Domain Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+}
+
+void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(tag)    , & sdata->atom.tag .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(image)   , & sdata->atom.image.dev_data, sizeof(int*) );
+}
+
+void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(boxlo)   ,  sdata->domain.boxlo       , 3*sizeof(X_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(boxhi)   ,  sdata->domain.boxhi       , 3*sizeof(X_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(sublo)   ,  sdata->domain.sublo       , 3*sizeof(X_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(subhi)   ,  sdata->domain.subhi       , 3*sizeof(X_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(prd)     ,  sdata->domain.prd         , 3*sizeof(X_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(periodicity)   ,   sdata->domain.periodicity , 3*sizeof(int));
+		cudaMemcpyToSymbol(MY_CONST(triclinic)     , & sdata->domain.triclinic   , sizeof(int) );
+		cudaMemcpyToSymbol(MY_CONST(boxlo_lamda)   ,   sdata->domain.boxlo_lamda , 3*sizeof(X_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(boxhi_lamda)   ,   sdata->domain.boxhi_lamda , 3*sizeof(X_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(prd_lamda)	   ,   sdata->domain.prd_lamda   , 3*sizeof(X_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(h)	   	 ,   sdata->domain.h   		  , 6*sizeof(X_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(h_inv)	 ,   sdata->domain.h_inv   	  , 6*sizeof(X_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(h_rate)	 ,   sdata->domain.h_rate     , 6*sizeof(V_FLOAT));
+		cudaMemcpyToSymbol(MY_CONST(flag)	 ,   &sdata->flag     , sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(debugdata)	 ,   &sdata->debugdata     , sizeof(int*));
+}
+
+void Cuda_Domain_Init(cuda_shared_data* sdata)
+{
+		Cuda_Domain_UpdateNmax(sdata);
+		Cuda_Domain_UpdateDomain(sdata);
+}
+
+void Cuda_Domain_PBC(cuda_shared_data* sdata,int deform_remap,int deform_groupbit,double* extent)
+{
+		Cuda_Domain_UpdateNmax(sdata);
+	//if(sdata->domain.update)
+		Cuda_Domain_UpdateDomain(sdata);
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+
+	int box_change=0;
+	if(extent) box_change=1;
+
+	int sharedmem=0;
+	if(box_change) sharedmem=6*sizeof(X_FLOAT);
+
+	int3 layout=getgrid(sdata->atom.nlocal,sharedmem);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	sharedmem*=threads.x;
+	
+	if((box_change)&&(sdata->buffer_new or (6*sizeof(X_FLOAT)*grid.x*grid.y>sdata->buffersize)))
+		Cuda_Domain_UpdateBuffer(sdata,layout.x*layout.y*6*sizeof(X_FLOAT));
+	
+	
+	Domain_PBC_Kernel<<<grid, threads,sharedmem>>>(deform_remap,deform_groupbit,box_change);
+	cudaThreadSynchronize();
+	  
+	CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed");
+	if(box_change)
+	{
+	  X_FLOAT buf2[6*layout.x*layout.y];
+	  X_FLOAT* buf=buf2;
+	  int flag;
+	  cudaMemcpy(buf, sdata->buffer, 6*layout.x*layout.y*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+	  cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+	  //printf("Flag: %i\n",flag);
+	  X_FLOAT min,max;
+	  min=1.0*BIG;
+	  max=-1.0*BIG;
+	  for(int i=0;i<layout.x*layout.y;i++)
+	  {
+	  	if(buf[i]<min) min=buf[i];
+	  	if(buf[i+layout.x*layout.y]>max) max=buf[i+layout.x*layout.y];
+	  }
+	  extent[0]=min;
+	  extent[1]=max;
+
+	  buf+=2*layout.x*layout.y;
+	  min=1.0*BIG;
+	  max=-1.0*BIG;
+	  for(int i=0;i<layout.x*layout.y;i++)
+	  {
+	  	if(buf[i]<min) min=buf[i];
+	  	if(buf[i+layout.x*layout.y]>max) max=buf[i+layout.x*layout.y];
+	  }
+	  extent[2]=min;
+	  extent[3]=max;
+
+	  buf+=2*layout.x*layout.y;
+	  min=1.0*BIG;
+	  max=-1.0*BIG;
+	  for(int i=0;i<layout.x*layout.y;i++)
+	  {
+	  	if(buf[i]<min) min=buf[i];
+	  	if(buf[i+layout.x*layout.y]>max) max=buf[i+layout.x*layout.y];
+	  }
+	  extent[4]=min;
+	  extent[5]=max;
+		//printf("Extent: %lf %lf %lf %lf %lf %lf\n",extent[0],extent[1],extent[2],extent[3],extent[4],extent[5]);
+/*	   int n=grid.x*grid.y;
+	   if(n<128) threads.x=32;
+	   else if(n<256) threads.x=64;
+	   else threads.x=128;
+	   sharedmem=n*sizeof(X_FLOAT);
+	   grid.x=6;
+	   grid.y=1;
+	   Domain_reduceBoxExtent<<<grid, threads,sharedmem>>>(extent,n);
+	   cudaThreadSynchronize();
+	   CUT_CHECK_ERROR("Cuda_Domain_reduceBoxExtent: Kernel execution failed");*/
+	}
+}
+
+void Cuda_Domain_lamda2x(cuda_shared_data* sdata,int n)
+{
+		Cuda_Domain_UpdateNmax(sdata);
+	//if(sdata->domain.update)
+		Cuda_Domain_UpdateDomain(sdata);
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+
+	Domain_lamda2x_Kernel<<<grid, threads,0>>>(n);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Domain_lamda2x: Kernel execution failed");
+}
+
+void Cuda_Domain_x2lamda(cuda_shared_data* sdata,int n)
+{
+		Cuda_Domain_UpdateNmax(sdata);
+	//if(sdata->domain.update)
+		Cuda_Domain_UpdateDomain(sdata);
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Domain_x2lamda_Kernel<<<grid, threads,0>>>(n);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Domain_x2lamda: Kernel execution failed");
+}
diff --git a/lib/cuda/domain_cu.h b/lib/cuda/domain_cu.h
new file mode 100644
index 0000000000..f04e5610c2
--- /dev/null
+++ b/lib/cuda/domain_cu.h
@@ -0,0 +1,29 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_Domain_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_Domain_PBC(cuda_shared_data* sdata,int deform_remap,int deform_groupbit,double* extent=NULL);
+extern "C" void Cuda_Domain_lamda2x(cuda_shared_data* sdata,int n);
+extern "C" void Cuda_Domain_x2lamda(cuda_shared_data* sdata,int n);
diff --git a/lib/cuda/domain_kernel.cu b/lib/cuda/domain_kernel.cu
new file mode 100644
index 0000000000..ec5ef897c1
--- /dev/null
+++ b/lib/cuda/domain_kernel.cu
@@ -0,0 +1,269 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ X_FLOAT sharedmem[];
+
+#define BIG 1e10
+__global__ void Domain_PBC_Kernel(int deform_remap,int deform_groupbit,int box_change)
+{
+  int idim,otherdims;
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+
+  X_FLOAT lo[3];
+  X_FLOAT hi[3];
+  X_FLOAT* period;
+
+  if (_triclinic == 0) {
+    lo[0] = _boxlo[0];
+    lo[1] = _boxlo[1];
+    lo[2] = _boxlo[2];
+    
+    hi[0] = _boxhi[0];
+    hi[1] = _boxhi[1];
+    hi[2] = _boxhi[2];
+    period = _prd;
+  } else {
+    lo[0] = _boxlo_lamda[0];
+    lo[1] = _boxlo_lamda[1];
+    lo[2] = _boxlo_lamda[2];
+    
+    hi[0] = _boxhi_lamda[0];
+    hi[1] = _boxhi_lamda[1];
+    hi[2] = _boxhi_lamda[2];
+    period = _prd_lamda;
+  }
+
+  
+  X_FLOAT tmpx=X_F(0.5)*(hi[0]+lo[0]);
+  X_FLOAT tmpy=X_F(0.5)*(hi[1]+lo[1]);
+  X_FLOAT tmpz=X_F(0.5)*(hi[2]+lo[2]);
+  
+  X_FLOAT* buf=(X_FLOAT*) _buffer;
+  buf+=blockIdx.x*gridDim.y+blockIdx.y;
+  buf[0]=tmpx;
+  buf+=gridDim.x*gridDim.y;
+  buf[0]=tmpx;
+  buf+=gridDim.x*gridDim.y;
+  buf[0]=tmpy;
+  buf+=gridDim.x*gridDim.y;
+  buf[0]=tmpy;
+  buf+=gridDim.x*gridDim.y;
+  buf[0]=tmpz;
+  buf+=gridDim.x*gridDim.y;
+  buf[0]=tmpz;
+
+  if(i<_nlocal)
+  {
+
+    if (_periodicity[0]) {
+      if (_x[i] < lo[0]) {
+	_x[i] += period[0];
+	if (deform_remap && _mask[i] & deform_groupbit) _v[i] += _h_rate[0];
+	idim = _image[i] & 1023;
+        otherdims = _image[i] ^ idim;
+	idim--;
+	idim &= 1023;
+	_image[i] = otherdims | idim;
+      }
+      if (_x[i] >= hi[0]) {
+	_x[i] -= period[0];
+	_x[i] = MAX(_x[i],lo[0]);
+	if (deform_remap && _mask[i] & deform_groupbit) _v[i] -= _h_rate[0];
+	idim = _image[i] & 1023;
+	otherdims = _image[i] ^ idim;
+	idim++;
+	idim &= 1023;
+	_image[i] = otherdims | idim;
+      }
+    }
+
+    if (_periodicity[1]) {
+      if (_x[i+_nmax] < lo[1]) {
+	_x[i+_nmax] += period[1];
+	if (deform_remap && _mask[i] & deform_groupbit) {
+	  _v[i] += _h_rate[5];
+	  _v[i+_nmax] += _h_rate[1];
+	}
+	idim = (_image[i] >> 10) & 1023;
+        otherdims = _image[i] ^ (idim << 10);
+	idim--;
+	idim &= 1023;
+	_image[i] = otherdims | (idim << 10);
+      }
+      if (_x[i+_nmax] >= hi[1]) {
+	_x[i+_nmax] -= period[1];
+	_x[i+_nmax] = MAX(_x[i+_nmax],lo[1]);
+	if (deform_remap && _mask[i] & deform_groupbit) {
+	  _v[i] -= _h_rate[5];
+	  _v[i+_nmax] -= _h_rate[1];
+	}
+	idim = (_image[i] >> 10) & 1023;
+        otherdims = _image[i] ^ (idim << 10);
+	idim++;
+	idim &= 1023;
+	_image[i] = otherdims | (idim << 10);
+      }
+    }
+
+    if (_periodicity[2]) {
+      if (_x[i+2*_nmax] < lo[2]) {
+	_x[i+2*_nmax] += period[2];
+	if (deform_remap && _mask[i] & deform_groupbit) {
+	  _v[i] += _h_rate[4];
+	  _v[i+_nmax] += _h_rate[3];
+	  _v[i+2*_nmax] += _h_rate[2];
+	}
+	idim = _image[i] >> 20;
+        otherdims = _image[i] ^ (idim << 20);
+	idim--;
+	idim &= 1023;
+	_image[i] = otherdims | (idim << 20);
+      }
+      if (_x[i+2*_nmax] >= hi[2]) {
+	_x[i+2*_nmax] -= period[2];
+	_x[i+2*_nmax] = MAX(_x[i+2*_nmax],lo[2]);
+	if (deform_remap && _mask[i] & deform_groupbit) {
+	  _v[i] -= _h_rate[4];
+	  _v[i+_nmax] -= _h_rate[3];
+	  _v[i+2*_nmax] -= _h_rate[2];
+	}
+	idim = _image[i] >> 20;
+        otherdims = _image[i] ^ (idim << 20);
+	idim++;
+	idim &= 1023;
+	_image[i] = otherdims | (idim << 20);
+      }
+    }
+    if(box_change)
+    {
+    	tmpx=_x[i];
+    	tmpy=_x[i+_nmax];
+    	tmpz=_x[i+2*_nmax];
+    	
+    	
+    }
+  }
+  __syncthreads();
+  if(box_change)
+  {
+  	X_FLOAT minx=BIG;
+  	X_FLOAT maxx=-BIG;
+  	X_FLOAT miny=BIG;
+  	X_FLOAT maxy=-BIG;
+  	X_FLOAT minz=BIG;
+  	X_FLOAT maxz=-BIG;
+  
+  if (not _periodicity[0]) {	
+  	sharedmem[threadIdx.x]=tmpx;
+  	minOfBlock(sharedmem);
+  	minx=sharedmem[0];
+  	__syncthreads();
+  	sharedmem[threadIdx.x]=tmpx;
+  	maxOfBlock(sharedmem);
+  	maxx=sharedmem[0];
+  	__syncthreads();
+  }
+  else {minx=lo[0];maxx=hi[0];}
+  if (not _periodicity[1]) {
+  	sharedmem[threadIdx.x]=tmpy;
+  	minOfBlock(sharedmem);
+  	miny=sharedmem[0];
+  	__syncthreads();
+  	sharedmem[threadIdx.x]=tmpy;
+  	maxOfBlock(sharedmem);
+  	maxy=sharedmem[0];
+  	__syncthreads();
+  }
+  else {minx=lo[1];maxx=hi[1];}
+  if (not _periodicity[2]) {	
+  	sharedmem[threadIdx.x]=tmpz;
+  	minOfBlock(sharedmem);
+  	minz=sharedmem[0];
+  	__syncthreads();
+  	sharedmem[threadIdx.x]=tmpz;
+  	maxOfBlock(sharedmem);
+  	maxz=sharedmem[0];
+  	__syncthreads();
+  }
+  else {minx=lo[2];maxx=hi[2];}
+  	if(threadIdx.x==0)
+  	{
+  	  buf=(X_FLOAT*) _buffer;
+  	  buf+=blockIdx.x*gridDim.y+blockIdx.y;
+  	  buf[0]=minx;
+  	  buf+=gridDim.x*gridDim.y;
+  	  buf[0]=maxx;
+  	  buf+=gridDim.x*gridDim.y;
+  	  buf[0]=miny;
+  	  buf+=gridDim.x*gridDim.y;
+  	  buf[0]=maxy;
+  	  buf+=gridDim.x*gridDim.y;
+  	  buf[0]=minz;
+  	  buf+=gridDim.x*gridDim.y;
+  	  buf[0]=maxz;
+  	}
+  }  
+}
+
+__global__ void Domain_reduceBoxExtent(double* extent,int n)
+{
+  X_FLOAT* buf=(X_FLOAT*) _buffer; 
+  buf+=blockIdx.x*n;
+  copyGlobToShared(buf,sharedmem,n);
+  if(blockIdx.x%2==0)
+  minOfData(sharedmem,n); 
+  else 
+  maxOfData(sharedmem,n);
+  extent[blockIdx.x]=sharedmem[0];
+}
+
+__global__ void Domain_lamda2x_Kernel(int n)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(i<n)
+  { 
+    X_FLOAT ytmp = _x[i+_nmax];
+    X_FLOAT ztmp = _x[i+2*_nmax];
+    _x[i] = _h[0]*_x[i] + _h[5]*ytmp + _h[4]*ztmp + _boxlo[0];
+    _x[i+_nmax] = _h[1]*ytmp + _h[3]*ztmp + _boxlo[1];
+    _x[i+2*_nmax] = _h[2]*ztmp + _boxlo[2];
+  }
+}
+
+__global__ void Domain_x2lamda_Kernel(int n)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+
+  X_FLOAT delta[3];
+
+  if(i<n)
+  { 
+    delta[0] = _x[i] - _boxlo[0];
+    delta[1] = _x[i+_nmax] - _boxlo[1];
+    delta[2] = _x[i+2*_nmax] - _boxlo[2];
+
+    _x[i] = _h_inv[0]*delta[0] + _h_inv[5]*delta[1] + _h_inv[4]*delta[2];
+    _x[i+_nmax] = _h_inv[1]*delta[1] + _h_inv[3]*delta[2];
+    _x[i+2*_nmax] = _h_inv[2]*delta[2];
+  }
+}
diff --git a/lib/cuda/fft3d_cuda.cu b/lib/cuda/fft3d_cuda.cu
new file mode 100644
index 0000000000..a77ac5ce6d
--- /dev/null
+++ b/lib/cuda/fft3d_cuda.cu
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+//#define CUDA_PRECISION 1
+#include "cuda_precision.h"
+#include "cuda_common.h"
+struct  FFT_DATA{
+  FFT_FLOAT re;
+  FFT_FLOAT im;
+};
+
+#include "fft3d_cuda_cu.h"
+#include "fft3d_cuda_kernel.cu"
+#include <stdio.h>
+
+void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow)
+{
+	
+  dim3 grid;
+  grid.x=nslow;
+  grid.y=nmid;
+  grid.z=1;
+  dim3 threads;
+  threads.x=nfast;
+  threads.y=1;
+  threads.z=1;
+  cudaThreadSynchronize();
+  initfftdata_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n",cudaGetErrorString(cudaGetLastError())));
+}
+
+
+void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow)
+{
+	
+  dim3 grid;
+  grid.x=nslow;
+  grid.y=nmid;
+  grid.z=1;
+  dim3 threads;
+  threads.x=nfast*2;
+  threads.y=1;
+  threads.z=1;
+  permute_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out);
+  cudaThreadSynchronize();
+  MYDBG(printf("ERROR-CUDA permute_kernel: %s\n",cudaGetErrorString(cudaGetLastError())));
+}
+
+void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow)
+{
+	
+  dim3 grid;
+  grid.x=nslow;
+  grid.y=nmid;
+  grid.z=1;
+  dim3 threads;
+  threads.x=nfast*2;
+  threads.y=1;
+  threads.z=1;
+  permute_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out);
+  cudaThreadSynchronize();
+}
+void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo)
+{
+	
+  dim3 grid;
+  grid.x=(ihi-ilo+1);
+  grid.y=(jhi-jlo+1);
+  grid.z=1;
+  dim3 threads;
+  threads.x=(khi-klo+1)*2;
+  threads.y=1;
+  threads.z=1;
+  permute_part_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out,nfast,nmid,nslow,ihi,ilo,jhi,jlo,khi,klo);
+  cudaThreadSynchronize();
+ }
+ 
+ void FFTsyncthreads()
+ {
+ 	cudaThreadSynchronize();
+ }
+ 
diff --git a/lib/cuda/fft3d_cuda_cu.h b/lib/cuda/fft3d_cuda_cu.h
new file mode 100644
index 0000000000..426b61d40c
--- /dev/null
+++ b/lib/cuda/fft3d_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow);
+extern "C" void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow);
+extern "C" void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow);
+extern "C" void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo);
+extern "C" void FFTsyncthreads();
diff --git a/lib/cuda/fft3d_cuda_kernel.cu b/lib/cuda/fft3d_cuda_kernel.cu
new file mode 100644
index 0000000000..0ee414998f
--- /dev/null
+++ b/lib/cuda/fft3d_cuda_kernel.cu
@@ -0,0 +1,44 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__global__ void initfftdata_kernel(double* in,FFT_FLOAT* out)
+{
+   out[2*(((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x];
+   out[2*(((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x)+1]=0;
+}
+
+
+__global__ void permute_kernel(FFT_FLOAT* in,FFT_FLOAT* out)
+{
+   out[2*(((threadIdx.x/2)*gridDim.x+blockIdx.x)*gridDim.y+blockIdx.y)+threadIdx.x-2*(threadIdx.x/2)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x];
+}
+
+__global__ void permute_scale_kernel(FFT_FLOAT* in,FFT_FLOAT* out)
+{
+   out[2*(((threadIdx.x/2)*gridDim.x+blockIdx.x)*gridDim.y+blockIdx.y)+threadIdx.x-2*(threadIdx.x/2)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x]*gridDim.x*gridDim.y*blockDim.x*0.5;
+}
+
+__global__ void permute_part_kernel(FFT_FLOAT* in,FFT_FLOAT* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo)
+{
+    {out[2*((threadIdx.x/2)*(ihi-ilo+1)*(jhi-jlo+1)+(blockIdx.x)*(jhi-jlo+1)+blockIdx.y-jlo)+threadIdx.x-2*(threadIdx.x/2)]=in[2*(blockIdx.x+ilo)*nmid*nslow+2*(blockIdx.y+jlo)*nmid+threadIdx.x+2*klo];  }
+}
diff --git a/lib/cuda/fix_addforce_cuda.cu b/lib/cuda/fix_addforce_cuda.cu
new file mode 100644
index 0000000000..33700b44b6
--- /dev/null
+++ b/lib/cuda/fix_addforce_cuda.cu
@@ -0,0 +1,89 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_add_force_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "fix_addforce_cuda_cu.h"
+#include "fix_addforce_cuda_kernel.cu"
+
+void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+        int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
+	    dim3 threads(layout.z, 1, 1);
+	    dim3 grid(layout.x, layout.y, 1);		
+	    int size=(unsigned)(layout.z*layout.y*layout.x)*4*sizeof(F_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+}
+
+void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+}
+
+void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_FixAddForceCuda_UpdateNmax(sdata);
+}
+
+void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue,F_FLOAT* aforiginal)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixAddForceCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	if(sdata->buffer_new)
+		Cuda_FixAddForceCuda_UpdateBuffer(sdata);
+	int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Cuda_FixAddForceCuda_PostForce_Kernel<<<grid, threads,threads.x*4*sizeof(F_FLOAT)>>> (groupbit,axvalue,ayvalue,azvalue);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed");
+  
+	int oldgrid=grid.x;
+	grid.x=4;
+	threads.x=512;
+    reduce_foriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,aforiginal);
+    cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed");
+
+}
diff --git a/lib/cuda/fix_addforce_cuda_cu.h b/lib/cuda/fix_addforce_cuda_cu.h
new file mode 100644
index 0000000000..8aff462666
--- /dev/null
+++ b/lib/cuda/fix_addforce_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue,F_FLOAT* aforiginal);
diff --git a/lib/cuda/fix_addforce_cuda_kernel.cu b/lib/cuda/fix_addforce_cuda_kernel.cu
new file mode 100644
index 0000000000..bbfbdbe35a
--- /dev/null
+++ b/lib/cuda/fix_addforce_cuda_kernel.cu
@@ -0,0 +1,86 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ F_FLOAT sharedmem[];
+
+
+__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+   
+  sharedmem[threadIdx.x]=0;
+  sharedmem[threadIdx.x+blockDim.x]=0;
+  sharedmem[threadIdx.x+2*blockDim.x]=0;
+  sharedmem[threadIdx.x+3*blockDim.x]=0;
+   
+  if(i < _nlocal)
+  if (_mask[i] & groupbit) 
+    //if (iregion >= 0 && 
+        //match(x[i][0],x[i][1],x[i][2],iregion)) //currently not supported
+  {
+     sharedmem[threadIdx.x]=-xvalue*_x[i] - yvalue*_x[i+1*_nmax] - zvalue*_x[i+2*_nmax];
+     sharedmem[threadIdx.x+blockDim.x]=_f[i];
+     sharedmem[threadIdx.x+2*blockDim.x]=_f[i+1*_nmax];
+     sharedmem[threadIdx.x+3*blockDim.x]=_f[i+2*_nmax];
+     _f[i] += xvalue;
+     _f[i+1*_nmax] += yvalue;
+     _f[i+2*_nmax] += zvalue;
+  }
+  
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2*blockDim.x]);
+  reduceBlock(&sharedmem[3*blockDim.x]);
+  F_FLOAT* buffer=(F_FLOAT*) _buffer;
+  if(threadIdx.x==0) 
+  {
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];	
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];	
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];	
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x];	
+  }
+  
+}
+
+
+__global__ void reduce_foriginal(int n,F_FLOAT* foriginal)
+{
+	int i=0;
+    sharedmem[threadIdx.x]=0;
+    F_FLOAT myforig=0.0;
+    F_FLOAT* buf=(F_FLOAT*) _buffer;
+    buf=&buf[blockIdx.x*n];
+	while(i<n)
+	{
+      sharedmem[threadIdx.x]=0;
+	  if(i+threadIdx.x<n)
+      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
+      __syncthreads();
+	  reduceBlock(sharedmem);
+      i+=blockDim.x;
+      if(threadIdx.x==0)
+      myforig+=sharedmem[0];
+	}
+	if(threadIdx.x==0)
+	foriginal[blockIdx.x]=myforig;
+}
diff --git a/lib/cuda/fix_aveforce_cuda.cu b/lib/cuda/fix_aveforce_cuda.cu
new file mode 100644
index 0000000000..cc659a4a12
--- /dev/null
+++ b/lib/cuda/fix_aveforce_cuda.cu
@@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_ave_force_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "fix_aveforce_cuda_cu.h"
+#include "fix_aveforce_cuda_kernel.cu"
+
+void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+        int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
+	    dim3 threads(layout.z, 1, 1);
+	    dim3 grid(layout.x, layout.y, 1);		
+	    int size=(unsigned)(layout.z*layout.y*layout.x)*4*sizeof(F_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+}
+
+void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+}
+
+void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_FixAveForceCuda_UpdateNmax(sdata);
+}
+
+void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,F_FLOAT* aforiginal)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixAveForceCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	if(sdata->buffer_new)
+		Cuda_FixAveForceCuda_UpdateBuffer(sdata);
+
+	int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	
+	Cuda_FixAveForceCuda_PostForce_FOrg_Kernel<<<grid, threads,threads.x*4*sizeof(F_FLOAT)>>> (groupbit);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed");
+  
+	int oldgrid=grid.x;
+	grid.x=4;
+	threads.x=512;
+    Cuda_FixAveForceCuda_reduce_foriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,aforiginal);
+    cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed");
+
+}
+
+void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue)
+{
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	
+	Cuda_FixAveForceCuda_PostForce_Set_Kernel<<<grid, threads,0>>> (groupbit,xflag,yflag,zflag,axvalue,ayvalue,azvalue);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce_Set: fix ave_force post_force Compute Kernel execution failed");
+  
+}
diff --git a/lib/cuda/fix_aveforce_cuda_cu.h b/lib/cuda/fix_aveforce_cuda_cu.h
new file mode 100644
index 0000000000..dd9992d866
--- /dev/null
+++ b/lib/cuda/fix_aveforce_cuda_cu.h
@@ -0,0 +1,28 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,F_FLOAT* aforiginal);
+extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue);
diff --git a/lib/cuda/fix_aveforce_cuda_kernel.cu b/lib/cuda/fix_aveforce_cuda_kernel.cu
new file mode 100644
index 0000000000..edccee8c4d
--- /dev/null
+++ b/lib/cuda/fix_aveforce_cuda_kernel.cu
@@ -0,0 +1,87 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ F_FLOAT sharedmem[];
+
+
+__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+      sharedmem[threadIdx.x]=0;
+      sharedmem[threadIdx.x+blockDim.x]=0;
+      sharedmem[threadIdx.x+2*blockDim.x]=0;
+      sharedmem[threadIdx.x+3*blockDim.x]=0;
+   if(i < _nlocal)
+    if (_mask[i] & groupbit) {
+      sharedmem[threadIdx.x]=_f[i];
+      sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax];
+      sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax];
+      sharedmem[threadIdx.x+3*blockDim.x]=1;
+    }
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2*blockDim.x]);
+  reduceBlock(&sharedmem[3*blockDim.x]);
+  F_FLOAT* buffer=(F_FLOAT*) _buffer;
+  if(threadIdx.x==0) 
+  {
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];	
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];	
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];	
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x];	
+  }
+}
+
+
+__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n,F_FLOAT* foriginal)
+{
+	int i=0;
+    sharedmem[threadIdx.x]=0;
+    F_FLOAT myforig=0.0;
+    F_FLOAT* buf=(F_FLOAT*) _buffer;
+    buf=&buf[blockIdx.x*n];
+	while(i<n)
+	{
+      sharedmem[threadIdx.x]=0;
+	  if(i+threadIdx.x<n)
+      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
+      __syncthreads();
+	  reduceBlock(sharedmem);
+      i+=blockDim.x;
+      if(threadIdx.x==0)
+      myforig+=sharedmem[0];
+	}
+	if(threadIdx.x==0)
+	foriginal[blockIdx.x]=myforig;
+}
+
+__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit,int xflag, int yflag, int zflag,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+   if(i < _nlocal)
+    if (_mask[i] & groupbit) {
+      if(xflag) _f[i] = xvalue;
+      if(yflag) _f[i+1*_nmax] = yvalue;
+      if(zflag) _f[i+2*_nmax] = zvalue;
+    }
+}
diff --git a/lib/cuda/fix_enforce2d_cuda.cu b/lib/cuda/fix_enforce2d_cuda.cu
new file mode 100644
index 0000000000..4f3aa424eb
--- /dev/null
+++ b/lib/cuda/fix_enforce2d_cuda.cu
@@ -0,0 +1,54 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_enforce2d_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "fix_enforce2d_cuda_cu.h"
+#include "fix_enforce2d_cuda_kernel.cu"
+
+void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata)
+{
+	cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+	cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
+	cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+}
+
+void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixEnforce2dCuda_Init(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	Cuda_FixEnforce2dCuda_PostForce_Kernel<<<grid, threads>>> (groupbit);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Cuda_FixEnforce2dCuda_PostForce: fix enforce2d post_force Kernel execution failed");
+}
diff --git a/lib/cuda/fix_enforce2d_cuda_cu.h b/lib/cuda/fix_enforce2d_cuda_cu.h
new file mode 100644
index 0000000000..a35fadf806
--- /dev/null
+++ b/lib/cuda/fix_enforce2d_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit);
diff --git a/lib/cuda/fix_enforce2d_cuda_kernel.cu b/lib/cuda/fix_enforce2d_cuda_kernel.cu
new file mode 100644
index 0000000000..c07f944901
--- /dev/null
+++ b/lib/cuda/fix_enforce2d_cuda_kernel.cu
@@ -0,0 +1,33 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+__global__ void Cuda_FixEnforce2dCuda_PostForce_Kernel(int groupbit)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+   if(i < _nlocal)
+    if (_mask[i] & groupbit) {
+      _v[i+2*_nmax] = V_F(0.0);
+      _f[i+2*_nmax] = F_F(0.0);
+    }	
+}
diff --git a/lib/cuda/fix_freeze_cuda.cu b/lib/cuda/fix_freeze_cuda.cu
new file mode 100644
index 0000000000..ba6fe117ce
--- /dev/null
+++ b/lib/cuda/fix_freeze_cuda.cu
@@ -0,0 +1,95 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_freeze_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_freeze_cuda_cu.h"
+#include "fix_freeze_cuda_kernel.cu"
+
+void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+        int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
+	    dim3 threads(layout.z, 1, 1);
+	    dim3 grid(layout.x, layout.y, 1);		
+	    int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+			
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
+}
+
+void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(torque)  , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*) );
+}
+
+
+void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_FixFreezeCuda_UpdateNmax(sdata);
+		
+}
+
+
+void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT* foriginal)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixFreezeCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	if(sdata->buffer_new)
+		Cuda_FixFreezeCuda_UpdateBuffer(sdata);
+	
+	
+	int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Cuda_FixFreezeCuda_PostForce_Kernel<<<grid, threads,threads.x*3*sizeof(F_FLOAT)>>> (groupbit);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+  
+	int oldgrid=grid.x;
+	grid.x=3;
+	threads.x=512;
+    Cuda_FixFreezeCuda_Reduce_FOriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,foriginal);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
+
+}
diff --git a/lib/cuda/fix_freeze_cuda_cu.h b/lib/cuda/fix_freeze_cuda_cu.h
new file mode 100644
index 0000000000..2df8743a6a
--- /dev/null
+++ b/lib/cuda/fix_freeze_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT* foriginal);
diff --git a/lib/cuda/fix_freeze_cuda_kernel.cu b/lib/cuda/fix_freeze_cuda_kernel.cu
new file mode 100644
index 0000000000..d6721311b6
--- /dev/null
+++ b/lib/cuda/fix_freeze_cuda_kernel.cu
@@ -0,0 +1,82 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ F_FLOAT sharedmem[];
+
+
+__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+      sharedmem[threadIdx.x]=0;
+      sharedmem[threadIdx.x+blockDim.x]=0;
+      sharedmem[threadIdx.x+2*blockDim.x]=0;
+   if(i < _nlocal)
+    if (_mask[i] & groupbit) {
+      sharedmem[threadIdx.x]=_f[i];
+      sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax];
+      sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax];
+
+      _f[i] = F_F(0.0);
+      _f[i+1*_nmax] = F_F(0.0);
+      _f[i+2*_nmax] = F_F(0.0);
+      _torque[i] = F_F(0.0);
+      _torque[i+1*_nmax] = F_F(0.0);
+      _torque[i+2*_nmax] = F_F(0.0);
+    }
+ 
+    
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2*blockDim.x]);
+  F_FLOAT* buffer=(F_FLOAT*)_buffer;
+  if(threadIdx.x==0) 
+  {
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];	
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];	
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];	
+  }
+}
+
+
+__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n,F_FLOAT* foriginal)
+{
+	int i=0;
+    sharedmem[threadIdx.x]=0;
+    F_FLOAT myforig=0.0;
+    F_FLOAT* buf=(F_FLOAT*)_buffer;
+    buf=&buf[blockIdx.x*n];
+	while(i<n)
+	{
+      sharedmem[threadIdx.x]=0;
+	  if(i+threadIdx.x<n)
+      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
+      __syncthreads();
+	  reduceBlock(sharedmem);
+      i+=blockDim.x;
+      if(threadIdx.x==0)
+      myforig+=sharedmem[0];
+	}
+	if(threadIdx.x==0)
+	foriginal[blockIdx.x]=myforig;
+}
+
diff --git a/lib/cuda/fix_gravity_cuda.cu b/lib/cuda/fix_gravity_cuda.cu
new file mode 100644
index 0000000000..7c814ae200
--- /dev/null
+++ b/lib/cuda/fix_gravity_cuda.cu
@@ -0,0 +1,89 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_gravity_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_gravity_cuda_cu.h"
+#include "fix_gravity_cuda_kernel.cu"
+
+void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+        int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
+	    dim3 threads(layout.z, 1, 1);
+	    dim3 grid(layout.x, layout.y, 1);		
+	    int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+			
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
+}
+
+void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)       , & sdata->atom.type    .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(rmass_flag)       , & sdata->atom.rmass_flag, sizeof(int) );
+		cudaMemcpyToSymbol(MY_CONST(rmass)       , & sdata->atom.rmass    .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(mass)       , & sdata->atom.mass    .dev_data, sizeof(V_FLOAT*) );
+}
+
+void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_FixGravityCuda_UpdateNmax(sdata);
+		
+}
+
+
+void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixGravityCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	if(sdata->buffer_new)
+		Cuda_FixGravityCuda_UpdateBuffer(sdata);
+	
+	
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Cuda_FixGravityCuda_PostForce_Kernel<<<grid, threads>>> (groupbit,xacc,yacc,zacc);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Cuda_FixGravityCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+}
diff --git a/lib/cuda/fix_gravity_cuda_cu.h b/lib/cuda/fix_gravity_cuda_cu.h
new file mode 100644
index 0000000000..d69816bb67
--- /dev/null
+++ b/lib/cuda/fix_gravity_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc);
diff --git a/lib/cuda/fix_gravity_cuda_kernel.cu b/lib/cuda/fix_gravity_cuda_kernel.cu
new file mode 100644
index 0000000000..6a77933acb
--- /dev/null
+++ b/lib/cuda/fix_gravity_cuda_kernel.cu
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+
+   if(i < _nlocal)
+    if (_mask[i] & groupbit) {
+	  F_FLOAT mass = _rmass_flag?_rmass[i]:_mass[_type[i]];
+      _f[i] += mass*xacc;
+      _f[i+1*_nmax] += mass*yacc;
+      _f[i+2*_nmax] += mass*zacc;
+    }
+}
+
diff --git a/lib/cuda/fix_nh_cuda.cu b/lib/cuda/fix_nh_cuda.cu
new file mode 100644
index 0000000000..ee91e473e2
--- /dev/null
+++ b/lib/cuda/fix_nh_cuda.cu
@@ -0,0 +1,219 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_nh_cuda
+#define IncludeCommonNeigh
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "fix_nh_cuda_cu.h"
+#include "fix_nh_cuda_kernel.cu"
+
+void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(debugdata)     , & sdata->debugdata, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(xhold)   , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*) ); //might be moved to a neighbor record in sdata
+		cudaMemcpyToSymbol(MY_CONST(maxhold)   , & sdata->atom.maxhold, sizeof(int) ); //might be moved to a neighbor record in sdata
+		cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
+		cudaMemcpyToSymbol(MY_CONST(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
+}
+
+void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+		int size=(unsigned)10*sizeof(int);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_FixNHCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+			
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
+}
+
+void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
+{
+		if(sdata->atom.mass_host)
+		cudaMemcpyToSymbol(MY_CONST(mass)    , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(dtf)     , & dtf                       		, sizeof(V_FLOAT)  );
+		cudaMemcpyToSymbol(MY_CONST(dtv)     , & dtv                            , sizeof(X_FLOAT)  );
+		cudaMemcpyToSymbol(MY_CONST(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT)  );
+		cudaMemcpyToSymbol(MY_CONST(dist_check), & sdata->atom.dist_check       , sizeof(int)	   );
+		cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int)      ); //
+		Cuda_FixNHCuda_UpdateNmax(sdata);
+}
+
+
+void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+      timespec atime1,atime2;
+	  clock_gettime(CLOCK_REALTIME,&atime1);
+	if(sdata->atom.update_nmax) 
+		Cuda_FixNHCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
+	  clock_gettime(CLOCK_REALTIME,&atime2);
+	  sdata->cuda_timings.test1+=
+        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
+	if(sdata->buffer_new)
+		Cuda_FixNHCuda_UpdateBuffer(sdata);
+	F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]};
+	F_FLOAT3 factor2;
+	if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];}
+	
+	int3 layout=getgrid(mynlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	FixNHCuda_nh_v_press_Kernel<<<grid, threads>>> (groupbit,factor,p_triclinic,factor2);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
+
+}
+
+void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixNHCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
+	if(sdata->buffer_new)
+		Cuda_FixNHCuda_UpdateBuffer(sdata);
+	F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]};
+	F_FLOAT3 factor2;
+	if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];}
+	
+	int3 layout=getgrid(mynlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel<<<grid, threads>>> (groupbit,factor,p_triclinic,factor2);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
+
+}
+
+void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+      timespec atime1,atime2;
+	  clock_gettime(CLOCK_REALTIME,&atime1);
+	if(sdata->atom.update_nmax) 
+		Cuda_FixNHCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
+	  clock_gettime(CLOCK_REALTIME,&atime2);
+	  sdata->cuda_timings.test1+=
+        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
+	if(sdata->buffer_new)
+		Cuda_FixNHCuda_UpdateBuffer(sdata);
+	
+	int3 layout=getgrid(mynlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	FixNHCuda_nh_v_temp_Kernel<<<grid, threads>>> (groupbit,factor_eta);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("FixNHCuda: fix nh v_temp Kernel execution failed");
+
+}
+void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+      timespec atime1,atime2;
+	  clock_gettime(CLOCK_REALTIME,&atime1);
+	if(sdata->atom.update_nmax) 
+		Cuda_FixNHCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
+	  clock_gettime(CLOCK_REALTIME,&atime2);
+	  sdata->cuda_timings.test1+=
+        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
+	if(sdata->buffer_new)
+		Cuda_FixNHCuda_UpdateBuffer(sdata);
+	
+	int3 layout=getgrid(mynlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	FixNHCuda_nve_v_Kernel<<<grid, threads>>> (groupbit);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("FixNHCuda: nve_v Kernel execution failed");
+}
+
+
+void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+      timespec atime1,atime2;
+	  clock_gettime(CLOCK_REALTIME,&atime1);
+	if(sdata->atom.update_nmax) 
+		Cuda_FixNHCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
+	  clock_gettime(CLOCK_REALTIME,&atime2);
+	  sdata->cuda_timings.test1+=
+        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
+	if(sdata->buffer_new)
+		Cuda_FixNHCuda_UpdateBuffer(sdata);
+	
+	int3 layout=getgrid(mynlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+    cudaMemset(sdata->buffer,0,sizeof(int));
+	FixNHCuda_nve_x_Kernel<<<grid, threads>>> (groupbit);
+	cudaThreadSynchronize();
+	int reneigh_flag;
+	cudaMemcpy((void*) (&reneigh_flag), sdata->buffer, sizeof(int),cudaMemcpyDeviceToHost);
+	sdata->atom.reneigh_flag+=reneigh_flag;
+	CUT_CHECK_ERROR("FixNHCuda: nve_x Kernel execution failed");
+}
+
+void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata,int groupbit, double* factor_h, int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixNHCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
+	if(sdata->buffer_new)
+		Cuda_FixNHCuda_UpdateBuffer(sdata);
+	
+	F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]};
+	F_FLOAT3 factor2;
+	if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];}
+	
+	int3 layout=getgrid(mynlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel<<<grid, threads>>> (groupbit,factor,p_triclinic,factor2);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("FixNHCuda__nve_v_and_nh_v_press_NoBias:   Kernel execution failed");
+}
+
diff --git a/lib/cuda/fix_nh_cuda_cu.h b/lib/cuda/fix_nh_cuda_cu.h
new file mode 100644
index 0000000000..e6ba4e08bd
--- /dev/null
+++ b/lib/cuda/fix_nh_cuda_cu.h
@@ -0,0 +1,32 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
+extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup  see cpp
+extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup  see cpp
+extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup  see cpp
+extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup  see cpp
+extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup  see cpp
+extern "C" void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata,int groupbit, double* factor_h, int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup  see cpp
diff --git a/lib/cuda/fix_nh_cuda_kernel.cu b/lib/cuda/fix_nh_cuda_kernel.cu
new file mode 100644
index 0000000000..a6a3a52a87
--- /dev/null
+++ b/lib/cuda/fix_nh_cuda_kernel.cu
@@ -0,0 +1,187 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+static inline __device__ void check_distance(X_FLOAT &xtmp,X_FLOAT &ytmp,X_FLOAT &ztmp,int &i,int groupbit)
+{
+  if(_dist_check)
+  {
+  	
+  	X_FLOAT d=X_F(0.0);
+  	if(i<_nlocal)
+  	{
+  	  X_FLOAT tmp=xtmp-_xhold[i];
+  	  d=tmp*tmp;
+  	  tmp=ytmp-_xhold[i+_maxhold];
+  	  d+=tmp*tmp;
+  	  tmp=ztmp-_xhold[i+2*_maxhold];
+  	  d+=tmp*tmp;
+  
+  	  d=((_mask[i] & groupbit))?d:X_F(0.0);
+  	}
+  	if(not __all(d<=_triggerneighsq)) 
+  		_reneigh_flag[0]=1;
+  }
+}
+
+__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2)
+{
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nlocal && _mask[i] & groupbit)
+	{
+		V_FLOAT* my_v = _v + i;
+		V_FLOAT vx=my_v[0];
+		V_FLOAT vy=my_v[_nmax];
+		V_FLOAT vz=my_v[2*_nmax];
+		vx*=factor.x;
+		vy*=factor.y;
+		vz*=factor.z;
+		if(p_triclinic) {
+			vx += vy*factor2.z + vz*factor2.y;
+			vy += vz*factor2.x;
+		}
+		vx*=factor.x;
+		vy*=factor.y;
+		vz*=factor.z;
+		my_v[0]       = vx;  
+		my_v[_nmax]   = vy;  
+		my_v[2*_nmax] = vz; 
+	}
+	
+}
+
+__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta)
+{
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nlocal && _mask[i] & groupbit)
+	{
+		V_FLOAT* my_v = _v + i;
+		my_v[0]*=factor_eta;
+		my_v[_nmax]*=factor_eta;
+		my_v[2*_nmax]*=factor_eta;
+	}
+	
+}
+
+__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2)
+{
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nlocal && _mask[i] & groupbit)
+	{
+		F_FLOAT* my_f = _f + i;
+		V_FLOAT* my_v = _v + i;
+		
+		V_FLOAT 		dtfm = _dtf;
+		if(_rmass_flag) dtfm*= V_F(1.0) / _rmass[i];
+		else 			dtfm*= V_F(1.0) / _mass[_type[i]];
+		
+		V_FLOAT vx=my_v[0];
+		V_FLOAT vy=my_v[_nmax];
+		V_FLOAT vz=my_v[2*_nmax];
+		vx*=factor.x;
+		vy*=factor.y;
+		vz*=factor.z;
+		if(p_triclinic) {
+			vx += vy*factor2.z + vz*factor2.y;
+			vy += vz*factor2.x;
+		}
+		vx*=factor.x;
+		vy*=factor.y;
+		vz*=factor.z;
+		my_v[0]       = vx + dtfm * my_f[0];  
+		my_v[_nmax]   = vy + dtfm * my_f[_nmax];  
+		my_v[2*_nmax] = vz + dtfm * my_f[_nmax*2]; 
+	}
+	
+}
+
+__global__ void FixNHCuda_nve_v_Kernel(int groupbit)
+{
+	
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nlocal && _mask[i] & groupbit)
+	{
+		F_FLOAT* my_f = _f + i;
+		V_FLOAT* my_v = _v + i;
+
+		V_FLOAT 		dtfm = _dtf;
+		if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i];
+		else 			dtfm*=V_F(1.0) / _mass[_type[i]];
+
+		*my_v = (*my_v + dtfm*(*my_f)); my_f += _nmax; my_v += _nmax;
+		*my_v = (*my_v + dtfm*(*my_f)); my_f += _nmax; my_v += _nmax;
+		*my_v = (*my_v + dtfm*(*my_f));
+	}
+}
+
+__global__ void FixNHCuda_nve_x_Kernel(int groupbit)
+{
+	X_FLOAT xtmp,ytmp,ztmp;
+		
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nlocal && _mask[i] & groupbit)
+	{
+		V_FLOAT* my_v = _v + i;
+		X_FLOAT* my_x = _x + i;
+		
+		xtmp = *my_x += _dtv * *my_v;  my_v += _nmax; my_x += _nmax;
+		ytmp = *my_x += _dtv * *my_v;  my_v += _nmax; my_x += _nmax;
+		ztmp = *my_x += _dtv * *my_v;
+	}
+	check_distance(xtmp,ytmp,ztmp,i,groupbit);
+}
+
+
+__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2)
+{
+	
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nlocal && _mask[i] & groupbit)
+	{
+		F_FLOAT* my_f = _f + i;
+		V_FLOAT* my_v = _v + i;
+
+		V_FLOAT 		dtfm = _dtf;
+		if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i];
+		else 			dtfm*=V_F(1.0) / _mass[_type[i]];
+
+		V_FLOAT vx = my_v[0] + dtfm*my_f[0];
+		V_FLOAT vy = my_v[_nmax] + dtfm*my_f[_nmax];
+		V_FLOAT vz = my_v[2*_nmax] + dtfm*my_f[2*_nmax];
+		
+		vx*=factor.x;
+		vy*=factor.y;
+		vz*=factor.z;
+		if(p_triclinic) {
+			vx += vy*factor2.z + vz*factor2.y;
+			vy += vz*factor2.x;
+		}		
+		vx*=factor.x;
+		vy*=factor.y;
+		vz*=factor.z;
+		my_v[0]       = vx;  
+		my_v[_nmax]   = vy;  
+		my_v[2*_nmax] = vz; 
+		
+	}
+}
+
diff --git a/lib/cuda/fix_nve_cuda.cu b/lib/cuda/fix_nve_cuda.cu
new file mode 100644
index 0000000000..624292431d
--- /dev/null
+++ b/lib/cuda/fix_nve_cuda.cu
@@ -0,0 +1,162 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_nve_cuda
+#define IncludeCommonNeigh
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "fix_nve_cuda_cu.h"
+#include "fix_nve_cuda_kernel.cu"
+
+void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata)
+{
+	#ifdef CUDA_USE_BINNING
+
+		
+		cudaMemcpyToSymbol(MY_CONST(bin_count_all)  , & sdata->atom.bin_count_all  .dev_data, sizeof(unsigned*));
+		cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
+		cudaMemcpyToSymbol(MY_CONST(bin_dim)        , sdata->domain.bin_dim                 , sizeof(int)*3    );
+		cudaMemcpyToSymbol(MY_CONST(binned_f)       , & sdata->atom.binned_f       .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(binned_type)    , & sdata->atom.binned_type    .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(binned_v)       , & sdata->atom.binned_v       .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(binned_x)       , & sdata->atom.binned_x       .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(binned_rmass)   , & sdata->atom.binned_rmass   .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(mask)           , & sdata->atom.mask           .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(rmass)          , & sdata->atom.rmass          .dev_data, sizeof(V_FLOAT*) );
+		
+	}
+	
+	#else
+	
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(xhold)   , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*) ); //might be moved to a neighbor record in sdata
+		cudaMemcpyToSymbol(MY_CONST(maxhold)   , & sdata->atom.maxhold, sizeof(int) ); //might be moved to a neighbor record in sdata
+		cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
+		cudaMemcpyToSymbol(MY_CONST(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
+	
+	#endif
+}
+
+void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+		int size=(unsigned)10*sizeof(int);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_FixNVECuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+			
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
+}
+
+void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
+{
+	
+		if(sdata->atom.mass_host)
+		cudaMemcpyToSymbol(MY_CONST(mass)    , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(dtf)     , & dtf                       		, sizeof(V_FLOAT)  );
+		cudaMemcpyToSymbol(MY_CONST(dtv)     , & dtv                            , sizeof(X_FLOAT)  );
+		cudaMemcpyToSymbol(MY_CONST(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT)  );
+		cudaMemcpyToSymbol(MY_CONST(dist_check), & sdata->atom.dist_check       , sizeof(int)	   );
+		cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int)      ); //
+		Cuda_FixNVECuda_UpdateNmax(sdata);
+}
+
+
+void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixNVECuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
+	if(sdata->buffer_new)
+		Cuda_FixNVECuda_UpdateBuffer(sdata);
+
+	#ifdef CUDA_USE_BINNING
+	
+	dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1);
+	dim3 threads(sdata->domain.bin_nmax, 1, 1);
+	FixNVECuda_InitialIntegrate_N_Kernel<<<grid, threads>>> (groupbit);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate (binning) Kernel execution failed");
+	
+	#else
+		
+	int3 layout=getgrid(mynlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+    cudaMemset(sdata->buffer,0,sizeof(int));
+ 	FixNVECuda_InitialIntegrate_Kernel<<<grid, threads>>> (groupbit);
+	cudaThreadSynchronize();
+	int reneigh_flag;
+	cudaMemcpy((void*) (&reneigh_flag), sdata->buffer, sizeof(int),cudaMemcpyDeviceToHost);
+	sdata->atom.reneigh_flag+=reneigh_flag;
+	CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate Kernel execution failed");
+	
+	#endif
+	
+}
+
+void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixNVECuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
+	if(sdata->buffer_new)
+		Cuda_FixNVECuda_UpdateBuffer(sdata);
+
+	#ifdef CUDA_USE_BINNING
+	
+	dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1);
+	dim3 threads(sdata->domain.bin_nmax, 1, 1);
+	FixNVECuda_FinalIntegrate_Kernel<<<grid, threads>>> (groupbit);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate (binning) Kernel execution failed");
+	
+	#else
+		
+	int3 layout=getgrid(mynlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	FixNVECuda_FinalIntegrate_Kernel<<<grid, threads>>> (groupbit);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate Kernel execution failed");
+	
+	#endif
+}
+
diff --git a/lib/cuda/fix_nve_cuda_cu.h b/lib/cuda/fix_nve_cuda_cu.h
new file mode 100644
index 0000000000..93cabe8d8b
--- /dev/null
+++ b/lib/cuda/fix_nve_cuda_cu.h
@@ -0,0 +1,28 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
+extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
+extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
diff --git a/lib/cuda/fix_nve_cuda_kernel.cu b/lib/cuda/fix_nve_cuda_kernel.cu
new file mode 100644
index 0000000000..84f59fb307
--- /dev/null
+++ b/lib/cuda/fix_nve_cuda_kernel.cu
@@ -0,0 +1,137 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+static inline __device__ void check_distance(X_FLOAT &xtmp,X_FLOAT &ytmp,X_FLOAT &ztmp,int &i,int groupbit)
+{
+  if(_dist_check)
+  {
+  	X_FLOAT tmp=xtmp-_xhold[i];
+  	X_FLOAT d=tmp*tmp;
+  	tmp=ytmp-_xhold[i+_maxhold];
+  	d+=tmp*tmp;
+  	tmp=ztmp-_xhold[i+2*_maxhold];
+  	d+=tmp*tmp;
+  
+  	d=((i < _nlocal) && (_mask[i] & groupbit))?d:X_F(0.0);
+  
+  	if(not __all(d<=_triggerneighsq)) 
+  		_reneigh_flag[0]=1;
+  }
+}
+
+
+__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
+{
+	X_FLOAT xtmp,ytmp,ztmp;
+	#ifdef CUDA_USE_BINNING
+	
+	const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
+	if(threadIdx.x < _bin_count_local[bin])
+	{
+		const int i = 3*blockDim.x * bin + threadIdx.x;
+		if(_mask[i] & groupbit)
+		{
+			F_FLOAT* my_f = _binned_f + i;
+			V_FLOAT* my_v = _binned_v + i;
+			X_FLOAT* my_x = _binned_x + i;
+
+			V_FLOAT 		dtfm = _dtf
+			if(_rmass_flag) dtfm*= V_F(1.0) / _binned_rmass[i];
+			else 			dtfm*= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
+
+			V_FLOAT v_mem;
+			v_mem = *my_v += dtfm * (*my_f); xtmp = *my_x += _dtv * v_mem;  my_f += blockDim.x; my_v += blockDim.x; my_x += blockDim.x;
+			v_mem = *my_v += dtfm * (*my_f); ytmp = *my_x += _dtv * v_mem;  my_f += blockDim.x; my_v += blockDim.x; my_x += blockDim.x;
+			v_mem = *my_v += dtfm * (*my_f); ztmp = *my_x += _dtv * v_mem;
+		}
+	}
+	
+	#else
+	
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nlocal && _mask[i] & groupbit)
+	{
+		F_FLOAT* my_f = _f + i;
+		V_FLOAT* my_v = _v + i;
+		X_FLOAT* my_x = _x + i;
+				
+		V_FLOAT 		dtfm = _dtf;
+		if(_rmass_flag) dtfm*= V_F(1.0) / _rmass[i];
+		else 			dtfm*= V_F(1.0) / _mass[_type[i]];
+		
+		V_FLOAT v_mem;
+		v_mem = *my_v += dtfm * (*my_f); xtmp=*my_x += _dtv * v_mem;  my_f += _nmax; my_v += _nmax; my_x += _nmax;
+		v_mem = *my_v += dtfm * (*my_f); ytmp=*my_x += _dtv * v_mem;  my_f += _nmax; my_v += _nmax; my_x += _nmax;
+		v_mem = *my_v += dtfm * (*my_f); ztmp=*my_x += _dtv * v_mem;
+	}
+	
+	#endif
+
+	check_distance(xtmp,ytmp,ztmp,i,groupbit);
+}
+
+__global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
+{
+	#ifdef CUDA_USE_BINNING
+	
+	const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
+	if(threadIdx.x < _bin_count_local[bin])
+	{
+		const int i = 3*blockDim.x * bin + threadIdx.x;
+		if(_mask[i] & groupbit)
+		{
+			F_FLOAT* my_f = _binned_f + i;
+			V_FLOAT* my_v = _binned_v + i;
+
+			V_FLOAT 		dtfm = _dtf
+			if(_rmass_flag) dtfm*= V_F(1.0) / _binned_rmass[i];
+			else 			dtfm*= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
+
+			*my_v += dtfm * (*my_f);  my_f += blockDim.x; my_v += blockDim.x;
+			*my_v += dtfm * (*my_f);  my_f += blockDim.x; my_v += blockDim.x;
+			*my_v += dtfm * (*my_f);
+		}
+	}
+	
+	#else
+	
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i < _nlocal && _mask[i] & groupbit)
+	{
+		F_FLOAT* my_f = _f + i;
+		V_FLOAT* my_v = _v + i;
+
+		V_FLOAT 		dtfm = _dtf;
+		if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i];
+		else 			dtfm*=V_F(1.0) / _mass[_type[i]];
+
+		*my_v += dtfm * (*my_f);  my_f += _nmax; my_v += _nmax;
+		*my_v += dtfm * (*my_f);  my_f += _nmax; my_v += _nmax;
+		*my_v += dtfm * (*my_f);
+	}
+	
+	#endif
+}
+
+
+
diff --git a/lib/cuda/fix_set_force_cuda.cu b/lib/cuda/fix_set_force_cuda.cu
new file mode 100644
index 0000000000..6d0f2fde66
--- /dev/null
+++ b/lib/cuda/fix_set_force_cuda.cu
@@ -0,0 +1,93 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_set_force_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_set_force_cuda_cu.h"
+#include "fix_set_force_cuda_kernel.cu"
+
+void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+        int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
+	    dim3 threads(layout.z, 1, 1);
+	    dim3 grid(layout.x, layout.y, 1);		
+	    int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+			
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
+}
+
+void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+}
+
+void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_FixSetForceCuda_UpdateNmax(sdata);
+		
+}
+
+
+void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,F_FLOAT* foriginal,int flagx,int flagy,int flagz)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixSetForceCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	if(sdata->buffer_new)
+		Cuda_FixSetForceCuda_UpdateBuffer(sdata);
+	
+	
+	int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Cuda_FixSetForceCuda_PostForce_Kernel<<<grid, threads,threads.x*3*sizeof(F_FLOAT)>>> (groupbit,xvalue,yvalue,zvalue,flagx,flagy,flagz);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+  
+	int oldgrid=grid.x;
+	grid.x=3;
+	threads.x=512;
+    Cuda_FixSetForceCuda_Reduce_FOriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,foriginal);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
+
+}
diff --git a/lib/cuda/fix_set_force_cuda_cu.h b/lib/cuda/fix_set_force_cuda_cu.h
new file mode 100644
index 0000000000..3121a684ad
--- /dev/null
+++ b/lib/cuda/fix_set_force_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,F_FLOAT* foriginal,int flagx,int flagy,int flagz);
diff --git a/lib/cuda/fix_set_force_cuda_kernel.cu b/lib/cuda/fix_set_force_cuda_kernel.cu
new file mode 100644
index 0000000000..f5836dee5f
--- /dev/null
+++ b/lib/cuda/fix_set_force_cuda_kernel.cu
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ F_FLOAT sharedmem[];
+
+
+__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,int flagx,int flagy,int flagz)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+      sharedmem[threadIdx.x]=0;
+      sharedmem[threadIdx.x+blockDim.x]=0;
+      sharedmem[threadIdx.x+2*blockDim.x]=0;
+   if(i < _nlocal)
+    if (_mask[i] & groupbit) {
+      sharedmem[threadIdx.x]=_f[i];
+      sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax];
+      sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax];
+
+      if(flagx) _f[i] = xvalue;
+      if(flagy) _f[i+1*_nmax] = yvalue;
+      if(flagz) _f[i+2*_nmax] = zvalue;
+    }
+ 
+    
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2*blockDim.x]);
+  F_FLOAT* buffer=(F_FLOAT*)_buffer;
+  if(threadIdx.x==0) 
+  {
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];	
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];	
+  	buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];	
+  }
+}
+
+
+__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n,F_FLOAT* foriginal)
+{
+	int i=0;
+    sharedmem[threadIdx.x]=0;
+    F_FLOAT myforig=0.0;
+    F_FLOAT* buf=(F_FLOAT*)_buffer;
+    buf=&buf[blockIdx.x*n];
+	while(i<n)
+	{
+      sharedmem[threadIdx.x]=0;
+	  if(i+threadIdx.x<n)
+      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
+      __syncthreads();
+	  reduceBlock(sharedmem);
+      i+=blockDim.x;
+      if(threadIdx.x==0)
+      myforig+=sharedmem[0];
+	}
+	if(threadIdx.x==0)
+	foriginal[blockIdx.x]=myforig;
+}
+
diff --git a/lib/cuda/fix_shake_cuda.cu b/lib/cuda/fix_shake_cuda.cu
new file mode 100644
index 0000000000..0483013c21
--- /dev/null
+++ b/lib/cuda/fix_shake_cuda.cu
@@ -0,0 +1,275 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_shake_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "fix_shake_cuda_cu.h"
+#include "cuda_pair_virial_kernel_nc.cu"
+
+#define _shake_atom           MY_AP(shake_atom)
+#define _shake_type           MY_AP(shake_type)
+#define _shake_flag           MY_AP(shake_flag)
+#define _xshake               MY_AP(xshake)
+#define _dtfsq                MY_AP(dtfsq)
+#define _bond_distance        MY_AP(bond_distance)
+#define _angle_distance       MY_AP(angle_distance)
+#define _max_iter			  MY_AP(max_iter)
+#define _tolerance			  MY_AP(tolerance)
+__device__ __constant__ int* _shake_atom;
+__device__ __constant__ int* _shake_type;
+__device__ __constant__ int* _shake_flag;
+__device__ __constant__ X_FLOAT3* _xshake;
+__device__ __constant__ F_FLOAT _dtfsq;
+__device__ __constant__ X_FLOAT* _bond_distance;
+__device__ __constant__ X_FLOAT* _angle_distance;
+__device__ __constant__ int _max_iter;
+__device__ __constant__ X_FLOAT _tolerance;
+
+#include "fix_shake_cuda_kernel.cu"
+
+void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(map_array), & sdata->atom.map_array .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(vatom)   , & sdata->atom.vatom.dev_data, sizeof(ENERGY_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(debugdata), & sdata->debugdata         , sizeof(int*)     );
+}
+
+void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata)
+{
+	cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity		, sizeof(int)*3    );
+	cudaMemcpyToSymbol(MY_CONST(prd)		, sdata->domain.prd				, sizeof(X_FLOAT)*3    );
+	cudaMemcpyToSymbol(MY_CONST(triclinic)  , &sdata->domain.triclinic		, sizeof(int)      );
+	cudaMemcpyToSymbol(MY_CONST(h)			, sdata->domain.h				, sizeof(X_FLOAT)*6    );
+}
+
+void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata,int size)
+{
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_FixShakeCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+			
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
+}
+
+void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata,X_FLOAT dtv, F_FLOAT dtfsq,
+						void* shake_flag,void* shake_atom,void* shake_type, void* xshake,
+						void* bond_distance,void* angle_distance,void* virial,
+						int max_iter,X_FLOAT tolerance)
+{
+	Cuda_FixShakeCuda_UpdateNmax(sdata);
+	Cuda_FixShakeCuda_UpdateDomain(sdata);
+	cudaMemcpyToSymbol(MY_CONST(shake_atom)        , & shake_atom 	  , sizeof(void*) );
+	cudaMemcpyToSymbol(MY_CONST(shake_type)        , & shake_type 	  , sizeof(void*) );
+	cudaMemcpyToSymbol(MY_CONST(shake_flag)        , & shake_flag 	  , sizeof(void*) );
+	cudaMemcpyToSymbol(MY_CONST(xshake)            , & xshake     	  , sizeof(void*) );
+	cudaMemcpyToSymbol(MY_CONST(dtv)               , & dtv        	  , sizeof(X_FLOAT));
+	cudaMemcpyToSymbol(MY_CONST(dtfsq)             , & dtfsq      	  , sizeof(F_FLOAT));
+	cudaMemcpyToSymbol(MY_CONST(bond_distance)     , & bond_distance  , sizeof(void*) );
+	cudaMemcpyToSymbol(MY_CONST(angle_distance)    , & angle_distance , sizeof(void*) );
+	cudaMemcpyToSymbol(MY_CONST(virial)     	   , & virial  		  , sizeof(void*) );
+	cudaMemcpyToSymbol(MY_CONST(flag)  			   , &sdata->flag	  , sizeof(int*)  );
+	cudaMemcpyToSymbol(MY_CONST(max_iter)  		   , &max_iter  	  , sizeof(int)   );
+	cudaMemcpyToSymbol(MY_CONST(tolerance)  	   , &tolerance  	  , sizeof(X_FLOAT));
+	
+	if(sdata->atom.mass_host)
+	cudaMemcpyToSymbol(MY_CONST(mass),& sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
+	cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int)      ); //
+
+    cudaMemcpyToSymbol(MY_CONST(flag)  , &sdata->flag, sizeof(int*));
+	
+}
+
+void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixShakeCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
+	if(sdata->buffer_new)
+		Cuda_FixShakeCuda_UpdateBuffer(sdata,10*sizeof(double));
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	FixShakeCuda_UnconstrainedUpdate_Kernel<<<grid, threads>>> ();
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("FixShakeCuda_UnconstrainedUpdate: Kernel execution failed");	
+}
+
+void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata,int vflag,int vflag_atom,int* list,int nlist)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixShakeCuda_UpdateNmax(sdata);
+	if(sdata->domain.update) 
+		Cuda_FixShakeCuda_UpdateDomain(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
+	int3 layout=getgrid(sdata->atom.nlocal,6*sizeof(ENERGY_FLOAT),64);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->buffer_new)
+		Cuda_FixShakeCuda_UpdateBuffer(sdata,grid.x*grid.y*6*sizeof(ENERGY_FLOAT));
+	
+	BindXTypeTexture(sdata);
+	
+	FixShakeCuda_Shake_Kernel<<<grid, threads,6*threads.x*sizeof(ENERGY_FLOAT)>>> (vflag,vflag_atom,list,nlist);
+	cudaThreadSynchronize();
+	
+	CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed");	
+
+    if(vflag) 
+	{
+		int n=grid.x*grid.y;
+		grid.x=6;
+		grid.y=1;
+		threads.x=256;
+		MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(n);
+		cudaThreadSynchronize();
+		CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed");
+	}
+	
+}
+
+int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixShakeCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_FixShakeCuda_UpdateBuffer(sdata,size);
+
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemset( sdata->flag,0,sizeof(int));
+	  FixShakeCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz);
+	  cudaThreadSynchronize();
+	  cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+	  int aflag;
+	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
+	  CUT_CHECK_ERROR("Cuda_FixShakeCuda_PackComm: Kernel execution failed");
+		
+	}		
+    return 3*n;
+}
+
+int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixShakeCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_FixShakeCuda_UpdateBuffer(sdata,size);
+	static int count=-1;
+	count++;
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+	  FixShakeCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
+	}	
+	
+    return 3*n;
+}
+
+void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixShakeCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_FixShakeCuda_UpdateBuffer(sdata,size);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+	  FixShakeCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed");
+		
+	}		
+}
diff --git a/lib/cuda/fix_shake_cuda_cu.h b/lib/cuda/fix_shake_cuda_cu.h
new file mode 100644
index 0000000000..b4276b741a
--- /dev/null
+++ b/lib/cuda/fix_shake_cuda_cu.h
@@ -0,0 +1,34 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata,X_FLOAT dtv, F_FLOAT dtfsq,
+						void* shake_flag,void* shake_atom,void* shake_type, void* xshake,
+						void* bond_distance,void* angle_distance,void* virial,
+						int max_iter,X_FLOAT tolerance);
+extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata);
+extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata,int vflag,int vflag_atom,int* list,int nlist);
+extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
+extern "C" int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
+extern "C" void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv);
+
diff --git a/lib/cuda/fix_shake_cuda_kernel.cu b/lib/cuda/fix_shake_cuda_kernel.cu
new file mode 100644
index 0000000000..e4ca822a77
--- /dev/null
+++ b/lib/cuda/fix_shake_cuda_kernel.cu
@@ -0,0 +1,971 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ void v_tally(int& vflag_global,int& vflag_atom,int& n, int *list, ENERGY_FLOAT total, ENERGY_FLOAT *v)
+{
+  /*if(vflag_global)
+  {
+    ENERGY_FLOAT fraction = n/total;
+	ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+    *shared += fraction*v[0]; shared+=blockDim.x;
+    *shared += fraction*v[1]; shared+=blockDim.x;
+    *shared += fraction*v[2]; shared+=blockDim.x;
+    *shared += fraction*v[3]; shared+=blockDim.x;
+    *shared += fraction*v[4]; shared+=blockDim.x;
+    *shared += fraction*v[5];	
+  }*/
+  if (vflag_atom) {
+    ENERGY_FLOAT fraction = ENERGY_F(1.0)/total;
+    for (int i = 0; i < n; i++) {
+      int m = list[i];
+      ENERGY_FLOAT* myvatom=&_vatom[m];
+      
+      *myvatom += fraction*v[0]; myvatom+=_nmax;
+      *myvatom += fraction*v[1]; myvatom+=_nmax;
+      *myvatom += fraction*v[2]; myvatom+=_nmax;
+      *myvatom += fraction*v[3]; myvatom+=_nmax;
+      *myvatom += fraction*v[4]; myvatom+=_nmax;
+      *myvatom += fraction*v[5];
+    }
+  }
+}
+
+inline __device__ void minimum_image(X_FLOAT3& delta)
+{
+  if (_triclinic == 0) {
+    if (_periodicity[0]) {
+    	delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] :
+    	          (delta.x >  X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0));
+    }
+    if (_periodicity[1]) {
+    	delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] :
+    	          (delta.y >  X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0));
+    }
+    if (_periodicity[2]) {
+    	delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] :
+    	          (delta.z >  X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0));
+    }
+
+  } else {
+    if (_periodicity[1]) {
+    	delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] :
+    	          (delta.z >  X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0));
+   		delta.y += delta.z < -X_F(0.5)*_prd[2] ? _h[3] :
+    	          (delta.z >  X_F(0.5)*_prd[2] ?-_h[3] : X_F(0.0));
+     	delta.x += delta.z < -X_F(0.5)*_prd[2] ? _h[4] :
+    	          (delta.z >  X_F(0.5)*_prd[2] ?-_h[4] : X_F(0.0));
+    	          
+    }
+    if (_periodicity[1]) {
+    	delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] :
+    	          (delta.y >  X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0));
+    	delta.x += delta.y < -X_F(0.5)*_prd[1] ? _h[5] :
+    	          (delta.y >  X_F(0.5)*_prd[1] ?-_h[5] : X_F(0.0));
+    	          
+    }
+    if (_periodicity[0]) {
+    	delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] :
+    	          (delta.x >  X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0));
+    }
+  }
+}
+
+__global__ void FixShakeCuda_UnconstrainedUpdate_Kernel()
+{
+	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	if(i>=_nlocal) return;
+	
+	X_FLOAT3 my_xshake = {X_F(0.0),X_F(0.0),X_F(0.0)};
+	if(_shake_flag[i])
+	{
+		F_FLOAT* my_f = _f + i;
+		V_FLOAT* my_v = _v + i;
+		X_FLOAT* my_x = _x + i;
+			
+		V_FLOAT 		dtfmsq = _dtfsq;
+		if(_rmass_flag) dtfmsq*= V_F(1.0) / _rmass[i];
+		else 			dtfmsq*= V_F(1.0) / _mass[_type[i]];
+		
+		my_xshake.x =  *my_x + _dtv* *my_v + dtfmsq* *my_f; my_f += _nmax; my_v += _nmax; my_x += _nmax; 
+		my_xshake.y =  *my_x + _dtv* *my_v + dtfmsq* *my_f; my_f += _nmax; my_v += _nmax; my_x += _nmax; 
+		my_xshake.z =  *my_x + _dtv* *my_v + dtfmsq* *my_f; 
+	}
+	_xshake[i]=my_xshake;
+}
+
+
+
+
+__device__ void FixShakeCuda_Shake2(int& vflag,int& vflag_atom,int& m)
+{
+  int nlist,list[2];
+  ENERGY_FLOAT v[6];
+  X_FLOAT invmass0,invmass1;
+
+  // local atom IDs and constraint distances
+
+  int i0 = _map_array[_shake_atom[m]];
+  int i1 = _map_array[_shake_atom[m+_nmax]];
+  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
+
+  // r01 = distance vec between atoms, with PBC
+  
+  X_FLOAT3 r01;
+  
+  X_FLOAT4 x_i0,x_i1;
+  x_i0=fetchXType(i0);
+  x_i1=fetchXType(i1);
+  
+  r01.x = x_i0.x - x_i1.x;
+  r01.y = x_i0.y - x_i1.y;
+  r01.z = x_i0.z - x_i1.z;
+  minimum_image(r01);
+
+  // s01 = distance vec after unconstrained update, with PBC
+
+  X_FLOAT3 s01;
+  X_FLOAT3 xs_i0=_xshake[i0];
+  X_FLOAT3 xs_i1=_xshake[i1];
+  
+  s01.x = xs_i0.x - xs_i1.x;
+  s01.y = xs_i0.y - xs_i1.y;
+  s01.z = xs_i0.z - xs_i1.z;
+  minimum_image(s01);
+
+  // scalar distances between atoms
+
+  X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
+  X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
+
+  // a,b,c = coeffs in quadratic equation for lamda
+  
+  if (_rmass_flag) {
+    invmass0 = X_F(1.0)/_rmass[i0];
+    invmass1 = X_F(1.0)/_rmass[i1];
+  } else {
+    invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
+    invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
+  }
+
+  X_FLOAT a = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
+  X_FLOAT b = X_F(2.0) * (invmass0+invmass1) *
+    (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
+  X_FLOAT c = s01sq - bond1*bond1;
+
+  // error check
+
+  X_FLOAT determ = b*b - X_F(4.0)*a*c;
+  if (determ < X_F(0.0)) {
+    _flag[0]++;
+    determ = X_F(0.0);
+  }
+
+  // exact quadratic solution for lamda
+
+  X_FLOAT lamda,lamda1,lamda2;
+  lamda1 = -b+_SQRT_(determ);
+  lamda2 = -lamda1 - X_F(2.0)*b;
+  lamda1 *= X_F(1.0) / (X_F(2.0)*a);
+  lamda2 *= X_F(1.0) / (X_F(2.0)*a);
+
+  lamda = (fabs(lamda1) <= fabs(lamda2))? lamda1 : lamda2;
+
+  // update forces if atom is owned by this processor
+
+  lamda*= X_F(1.0)/_dtfsq;
+
+
+  //attenion: are shake clusters <-> atom unique?
+    nlist = 0;
+  if (i0 < _nlocal) {
+    _f[i0]         += lamda*r01.x;
+    _f[i0+_nmax]   += lamda*r01.y;
+    _f[i0+2*_nmax] += lamda*r01.z;
+    list[nlist++] = i0;
+  }
+
+  if (i1 < _nlocal) {
+    _f[i1]         -= lamda*r01.x;
+    _f[i1+_nmax]   -= lamda*r01.y;
+    _f[i1+2*_nmax] -= lamda*r01.z;
+    list[nlist++] = i1;
+  }
+ 
+  if (vflag||vflag_atom) {
+  	ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+  	X_FLOAT factor=nlist;
+    v[0] = lamda*r01.x*r01.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
+    v[1] = lamda*r01.y*r01.y; *shared = factor*v[1]; shared+=blockDim.x;
+    v[2] = lamda*r01.z*r01.z; *shared = factor*v[2]; shared+=blockDim.x;
+    v[3] = lamda*r01.x*r01.y; *shared = factor*v[3]; shared+=blockDim.x;
+    v[4] = lamda*r01.x*r01.z; *shared = factor*v[4]; shared+=blockDim.x;
+    v[5] = lamda*r01.y*r01.z; *shared = factor*v[5]; shared+=blockDim.x;
+    
+    v_tally(vflag,vflag_atom,nlist,list,2.0,v);
+  }
+}
+
+
+__device__ void FixShakeCuda_Shake3(int& vflag,int& vflag_atom,int& m)
+{
+  int nlist,list[3];
+  ENERGY_FLOAT v[6];
+  X_FLOAT invmass0,invmass1,invmass2;
+
+  // local atom IDs and constraint distances
+
+  int i0 = _map_array[_shake_atom[m]];
+  int i1 = _map_array[_shake_atom[m+_nmax]];
+  int i2 = _map_array[_shake_atom[m+2*_nmax]];
+  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
+  X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]];
+
+  // r01 = distance vec between atoms, with PBC
+  
+  X_FLOAT3 r01,r02;
+  
+  X_FLOAT4 x_i0,x_i1,x_i2;
+  x_i0=fetchXType(i0);
+  x_i1=fetchXType(i1);
+  x_i2=fetchXType(i2);
+  
+  r01.x = x_i0.x - x_i1.x;
+  r01.y = x_i0.y - x_i1.y;
+  r01.z = x_i0.z - x_i1.z;
+  minimum_image(r01);
+
+  r02.x = x_i0.x - x_i2.x;
+  r02.y = x_i0.y - x_i2.y;
+  r02.z = x_i0.z - x_i2.z;
+  minimum_image(r02);
+
+  // s01 = distance vec after unconstrained update, with PBC
+
+  X_FLOAT3 s01,s02;
+  X_FLOAT3 xs_i0=_xshake[i0];
+  X_FLOAT3 xs_i1=_xshake[i1];
+  X_FLOAT3 xs_i2=_xshake[i2];
+  
+  s01.x = xs_i0.x - xs_i1.x;
+  s01.y = xs_i0.y - xs_i1.y;
+  s01.z = xs_i0.z - xs_i1.z;
+  minimum_image(s01);
+
+  s02.x = xs_i0.x - xs_i2.x;
+  s02.y = xs_i0.y - xs_i2.y;
+  s02.z = xs_i0.z - xs_i2.z;
+  minimum_image(s02);
+
+  // scalar distances between atoms
+
+  X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
+  X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z;
+  X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
+  X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z;
+
+  // a,b,c = coeffs in quadratic equation for lamda
+  
+  if (_rmass_flag) {
+    invmass0 = X_F(1.0)/_rmass[i0];
+    invmass1 = X_F(1.0)/_rmass[i1];
+    invmass2 = X_F(1.0)/_rmass[i2];
+  } else {
+    invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
+    invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
+    invmass2 = X_F(1.0)/_mass[static_cast <int> (x_i2.w)];
+  }
+
+  X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) *
+    (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
+  X_FLOAT a12 = X_F(2.0) * invmass0 *
+    (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z);
+  X_FLOAT a21 = X_F(2.0) * invmass0 *
+    (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z);
+  X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) *
+    (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z);
+
+  // error check
+
+  X_FLOAT determ = a11*a22 - a12*a21;
+  if (determ == X_F(0.0)) _flag[0]++;
+  X_FLOAT determinv = X_F(1.0)/determ;
+  
+  X_FLOAT a11inv = a22*determinv;
+  X_FLOAT a12inv = -a12*determinv;
+  X_FLOAT a21inv = -a21*determinv;
+  X_FLOAT a22inv = a11*determinv;
+
+  // quadratic correction coeffs
+
+  X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z);
+
+  X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
+  X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq;
+  X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102;
+
+  X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
+  X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq;
+  X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102;
+
+  // iterate until converged
+
+  X_FLOAT lamda01 = X_F(0.0);
+  X_FLOAT lamda02 = X_F(0.0);
+  int niter = 0;
+  int done = 0;
+
+  X_FLOAT quad1,quad2,b1,b2,lamda01_new,lamda02_new;
+
+//maybe all running full loop?
+  while (__any(!done) && niter < _max_iter) {
+    quad1 = quad1_0101 * lamda01*lamda01 + quad1_0202 * lamda02*lamda02 + 
+      quad1_0102 * lamda01*lamda02;
+    quad2 = quad2_0101 * lamda01*lamda01 + quad2_0202 * lamda02*lamda02 + 
+      quad2_0102 * lamda01*lamda02;
+        
+    b1 = bond1*bond1 - s01sq - quad1;
+    b2 = bond2*bond2 - s02sq - quad2;
+        
+    lamda01_new = a11inv*b1 + a12inv*b2;
+    lamda02_new = a21inv*b1 + a22inv*b2;
+
+    done++;
+    done = (fabs(lamda01_new-lamda01) > _tolerance)?0: done;
+    done = (fabs(lamda02_new-lamda02) > _tolerance)?0: done;
+
+	
+    lamda01 = done<2?lamda01_new:lamda01;
+    lamda02 = done<2?lamda02_new:lamda02;
+    niter++;
+  }
+  // update forces if atom is owned by this processor
+
+  lamda01 *= X_F(1.0)/_dtfsq;
+  lamda02 *= X_F(1.0)/_dtfsq;
+
+
+  //attenion: are shake clusters <-> atom unique?
+    nlist = 0;
+  if (i0 < _nlocal) {
+    _f[i0] += lamda01*r01.x + lamda02*r02.x;
+    _f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y;
+    _f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z;
+    list[nlist++] = i0;
+  }
+
+  if (i1 < _nlocal) {
+    _f[i1] -= lamda01*r01.x;
+    _f[i1+_nmax] -= lamda01*r01.y;
+    _f[i1+2*_nmax] -= lamda01*r01.z;
+    list[nlist++] = i1;
+  }
+
+  if (i2 < _nlocal) {
+    _f[i2] -= lamda02*r02.x;
+    _f[i2+_nmax] -= lamda02*r02.y;
+    _f[i2+2*_nmax] -= lamda02*r02.z;
+    list[nlist++] = i2;
+  }
+
+  if (vflag||vflag_atom) {
+  	ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+  	X_FLOAT factor=X_F(2.0)/X_F(3.0)*nlist;
+    v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
+    v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y; *shared = factor*v[1]; shared+=blockDim.x;
+    v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z; *shared = factor*v[2]; shared+=blockDim.x;
+    v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y; *shared = factor*v[3]; shared+=blockDim.x;
+    v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z; *shared = factor*v[4]; shared+=blockDim.x;
+    v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z; *shared = factor*v[5]; shared+=blockDim.x;
+    
+    v_tally(vflag,vflag_atom,nlist,list,3.0,v);
+  }
+}
+
+__device__ void FixShakeCuda_Shake4(int& vflag,int& vflag_atom,int& m)
+{
+  int nlist,list[4];
+  ENERGY_FLOAT v[6];
+  X_FLOAT invmass0,invmass1,invmass2,invmass3;
+
+  // local atom IDs and constraint distances
+
+  int i0 = _map_array[_shake_atom[m]];
+  int i1 = _map_array[_shake_atom[m+_nmax]];
+  int i2 = _map_array[_shake_atom[m+2*_nmax]];
+  int i3 = _map_array[_shake_atom[m+3*_nmax]];
+  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
+  X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]];
+  X_FLOAT bond3 = _bond_distance[_shake_type[m+2*_nmax]];
+
+  // r01 = distance vec between atoms, with PBC
+  
+  X_FLOAT3 r01,r02,r03;
+  
+  X_FLOAT4 x_i0,x_i1,x_i2,x_i3;
+  x_i0=fetchXType(i0);
+  x_i1=fetchXType(i1);
+  x_i2=fetchXType(i2);
+  x_i3=fetchXType(i3);
+  
+  r01.x = x_i0.x - x_i1.x;
+  r01.y = x_i0.y - x_i1.y;
+  r01.z = x_i0.z - x_i1.z;
+  minimum_image(r01);
+
+  r02.x = x_i0.x - x_i2.x;
+  r02.y = x_i0.y - x_i2.y;
+  r02.z = x_i0.z - x_i2.z;
+  minimum_image(r02);
+
+  r03.x = x_i0.x - x_i3.x;
+  r03.y = x_i0.y - x_i3.y;
+  r03.z = x_i0.z - x_i3.z;
+  minimum_image(r03);
+
+  // s01 = distance vec after unconstrained update, with PBC
+
+  X_FLOAT3 s01,s02,s03;
+  X_FLOAT3 xs_i0=_xshake[i0];
+  X_FLOAT3 xs_i1=_xshake[i1];
+  X_FLOAT3 xs_i2=_xshake[i2];
+  X_FLOAT3 xs_i3=_xshake[i3];
+  
+  s01.x = xs_i0.x - xs_i1.x;
+  s01.y = xs_i0.y - xs_i1.y;
+  s01.z = xs_i0.z - xs_i1.z;
+  minimum_image(s01);
+
+  s02.x = xs_i0.x - xs_i2.x;
+  s02.y = xs_i0.y - xs_i2.y;
+  s02.z = xs_i0.z - xs_i2.z;
+  minimum_image(s02);
+
+  s03.x = xs_i0.x - xs_i3.x;
+  s03.y = xs_i0.y - xs_i3.y;
+  s03.z = xs_i0.z - xs_i3.z;
+  minimum_image(s03);
+  
+  // scalar distances between atoms
+
+  X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
+  X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z;
+  X_FLOAT r03sq = r03.x*r03.x + r03.y*r03.y + r03.z*r03.z;
+  X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
+  X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z;
+  X_FLOAT s03sq = s03.x*s03.x + s03.y*s03.y + s03.z*s03.z;
+
+  // a,b,c = coeffs in quadratic equation for lamda
+  
+  if (_rmass_flag) {
+    invmass0 = X_F(1.0)/_rmass[i0];
+    invmass1 = X_F(1.0)/_rmass[i1];
+    invmass2 = X_F(1.0)/_rmass[i2];
+    invmass3 = X_F(1.0)/_rmass[i3];
+  } else {
+    invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
+    invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
+    invmass2 = X_F(1.0)/_mass[static_cast <int> (x_i2.w)];
+    invmass3 = X_F(1.0)/_mass[static_cast <int> (x_i3.w)];
+  }
+
+  X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) *
+    (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
+  X_FLOAT a12 = X_F(2.0) * invmass0 *
+    (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z);
+  X_FLOAT a13 = X_F(2.0) * invmass0 *
+    (s01.x*r03.x + s01.y*r03.y + s01.z*r03.z);
+  X_FLOAT a21 = X_F(2.0) * invmass0 *
+    (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z);
+  X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) *
+    (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z);
+  X_FLOAT a23 = X_F(2.0) * (invmass0) *
+    (s02.x*r03.x + s02.y*r03.y + s02.z*r03.z);
+  X_FLOAT a31 = X_F(2.0) * (invmass0) *
+    (s03.x*r01.x + s03.y*r01.y + s03.z*r01.z);
+  X_FLOAT a32 = X_F(2.0) * (invmass0) *
+    (s03.x*r02.x + s03.y*r02.y + s03.z*r02.z);
+  X_FLOAT a33 = X_F(2.0) * (invmass0+invmass3) *
+    (s03.x*r03.x + s03.y*r03.y + s03.z*r03.z);
+
+  // error check
+
+  X_FLOAT determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 -
+    a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
+  if (determ == X_F(0.0)) _flag[0]++;
+  X_FLOAT determinv = X_F(1.0)/determ;
+  
+  X_FLOAT a11inv = determinv * (a22*a33 - a23*a32);
+  X_FLOAT a12inv = -determinv * (a12*a33 - a13*a32);
+  X_FLOAT a13inv = determinv * (a12*a23 - a13*a22);
+  X_FLOAT a21inv = -determinv * (a21*a33 - a23*a31);
+  X_FLOAT a22inv = determinv * (a11*a33 - a13*a31);
+  X_FLOAT a23inv = -determinv * (a11*a23 - a13*a21);
+  X_FLOAT a31inv = determinv * (a21*a32 - a22*a31);
+  X_FLOAT a32inv = -determinv * (a11*a32 - a12*a31);
+  X_FLOAT a33inv = determinv * (a11*a22 - a12*a21);
+  
+  // quadratic correction coeffs
+
+  X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z);
+  X_FLOAT r0103 = (r01.x*r03.x + r01.y*r03.y + r01.z*r03.z);
+  X_FLOAT r0203 = (r02.x*r03.x + r02.y*r03.y + r02.z*r03.z);
+
+  X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
+  X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq;
+  X_FLOAT quad1_0303 = invmass0*invmass0 * r03sq;
+  X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102;
+  X_FLOAT quad1_0103 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0103;
+  X_FLOAT quad1_0203 = X_F(2.0) * invmass0*invmass0 * r0203;
+
+  X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq;
+  X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
+  X_FLOAT quad2_0303 = invmass0*invmass0 * r03sq;
+  X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102;
+  X_FLOAT quad2_0103 = X_F(2.0) * invmass0*invmass0 * r0103;
+  X_FLOAT quad2_0203 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0203;
+
+  X_FLOAT quad3_0101 = invmass0*invmass0 * r01sq;
+  X_FLOAT quad3_0202 = invmass0*invmass0 * r02sq;
+  X_FLOAT quad3_0303 = (invmass0+invmass3)*(invmass0+invmass3) * r03sq;
+  X_FLOAT quad3_0102 = X_F(2.0) * invmass0*invmass0 * r0102;
+  X_FLOAT quad3_0103 = X_F(2.0) * (invmass0+invmass3)*invmass0 * r0103;
+  X_FLOAT quad3_0203 = X_F(2.0) * (invmass0+invmass3)*invmass0 * r0203;
+  // iterate until converged
+
+  X_FLOAT lamda01 = X_F(0.0);
+  X_FLOAT lamda02 = X_F(0.0);
+  X_FLOAT lamda03 = X_F(0.0);
+  int niter = 0;
+  int done = 0;
+
+  X_FLOAT quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda03_new;
+  
+//maybe all running full loop?
+  while (__any(!done) && niter < _max_iter) {
+    quad1 = quad1_0101 * lamda01*lamda01 + 
+      quad1_0202 * lamda02*lamda02 +
+      quad1_0303 * lamda03*lamda03 + 
+      quad1_0102 * lamda01*lamda02 +
+      quad1_0103 * lamda01*lamda03 +
+      quad1_0203 * lamda02*lamda03;
+
+    quad2 = quad2_0101 * lamda01*lamda01 + 
+      quad2_0202 * lamda02*lamda02 +
+      quad2_0303 * lamda03*lamda03 + 
+      quad2_0102 * lamda01*lamda02 +
+      quad2_0103 * lamda01*lamda03 +
+      quad2_0203 * lamda02*lamda03;
+
+    quad3 = quad3_0101 * lamda01*lamda01 + 
+      quad3_0202 * lamda02*lamda02 +
+      quad3_0303 * lamda03*lamda03 + 
+      quad3_0102 * lamda01*lamda02 +
+      quad3_0103 * lamda01*lamda03 +
+      quad3_0203 * lamda02*lamda03;
+
+    b1 = bond1*bond1 - s01sq - quad1;
+    b2 = bond2*bond2 - s02sq - quad2;
+    b3 = bond3*bond3 - s03sq - quad3;
+        
+    lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3;
+    lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3;
+    lamda03_new = a31inv*b1 + a32inv*b2 + a33inv*b3;
+
+    done++;
+    done = (fabs(lamda01_new-lamda01) > _tolerance)? 0:done;
+    done = (fabs(lamda02_new-lamda02) > _tolerance)? 0:done;
+    done = (fabs(lamda03_new-lamda03) > _tolerance)? 0:done;
+
+    lamda01 = done<2?lamda01_new:lamda01;
+    lamda02 = done<2?lamda02_new:lamda02;
+    lamda03 = done<2?lamda03_new:lamda03;
+    niter++;
+  }
+  // update forces if atom is owned by this processor
+
+  lamda01 *= X_F(1.0)/_dtfsq;
+  lamda02 *= X_F(1.0)/_dtfsq;
+  lamda03 *= X_F(1.0)/_dtfsq;
+
+
+  //attenion: are shake clusters <-> atom unique?
+    nlist = 0;
+  if (i0 < _nlocal) {
+    _f[i0] 			+= lamda01*r01.x + lamda02*r02.x + lamda03*r03.x;
+    _f[i0+_nmax] 	+= lamda01*r01.y + lamda02*r02.y + lamda03*r03.y;
+    _f[i0+2*_nmax] 	+= lamda01*r01.z + lamda02*r02.z + lamda03*r03.z;
+    list[nlist++] = i0;
+  }
+
+  if (i1 < _nlocal) {
+    _f[i1] -= lamda01*r01.x;
+    _f[i1+_nmax] -= lamda01*r01.y;
+    _f[i1+2*_nmax] -= lamda01*r01.z;
+    list[nlist++] = i1;
+  }
+
+  if (i2 < _nlocal) {
+    _f[i2] -= lamda02*r02.x;
+    _f[i2+_nmax] -= lamda02*r02.y;
+    _f[i2+2*_nmax] -= lamda02*r02.z;
+    list[nlist++] = i2;
+  }
+
+  if (i3 < _nlocal) {
+    _f[i3] -= lamda03*r03.x;
+    _f[i3+_nmax] -= lamda03*r03.y;
+    _f[i3+2*_nmax] -= lamda03*r03.z;
+    list[nlist++] = i3;
+  }
+
+  if (vflag||vflag_atom) {
+  	ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+  	X_FLOAT factor=X_F(2.0)/X_F(4.0)*nlist;
+    v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x + lamda03*r03.x*r03.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
+    v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y + lamda03*r03.y*r03.y; *shared = factor*v[1]; shared+=blockDim.x;
+    v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z + lamda03*r03.z*r03.z; *shared = factor*v[2]; shared+=blockDim.x;
+    v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y + lamda03*r03.x*r03.y; *shared = factor*v[3]; shared+=blockDim.x;
+    v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z + lamda03*r03.x*r03.z; *shared = factor*v[4]; shared+=blockDim.x;
+    v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z + lamda03*r03.y*r03.z; *shared = factor*v[5]; shared+=blockDim.x;
+    
+    v_tally(vflag,vflag_atom,nlist,list,4.0,v);
+  }
+}
+
+__device__ void FixShakeCuda_Shake3Angle(int& vflag,int& vflag_atom,int& m)
+{
+  int nlist,list[3];
+  ENERGY_FLOAT v[6];
+  X_FLOAT invmass0,invmass1,invmass2;
+
+  // local atom IDs and constraint distances
+
+  int i0 = _map_array[_shake_atom[m]];
+  int i1 = _map_array[_shake_atom[m+_nmax]];
+  int i2 = _map_array[_shake_atom[m+2*_nmax]];
+  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
+  X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]];
+  X_FLOAT bond12 = _angle_distance[_shake_type[m+2*_nmax]];
+
+  // r01 = distance vec between atoms, with PBC
+  
+  X_FLOAT3 r01,r02,r12;
+  
+  X_FLOAT4 x_i0,x_i1,x_i2;
+  x_i0=fetchXType(i0);
+  x_i1=fetchXType(i1);
+  x_i2=fetchXType(i2);
+  
+  r01.x = x_i0.x - x_i1.x;
+  r01.y = x_i0.y - x_i1.y;
+  r01.z = x_i0.z - x_i1.z;
+  minimum_image(r01);
+
+  r02.x = x_i0.x - x_i2.x;
+  r02.y = x_i0.y - x_i2.y;
+  r02.z = x_i0.z - x_i2.z;
+  minimum_image(r02);
+
+  r12.x = x_i1.x - x_i2.x;
+  r12.y = x_i1.y - x_i2.y;
+  r12.z = x_i1.z - x_i2.z;
+  minimum_image(r12);
+
+  // s01 = distance vec after unconstrained update, with PBC
+
+  X_FLOAT3 s01,s02,s12;
+  X_FLOAT3 xs_i0=_xshake[i0];
+  X_FLOAT3 xs_i1=_xshake[i1];
+  X_FLOAT3 xs_i2=_xshake[i2];
+  
+  s01.x = xs_i0.x - xs_i1.x;
+  s01.y = xs_i0.y - xs_i1.y;
+  s01.z = xs_i0.z - xs_i1.z;
+  minimum_image(s01);
+
+  s02.x = xs_i0.x - xs_i2.x;
+  s02.y = xs_i0.y - xs_i2.y;
+  s02.z = xs_i0.z - xs_i2.z;
+  minimum_image(s02);
+
+  s12.x = xs_i1.x - xs_i2.x;
+  s12.y = xs_i1.y - xs_i2.y;
+  s12.z = xs_i1.z - xs_i2.z;
+  minimum_image(s12);
+  
+  // scalar distances between atoms
+
+  X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
+  X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z;
+  X_FLOAT r12sq = r12.x*r12.x + r12.y*r12.y + r12.z*r12.z;
+  X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
+  X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z;
+  X_FLOAT s12sq = s12.x*s12.x + s12.y*s12.y + s12.z*s12.z;
+
+  // a,b,c = coeffs in quadratic equation for lamda
+  
+  if (_rmass_flag) {
+    invmass0 = X_F(1.0)/_rmass[i0];
+    invmass1 = X_F(1.0)/_rmass[i1];
+    invmass2 = X_F(1.0)/_rmass[i2];
+  } else {
+    invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
+    invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
+    invmass2 = X_F(1.0)/_mass[static_cast <int> (x_i2.w)];
+  }
+
+  X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) *
+    (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
+  X_FLOAT a12 = X_F(2.0) * invmass0 *
+    (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z);
+  X_FLOAT a13 = - X_F(2.0) * invmass1 *
+    (s01.x*r12.x + s01.y*r12.y + s01.z*r12.z);
+  X_FLOAT a21 = X_F(2.0) * invmass0 *
+    (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z);
+  X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) *
+    (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z);
+  X_FLOAT a23 = X_F(2.0) * invmass2 *
+    (s02.x*r12.x + s02.y*r12.y + s02.z*r12.z);
+  X_FLOAT a31 = - X_F(2.0) * invmass1 *
+    (s12.x*r01.x + s12.y*r01.y + s12.z*r01.z);
+  X_FLOAT a32 = X_F(2.0) * invmass2 *
+    (s12.x*r02.x + s12.y*r02.y + s12.z*r02.z);
+  X_FLOAT a33 = X_F(2.0) * (invmass1+invmass2) *
+    (s12.x*r12.x + s12.y*r12.y + s12.z*r12.z);
+
+  // inverse of matrix
+
+  X_FLOAT determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 -
+    a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
+  if (determ == X_F(0.0)) _flag[0]++;
+  X_FLOAT determinv = X_F(1.0)/determ;
+  
+  X_FLOAT a11inv = determinv * (a22*a33 - a23*a32);
+  X_FLOAT a12inv = -determinv * (a12*a33 - a13*a32);
+  X_FLOAT a13inv = determinv * (a12*a23 - a13*a22);
+  X_FLOAT a21inv = -determinv * (a21*a33 - a23*a31);
+  X_FLOAT a22inv = determinv * (a11*a33 - a13*a31);
+  X_FLOAT a23inv = -determinv * (a11*a23 - a13*a21);
+  X_FLOAT a31inv = determinv * (a21*a32 - a22*a31);
+  X_FLOAT a32inv = -determinv * (a11*a32 - a12*a31);
+  X_FLOAT a33inv = determinv * (a11*a22 - a12*a21);
+  
+  // quadratic correction coeffs
+
+  X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z);
+  X_FLOAT r0112 = (r01.x*r12.x + r01.y*r12.y + r01.z*r12.z);
+  X_FLOAT r0212 = (r02.x*r12.x + r02.y*r12.y + r02.z*r12.z);
+
+  X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
+  X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq;
+  X_FLOAT quad1_1212 = invmass1*invmass1 * r12sq;
+  X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102;
+  X_FLOAT quad1_0112 = - X_F(2.0) * (invmass0+invmass1)*invmass1 * r0112;
+  X_FLOAT quad1_0212 = - X_F(2.0) * invmass0*invmass1 * r0212;
+
+  X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq;
+  X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
+  X_FLOAT quad2_1212 = invmass2*invmass2 * r12sq;
+  X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102;
+  X_FLOAT quad2_0112 = X_F(2.0) * invmass0*invmass2 * r0112;
+  X_FLOAT quad2_0212 = X_F(2.0) * (invmass0+invmass2)*invmass2 * r0212;
+
+  X_FLOAT quad3_0101 = invmass1*invmass1 * r01sq;
+  X_FLOAT quad3_0202 = invmass2*invmass2 * r02sq;
+  X_FLOAT quad3_1212 = (invmass1+invmass2)*(invmass1+invmass2) * r12sq;
+  X_FLOAT quad3_0102 = - X_F(2.0) * invmass1*invmass2 * r0102;
+  X_FLOAT quad3_0112 = - X_F(2.0) * (invmass1+invmass2)*invmass1 * r0112;
+  X_FLOAT quad3_0212 = X_F(2.0) * (invmass1+invmass2)*invmass2 * r0212;
+  // iterate until converged
+
+  X_FLOAT lamda01 = X_F(0.0);
+  X_FLOAT lamda02 = X_F(0.0);
+  X_FLOAT lamda12 = X_F(0.0);
+  int niter = 0;
+  int done = 0;
+
+  X_FLOAT quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda12_new;
+  
+//maybe all running full loop?
+  while (__any(!done) && niter < _max_iter) {
+    quad1 = quad1_0101 * lamda01*lamda01 + 
+      quad1_0202 * lamda02*lamda02 +
+      quad1_1212 * lamda12*lamda12 + 
+      quad1_0102 * lamda01*lamda02 +
+      quad1_0112 * lamda01*lamda12 +
+      quad1_0212 * lamda02*lamda12;
+
+    quad2 = quad2_0101 * lamda01*lamda01 + 
+      quad2_0202 * lamda02*lamda02 +
+      quad2_1212 * lamda12*lamda12 + 
+      quad2_0102 * lamda01*lamda02 +
+      quad2_0112 * lamda01*lamda12 +
+      quad2_0212 * lamda02*lamda12;
+      
+    quad3 = quad3_0101 * lamda01*lamda01 + 
+      quad3_0202 * lamda02*lamda02 +
+      quad3_1212 * lamda12*lamda12 + 
+      quad3_0102 * lamda01*lamda02 +
+      quad3_0112 * lamda01*lamda12 +
+      quad3_0212 * lamda02*lamda12;
+
+    b1 = bond1*bond1 - s01sq - quad1;
+    b2 = bond2*bond2 - s02sq - quad2;
+    b3 = bond12*bond12 - s12sq - quad3;
+        
+    lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3;
+    lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3;
+    lamda12_new = a31inv*b1 + a32inv*b2 + a33inv*b3;
+
+    done++;
+    done = (fabs(lamda01_new-lamda01) > _tolerance)?0: done;
+    done = (fabs(lamda02_new-lamda02) > _tolerance)?0: done;
+    done = (fabs(lamda12_new-lamda12) > _tolerance)?0: done;
+
+    lamda01 = done<2?lamda01_new:lamda01;
+    lamda02 = done<2?lamda02_new:lamda02;
+    lamda12 = done<2?lamda12_new:lamda12;
+    niter++;
+  }
+  // update forces if atom is owned by this processor
+
+  lamda01 *= X_F(1.0)/_dtfsq;
+  lamda02 *= X_F(1.0)/_dtfsq;
+  lamda12 *= X_F(1.0)/_dtfsq;
+
+
+  //attenion: are shake clusters <-> atom unique?
+    nlist = 0;
+  if (i0 < _nlocal) {
+    _f[i0] 			+= lamda01*r01.x + lamda02*r02.x;
+    _f[i0+_nmax] 	+= lamda01*r01.y + lamda02*r02.y;
+    _f[i0+2*_nmax] 	+= lamda01*r01.z + lamda02*r02.z;
+    list[nlist++] = i0;
+  }
+
+  if (i1 < _nlocal) {
+    _f[i1] 			-= lamda01*r01.x - lamda12*r12.x;
+    _f[i1+_nmax] 	-= lamda01*r01.y - lamda12*r12.y;
+    _f[i1+2*_nmax] 	-= lamda01*r01.z - lamda12*r12.z;
+    list[nlist++] = i1;
+  }
+
+  if (i2 < _nlocal) {
+    _f[i2] 			-= lamda02*r02.x + lamda12*r12.x;
+    _f[i2+_nmax] 	-= lamda02*r02.y + lamda12*r12.y;
+    _f[i2+2*_nmax] 	-= lamda02*r02.z + lamda12*r12.z;
+    list[nlist++] = i2;
+  }
+
+  if (vflag||vflag_atom) {
+  	ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+  	X_FLOAT factor=X_F(2.0)/X_F(3.0)*nlist;
+    v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x + lamda12*r12.x*r12.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
+    v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y + lamda12*r12.y*r12.y; *shared = factor*v[1]; shared+=blockDim.x;
+    v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z + lamda12*r12.z*r12.z; *shared = factor*v[2]; shared+=blockDim.x;
+    v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y + lamda12*r12.x*r12.y; *shared = factor*v[3]; shared+=blockDim.x;
+    v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z + lamda12*r12.x*r12.z; *shared = factor*v[4]; shared+=blockDim.x;
+    v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z + lamda12*r12.y*r12.z; *shared = factor*v[5]; shared+=blockDim.x;
+    
+    v_tally(vflag,vflag_atom,nlist,list,3.0,v);
+  }
+}
+
+__global__ void FixShakeCuda_Shake_Kernel(int vflag,int vflag_atom,int* list,int nlist)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(i<nlist)
+  {
+
+    int m = list[i];
+    int sflag = _shake_flag[m];
+    if (sflag == 2) FixShakeCuda_Shake2(vflag,vflag_atom,m); 
+    else if(sflag == 3) FixShakeCuda_Shake3(vflag,vflag_atom,m); 
+    else if(sflag == 4) FixShakeCuda_Shake4(vflag,vflag_atom,m);
+    else FixShakeCuda_Shake3Angle(vflag,vflag_atom,m);
+  }
+  else
+  {
+  	ENERGY_FLOAT* shared=&sharedmem[threadIdx.x];
+  	*shared=ENERGY_F(0.0); shared+=blockDim.x;
+  	*shared=ENERGY_F(0.0); shared+=blockDim.x;
+  	*shared=ENERGY_F(0.0); shared+=blockDim.x;
+  	*shared=ENERGY_F(0.0); shared+=blockDim.x;
+  	*shared=ENERGY_F(0.0); shared+=blockDim.x;
+  	*shared=ENERGY_F(0.0);
+  }
+  if(vflag)
+  {
+  	__syncthreads();
+  	int eflag=0;
+  	PairVirialCompute_A_Kernel(eflag,vflag);
+  }
+    
+}
+
+__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+    int j=list[i];
+    if(j>_nmax) _flag[0]=1;
+    X_FLOAT3 xs=_xshake[j];
+    ((X_FLOAT*) _buffer)[i]=xs.x + dx;
+    ((X_FLOAT*) _buffer)[i+1*n] = xs.y + dy;
+    ((X_FLOAT*) _buffer)[i+2*n] = xs.z + dz;
+  }
+  
+}
+
+__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+    int j=list[i];
+    if(j>_nmax) _flag[0]=1;
+    X_FLOAT3 xs=_xshake[j];
+    xs.x += dx;
+    xs.y += dy;
+    xs.z += dz;
+    _xshake[i+first]=xs;
+  }
+  
+}
+
+__global__ void FixShakeCuda_UnpackComm_Kernel(int n,int first)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(i<n)
+  {
+    X_FLOAT3 xs;
+  xs.x=((X_FLOAT*) _buffer)[i];
+  xs.y=((X_FLOAT*) _buffer)[i+1*n];
+  xs.z=((X_FLOAT*) _buffer)[i+2*n];
+  _xshake[i+first]=xs;
+  }
+}
+
diff --git a/lib/cuda/fix_temp_berendsen_cuda.cu b/lib/cuda/fix_temp_berendsen_cuda.cu
new file mode 100644
index 0000000000..8dff7e6cb0
--- /dev/null
+++ b/lib/cuda/fix_temp_berendsen_cuda.cu
@@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_temp_berendsen_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_temp_berendsen_cuda_cu.h"
+#include "fix_temp_berendsen_cuda_kernel.cu"
+
+
+void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*) );
+}
+
+void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
+		
+}
+
+
+void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor)
+{
+	V_FLOAT factor=afactor;
+	if(sdata->atom.update_nmax) 
+		Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Cuda_FixTempBerendsenCuda_EndOfStep_Kernel<<<grid, threads,0>>> (groupbit,factor);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Cuda_FixTempBerendsenCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+}
diff --git a/lib/cuda/fix_temp_berendsen_cuda_cu.h b/lib/cuda/fix_temp_berendsen_cuda_cu.h
new file mode 100644
index 0000000000..fd64f98e42
--- /dev/null
+++ b/lib/cuda/fix_temp_berendsen_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor);
diff --git a/lib/cuda/fix_temp_berendsen_cuda_kernel.cu b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu
new file mode 100644
index 0000000000..716cbeac1e
--- /dev/null
+++ b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+
+__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+   if(i < _nlocal)
+    if (_mask[i] & groupbit) {
+	  _v[i]*=factor;
+	  _v[i+_nmax]*=factor;
+	  _v[i+2*_nmax]*=factor;
+    }
+}
+
diff --git a/lib/cuda/fix_temp_rescale_cuda.cu b/lib/cuda/fix_temp_rescale_cuda.cu
new file mode 100644
index 0000000000..6ca0942970
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_cuda.cu
@@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_temp_rescale_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_temp_rescale_cuda_cu.h"
+#include "fix_temp_rescale_cuda_kernel.cu"
+
+
+void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*) );
+}
+
+void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
+		
+}
+
+
+void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor)
+{
+	V_FLOAT factor=afactor;
+	//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
+		Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
+	//if(sdata->atom.update_nlocal) 		
+		//cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Cuda_FixTempRescaleCuda_EndOfStep_Kernel<<<grid, threads,0>>> (groupbit,factor);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+}
diff --git a/lib/cuda/fix_temp_rescale_cuda_cu.h b/lib/cuda/fix_temp_rescale_cuda_cu.h
new file mode 100644
index 0000000000..689b51a603
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor);
diff --git a/lib/cuda/fix_temp_rescale_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_cuda_kernel.cu
new file mode 100644
index 0000000000..19d04a5156
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_cuda_kernel.cu
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+
+__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+   if(i < _nlocal)
+    if (_mask[i] & groupbit) {
+	  _v[i]*=factor;
+	  _v[i+_nmax]*=factor;
+	  _v[i+2*_nmax]*=factor;
+    }
+}
+
diff --git a/lib/cuda/fix_temp_rescale_limit_cuda.cu b/lib/cuda/fix_temp_rescale_limit_cuda.cu
new file mode 100644
index 0000000000..5e2c43e932
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_limit_cuda.cu
@@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_temp_rescale_limit_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_temp_rescale_limit_cuda_cu.h"
+#include "fix_temp_rescale_limit_cuda_kernel.cu"
+
+
+void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*) );
+}
+
+void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
+		
+}
+
+
+void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor,double limit)
+{
+	V_FLOAT factor=afactor;
+	//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
+		Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
+	//if(sdata->atom.update_nlocal) 		
+		//cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	
+	int3 layout=getgrid(sdata->atom.nlocal);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel<<<grid, threads,0>>> (groupbit,factor,limit);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleLimitCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+}
diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_cu.h b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h
new file mode 100644
index 0000000000..117bca28d8
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor,double limit);
diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu
new file mode 100644
index 0000000000..a6cf446677
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu
@@ -0,0 +1,43 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+
+__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor,V_FLOAT limit)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+   if(i < _nlocal)
+    if (_mask[i] & groupbit) {
+	  V_FLOAT vx = _v[i];
+	  V_FLOAT vy = _v[i+_nmax];
+	  V_FLOAT vz = _v[i+2*_nmax];
+	  vx*=factor;
+	  vy*=factor;
+	  vz*=factor;
+	  
+	  _v[i]=vx>0?min(vx,limit):max(vx,-limit);
+	  _v[i+_nmax]=vy>0?min(vy,limit):max(vy,-limit);
+	  _v[i+2*_nmax]=vz>0?min(vz,limit):max(vz,-limit);
+    }
+}
+
diff --git a/lib/cuda/fix_viscous_cuda.cu b/lib/cuda/fix_viscous_cuda.cu
new file mode 100644
index 0000000000..3406115e58
--- /dev/null
+++ b/lib/cuda/fix_viscous_cuda.cu
@@ -0,0 +1,66 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_viscous_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_viscous_cuda_cu.h"
+#include "fix_viscous_cuda_kernel.cu"
+
+void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*)     );
+}
+
+void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_FixViscousCuda_UpdateNmax(sdata);
+		
+}
+
+
+void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit,void* gamma)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_FixViscousCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	
+	
+	int3 layout=getgrid(sdata->atom.nlocal,0);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	Cuda_FixViscousCuda_PostForce_Kernel<<<grid, threads,0>>> (groupbit,(F_FLOAT*) gamma);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed");
+ 
+}
diff --git a/lib/cuda/fix_viscous_cuda_cu.h b/lib/cuda/fix_viscous_cuda_cu.h
new file mode 100644
index 0000000000..b785a598a8
--- /dev/null
+++ b/lib/cuda/fix_viscous_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit,void* gamma);
diff --git a/lib/cuda/fix_viscous_cuda_kernel.cu b/lib/cuda/fix_viscous_cuda_kernel.cu
new file mode 100644
index 0000000000..2cd225bbd1
--- /dev/null
+++ b/lib/cuda/fix_viscous_cuda_kernel.cu
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit,F_FLOAT* gamma)
+{
+   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+ 
+   if(i < _nlocal)
+    if (_mask[i] & groupbit) {
+      F_FLOAT drag = gamma[_type[i]];
+      _f[i] -= drag*_v[i];
+      _f[i+1*_nmax] -= drag*_v[i+1*_nmax];
+      _f[i+2*_nmax] -= drag*_v[i+2*_nmax];
+    }
+}
diff --git a/lib/cuda/neighbor.cu b/lib/cuda/neighbor.cu
new file mode 100644
index 0000000000..a01d5b6ba9
--- /dev/null
+++ b/lib/cuda/neighbor.cu
@@ -0,0 +1,367 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <time.h>
+#define MY_PREFIX neighbor
+#define IncludeCommonNeigh
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "cuda_wrapper_cu.h"
+
+#define _cutneighsq     MY_AP(cutneighsq)
+#define _ex_type     	MY_AP(ex_type)
+#define _nex_type     	MY_AP(nex_type)
+#define _ex1_bit     	MY_AP(ex1_bit)
+#define _ex2_bit     	MY_AP(ex2_bit)
+#define _nex_group     	MY_AP(nex_group)
+#define _ex_mol_bit     MY_AP(ex_mol_bit)
+#define _nex_mol     	MY_AP(nex_mol)
+__device__ __constant__ CUDA_FLOAT* _cutneighsq;
+__device__ __constant__ int* _ex_type;
+__device__ __constant__ int _nex_type;
+__device__ __constant__ int* _ex1_bit;
+__device__ __constant__ int* _ex2_bit;
+__device__ __constant__ int _nex_group;
+__device__ __constant__ int* _ex_mol_bit;
+__device__ __constant__ int _nex_mol;
+
+#include "neighbor_cu.h"
+#include "neighbor_kernel.cu"
+
+void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+	CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");
+
+	    int size=(unsigned)(sizeof(int)*20+sneighlist->bin_dim[0]*sneighlist->bin_dim[1]*sneighlist->bin_dim[2]*(sizeof(int)+sneighlist->bin_nmax*3*sizeof(CUDA_FLOAT)));
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			if(sdata->buffer!=NULL) CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer=CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+	CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed");
+}
+
+int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+	if(sdata->buffer_new)
+		Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
+
+	// initialize only on first call
+	CUDA_FLOAT rez_bin_size[3] = 
+	{
+		(1.0 * sneighlist->bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
+		(1.0 * sneighlist->bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
+		(1.0 * sneighlist->bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
+	};
+
+	short init = 0;
+	if(! init)
+	{
+		init = 0;
+		cudaMemcpyToSymbol(MY_CONST(x)              , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(nall)         , & sdata->atom.nall                    , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(nmax)           , & sdata->atom.nmax                    , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(sublo)          ,   sdata->domain.sublo                 , sizeof(X_FLOAT)*3);
+	}
+	
+
+	int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+    
+    timespec starttime,endtime;
+    clock_gettime(CLOCK_REALTIME,&starttime);
+	
+	cudaMemset((int*) (sdata->buffer),0,sizeof(int)*(20+(sneighlist->bin_dim[0])*(sneighlist->bin_dim[1])*(sneighlist->bin_dim[2]))+3*sizeof(CUDA_FLOAT)*(sneighlist->bin_dim[0])*(sneighlist->bin_dim[1])*(sneighlist->bin_dim[2])*(sneighlist->bin_nmax));
+	
+	Binning_Kernel<<<grid, threads>>> (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],sneighlist->bin_dim[2],rez_bin_size[0],rez_bin_size[1],rez_bin_size[2]);
+	cudaThreadSynchronize();
+  
+    clock_gettime(CLOCK_REALTIME,&endtime);
+    sdata->cuda_timings.neigh_bin+=
+		endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+	
+	
+	int binning_error;
+	cudaMemcpy((void*) &binning_error,(void*) sdata->buffer,1*sizeof(int),cudaMemcpyDeviceToHost);
+	if(binning_error) 
+	{
+		sneighlist->bin_extraspace+=0.05;
+	}
+	else
+	{
+		MYDBG(printf("CUDA: binning successful\n");)
+	}
+	CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
+	return binning_error;
+}
+
+int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+		//Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
+	CUDA_FLOAT globcutoff=-1.0;
+
+	short init=0;
+	if(! init)
+	{
+		init = 1;
+		
+		// !! LAMMPS indexes atom types starting with 1 !!
+		
+		unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+				
+		unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
+
+		CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
+		//printf("Allocate: %i\n",nx);
+		sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx);
+		
+		if(sneighlist->cutneighsq)
+		{
+		    int cutoffsdiffer=0;
+		    double cutoff0 = sneighlist->cutneighsq[1][1];
+			for(int i=1; i<=sdata->atom.ntypes; ++i)
+			{
+				for(int j=1; j<=sdata->atom.ntypes; ++j)
+				{
+					acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT) (sneighlist->cutneighsq[i][j]);
+					if((sneighlist->cutneighsq[i][j]-cutoff0)*(sneighlist->cutneighsq[i][j]-cutoff0)>1e-6) cutoffsdiffer++;
+				}
+			}
+			if(not cutoffsdiffer) globcutoff=(CUDA_FLOAT) cutoff0;
+		}
+		else
+		{
+			MYEMUDBG( printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n"); )
+			return 0;
+		}
+		
+		int size = 100;
+		if(sdata->buffersize < size)
+		{
+			MYDBG( printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize); )
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize = size;
+			sdata->buffer_new++;
+			MYDBG( printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize); )
+		}
+		
+	    CudaWrapper_UploadCudaData(acutneighsq,sneighlist->cu_cutneighsq,nx);
+		cudaMemcpyToSymbol(MY_CONST(cutneighsq)       , &sneighlist->cu_cutneighsq       , sizeof(CUDA_FLOAT*) );
+		
+		cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(special_flag)     , sdata->atom.special_flag         , 4*sizeof(int)	);
+		cudaMemcpyToSymbol(MY_CONST(molecular)        , & sdata->atom.molecular          , sizeof(int)	);
+	}
+	
+	cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal), & sneighlist->firstneigh.dim[0]  , sizeof(unsigned) );
+	//cudaMemcpyToSymbol(MY_CONST(firstneigh)       , & sneighlist->firstneigh.dev_data, sizeof(int*)     );
+	cudaMemcpyToSymbol(MY_CONST(ilist)            , & sneighlist->ilist     .dev_data, sizeof(int*)     );
+	cudaMemcpyToSymbol(MY_CONST(inum)             , & sneighlist->inum               , sizeof(int)      );
+	cudaMemcpyToSymbol(MY_CONST(nlocal)           , & sdata->atom.nlocal             , sizeof(int)      );
+	cudaMemcpyToSymbol(MY_CONST(nall)             , & sdata->atom.nall            , sizeof(int)      );
+	cudaMemcpyToSymbol(MY_CONST(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*)     );
+	cudaMemcpyToSymbol(MY_CONST(type)             , & sdata->atom.type      .dev_data, sizeof(int*)     );
+	cudaMemcpyToSymbol(MY_CONST(mask)             , & sdata->atom.mask      .dev_data, sizeof(int*)     );
+	cudaMemcpyToSymbol(MY_CONST(tag)              , & sdata->atom.tag       .dev_data, sizeof(int*)     );
+	cudaMemcpyToSymbol(MY_CONST(special)          , & sdata->atom.special   .dev_data, sizeof(int*)     );
+	cudaMemcpyToSymbol(MY_CONST(maxspecial)       , & sdata->atom.maxspecial         , sizeof(int)      );
+	cudaMemcpyToSymbol(MY_CONST(nspecial)         , & sdata->atom.nspecial  .dev_data, sizeof(int*)     );
+	cudaMemcpyToSymbol(MY_CONST(maxneighbors)     , & sneighlist->maxneighbors	 , sizeof(int) );
+	cudaMemcpyToSymbol(MY_CONST(debugdata)        , & sdata->debugdata	 , sizeof(int*) );
+	cudaMemcpyToSymbol(MY_CONST(overlap_comm)     , & sdata->overlap_comm, sizeof(int) );
+	cudaMemcpyToSymbol(MY_CONST(neighbors) 		  , & sneighlist->neighbors.dev_data, sizeof(int*));
+	cudaMemcpyToSymbol(MY_CONST(ex_type) 		  , & sneighlist->ex_type.dev_data, sizeof(int*));
+	cudaMemcpyToSymbol(MY_CONST(ex1_bit) 		  , & sneighlist->ex1_bit.dev_data, sizeof(int*));
+	cudaMemcpyToSymbol(MY_CONST(ex2_bit) 		  , & sneighlist->ex2_bit.dev_data, sizeof(int*));
+	cudaMemcpyToSymbol(MY_CONST(ex_mol_bit) 	  , & sneighlist->ex_mol_bit.dev_data, sizeof(int*));
+	cudaMemcpyToSymbol(MY_CONST(nex_type)     	  , & sneighlist->nex_type, sizeof(int) );
+	cudaMemcpyToSymbol(MY_CONST(nex_group)     	  , & sneighlist->nex_group, sizeof(int) );
+	cudaMemcpyToSymbol(MY_CONST(nex_mol)     	  , & sneighlist->nex_mol, sizeof(int) );
+
+	if(sdata->overlap_comm) 
+	{
+		cudaMemcpyToSymbol(MY_CONST(numneigh_border)  , & sneighlist->numneigh_border .dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(numneigh_inner)   , & sneighlist->numneigh_inner  .dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(neighbors_inner)  , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(ilist_border)     , & sneighlist->ilist_border    .dev_data, sizeof(int*));
+		cudaMemcpyToSymbol(MY_CONST(inum_border)      , & sneighlist->inum_border     .dev_data, sizeof(int*) );		
+	}
+
+	//dim3 threads(sneighlist->bin_nmax,1,1);
+	dim3 threads(MIN(128,sneighlist->bin_nmax),1,1);
+	dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1],sneighlist->bin_dim[2],1);
+
+	//printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax);
+	int buffer[20];
+	buffer[0]=1;
+	buffer[1]=0;
+	CudaWrapper_UploadCudaData( buffer, sdata->buffer, 2*sizeof(int));
+	CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
+	//cudaMemset(sdata->debugdata,0,100*sizeof(int));
+	unsigned int shared_size=(sizeof(int)+3*sizeof(CUDA_FLOAT))*threads.x;
+	MYDBG(printf("Configuration: %i %i %i %u %i\n",grid.x,grid.y,threads.x,shared_size,sneighlist->bin_nmax);)
+	//shared_size=2056;
+  timespec starttime,endtime;
+  clock_gettime(CLOCK_REALTIME,&starttime);
+	//for(int i=0;i<100;i++)
+	{
+	if(sdata->overlap_comm)
+	NeighborBuildFullBin_OverlapComm_Kernel<<<grid,threads,shared_size>>>
+		(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom);
+	else
+	{
+	  int exclude=sneighlist->nex_mol|sneighlist->nex_group|sneighlist->nex_type;			
+	  if(exclude)
+		NeighborBuildFullBin_Kernel<1><<<grid,threads,shared_size>>>
+		(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom);
+	  else
+		NeighborBuildFullBin_Kernel<0><<<grid,threads,shared_size>>>
+		(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom);	  
+	}
+	//NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_FLOAT))*threads.x+sizeof(int)>>>
+	//	(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);
+	
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
+    clock_gettime(CLOCK_REALTIME,&endtime);
+    sdata->cuda_timings.neigh_build+=
+		endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+	//dim3 threads,grid;
+	CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int));
+	if(buffer[0]>=0&&true&&sdata->atom.molecular)
+	{
+		//printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall);
+  clock_gettime(CLOCK_REALTIME,&starttime);
+	int3 layout=getgrid(sdata->atom.nlocal,0,512); 
+	threads.x = layout.z; threads.y = 1; threads.z = 1;
+	grid.x = layout.x; grid.y = layout.y; grid.z = 1;
+	FindSpecial<<<grid,threads>>>(sdata->pair.use_block_per_atom);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed");
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  sdata->cuda_timings.neigh_special+=
+		endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+	}
+	}
+		//printf("Neightime: %lf\n",sdata->cuda_timings.test1);
+	CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
+
+	//CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int));
+
+	MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");)
+	return buffer[0];
+}
+
+int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+	MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");)
+	// initialize only on first call
+	/*static*/ short init=0;
+	if(! init)
+	{
+		init = 1;
+		
+		// !! LAMMPS indexes atom types starting with 1 !!
+		
+		unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+		if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2)
+			printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u "
+				"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
+				"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
+		
+		unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
+		CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
+		
+		if(sneighlist->cutneighsq)
+		{
+			for(int i=1; i<=sdata->atom.ntypes; ++i)
+			{
+				for(int j=1; j<=sdata->atom.ntypes; ++j)
+				{
+					acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT) (sneighlist->cutneighsq[i][j]);
+					//printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
+				}
+			}
+		}
+		else
+		{
+			MYEMUDBG( printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n"); )
+			return 0;
+		}
+		
+		int size = 100;
+		if(sdata->buffersize < size)
+		{
+			MYDBG( printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize); )
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize = size;
+			sdata->buffer_new++;
+			MYDBG( printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize); )
+		}
+		
+		cudaMemcpyToSymbol(MY_CONST(buffer)           , & sdata->buffer                  , sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(cutneighsq)       , acutneighsq                    , nx               );
+		cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal), & sneighlist->firstneigh.dim[0]  , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(firstneigh)       , & sneighlist->firstneigh.dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(ilist)            , & sneighlist->ilist     .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(inum)             , & sneighlist->inum               , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)           , & sdata->atom.nlocal             , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nall)             , & sdata->atom.nall               , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)             , & sdata->atom.nmax               , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(type)             , & sdata->atom.type      .dev_data, sizeof(int*)     );
+		cudaMemcpyToSymbol(MY_CONST(x)                , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(maxneighbors)     , & sneighlist->maxneighbors	 , sizeof(int) );
+			
+		free(acutneighsq);
+	}
+
+	int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	int return_value = 1;
+	CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int));
+	
+	CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed");
+	NeighborBuildFullNsq_Kernel<<<grid, threads>>> ();
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
+	
+	int buffer[20];
+	CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int)*20);
+	MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");)
+	return return_value=buffer[0];
+}
diff --git a/lib/cuda/neighbor_cu.h b/lib/cuda/neighbor_cu.h
new file mode 100644
index 0000000000..6ca1440de0
--- /dev/null
+++ b/lib/cuda/neighbor_cu.h
@@ -0,0 +1,32 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef NEIGHBOR_CU_H_
+#define NEIGHBOR_CU_H_
+#include "cuda_shared.h"
+
+extern "C" int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
+extern "C" int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
+extern "C" int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
+
+#endif /*NEIGHBOR_CU_H_*/
diff --git a/lib/cuda/neighbor_kernel.cu b/lib/cuda/neighbor_kernel.cu
new file mode 100644
index 0000000000..ad1a6a8fe7
--- /dev/null
+++ b/lib/cuda/neighbor_kernel.cu
@@ -0,0 +1,623 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__global__ void Binning_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,int bin_dim_z,
+							   CUDA_FLOAT rez_bin_size_x,CUDA_FLOAT rez_bin_size_y,CUDA_FLOAT rez_bin_size_z)
+{
+	int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+	
+	/*int* bin_count=(int*) _buffer;
+	bin_count=bin_count+20;
+	CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
+	CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer;
+	binned_x = &binned_x[2];
+	int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax];
+	if(i < _nall)
+	{
+		// copy atom position from global device memory to local register
+		// in this 3 steps to get as much coalesced access as possible
+		X_FLOAT* my_x = _x + i;
+		CUDA_FLOAT x_i = *my_x; my_x += _nmax;
+		CUDA_FLOAT y_i = *my_x; my_x += _nmax;
+		CUDA_FLOAT z_i = *my_x;
+		
+		
+		// calculate flat bin index
+		int bx=__float2int_rd(rez_bin_size_x * (x_i - _sublo[0]))+2;
+		int by=__float2int_rd(rez_bin_size_y * (y_i - _sublo[1]))+2;
+		int bz=__float2int_rd(rez_bin_size_z * (z_i - _sublo[2]))+2;
+
+		bx-=bx*negativCUDA(1.0f*bx);
+		bx-=(bx-bin_dim_x+1)*negativCUDA(1.0f*bin_dim_x-1.0f-1.0f*bx);
+		by-=by*negativCUDA(1.0f*by);
+		by-=(by-bin_dim_y+1)*negativCUDA(1.0f*bin_dim_y-1.0f-1.0f*by);
+		bz-=bz*negativCUDA(1.0f*bz);
+		bz-=(bz-bin_dim_z+1)*negativCUDA(1.0f*bin_dim_z-1.0f-1.0f*bz);
+		
+
+		const unsigned j = bin_dim_z * ( bin_dim_y *bx+by)+bz;
+		
+		// add new atom to bin, get bin-array position
+		const unsigned k = atomicAdd(& bin_count[j], 1);
+		if(k < bin_nmax)
+		{
+			binned_id [bin_nmax * j + k] = i;
+			binned_x [3 * bin_nmax * j + k] = x_i;
+			binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i;
+			binned_x [3 * bin_nmax * j + k + 2*bin_nmax] = z_i;
+		}
+		else
+		{	// normally, this should not happen:
+			int errorn=atomicAdd((int*) _buffer, 1);
+			MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); )
+		}
+	}
+}
+
+
+__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
+{
+  int m;
+
+  if (_nex_type)
+   if( _ex_type[itype * _cuda_ntypes + jtype]) return 1;
+
+  if (_nex_group) {
+    for (m = 0; m < _nex_group; m++) {
+      if (_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1;
+      if (_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1;
+    }
+  }
+
+  if (_nex_mol) {
+  	if(_molecule[i] == _molecule[j])
+    for (m = 0; m < _nex_mol; m++)
+      if (_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m] ) return 1;
+  }
+
+  return 0;
+}
+
+extern __shared__ CUDA_FLOAT shared[];
+
+__device__ inline int find_special(int3 &n, int* list,int & tag,int3 flag)
+{
+  int k=n.z;
+  for (int l = 0; l < n.z; l++) k = ((list[l] == tag)?l:k);
+  
+  return k<n.x ? flag.x : (k<n.y? flag.y : (k<n.z?flag.z:0));
+}
+
+template <const unsigned int exclude>
+__global__ void NeighborBuildFullBin_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style)
+{
+	//const bool domol=false;
+	int bin_dim_z=gridDim.y;
+	CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer;
+	binned_x = &binned_x[2];
+	int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax];
+	int bin = __mul24(gridDim.y,blockIdx.x)+blockIdx.y;
+	int bin_x = blockIdx.x/bin_dim_y;
+	int bin_y = blockIdx.x-bin_x*bin_dim_y;
+	int bin_z = blockIdx.y;
+	int bin_c = bin_count[bin];
+	
+	
+	CUDA_FLOAT cut;
+	if(globcutoff>0)
+	cut = globcutoff;
+	
+	int i=_nall;
+	CUDA_FLOAT* my_x;
+	CUDA_FLOAT x_i,y_i,z_i;
+
+    for(int actOffset=0; actOffset<bin_c; actOffset+=blockDim.x){
+    
+    int actIdx=threadIdx.x+actOffset;
+	CUDA_FLOAT* other_x=shared;
+	int* other_id=(int*) &other_x[3*blockDim.x];
+	
+	if(actIdx < bin_c)
+	{
+		i = binned_id[__mul24(bin,bin_nmax)+actIdx];
+		my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+actIdx;
+		x_i = *my_x; my_x += bin_nmax;
+		y_i = *my_x; my_x += bin_nmax;
+		z_i = *my_x;
+	}
+	else
+	i=2*_nall;
+	__syncthreads();
+	
+	int jnum=0;
+	int itype;
+	
+	if(i<_nlocal)
+	{
+	   jnum = 0;
+	   _ilist[i]=i;
+	   itype = _type[i];
+	}
+    //__syncthreads();
+    
+  
+	for(int otherActOffset=0; otherActOffset<bin_c; otherActOffset+=blockDim.x){
+	int otherActIdx=threadIdx.x+otherActOffset;
+	if(otherActIdx<bin_c)
+	{
+	  if(otherActOffset==actOffset)
+	  {
+		other_id[threadIdx.x]=i;
+		other_x[threadIdx.x] = x_i;
+	    other_x[threadIdx.x+blockDim.x] = y_i; 
+		other_x[threadIdx.x+2*blockDim.x] = z_i; 
+	  }
+	  else
+	  {
+		other_id[threadIdx.x] = binned_id[__mul24(bin,bin_nmax)+otherActIdx];
+		my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+otherActIdx;
+		other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
+		other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
+		other_x[threadIdx.x+__mul24(2,blockDim.x)] = *my_x;
+		
+	  }
+	}
+	__syncthreads();
+	int kk=threadIdx.x;
+	for(int k = 0; k < MIN(bin_c-otherActOffset,blockDim.x); ++k)
+	{
+		if(i<_nlocal)
+		{
+			kk++;
+			kk=kk<MIN(bin_c-otherActOffset,blockDim.x)?kk:0;
+			int j = other_id[kk];
+			if(exclude && exclusion(i,j,itype,_type[j])) continue;			
+			if(globcutoff<0)
+			{
+			  int jtype = _type[j];
+			  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
+			}
+			CUDA_FLOAT delx = x_i - other_x[kk];
+			CUDA_FLOAT dely = y_i - other_x[kk+blockDim.x];
+			CUDA_FLOAT delz = z_i - other_x[kk+2*blockDim.x];
+			CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+				
+			if(rsq <= cut && i != j)
+			{
+		  	    if(jnum<_maxneighbors){
+		  	       	 if(block_style)
+ 			         _neighbors[i*_maxneighbors+jnum]= j;
+		  	       	 else
+ 			         _neighbors[i+jnum*_nlocal]= j;
+ 			      }
+ 			      ++jnum;
+			}
+		}
+	}
+			__syncthreads();
+	
+	}
+	
+	for(int obin_x=bin_x-1;obin_x<bin_x+2;obin_x++)
+	for(int obin_y=bin_y-1;obin_y<bin_y+2;obin_y++)
+	for(int obin_z=bin_z-1;obin_z<bin_z+2;obin_z++)
+	{
+		if(obin_x<0||obin_y<0||obin_z<0) continue;
+		if(obin_x>=bin_dim_x||obin_y>=bin_dim_y||obin_z>=bin_dim_z) continue;
+		int other_bin=bin_dim_z * ( bin_dim_y * obin_x + obin_y) + obin_z;
+		if(other_bin==bin) continue;
+		
+		int obin_c=bin_count[other_bin];
+		
+		for(int otherActOffset=0; otherActOffset<obin_c; otherActOffset+=blockDim.x){
+		int otherActIdx=otherActOffset+threadIdx.x;
+		if(threadIdx.x < MIN(blockDim.x,obin_c-otherActOffset))
+		{
+			other_id[threadIdx.x] = binned_id[__mul24(other_bin,bin_nmax)+otherActIdx];
+			my_x = binned_x + __mul24(__mul24(other_bin,3),bin_nmax)+otherActIdx;
+			other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
+			other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
+			other_x[threadIdx.x+2*blockDim.x] = *my_x;
+		}
+		__syncthreads();
+		
+		for(int k = 0; k < MIN(blockDim.x,obin_c-otherActOffset); ++k)
+		{
+			if(i<_nlocal)
+			{
+				int j = other_id[k];
+				if(exclude && exclusion(i,j,itype,_type[j])) continue;			
+				if(globcutoff<0)
+				{
+			  	  int jtype = _type[j];
+			  	  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
+				}
+
+				CUDA_FLOAT delx = x_i - other_x[k];
+				CUDA_FLOAT dely = y_i - other_x[k+blockDim.x];
+				CUDA_FLOAT delz = z_i - other_x[k+2*blockDim.x];
+				CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+				if(rsq <= cut && i != j)
+				{
+			  	    if(jnum<_maxneighbors)
+			  	    {
+			  	      if(block_style)
+	 			      _neighbors[i*_maxneighbors+jnum]= j;
+			  	      else
+	 			      _neighbors[i+jnum*_nlocal]= j;
+			  	    }
+	 			      ++jnum;
+				}
+			}
+		}
+		__syncthreads();
+		
+		}
+	}
+	
+    if(jnum > _maxneighbors) ((int*)_buffer)[0] = -jnum;
+
+	if(i<_nlocal)
+	_numneigh[i] = jnum;
+    }
+}
+
+
+__global__ void FindSpecial(int block_style)
+{
+  int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int which;
+  int tag_mask=0;
+  int3 spec_flag;
+	
+  int3 mynspecial = {0,0,1};
+  if(ii>=_nlocal) return;
+  int special_id[CUDA_MAX_NSPECIAL];
+  
+  int i = _ilist[ii];
+  if(i>=_nlocal) return;
+  int jnum = _numneigh[i];
+  if (_special_flag[1] == 0) spec_flag.x = -1;
+  else if (_special_flag[1] == 1) spec_flag.x = 0;
+  else spec_flag.x = 1;
+ 	  
+  if (_special_flag[2] == 0) spec_flag.y = -1;
+  else if (_special_flag[2] == 1) spec_flag.y = 0;
+  else spec_flag.y = 2;
+	  
+  if (_special_flag[3] == 0) spec_flag.z = -1;
+  else if (_special_flag[3] == 1) spec_flag.z = 0;
+  else spec_flag.z = 3;
+ 
+  mynspecial.x=_nspecial[i];
+  mynspecial.y=_nspecial[i+_nmax];
+  mynspecial.z=_nspecial[i+2*_nmax];
+
+  if(i<_nlocal)
+  {
+	int* list = &_special[i];
+	for(int k=0;k<mynspecial.z;k++)
+	{
+	  special_id[k]=list[k*_nmax];
+  	  tag_mask = tag_mask|special_id[k];
+  	}
+  }	
+	  
+
+  for(int k=0;k<MIN(jnum,_maxneighbors);k++)
+  {
+  	int j;
+	if(block_style)
+	  j = _neighbors[i*_maxneighbors+k];
+	else
+	  j = _neighbors[i+k*_nlocal];
+	int tag_j=_tag[j];
+	which=0;
+	if((tag_mask&tag_j)==tag_j)
+	{
+	  which = find_special(mynspecial,special_id,tag_j,spec_flag);
+   	  if(which>0)
+   	  {
+   	    if(block_style)
+	      _neighbors[i*_maxneighbors+k]=j+which*_nall;
+	    else
+	      _neighbors[i+k*_nlocal]=j+which*_nall;
+   	  }
+   	  else if(which<0)
+   	  {
+   	    if(block_style)
+	  	  _neighbors[i*_maxneighbors+k]=_neighbors[i*_maxneighbors+jnum-1];
+		else
+	  	  _neighbors[i+k*_nlocal]=_neighbors[i+(jnum-1)*_nlocal];  
+	  	jnum--; 
+	  	k--;		
+   	  }
+    }
+  }
+  _numneigh[i]=jnum;
+}
+
+__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style)
+{
+	int bin_dim_z=gridDim.y;
+	CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer;
+	binned_x = &binned_x[2];
+	int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax];
+	int bin = __mul24(gridDim.y,blockIdx.x)+blockIdx.y;
+	int bin_x = blockIdx.x/bin_dim_y;
+	int bin_y = blockIdx.x-bin_x*bin_dim_y;
+	int bin_z = blockIdx.y;
+	int bin_c = bin_count[bin];
+	
+	
+	CUDA_FLOAT cut;
+	if(globcutoff>0)
+	cut = globcutoff;
+	
+	int i=_nall;
+	CUDA_FLOAT* my_x;
+	CUDA_FLOAT x_i,y_i,z_i;
+ 
+    for(int actOffset=0; actOffset<bin_c; actOffset+=blockDim.x){
+    
+    int actIdx=threadIdx.x+actOffset;
+	CUDA_FLOAT* other_x=shared;
+	int* other_id=(int*) &other_x[3*blockDim.x];
+	
+	if(actIdx < bin_c)
+	{
+		i = binned_id[__mul24(bin,bin_nmax)+actIdx];
+		my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+actIdx;
+		x_i = *my_x; my_x += bin_nmax;
+		y_i = *my_x; my_x += bin_nmax;
+		z_i = *my_x;
+	}
+	else
+	i=2*_nall;
+	__syncthreads();
+	
+	int jnum=0;
+	int jnum_border=0;
+	int jnum_inner=0;
+	int i_border=-1;
+	int itype;
+	
+	if(i<_nlocal)
+	{
+	   jnum = 0;
+	   _ilist[i]=i;
+	   itype = _type[i];
+	}
+    __syncthreads();
+    
+
+	for(int otherActOffset=0; otherActOffset<bin_c; otherActOffset+=blockDim.x){
+	int otherActIdx=threadIdx.x+otherActOffset;
+	if(otherActIdx<bin_c)
+	{
+	  if(otherActOffset==actOffset)
+	  {
+		other_id[threadIdx.x]=i;
+		other_x[threadIdx.x] = x_i;
+	    other_x[threadIdx.x+blockDim.x] = y_i; 
+		other_x[threadIdx.x+2*blockDim.x] = z_i; 
+	  }
+	  else
+	  {
+		other_id[threadIdx.x] = binned_id[__mul24(bin,bin_nmax)+otherActIdx];
+		my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+otherActIdx;
+		other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
+		other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
+		other_x[threadIdx.x+__mul24(2,blockDim.x)] = *my_x;
+		
+	  }
+	}
+	__syncthreads();
+	int kk=threadIdx.x;
+
+	for(int k = 0; k < MIN(bin_c-otherActOffset,blockDim.x); ++k)
+	{
+		if(i<_nlocal)
+		{
+			kk++;
+			kk=kk<MIN(bin_c-otherActOffset,blockDim.x)?kk:0;
+			int j = other_id[kk];
+
+			if(globcutoff<0)
+			{
+			  int jtype = _type[j];
+			  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
+			}
+			
+			CUDA_FLOAT delx = x_i - other_x[kk];
+			CUDA_FLOAT dely = y_i - other_x[kk+blockDim.x];
+			CUDA_FLOAT delz = z_i - other_x[kk+2*blockDim.x];
+			CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+				
+			if(rsq <= cut && i != j)
+			{
+	 			  if((j>=_nlocal)&&(i_border<0))
+	 			    i_border=atomicAdd(_inum_border,1);
+
+			  	    if(jnum<_maxneighbors)
+			  	    {
+		  	       	  if(block_style)
+ 			          {
+ 			            _neighbors[i*_maxneighbors+jnum]= j;
+	 			        if(j>=_nlocal)
+	 			           {_neighbors_border[i_border*_maxneighbors+jnum_border]=j;}
+	 			        else
+	 			           {_neighbors_inner[i*_maxneighbors+jnum_inner]=j;}
+ 			          }
+		  	       	  else
+		  	       	  {
+	 			        _neighbors[i+jnum*_nlocal]=j;
+	 			        if(j>=_nlocal)
+	 			           {_neighbors_border[i_border+jnum_border*_nlocal]=j;}
+	 			        else
+	 			           {_neighbors_inner[i+jnum_inner*_nlocal]=j;}
+		  	       	  }
+			  	    }	
+	 			    ++jnum;
+   			        if(j>=_nlocal)
+	 			       jnum_border++;
+	 			    else
+	 			       jnum_inner++;
+			}
+		}
+	  }
+	  __syncthreads();
+	}
+	for(int obin_x=bin_x-1;obin_x<bin_x+2;obin_x++)
+	for(int obin_y=bin_y-1;obin_y<bin_y+2;obin_y++)
+	for(int obin_z=bin_z-1;obin_z<bin_z+2;obin_z++)
+	{
+		if(obin_x<0||obin_y<0||obin_z<0) continue;
+		if(obin_x>=bin_dim_x||obin_y>=bin_dim_y||obin_z>=bin_dim_z) continue;
+		int other_bin=bin_dim_z * ( bin_dim_y * obin_x + obin_y) + obin_z;
+		if(other_bin==bin) continue;
+		
+		int obin_c=bin_count[other_bin];
+		
+		for(int otherActOffset=0; otherActOffset<obin_c; otherActOffset+=blockDim.x){
+		int otherActIdx=otherActOffset+threadIdx.x;
+		if(threadIdx.x < MIN(blockDim.x,obin_c-otherActOffset))
+		{
+			other_id[threadIdx.x] = binned_id[__mul24(other_bin,bin_nmax)+otherActIdx];
+			my_x = binned_x + __mul24(__mul24(other_bin,3),bin_nmax)+otherActIdx;
+			other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
+			other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
+			other_x[threadIdx.x+2*blockDim.x] = *my_x;
+		}
+		__syncthreads();
+		
+		for(int k = 0; k < MIN(blockDim.x,obin_c-otherActOffset); ++k)
+		{
+			if(i<_nlocal)
+			{
+				int j = other_id[k];
+				if(globcutoff<0)
+				{
+			  		int jtype = _type[j];
+			  		cut = _cutneighsq[itype * _cuda_ntypes + jtype];
+				}
+
+				CUDA_FLOAT delx = x_i - other_x[k];
+				CUDA_FLOAT dely = y_i - other_x[k+blockDim.x];
+				CUDA_FLOAT delz = z_i - other_x[k+2*blockDim.x];
+				CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+				if(rsq <= cut && i != j)
+				{
+	 			  if((j>=_nlocal)&&(i_border<0))
+	 			    i_border=atomicAdd(_inum_border,1);
+			  	    if(jnum<_maxneighbors)
+			  	    {
+		  	       	  if(block_style)
+ 			          {
+ 			            _neighbors[i*_maxneighbors+jnum]= j;
+	 			        if(j>=_nlocal)
+	 			           {_neighbors_border[i_border*_maxneighbors+jnum_border]=j;}
+	 			        else
+	 			           {_neighbors_inner[i*_maxneighbors+jnum_inner]=j;}
+ 			          }
+		  	       	  else
+		  	       	  {
+	 			        _neighbors[i+jnum*_nlocal]=j;
+	 			        if(j>=_nlocal)
+	 			           {_neighbors_border[i_border+jnum_border*_nlocal]=j;}
+	 			        else
+	 			           {_neighbors_inner[i+jnum_inner*_nlocal]=j;}
+		  	       	  }
+			  	    }	
+	 			    ++jnum;
+   			        if(j>=_nlocal)
+	 			       jnum_border++;
+	 			    else
+	 			       jnum_inner++;
+				}
+			}
+		  }
+		  __syncthreads();
+		}
+	}
+	
+    if(jnum > _maxneighbors) ((int*)_buffer)[0] = -jnum;
+ 
+	if(i<_nlocal)
+	{
+ 	  _numneigh[i] = jnum;
+	  _numneigh_inner[i] = jnum_inner;
+	  if(i_border>=0) _numneigh_border[i_border] = jnum_border;
+	  if(i_border>=0) _ilist_border[i_border] = i;
+		  
+	}
+  }
+}
+
+__global__ void NeighborBuildFullNsq_Kernel()
+{
+	int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+	int* buffer = (int*) _buffer;
+	
+	if(i < _nlocal)
+	{
+		X_FLOAT* my_x = _x + i;
+		CUDA_FLOAT x_i = *my_x; my_x += _nmax;
+		CUDA_FLOAT y_i = *my_x; my_x += _nmax;
+		CUDA_FLOAT z_i = *my_x;
+		int jnum = 0;
+		int* jlist = _firstneigh[i];
+		_ilist[i]=i;
+		
+		int itype = _type[i];
+		__syncthreads();
+		for(int j = 0; j < _nall; ++j)
+		{
+			my_x = _x + j;
+			CUDA_FLOAT x_j = *my_x; my_x += _nmax;
+			CUDA_FLOAT y_j = *my_x; my_x += _nmax;
+			CUDA_FLOAT z_j = *my_x;
+			CUDA_FLOAT delx = x_i - x_j;
+			CUDA_FLOAT dely = y_i - y_j;
+			CUDA_FLOAT delz = z_i - z_j;
+			CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+			int jtype = _type[j];
+			if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j)
+			{
+				if(jnum<_maxneighbors)
+				jlist[jnum] = j;
+				if(i==151) ((int*)_buffer)[jnum+2]=j;
+				++jnum;
+			}
+			__syncthreads();
+		}
+	    if(jnum > _maxneighbors) buffer[0] = 0;
+		_numneigh[i] = jnum;
+				if(i==151) ((int*)_buffer)[1]=jnum;
+	}
+}
+
diff --git a/lib/cuda/pair_born_coul_long_cuda.cu b/lib/cuda/pair_born_coul_long_cuda.cu
new file mode 100644
index 0000000000..913d5eb2c5
--- /dev/null
+++ b/lib/cuda/pair_born_coul_long_cuda.cu
@@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _rhoinv MY_AP(coeff1)
+#define _sigma MY_AP(coeff2)
+#define _a MY_AP(coeff3)
+#define _c MY_AP(coeff4)
+#define _d MY_AP(coeff5)
+
+#include "pair_born_coul_long_cuda_cu.h"
+#include "pair_born_coul_long_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairBornCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 5,true);
+}
+
+void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairBornCoulLongCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_BORN,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_BORN,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+
+#undef _rhoinv
+#undef _sigma
+#undef _a
+#undef _c
+#undef _d
+
diff --git a/lib/cuda/pair_born_coul_long_cuda_cu.h b/lib/cuda/pair_born_coul_long_cuda_cu.h
new file mode 100644
index 0000000000..e47968d0f9
--- /dev/null
+++ b/lib/cuda/pair_born_coul_long_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..651326cb60
--- /dev/null
+++ b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu
@@ -0,0 +1,34 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairBornCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+    const F_FLOAT r2inv = F_F(1.0)/rsq;
+   	const F_FLOAT r = _RSQRT_(r2inv);
+	const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+	const F_FLOAT rexp = _EXP_((_sigma[ij_type]-r)*_rhoinv[ij_type]);
+	const F_FLOAT forceborn = _a[ij_type]*_rhoinv[ij_type]*r*rexp - 
+					F_F(6.0)*_c[ij_type]*r6inv + F_F(8.0)*_d[ij_type]*r2inv*r6inv;
+    if(eflag) evdwl += factor_lj*(_a[ij_type]*rexp - _c[ij_type]*r6inv 
+			    					+_d[ij_type]*r2inv*r6inv-_offset[ij_type]);
+	return factor_lj*forceborn*r2inv;	
+}
diff --git a/lib/cuda/pair_buck_coul_cut_cuda.cu b/lib/cuda/pair_buck_coul_cut_cuda.cu
new file mode 100644
index 0000000000..b20de75efb
--- /dev/null
+++ b/lib/cuda/pair_buck_coul_cut_cuda.cu
@@ -0,0 +1,74 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _rhoinv MY_AP(coeff1)
+#define _buck1 MY_AP(coeff2)
+#define _buck2 MY_AP(coeff3)
+#define _a MY_AP(coeff4)
+#define _c MY_AP(coeff5)
+
+#include "pair_buck_coul_cut_cuda_cu.h"
+
+#include <time.h>
+void Cuda_PairBuckCoulCutCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 5,true);
+}
+
+void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairBuckCoulCutCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_BUCK,COUL_CUT,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_BUCK,COUL_CUT,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _rhoinv
+#undef _buck1
+#undef _buck2
+#undef _a
+#undef _c
+
diff --git a/lib/cuda/pair_buck_coul_cut_cuda_cu.h b/lib/cuda/pair_buck_coul_cut_cuda_cu.h
new file mode 100644
index 0000000000..1a2576ccae
--- /dev/null
+++ b/lib/cuda/pair_buck_coul_cut_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_buck_coul_long_cuda.cu b/lib/cuda/pair_buck_coul_long_cuda.cu
new file mode 100644
index 0000000000..70e53edf08
--- /dev/null
+++ b/lib/cuda/pair_buck_coul_long_cuda.cu
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _rhoinv MY_AP(coeff1)
+#define _buck1 MY_AP(coeff2)
+#define _buck2 MY_AP(coeff3)
+#define _a MY_AP(coeff4)
+#define _c MY_AP(coeff5)
+
+#include "pair_buck_coul_long_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairBuckCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 5,true);
+}
+
+void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairBuckCoulLongCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_BUCK,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_BUCK,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+
+
+#undef _rhoinv
+#undef _buck1
+#undef _buck2
+#undef _a
+#undef _c
+
diff --git a/lib/cuda/pair_buck_coul_long_cuda_cu.h b/lib/cuda/pair_buck_coul_long_cuda_cu.h
new file mode 100644
index 0000000000..77cbb4c07f
--- /dev/null
+++ b/lib/cuda/pair_buck_coul_long_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_buck_cuda.cu b/lib/cuda/pair_buck_cuda.cu
new file mode 100644
index 0000000000..c14abc0067
--- /dev/null
+++ b/lib/cuda/pair_buck_cuda.cu
@@ -0,0 +1,76 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _rhoinv MY_AP(coeff1)
+#define _buck1 MY_AP(coeff2)
+#define _buck2 MY_AP(coeff3)
+#define _a MY_AP(coeff4)
+#define _c MY_AP(coeff5)
+
+#include "pair_buck_cuda_cu.h"
+#include "pair_buck_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairBuckCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 5);
+}
+
+void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairBuckCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_BUCK,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_BUCK,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _rhoinv
+#undef _buck1
+#undef _buck2
+#undef _a
+#undef _c
+
diff --git a/lib/cuda/pair_buck_cuda_cu.h b/lib/cuda/pair_buck_cuda_cu.h
new file mode 100644
index 0000000000..92b6350d9f
--- /dev/null
+++ b/lib/cuda/pair_buck_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_buck_cuda_kernel_nc.cu b/lib/cuda/pair_buck_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..3ec40a26f8
--- /dev/null
+++ b/lib/cuda/pair_buck_cuda_kernel_nc.cu
@@ -0,0 +1,33 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairBuckCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+   	const F_FLOAT r = _RSQRT_(r2inv);
+	const F_FLOAT rexp = _EXP_(-r*_rhoinv[ij_type]);
+	const F_FLOAT forcebuck = _buck1[ij_type]*r*rexp - _buck2[ij_type]*r6inv;
+	if(eflag) evdwl += factor_lj*(_a[ij_type]*rexp - _c[ij_type]*r6inv - 
+			    			_offset[ij_type]);		
+	return (factor_lj*forcebuck) * r2inv;
+}
diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu
new file mode 100644
index 0000000000..1f780674c1
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu
@@ -0,0 +1,80 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _cg_type MY_AP(coeff5)
+
+
+#include "pair_cg_cmm_coul_cut_cuda_cu.h"
+#include <time.h>
+
+
+
+
+void Cuda_PairCGCMMCoulCutCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 5, true, false );
+	
+}
+
+
+
+
+void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+	// initialize only on first call
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairCGCMMCoulCutCuda_Init(sdata);
+	}
+	
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_CG_CMM,COUL_CUT,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_CG_CMM,COUL_CUT,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);	
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _cg_type
+
diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h
new file mode 100644
index 0000000000..00eb4c983c
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu
new file mode 100644
index 0000000000..ead0fc9832
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu
@@ -0,0 +1,80 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _cg_type MY_AP(coeff5)
+
+
+#include "pair_cg_cmm_coul_debye_cuda_cu.h"
+#include <time.h>
+
+
+
+
+void Cuda_PairCGCMMCoulDebyeCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 5, true, false );
+	
+}
+
+
+
+
+void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+	// initialize only on first call
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairCGCMMCoulDebyeCuda_Init(sdata);
+	}
+	
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_CG_CMM,COUL_DEBYE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_CG_CMM,COUL_DEBYE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);	
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _cg_type
+
diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h
new file mode 100644
index 0000000000..5b8bab44c5
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda.cu b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu
new file mode 100644
index 0000000000..dbdc2d2a12
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu
@@ -0,0 +1,80 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _cg_type MY_AP(coeff5)
+
+
+#include "pair_cg_cmm_coul_long_cuda_cu.h"
+#include <time.h>
+
+
+
+
+void Cuda_PairCGCMMCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 5, true, false );
+	
+}
+
+
+
+
+void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+	// initialize only on first call
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairCGCMMCoulLongCuda_Init(sdata);
+	}
+	
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_CG_CMM,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_CG_CMM,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);	
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _cg_type
+
diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h
new file mode 100644
index 0000000000..bed897d5d3
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_cg_cmm_cuda.cu b/lib/cuda/pair_cg_cmm_cuda.cu
new file mode 100644
index 0000000000..b4bb31e094
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_cuda.cu
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _cg_type MY_AP(coeff5)
+
+    enum {CG_NOT_SET=0, CG_LJ9_6, CG_LJ12_4, CG_LJ12_6, NUM_CG_TYPES,
+          CG_COUL_NONE, CG_COUL_CUT, CG_COUL_DEBYE, CG_COUL_LONG};
+
+#include "pair_cg_cmm_cuda_cu.h"
+#include "pair_cg_cmm_cuda_kernel_nc.cu"
+#include <time.h>
+
+
+
+
+void Cuda_PairCGCMMCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 5, false, false );
+	
+}
+
+
+
+
+void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+	// initialize only on first call
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairCGCMMCuda_Init(sdata);
+	}
+	
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	int maxthreads=128;
+
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,maxthreads);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_CG_CMM,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_CG_CMM,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);	
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _cg_type
+
diff --git a/lib/cuda/pair_cg_cmm_cuda_cu.h b/lib/cuda/pair_cg_cmm_cuda_cu.h
new file mode 100644
index 0000000000..da6d6075f0
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..dcaaab7955
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu
@@ -0,0 +1,48 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+					
+__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) //0.11 of 0.4
+{
+	const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const int cg_type = _cg_type[ij_type];
+	const F_FLOAT r4inv = r2inv*r2inv;
+	const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq);
+	const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?-r2inv:-F_F(1.0);
+	const F_FLOAT forcelj = r4inv * (_lj1[ij_type]*r4inv*rNinv_first + _lj2[ij_type]*rNinv_second);
+	
+    if(eflag) evdwl += factor_lj*(r4inv*(_lj3[ij_type]*r4inv*rNinv_first+_lj4[ij_type]*rNinv_second) - _offset[ij_type]);
+	return factor_lj*forcelj*r2inv; 
+}
+
+/*__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+	const int cg_type = tex1Dfetch(_coeff5_gm_tex,ij_type);
+	const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const F_FLOAT r4inv = r2inv*r2inv;
+	const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq);
+	const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0);
+	const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second);
+	
+    if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second));
+	return factor_lj*forcelj*r2inv; 
+}*/
diff --git a/lib/cuda/pair_eam_cuda.cu b/lib/cuda/pair_eam_cuda.cu
new file mode 100644
index 0000000000..29ad4af271
--- /dev/null
+++ b/lib/cuda/pair_eam_cuda.cu
@@ -0,0 +1,330 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _type2frho MY_AP(coeff1)
+#define _type2rhor MY_AP(coeff2)
+#define _type2z2r MY_AP(coeff3)
+#define _rdr MY_AP(rdr)
+#define _rdrho MY_AP(rdrho)
+#define _nr MY_AP(nr)
+#define _nrho MY_AP(nrho)
+#define _nfrho MY_AP(nfrho)
+#define _nrhor MY_AP(nrhor)
+#define _nz2r MY_AP(nz2r)
+#define _frho_spline MY_AP(frho_spline)
+#define _rhor_spline MY_AP(rhor_spline)
+#define _z2r_spline MY_AP(z2r_spline)
+#define _rho MY_AP(rho)
+#define _fp MY_AP(fp)
+
+__device__ __constant__ F_FLOAT MY_AP(rdr);
+__device__ __constant__ F_FLOAT MY_AP(rdrho);
+__device__ __constant__ int MY_AP(nr);
+__device__ __constant__ int MY_AP(nrho);
+__device__ __constant__ int MY_AP(nfrho);
+__device__ __constant__ int MY_AP(nrhor);
+__device__ __constant__ int MY_AP(nz2r);
+__device__ __constant__ F_FLOAT* MY_AP(frho_spline);
+__device__ __constant__ F_FLOAT* MY_AP(rhor_spline);
+__device__ __constant__ F_FLOAT* MY_AP(z2r_spline);
+__device__ __constant__ F_FLOAT* MY_AP(rho);
+__device__ __constant__ F_FLOAT* MY_AP(fp);
+
+#define _rhor_spline_tex         MY_AP(rhor_spline_tex)
+#if F_PRECISION == 1
+texture<float4,1> _rhor_spline_tex;
+#else
+texture<int4,1> _rhor_spline_tex;
+#endif
+
+
+#define _z2r_spline_tex         MY_AP(z2r_spline_tex)
+#if F_PRECISION == 1
+texture<float4,1> _z2r_spline_tex;
+#else
+texture<int4,1> _z2r_spline_tex;
+#endif
+
+
+
+#include "pair_eam_cuda_cu.h"
+#include "pair_eam_cuda_kernel_nc.cu"
+#include <time.h>
+
+int eam_buff_offset;
+int rhor_spline_size;
+void* rhor_spline_pointer;
+int z2r_spline_size;
+void* z2r_spline_pointer;
+
+
+inline void BindEAMTextures(cuda_shared_data* sdata)
+{
+  _rhor_spline_tex.normalized = false;                      // access with normalized texture coordinates
+  _rhor_spline_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _rhor_spline_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+ 
+  const textureReference* rhor_spline_texture_ptr;
+  cudaGetTextureReference(&rhor_spline_texture_ptr, MY_CONST(rhor_spline_tex));
+
+  #if F_PRECISION == 1
+  cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<float4>();
+  cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
+  #else
+  cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<int4>();
+  cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
+  #endif
+  
+  _z2r_spline_tex.normalized = false;                      // access with normalized texture coordinates
+  _z2r_spline_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _z2r_spline_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+
+  const textureReference* z2r_spline_texture_ptr;
+  cudaGetTextureReference(&z2r_spline_texture_ptr, MY_CONST(z2r_spline_tex));
+
+  #if F_PRECISION == 1
+  cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<float4>();
+  cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
+  #else
+  cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<int4>();
+  cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
+  #endif
+  
+}
+
+void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+	CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed");
+        int3 layout=getgrid(sneighlist->inum,7*sizeof(F_FLOAT));
+	    dim3 threads(layout.z, 1, 1);
+	    dim3 grid(layout.x, layout.y, 1);		
+	    int size=(unsigned)(layout.y*layout.x)*7*sizeof(F_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			if(sdata->buffer!=NULL) cudaFree(sdata->buffer);
+			cudaMalloc((void**)&sdata->buffer,size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+	CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed");
+}
+
+void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+	CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed");
+		cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0]  , sizeof(unsigned) );
+		cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(ilist)     , & sneighlist->ilist     .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(inum)      , & sneighlist->inum               , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal               , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)      , & sdata->atom.nmax               , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(numneigh)  , & sneighlist->numneigh  .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(neighbors)  		, & sneighlist->neighbors  .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(maxneighbors)     	, & sneighlist->maxneighbors	   , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(x)         , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(x_type)         	, & sdata->atom.x_type    .dev_data, sizeof(X_FLOAT4*) );
+		cudaMemcpyToSymbol(MY_CONST(f)         			, & sdata->atom.f         .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)      			, & sdata->atom.type      .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(tag)      			, & sdata->atom.tag       .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*) );
+	CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed");
+}
+
+
+void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata,double rdr,double rdrho,int nfrho, int nrhor,int nr, int nrho,int nz2r,
+void* frho_spline,void* rhor_spline,void* z2r_spline,void* rho,void* fp,
+int* type2frho,int** type2z2r,int** type2rhor)
+{
+	// !! LAMMPS indexes atom types starting with 1 !!
+	
+	unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+	if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2)
+		printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u "
+		"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 "
+		"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
+	unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes;
+
+	X_FLOAT cutsq_global;
+	cutsq_global = (X_FLOAT) (sdata->pair.cut_global);	
+	cudaMemcpyToSymbol(MY_CONST(cutsq_global)	,&cutsq_global  				, sizeof(X_FLOAT)  );
+	
+	
+	F_FLOAT* coeff_buf=new F_FLOAT[cuda_ntypes*cuda_ntypes];
+	for(int i=0;i<cuda_ntypes;i++) coeff_buf[i]=type2frho[i];
+	cudaMemcpyToSymbol(MY_AP(coeff1)        , coeff_buf             , cuda_ntypes*sizeof(F_FLOAT)      );
+	
+	for(int i=0;i<cuda_ntypes*cuda_ntypes;i++) coeff_buf[i]=(&type2rhor[0][0])[i];
+	cudaMemcpyToSymbol(MY_AP(coeff2)        , coeff_buf             , nI       );
+	for(int i=0;i<cuda_ntypes*cuda_ntypes;i++) coeff_buf[i]=(&type2z2r[0][0])[i];
+	cudaMemcpyToSymbol(MY_AP(coeff3)        , coeff_buf             , nI       );
+	
+	delete [] coeff_buf;
+	X_FLOAT box_size[3] =
+	{
+		sdata->domain.subhi[0] - sdata->domain.sublo[0],
+		sdata->domain.subhi[1] - sdata->domain.sublo[1],
+		sdata->domain.subhi[2] - sdata->domain.sublo[2]
+	};
+	F_FLOAT rdr_F=rdr;
+	F_FLOAT rdrho_F=rdrho;
+	cudaMemcpyToSymbol(MY_CONST(box_size)   , box_size                 , sizeof(X_FLOAT)*3);
+	cudaMemcpyToSymbol(MY_CONST(cuda_ntypes), & cuda_ntypes            , sizeof(unsigned) );
+	cudaMemcpyToSymbol(MY_CONST(virial)     , &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*)  );
+	cudaMemcpyToSymbol(MY_CONST(eng_vdwl)     , &sdata->pair.eng_vdwl.dev_data   , sizeof(ENERGY_FLOAT*)  );
+	cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity, sizeof(int)*3    );
+	cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int)  );
+	cudaMemcpyToSymbol(MY_CONST(rdr), &rdr_F, sizeof(F_FLOAT)    );
+	cudaMemcpyToSymbol(MY_CONST(rdrho), &rdrho_F, sizeof(F_FLOAT)    );
+	cudaMemcpyToSymbol(MY_CONST(nr), &nr, sizeof(int)    );
+	cudaMemcpyToSymbol(MY_CONST(nrho), &nrho, sizeof(int)    );
+	cudaMemcpyToSymbol(MY_CONST(nfrho), &nfrho, sizeof(int)    );
+	cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int)    );
+	cudaMemcpyToSymbol(MY_CONST(rho), &rho, sizeof(F_FLOAT*)    );
+	cudaMemcpyToSymbol(MY_CONST(fp), &fp, sizeof(F_FLOAT*)    );
+	cudaMemcpyToSymbol(MY_CONST(frho_spline), &frho_spline, sizeof(F_FLOAT*)    );
+	cudaMemcpyToSymbol(MY_CONST(rhor_spline), &rhor_spline, sizeof(F_FLOAT*)    );
+	cudaMemcpyToSymbol(MY_CONST(z2r_spline), &z2r_spline, sizeof(F_FLOAT*)    );
+	cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int)    );
+	
+	rhor_spline_size = nrhor*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT);
+	z2r_spline_size = nz2r*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT);
+	rhor_spline_pointer = rhor_spline;
+	z2r_spline_pointer = z2r_spline;
+	
+	CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed");
+	
+}
+
+
+
+void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+	
+	if(sdata->atom.update_nmax) 
+		Cuda_PairEAMCuda_UpdateNmax(sdata,sneighlist);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	if(sdata->buffer_new)
+		Cuda_PairEAMCuda_UpdateBuffer(sdata,sneighlist);
+		cudaMemcpyToSymbol(MY_CONST(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*) );
+
+	int sharedperproc=0;
+	if(eflag||eflag_atom) sharedperproc=1;
+	if(vflag||vflag_atom) sharedperproc=7;
+
+	int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT));
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	
+	eam_buff_offset=grid.x*grid.y;
+		
+	BindXTypeTexture(sdata);
+	BindEAMTextures( sdata);// initialize only on first call
+
+	
+	MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); )
+	CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation");
+	PairEAMCuda_Kernel1<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag,eflag_atom,vflag_atom);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed");
+	
+
+
+	MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); )
+	
+}
+
+void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+	int sharedperproc=0;
+	if(eflag||eflag_atom) sharedperproc=1;
+	if(vflag||vflag_atom) sharedperproc=7;
+	int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT));
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+		
+	BindXTypeTexture(sdata);
+	BindEAMTextures( sdata);// initialize only on first call
+	// initialize only on first call
+    sdata->pair.lastgridsize=grid.x*grid.y;
+    sdata->pair.n_energy_virial=sharedperproc;
+
+	MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); )
+	CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation");
+	PairEAMCuda_Kernel2<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag,eflag_atom,vflag_atom);
+	CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed");
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed");
+	
+	if(eflag||vflag) 
+	{
+		int n=grid.x*grid.y;
+		grid.x=sharedperproc;
+		grid.y=1;
+		threads.x=256;
+		MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)*sharedperproc>>>(n);
+		cudaThreadSynchronize();
+		CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed");
+	}
+
+	MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); )
+	
+}
+
+void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send)
+{
+ 	int3 layout=getgrid(n,0);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	F_FLOAT* buf=(F_FLOAT*) (& ((double*)sdata->buffer)[eam_buff_offset]);
+	
+    PairEAMCuda_PackComm_Kernel<<<grid, threads,0>>> ((int*) sdata->comm.sendlist.dev_data,n
+	  ,sdata->comm.maxlistlength,iswap,buf);
+	cudaThreadSynchronize();
+    cudaMemcpy(buf_send, buf, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);   
+	cudaThreadSynchronize();
+}
+
+void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,void* fp)
+{
+	F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]);
+    cudaMemcpy(fp_first,buf_recv, n*sizeof(F_FLOAT), cudaMemcpyHostToDevice);   
+}
+
+#undef _type2frho
+#undef _type2rhor
+#undef _type2z2r
+
+
+/* ----------------------------------------------------------------------
+   tally eng_vdwl and virial into global and per-atom accumulators
+   need i < nlocal test since called by bond_quartic and dihedral_charmm
+------------------------------------------------------------------------- */
+
diff --git a/lib/cuda/pair_eam_cuda_cu.h b/lib/cuda/pair_eam_cuda_cu.h
new file mode 100644
index 0000000000..dee4a036e2
--- /dev/null
+++ b/lib/cuda/pair_eam_cuda_cu.h
@@ -0,0 +1,33 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+extern "C" void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata,double rdr,double rdrho,int nfrho, int nrhor,int nr, int nrho,int nz2r,
+void* frho_spline,void* rhor_spline,void* z2r_spline,void* rho,void* fp,
+int* type2frho,int** type2z2r,int** type2rhor);
+extern "C" void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
+extern "C" void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
+extern "C" void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send);
+extern "C" void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,void* fp);
+
+#define EAM_COEFF_LENGTH 8
diff --git a/lib/cuda/pair_eam_cuda_kernel_nc.cu b/lib/cuda/pair_eam_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..a3dc30f397
--- /dev/null
+++ b/lib/cuda/pair_eam_cuda_kernel_nc.cu
@@ -0,0 +1,340 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+
+
+static __device__ inline F_FLOAT4 fetchRhor(int i)
+{
+		#ifdef CUDA_USE_TEXTURE
+		  #if F_PRECISION == 1
+		     return tex1Dfetch(_rhor_spline_tex,i);
+		  #else
+		     return tex1Dfetch_double_f(_rhor_spline_tex,i);
+		  #endif
+		#else
+		  return _rhor_spline[i];
+		#endif		
+}
+
+static __device__ inline F_FLOAT4 fetchZ2r(int i)
+{
+		#ifdef CUDA_USE_TEXTURE
+		  #if F_PRECISION == 1
+		     return tex1Dfetch(_z2r_spline_tex,i);
+		  #else
+		     return tex1Dfetch_double_f(_z2r_spline_tex,i);
+		  #endif
+		#else
+		  return _z2r_spline[i];
+		#endif		
+}
+
+__global__ void PairEAMCuda_Kernel1(int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	ENERGY_FLOAT* sharedE;
+	ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
+
+	
+	if(eflag||eflag_atom)
+    {
+      sharedE = &sharedmem[threadIdx.x];
+      sharedE[0] = ENERGY_F(0.0); 
+      sharedV += blockDim.x;
+    }
+    if(vflag||vflag_atom)
+    {  
+      sharedV[0*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[1*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[2*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[3*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[4*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[5*blockDim.x] = ENERGY_F(0.0); 
+    }
+	
+	int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+
+	X_FLOAT xtmp,ytmp,ztmp;
+	X_FLOAT4 myxtype;
+	F_FLOAT delx,dely,delz;
+	int itype;
+	int i=_nlocal;
+	int jnum=0;
+	int* jlist;
+
+	if(ii < _inum)
+	{
+		i = _ilist[ii];
+		
+		myxtype=fetchXType(i);
+		xtmp=myxtype.x;
+		ytmp=myxtype.y;
+		ztmp=myxtype.z;
+		itype=static_cast <int> (myxtype.w);
+
+		jnum = _numneigh[i];
+
+		jlist = &_neighbors[i];
+		if(i<_nlocal)
+		_rho[i]=F_F(0.0);
+	} 
+	__syncthreads();
+
+	for (int jj = 0; jj < jnum; jj++)
+	{
+		  if(ii < _inum)
+		  if(jj<jnum)
+		  {
+			const int j = jlist[jj*_nlocal]; 
+			myxtype = fetchXType(j);
+			delx = xtmp - myxtype.x;
+			dely = ytmp - myxtype.y;
+			delz = ztmp - myxtype.z;
+			int jtype = static_cast <int> (myxtype.w);
+			const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+			if (rsq < _cutsq_global)
+			{
+				F_FLOAT p = sqrt(rsq)*_rdr + F_F(1.0);
+				int m = static_cast<int> (p);
+				m = MIN(m,_nr-1);
+				p -= m;
+				p = MIN(p,F_F(1.0));
+
+				int k=(static_cast <int> (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2;
+				F_FLOAT4 c=fetchRhor(k+1);
+				_rho[i] += ((c.w*p+c.x)*p+c.y)*p+c.z;
+			}
+		  }
+		}
+ 
+ 	if(ii < _inum)
+	{
+   				
+   		F_FLOAT p = _rho[i]*_rdrho + F_F(1.0);
+    	int m = static_cast<int> (p);
+    	m = MAX(1,MIN(m,_nrho-1));
+    	p -= m;
+    	p = MIN(p,F_F(1.0));
+    	F_FLOAT* coeff = &_frho_spline[(static_cast <int> (_type2frho[itype])*(_nrho+1)+m)*EAM_COEFF_LENGTH];
+    	_fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2];
+      	if (eflag||eflag_atom) {
+      		sharedmem[threadIdx.x] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+   		}
+        
+	}
+	__syncthreads();
+	if(eflag||eflag_atom)
+	{
+	  if(i<_nlocal&&eflag_atom)
+	  _eatom[i]+=sharedmem[threadIdx.x];
+      reduceBlock(sharedmem);
+	  ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+	  buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0)*sharedmem[0];        
+	}
+}
+
+__global__ void PairEAMCuda_Kernel2(int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+
+	ENERGY_FLOAT* sharedE;
+	ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
+
+	
+	if(eflag||eflag_atom)
+    {
+      sharedE = &sharedmem[threadIdx.x];
+      sharedE[0] = ENERGY_F(0.0); 
+      sharedV += blockDim.x;
+    }
+    if(vflag||vflag_atom)
+    {  
+      sharedV[0*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[1*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[2*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[3*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[4*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[5*blockDim.x] = ENERGY_F(0.0); 
+    }
+	
+	int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+
+	X_FLOAT xtmp,ytmp,ztmp;
+	X_FLOAT4 myxtype;
+	F_FLOAT fxtmp,fytmp,fztmp,fpair;
+	F_FLOAT delx,dely,delz;
+	int itype,i;
+	int jnum=0;
+	int* jlist;
+
+	if(ii < _inum)
+	{
+		i = _ilist[ii];
+		
+		myxtype=fetchXType(i);
+		xtmp=myxtype.x;
+		ytmp=myxtype.y;
+		ztmp=myxtype.z;
+		itype=static_cast <int> (myxtype.w);
+		fxtmp = F_F(0.0);
+		fytmp = F_F(0.0);
+		fztmp = F_F(0.0);
+		
+		jnum = _numneigh[i];
+
+		jlist = &_neighbors[i];
+		if(i<_nlocal)
+		_rho[i]=F_F(0.0);
+	} 
+	if(ii<gridDim.x*gridDim.y) evdwl=((ENERGY_FLOAT*) _buffer)[ii];
+	__syncthreads();
+
+	for (int jj = 0; jj < jnum; jj++)
+	{
+		  if(ii < _inum)
+		  if(jj<jnum)
+		  {
+			const int j = jlist[jj*_nlocal]; 
+			myxtype = fetchXType(j);
+			delx = xtmp - myxtype.x;
+			dely = ytmp - myxtype.y;
+			delz = ztmp - myxtype.z;
+			int jtype = static_cast <int> (myxtype.w);
+			const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+			if (rsq < _cutsq_global)
+			{
+				F_FLOAT r = _SQRT_(rsq);
+				F_FLOAT p = r*_rdr + F_F(1.0);
+				int m = static_cast<int> (p);
+				m = MIN(m,_nr-1);
+				p -= m;
+				p = MIN(p,F_F(1.0));
+				
+				int k=(static_cast <int> (_type2rhor[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2;
+				F_FLOAT4 c=fetchRhor(k);
+				F_FLOAT rhoip = (c.x*p + c.y)*p + c.z;
+				k=(static_cast <int> (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2;
+				c=fetchRhor(k);
+				F_FLOAT rhojp = (c.x*p + c.y)*p + c.z;
+				k=(static_cast <int> (_type2z2r[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2;
+				c=fetchZ2r(k);
+				F_FLOAT z2p = (c.x*p + c.y)*p + c.z;
+				c=fetchZ2r(k+1);
+				F_FLOAT z2 = ((c.w*p + c.x)*p + c.y)*p+c.z;
+
+				F_FLOAT recip = F_F(1.0)/r;
+				F_FLOAT phi = z2*recip;
+				F_FLOAT phip = z2p*recip - phi*recip;
+				F_FLOAT psip = _fp[i]*rhojp + _fp[j]*rhoip + phip;
+				fpair = -psip*recip;
+
+				F_FLOAT dxfp,dyfp,dzfp;
+				fxtmp += dxfp = delx*fpair;
+				fytmp += dyfp = dely*fpair; 
+				fztmp += dzfp = delz*fpair;
+				evdwl+=phi;
+				if(vflag||vflag_atom)
+				{
+				  sharedV[0 * blockDim.x]+= delx*dxfp;
+    			  sharedV[1 * blockDim.x]+= dely*dyfp;
+    			  sharedV[2 * blockDim.x]+= delz*dzfp;
+    			  sharedV[3 * blockDim.x]+= delx*dyfp;
+    			  sharedV[4 * blockDim.x]+= delx*dzfp;
+    			  sharedV[5 * blockDim.x]+= dely*dzfp;
+				}
+			}
+		  }
+	}
+
+ 	__syncthreads();
+	if(ii < _inum)
+	{
+ 	    F_FLOAT* my_f;
+  	    if(_collect_forces_later)
+  	    {
+  			ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+        	if(eflag) 
+        	{
+          		buffer=&buffer[1 * gridDim.x * gridDim.y];
+        	}
+	    	if(vflag)
+	    	{
+		  		buffer=&buffer[6 * gridDim.x * gridDim.y];
+	    	}
+  	    	my_f = (F_FLOAT*) buffer; 
+  	    	my_f += i;
+	    	*my_f = fxtmp; my_f += _nmax;
+	    	*my_f = fytmp; my_f += _nmax;
+	    	*my_f = fztmp;
+  	    }
+  	    else
+  	    {
+  	     	my_f = _f + i;
+	    	*my_f += fxtmp; my_f += _nmax;
+	    	*my_f += fytmp; my_f += _nmax;
+	    	*my_f += fztmp;
+  	    }
+	}
+	__syncthreads();
+	
+	if(eflag) 
+	{
+	  sharedE[0] = evdwl;
+	}
+    if(eflag_atom && i<_nlocal) 
+    {
+       _eatom[i] += evdwl;
+    }
+       
+	if(vflag_atom && i<_nlocal)
+	{
+	  _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
+	  _vatom[i+_nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
+	  _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
+	  _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
+	  _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
+	  _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
+	}
+	if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,0);
+}
+
+__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,F_FLOAT* buffer)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  int* list=sendlist+iswap*maxlistlength;
+  if(i<n)
+  {
+    int j=list[i];
+    buffer[i]=_fp[j];
+  }
+}
+
+__global__ void PairEAMCuda_UnpackComm_Kernel(int n,int first,F_FLOAT* buffer)
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  if(i<n)
+  {
+    _fp[i+first]=buffer[i];
+  }
+}
diff --git a/lib/cuda/pair_gran_hooke_cuda.cu b/lib/cuda/pair_gran_hooke_cuda.cu
new file mode 100644
index 0000000000..d9c1cffad4
--- /dev/null
+++ b/lib/cuda/pair_gran_hooke_cuda.cu
@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _kn MY_AP(coeff1)  //[0]
+#define _kt MY_AP(coeff1)  //[1]
+#define _gamman MY_AP(coeff1) //[2]
+#define _gammat MY_AP(coeff3) //[0]
+#define _xmu MY_AP(coeff2) //[0]
+#define _dampflag MY_AP(coeff2) //[1]
+
+#include "pair_gran_hooke_cuda_cu.h"
+#include "pair_gran_hooke_cuda_kernel_nc.cu"
+#include <time.h>
+
+void Cuda_PairGranHookeCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+	CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateBuffer failed");
+        int3 layout=getgrid(sneighlist->inum,7*sizeof(ENERGY_FLOAT));
+	    dim3 threads(layout.z, 1, 1);
+	    dim3 grid(layout.x, layout.y, 1);		
+	    int size=(unsigned)(layout.y*layout.x)*7*sizeof(ENERGY_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_PairGranHookeCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			if(sdata->buffer!=NULL) cudaFree(sdata->buffer);
+			cudaMalloc((void**)&sdata->buffer,size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+	CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: updateBuffer failed");
+}
+
+void Cuda_PairGranHookeCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+	CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateNmax failed");
+		cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0]  , sizeof(unsigned) );
+		//cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(ilist)     , & sneighlist->ilist     .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(inum)      , & sneighlist->inum               , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(nlocal)    , & sdata->atom.nlocal             , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(nall)      , & sdata->atom.nall               , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(nmax)      , & sdata->atom.nmax               , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(numneigh)  , & sneighlist->numneigh  .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors  .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(type)      , & sdata->atom.type      .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(tag)       , & sdata->atom.tag       .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(mask)      , & sdata->atom.mask      .dev_data, sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(f)         , & sdata->atom.f         .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(x)         , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(x_type)    , & sdata->atom.x_type    .dev_data, sizeof(X_FLOAT4*) );
+		cudaMemcpyToSymbol(MY_CONST(v_radius)  , & sdata->atom.v_radius  .dev_data, sizeof(V_FLOAT4*) );
+		cudaMemcpyToSymbol(MY_CONST(omega_rmass),& sdata->atom.omega_rmass.dev_data,sizeof(V_FLOAT4*) );
+		cudaMemcpyToSymbol(MY_CONST(torque)    , & sdata->atom.torque    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(maxneighbors),&sneighlist->maxneighbors	 	  , sizeof(int) );
+		cudaMemcpyToSymbol(MY_CONST(eatom)     , & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(vatom)     , & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata      		  , sizeof(int*) );
+		cudaMemcpyToSymbol(MY_CONST(freeze_group_bit) , & sdata->pair.freeze_group_bit, sizeof(int)  );
+		
+		
+	CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: updateNmax failed");
+}
+
+
+void Cuda_PairGranHookeCuda_Init(cuda_shared_data* sdata)
+{
+	// !! LAMMPS indexes atom types starting with 1 !!
+	
+	unsigned cuda_ntypes = sdata->atom.ntypes + 2;
+	if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2)
+		printf("# CUDA: Cuda_PairGranHookeCuda_Init: you need %u types. this is more than %u "
+		"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
+		"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE-1);
+	unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
+	unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
+	
+	F_FLOAT coeffs1[cuda_ntypes2];
+	coeffs1[0]= (F_FLOAT) sdata->pair.coeff1[0][0];
+	coeffs1[1]= (F_FLOAT) sdata->pair.coeff1[0][1];
+	coeffs1[2]= (F_FLOAT) sdata->pair.coeff1[1][0];
+	F_FLOAT coeffs3[cuda_ntypes2];
+	coeffs3[0]= (F_FLOAT) sdata->pair.coeff1[1][1];
+	F_FLOAT coeffs2[cuda_ntypes2];
+	coeffs2[0]= (F_FLOAT) sdata->pair.coeff2[0][0];
+	coeffs2[1]= (F_FLOAT) sdata->pair.coeff2[0][1];
+	
+	
+	X_FLOAT box_size[3] =
+	{
+		sdata->domain.subhi[0] - sdata->domain.sublo[0],
+		sdata->domain.subhi[1] - sdata->domain.sublo[1],
+		sdata->domain.subhi[2] - sdata->domain.sublo[2]
+	};
+	//printf("n: %i %i\n",n,CUDA_MAX_TYPES2);
+	cudaMemcpyToSymbol(MY_CONST(box_size)   , box_size                 , sizeof(X_FLOAT)*3);
+	cudaMemcpyToSymbol(MY_CONST(cuda_ntypes), & cuda_ntypes            , sizeof(unsigned) );
+	cudaMemcpyToSymbol(MY_CONST(coeff1)        , coeffs1                   , n                );
+	cudaMemcpyToSymbol(MY_CONST(coeff2)        , coeffs2                   , n                );
+	cudaMemcpyToSymbol(MY_CONST(coeff3)        , coeffs3                   , n                );
+	cudaMemcpyToSymbol(MY_CONST(virial)     , &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*)  );
+	cudaMemcpyToSymbol(MY_CONST(eng_vdwl)     , &sdata->pair.eng_vdwl.dev_data   , sizeof(ENERGY_FLOAT*)  );
+	cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity, sizeof(int)*3    );
+	CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: init failed");
+}
+
+
+
+void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+	//if(sdata->atom.update_nmax) 
+		Cuda_PairGranHookeCuda_UpdateNmax(sdata,sneighlist);
+	//if(sdata->atom.update_nlocal) 		
+	{
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
+		cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
+	}		
+	//if(sdata->buffer_new)
+		Cuda_PairGranHookeCuda_UpdateBuffer(sdata,sneighlist);
+	
+	BindXTypeTexture(sdata);
+	BindVRadiusTexture(sdata);
+	BindOmegaRmassTexture(sdata);
+
+	int sharedperproc=0;
+	if(eflag) sharedperproc+=1;
+	if(vflag) sharedperproc+=6;
+
+	int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT),128);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+
+	// initialize only on first call
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairGranHookeCuda_Init(sdata);
+	}
+	
+	MYDBG( printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); )
+	
+	CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pre pair lj cut Kernel problems before kernel invocation");
+	PairGranHookeCuda_Kernel<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag,eflag_atom,vflag_atom,(int**)sneighlist->firstneigh.dev_data,sneighlist->binned_id
+	,(F_FLOAT) sdata->pair.coeff1[0][0],(F_FLOAT) sdata->pair.coeff1[1][0],(F_FLOAT) sdata->pair.coeff1[1][1],(F_FLOAT) sdata->pair.coeff2[0][0]);
+	cudaThreadSynchronize();
+	CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pair lj cut Kernel execution failed");
+	
+	if(eflag||vflag) 
+	{
+		int n=grid.x*grid.y;
+		grid.x=sharedperproc;
+		grid.y=1;
+		threads.x=256;
+		MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(n);
+		cudaThreadSynchronize();
+		CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) virial compute Kernel execution failed");
+	}
+
+	MYDBG( printf("# CUDA: Cuda_PairGranHookeCoulLongCuda: kernel done\n"); )
+	
+}
+
+
+#undef _kn
+#undef _kt
+#undef _gamman
+#undef _gammat
+#undef _xmu
+#undef _dampflag
+
+
diff --git a/lib/cuda/pair_gran_hooke_cuda_cu.h b/lib/cuda/pair_gran_hooke_cuda_cu.h
new file mode 100644
index 0000000000..03cbd36519
--- /dev/null
+++ b/lib/cuda/pair_gran_hooke_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..f063def443
--- /dev/null
+++ b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu
@@ -0,0 +1,219 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+__global__ void PairGranHookeCuda_Kernel(int eflag, int vflag,int eflag_atom,int vflag_atom,int** firstneight,int* binned_id
+,F_FLOAT kn,F_FLOAT gamman,F_FLOAT gammat, F_FLOAT xmu)
+{
+	ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+
+	ENERGY_FLOAT* sharedE;
+	ENERGY_FLOAT* sharedV;
+	
+	if(eflag||eflag_atom)
+    {
+      sharedE = &sharedmem[threadIdx.x];
+      sharedV = &sharedmem[0];
+      sharedE[0] = ENERGY_F(0.0); sharedV+=blockDim.x;
+    }
+    if(vflag||vflag_atom)
+    {  
+      sharedV += threadIdx.x;
+      sharedV[0*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[1*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[2*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[3*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[4*blockDim.x] = ENERGY_F(0.0); 
+      sharedV[5*blockDim.x] = ENERGY_F(0.0); 
+    }
+    
+	int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+	MYEMUDBG( if(ii==0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n"); )
+
+	X_FLOAT xtmp,ytmp,ztmp;
+	X_FLOAT4 myxtype;
+	V_FLOAT4 myvradius, ovradius;
+	F_FLOAT fxtmp,fytmp,fztmp,torquextmp,torqueytmp,torqueztmp;
+	F_FLOAT delx,dely,delz;
+ 	F_FLOAT radi,radj,radsum,r,rsqinv;
+    F_FLOAT vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3;
+  	F_FLOAT wr1,wr2,wr3;
+    F_FLOAT vtr1,vtr2,vtr3,vrel;
+    F_FLOAT meff,damp,ccel,tor1,tor2,tor3;
+    F_FLOAT fn,fs,ft,fs1,fs2,fs3;
+ 
+ 	int jnum =0;
+ 	int i,j;
+	int* jlist;
+	
+	if(ii < _inum)
+	{
+		i = _ilist[ii];
+
+		myxtype = fetchXType(i);
+		myvradius = fetchVRadius(i);
+
+		xtmp=myxtype.x;
+		ytmp=myxtype.y;
+		ztmp=myxtype.z;
+ 	    radi = myvradius.w;
+ 		  
+		fxtmp = F_F(0.0);
+		fytmp = F_F(0.0);
+		fztmp = F_F(0.0);
+		torquextmp = F_F(0.0);
+		torqueytmp = F_F(0.0);
+		torqueztmp = F_F(0.0);
+
+		jnum = _numneigh[i];
+
+		jlist = &_neighbors[i];
+	} 
+	__syncthreads();
+	
+	for (int jj = 0; jj < jnum; jj++)
+	{
+		if(ii < _inum)
+		if(jj<jnum)
+		{
+			j = jlist[jj*_nlocal]; 
+
+			myxtype = fetchXType(j);
+			ovradius = fetchVRadius(j);
+	
+			delx = xtmp - myxtype.x;
+			dely = ytmp - myxtype.y;
+			delz = ztmp - myxtype.z;
+		  
+     		radj = ovradius.w;
+      		radsum = radi + radj;
+			
+			const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+			
+			if (rsq < radsum*radsum)
+			{
+				const F_FLOAT rinv = _RSQRT_(rsq);
+				r = F_F(1.0)/rinv;
+				rsqinv = F_F(1.0)/rsq;
+	
+				// relative translational velocity
+
+				vr1 = myvradius.x - ovradius.x;
+				vr2 = myvradius.y - ovradius.y;
+				vr3 = myvradius.z - ovradius.z;
+	
+				// normal component
+
+				vnnr = vr1*delx + vr2*dely + vr3*delz;
+				vn1 = delx*vnnr * rsqinv;
+				vn2 = dely*vnnr * rsqinv;
+				vn3 = delz*vnnr * rsqinv;
+
+				// tangential component
+
+				vt1 = vr1 - vn1;
+				vt2 = vr2 - vn2;
+				vt3 = vr3 - vn3;
+
+				// relative rotational velocity
+				V_FLOAT4 omegarmass_i=fetchOmegaRmass(i);
+				V_FLOAT4 omegarmass_j=fetchOmegaRmass(j);
+				 
+				wr1 = (radi*omegarmass_i.x + radj*omegarmass_j.x) * rinv;
+				wr2 = (radi*omegarmass_i.y + radj*omegarmass_j.y) * rinv;
+				wr3 = (radi*omegarmass_i.z + radj*omegarmass_j.z) * rinv;
+	
+	  			meff = omegarmass_i.w*omegarmass_j.w / (omegarmass_i.w+omegarmass_j.w);
+	  			if (_mask[i] & _freeze_group_bit) meff = omegarmass_j.w;
+	  			if (_mask[j] & _freeze_group_bit) meff = omegarmass_i.w;
+				
+				damp = meff*gamman*vnnr*rsqinv;
+				ccel = kn*(radsum-r)*rinv - damp;
+	
+				vtr1 = vt1 - (delz*wr2-dely*wr3);
+				vtr2 = vt2 - (delx*wr3-delz*wr1);
+				vtr3 = vt3 - (dely*wr1-delx*wr2);
+				vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
+				vrel = _SQRT_(vrel);
+
+				fn = xmu * fabs(ccel*r);
+				fs = meff*gammat*vrel;
+			    ft = (vrel != F_F(0.0))?MIN(fn,fs) / vrel:F_F(0.0);
+
+				fs1 = -ft*vtr1;
+				fs2 = -ft*vtr2;
+				fs3 = -ft*vtr3;
+				
+				F_FLOAT dxfp,dyfp,dzfp;
+				fxtmp += dxfp = delx*ccel + fs1; 
+				fytmp += dyfp = dely*ccel + fs2; 
+				fztmp += dzfp = delz*ccel + fs3;
+	
+				tor1 = rinv * (dely*fs3 - delz*fs2);
+				tor2 = rinv * (delz*fs1 - delx*fs3);
+				tor3 = rinv * (delx*fs2 - dely*fs1);
+
+				torquextmp -= radi*tor1;
+				torqueytmp -= radi*tor2;
+				torqueztmp -= radi*tor3;
+
+				if(vflag)
+				{
+				  sharedV[0 * blockDim.x]+= delx*dxfp;
+    			  sharedV[1 * blockDim.x]+= dely*dyfp;
+    			  sharedV[2 * blockDim.x]+= delz*dzfp;
+    			  sharedV[3 * blockDim.x]+= delx*dyfp;
+    			  sharedV[4 * blockDim.x]+= delx*dzfp;
+    			  sharedV[5 * blockDim.x]+= dely*dzfp;
+				}
+				
+			}
+		}
+	}
+    __syncthreads();
+	if(ii < _inum)
+	{
+		F_FLOAT* my_f = _f + i;
+		*my_f += fxtmp; my_f += _nmax;
+		*my_f += fytmp; my_f += _nmax;
+		*my_f += fztmp;
+		F_FLOAT* my_torque = _torque + i;
+		*my_torque += torquextmp; my_torque += _nmax;
+		*my_torque += torqueytmp; my_torque += _nmax;
+		*my_torque += torqueztmp;
+	}
+	__syncthreads();
+	
+	if(eflag) sharedE[0] = evdwl;
+    if(eflag_atom && i<_nlocal) _eatom[i] += evdwl;
+	if(vflag_atom && i<_nlocal)
+	{
+	  _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
+	  _vatom[i+_nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
+	  _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
+	  _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
+	  _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
+	  _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
+	}
+	if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,0);
+ }
diff --git a/lib/cuda/pair_lj96_cut_cuda.cu b/lib/cuda/pair_lj96_cut_cuda.cu
new file mode 100644
index 0000000000..1ec106e0f1
--- /dev/null
+++ b/lib/cuda/pair_lj96_cut_cuda.cu
@@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj96_cut_cuda_cu.h"
+#include "pair_lj96_cut_cuda_kernel_nc.cu"
+#include <time.h>
+
+
+
+
+void Cuda_PairLJ96CutCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4, false, false );
+}
+
+
+
+
+void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+	// initialize only on first call
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJ96CutCuda_Init(sdata);
+	}
+	
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,256);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ96_CUT,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ96_CUT,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);	
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+
+
diff --git a/lib/cuda/pair_lj96_cut_cuda_cu.h b/lib/cuda/pair_lj96_cut_cuda_cu.h
new file mode 100644
index 0000000000..24763103a7
--- /dev/null
+++ b/lib/cuda/pair_lj96_cut_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..28ccb839ba
--- /dev/null
+++ b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu
@@ -0,0 +1,33 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJ96CutCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+	const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+	const F_FLOAT r3inv = _SQRT_(r6inv);
+	const F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r3inv - _lj2[ij_type]);
+    if(eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r3inv-_lj4[ij_type]) - _offset[ij_type]);
+	return factor_lj*forcelj*r2inv; 
+}
+
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu
new file mode 100644
index 0000000000..b5a12755da
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu
@@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+
+#include "pair_lj_charmm_coul_charmm_cuda_cu.h"
+#include "pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata,F_FLOAT cut_coul_innersq,F_FLOAT denom_lj_inv,F_FLOAT denom_coul_inv)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4,true,true,true);
+	cudaMemcpyToSymbol(MY_CONST(cut_coul_innersq_global) , &cut_coul_innersq  , sizeof(F_FLOAT)  );
+	cudaMemcpyToSymbol(MY_CONST(denom_lj_inv) , &denom_lj_inv  , sizeof(F_FLOAT)  );
+	cudaMemcpyToSymbol(MY_CONST(denom_coul_inv) , &denom_coul_inv  , sizeof(F_FLOAT)  );
+	
+	return;
+}
+
+
+
+void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+			int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul)
+{
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJCharmmCoulCharmmCuda_Init(sdata,cut_coul_innersq,1.0/denom_lj,1.0/denom_coul);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CHARMM,COUL_CHARMM,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_CHARMM,COUL_CHARMM,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h
new file mode 100644
index 0000000000..3b96ab4481
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul);
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..baaea5d4e5
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+	const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+	F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]);
+	F_FLOAT philj,switch1;
+	if(rsq > _cut_innersq_global) 
+	{
+	  switch1 = (_cutsq_global-rsq) * (_cutsq_global-rsq) *
+				(_cutsq_global + F_F(2.0)*rsq - F_F(3.0)*_cut_innersq_global) * _denom_lj_inv;
+	  const F_FLOAT switch2 = F_F(12.0)*rsq * (_cutsq_global-rsq) * 
+				(rsq-_cut_innersq_global) * _denom_lj_inv;
+	  philj = r6inv * (_lj3[ij_type]*r6inv - _lj4[ij_type]);
+	  forcelj = forcelj*switch1 + philj*switch2;
+	}
+				
+	if (eflag) 
+	{
+	  ENERGY_FLOAT evdwl_tmp = factor_lj;
+	  if (rsq > _cut_innersq_global) 
+	  {
+	  	  evdwl_tmp*=philj*switch1;
+	  }
+	  else
+	  evdwl_tmp*= r6inv * (_lj3[ij_type]*r6inv - _lj4[ij_type]);
+	  evdwl+=evdwl_tmp;
+	}
+	 
+	return factor_lj*forcelj*r2inv;
+}
+
+__device__ inline F_FLOAT CoulCharmmCuda_Eval(const F_FLOAT& rsq,F_FLOAT& factor_coul,int& eflag, ENERGY_FLOAT& ecoul, F_FLOAT qij)
+{
+	 F_FLOAT forcecoul;
+	 ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij *_RSQRT_(rsq)*factor_coul; 
+	 if (rsq > _cut_coul_innersq_global) {
+	 	const F_FLOAT switch1 = (_cut_coulsq_global-rsq) * (_cut_coulsq_global-rsq) *
+	  				  	  (_cut_coulsq_global + F_F(2.0)*rsq - F_F(3.0)*_cut_coul_innersq_global) * _denom_coul_inv;
+	 	ecoul_tmp *= switch1;
+	 	const F_FLOAT switch2 = F_F(12.0)*rsq * (_cut_coulsq_global-rsq) * 
+	   					  (rsq-_cut_coul_innersq_global) * _denom_coul_inv;
+	 	forcecoul *= switch1 + switch2;
+	 }
+	 if(eflag) 
+	 {
+	   	ecoul += ecoul_tmp*factor_coul;
+	 }
+	 return forcecoul*(F_F(1.0)/rsq);
+}
+
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu
new file mode 100644
index 0000000000..9bfb0bcc0e
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+#define _cut_coul_innersq_global MY_AP(cut_coul_innersq_global)
+#define _denom_lj_inv MY_AP(denom_lj_inv)
+#define _denom_coul_inv MY_AP(denom_coul_inv)
+__device__ __constant__ F_FLOAT _cut_coul_innersq_global;
+__device__ __constant__ F_FLOAT _denom_lj_inv;
+__device__ __constant__ F_FLOAT _denom_coul_inv;
+
+
+#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h"
+#include "pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata,F_FLOAT cut_coul_innersq,F_FLOAT denom_lj_inv,F_FLOAT denom_coul_inv)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4,true,true,true);
+	cudaMemcpyToSymbol(MY_CONST(cut_coul_innersq_global) , &cut_coul_innersq  , sizeof(F_FLOAT)  );
+	cudaMemcpyToSymbol(MY_CONST(denom_lj_inv) , &denom_lj_inv  , sizeof(F_FLOAT)  );
+	cudaMemcpyToSymbol(MY_CONST(denom_coul_inv) , &denom_coul_inv  , sizeof(F_FLOAT)  );
+	
+	return;
+}
+
+
+
+void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+			int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul)
+{
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(sdata,cut_coul_innersq,1.0/denom_lj,1.0/denom_coul);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CHARMM,COUL_CHARMM_IMPLICIT,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_CHARMM,COUL_CHARMM_IMPLICIT,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h
new file mode 100644
index 0000000000..119163b291
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul);
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..c67037b7ce
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu
@@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT CoulCharmmImplicitCuda_Eval(const F_FLOAT& rsq,F_FLOAT& factor_coul,int& eflag, ENERGY_FLOAT& ecoul, F_FLOAT qij)
+{
+	 F_FLOAT forcecoul;
+	 ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij *(F_F(1.0)/rsq)*factor_coul; 
+	 if (rsq > _cut_coul_innersq_global) {
+	 	const F_FLOAT switch1 = (_cut_coulsq_global-rsq) * (_cut_coulsq_global-rsq) *
+	  				  	  (_cut_coulsq_global + F_F(2.0)*rsq - F_F(3.0)*_cut_coul_innersq_global) * _denom_coul_inv;
+	 	ecoul_tmp *= switch1;
+	 	const F_FLOAT switch2 = F_F(12.0)*rsq * (_cut_coulsq_global-rsq) * 
+	   					  (rsq-_cut_coul_innersq_global) * _denom_coul_inv;
+	 	forcecoul *= (switch1 + switch2);
+	 }
+	 if(eflag) 
+	 {
+	   	ecoul += ecoul_tmp*factor_coul;
+	 }
+	 return F_F(2.0)*forcecoul*(F_F(1.0)/rsq);
+}
+
diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda.cu b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu
new file mode 100644
index 0000000000..7c1a5ac46c
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+
+#include "pair_lj_charmm_coul_long_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata,F_FLOAT denom_lj_inv)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4,true,true,true);
+	cudaMemcpyToSymbol(MY_CONST(denom_lj_inv) , &denom_lj_inv  , sizeof(F_FLOAT)  );
+	
+	return;
+}
+
+
+
+void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+			int eflag_atom,int vflag_atom,F_FLOAT denom_lj)
+{
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJCharmmCoulLongCuda_Init(sdata,1.0/denom_lj);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CHARMM,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_CHARMM,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h
new file mode 100644
index 0000000000..0f29e8f97b
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT denom_lj);
diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda.cu b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu
new file mode 100644
index 0000000000..7cd53d31ff
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_class2_coul_cut_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJClass2CoulCutCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4,true);
+}
+
+void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJClass2CoulCutCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CLASS2,COUL_CUT,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_CLASS2,COUL_CUT,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h
new file mode 100644
index 0000000000..a656ebbd89
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda.cu b/lib/cuda/pair_lj_class2_coul_long_cuda.cu
new file mode 100644
index 0000000000..4f15d42936
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_coul_long_cuda.cu
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_class2_coul_long_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJClass2CoulLongCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4,true);
+}
+
+void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJClass2CoulLongCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CLASS2,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_CLASS2,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h b/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h
new file mode 100644
index 0000000000..dea620defe
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_lj_class2_cuda.cu b/lib/cuda/pair_lj_class2_cuda.cu
new file mode 100644
index 0000000000..1064d12cf6
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_cuda.cu
@@ -0,0 +1,74 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_class2_cuda_cu.h"
+#include "pair_lj_class2_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJClass2Cuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4);
+}
+
+void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJClass2Cuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	//int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
+	//if(CUDA_ARCH==20) maxthreads*=2;
+	//cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192);
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CLASS2,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_CLASS2,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_class2_cuda_cu.h b/lib/cuda/pair_lj_class2_cuda_cu.h
new file mode 100644
index 0000000000..cc14d9eda4
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..e5674d8b74
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu
@@ -0,0 +1,33 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJClass2Cuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+	const F_FLOAT r3inv = _SQRT_(r6inv);
+	if (eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r3inv-
+			    			_lj4[ij_type]) - _offset[ij_type]);
+	return factor_lj*r6inv * (_lj1[ij_type]*r3inv - _lj2[ij_type])*r2inv;
+}
+
diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda.cu b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu
new file mode 100644
index 0000000000..c3b4a40749
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_cut_coul_cut_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJCutCoulCutCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4,true);
+}
+
+void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJCutCoulCutCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CUT,COUL_CUT,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_CUT,COUL_CUT,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h
new file mode 100644
index 0000000000..95fadcd39b
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda.cu b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu
new file mode 100644
index 0000000000..f5e074ba82
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu
@@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_cut_coul_debye_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJCutCoulDebyeCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4,true);
+}
+
+void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJCutCoulDebyeCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CUT,COUL_DEBYE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_CUT,COUL_DEBYE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h
new file mode 100644
index 0000000000..b6df066ac1
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda.cu b/lib/cuda/pair_lj_cut_coul_long_cuda.cu
new file mode 100644
index 0000000000..dd3e1df978
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_long_cuda.cu
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_cut_coul_long_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJCutCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4,true);
+}
+
+void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJCutCoulLongCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CUT,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_CUT,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h
new file mode 100644
index 0000000000..9cac5457bd
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_lj_cut_cuda.cu b/lib/cuda/pair_lj_cut_cuda.cu
new file mode 100644
index 0000000000..8f0c862004
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_cuda.cu
@@ -0,0 +1,74 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_cut_cuda_cu.h"
+#include "pair_lj_cut_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJCutCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4);
+}
+
+void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJCutCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	//int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
+	//if(CUDA_ARCH==20) maxthreads*=2;
+	//cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192);
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_cut_cuda_cu.h b/lib/cuda/pair_lj_cut_cuda_cu.h
new file mode 100644
index 0000000000..9d9722501f
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairLJCutCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..d263e4a5cf
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu
@@ -0,0 +1,32 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJCutCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+    const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+	if (eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r6inv-
+			    			_lj4[ij_type]) - _offset[ij_type]);
+	return factor_lj*r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type])*r2inv;
+}
+
diff --git a/lib/cuda/pair_lj_cut_experimental_cuda.cu b/lib/cuda/pair_lj_cut_experimental_cuda.cu
new file mode 100644
index 0000000000..6996c02236
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_experimental_cuda.cu
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_cut_experimental_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJCutExperimentalCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4);
+}
+
+void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJCutExperimentalCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	//int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
+	//if(CUDA_ARCH==20) maxthreads*=2;
+	//cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192);
+	if (sharedperproc==0) sharedperproc++;
+	//printf("comm_phase: %i\n",sdata->comm.comm_phase);
+	
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+  	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom,sdata->comm.comm_phase);
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_cut_experimental_cuda_cu.h b/lib/cuda/pair_lj_cut_experimental_cuda_cu.h
new file mode 100644
index 0000000000..4cc1f6de36
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_experimental_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_lj_expand_cuda.cu b/lib/cuda/pair_lj_expand_cuda.cu
new file mode 100644
index 0000000000..e1fa43d050
--- /dev/null
+++ b/lib/cuda/pair_lj_expand_cuda.cu
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _shift MY_AP(coeff5)
+
+#include "pair_lj_expand_cuda_cu.h"
+#include "pair_lj_expand_cuda_kernel_nc.cu"
+#include <time.h>
+
+
+void Cuda_PairLJExpandCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 5);
+}
+
+
+
+
+void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+	// initialize only on first call
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJExpandCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,256);
+	
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_EXPAND,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_EXPAND,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+
+
diff --git a/lib/cuda/pair_lj_expand_cuda_cu.h b/lib/cuda/pair_lj_expand_cuda_cu.h
new file mode 100644
index 0000000000..24164b6fa7
--- /dev/null
+++ b/lib/cuda/pair_lj_expand_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..533bd761fc
--- /dev/null
+++ b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu
@@ -0,0 +1,33 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairLJExpandCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+	const F_FLOAT r = _SQRT_(rsq);
+	const F_FLOAT rshift = r - _shift[ij_type];
+	const F_FLOAT rshiftsq = rshift*rshift;
+	const F_FLOAT r2inv = F_F(1.0)/rshiftsq;
+	const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+	const F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]);
+    if(eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r6inv-_lj4[ij_type]) - _offset[ij_type]);
+	return factor_lj*forcelj*(F_F(1.0)/rshift)*(F_F(1.0)/r); 
+}
diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu
new file mode 100644
index 0000000000..7532e4b643
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu
@@ -0,0 +1,102 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+#define _ljsw1 MY_AP(coeff5_gm)
+#define _ljsw2 MY_AP(coeff6_gm)
+#define _ljsw3 MY_AP(coeff7_gm)
+#define _ljsw4 MY_AP(coeff8_gm)
+#define _ljsw5 MY_AP(coeff9_gm)
+
+#define _cut_coul_inner_global MY_AP(cut_coul_inner_global)
+#define _coulsw1 MY_AP(coulsw1)
+#define _coulsw2 MY_AP(coulsw2)
+#define _coulsw5 MY_AP(coulsw5)
+__device__ __constant__ F_FLOAT _cut_coul_inner_global;
+__device__ __constant__ F_FLOAT _coulsw1;
+__device__ __constant__ F_FLOAT _coulsw2;
+__device__ __constant__ F_FLOAT _coulsw5;
+
+
+#include "pair_lj_gromacs_coul_gromacs_cuda_cu.h"
+#include "pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata,F_FLOAT cut_coul_inner,F_FLOAT coulsw1,F_FLOAT coulsw2,F_FLOAT coulsw5)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 9,true,true,true);
+	cudaMemcpyToSymbol(MY_CONST(cut_coul_inner_global) , &cut_coul_inner  , sizeof(F_FLOAT)  );
+	cudaMemcpyToSymbol(MY_CONST(coulsw1) , &coulsw1  , sizeof(F_FLOAT)  );
+	cudaMemcpyToSymbol(MY_CONST(coulsw2) , &coulsw2  , sizeof(F_FLOAT)  );
+	cudaMemcpyToSymbol(MY_CONST(coulsw5) , &coulsw5  , sizeof(F_FLOAT)  );
+	
+	return;
+}
+
+
+
+void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+			int eflag_atom,int vflag_atom,F_FLOAT cut_coul_inner,F_FLOAT coulsw1,F_FLOAT coulsw2,F_FLOAT coulsw5)
+{
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJGromacsCoulGromacsCuda_Init(sdata,cut_coul_inner,coulsw1,coulsw2,coulsw5);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_GROMACS,COUL_GROMACS,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_GROMACS,COUL_GROMACS,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _ljsw1
+#undef _ljsw2
+#undef _ljsw3
+#undef _ljsw4
+#undef _ljsw5
+#undef _cut_coul_inner_global 
+#undef _coulsw1 
+#undef _coulsw2 
+#undef _coulsw5 
diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h
new file mode 100644
index 0000000000..8dc5f8fcde
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT cut_coul_inner,F_FLOAT coulsw1,F_FLOAT coulsw2,F_FLOAT coulsw5);
diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..29e0a63c90
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu
@@ -0,0 +1,46 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT CoulGromacsCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_coul,int& eflag, ENERGY_FLOAT& ecoul, F_FLOAT qij)
+{
+	  		if (qij != F_F(0.0)) 
+	  		{ 
+	  			F_FLOAT ecoul_tmp;
+	  			F_FLOAT forcecoul = _RSQRT_(rsq); 
+	  			if(eflag) ecoul_tmp=forcecoul - _coulsw5;
+	  			if (rsq > _cut_coul_inner_global*_cut_coul_inner_global) {
+	  				const F_FLOAT r = F_F(1.0)/forcecoul;
+	    			const F_FLOAT tc = r - _cut_coul_inner_global;
+	    			forcecoul += r*tc*tc*(_coulsw1 + _coulsw2*tc);
+	    			if(eflag)  ecoul_tmp-=tc*tc*tc*(_coulsw1*(F_F(1.0)/F_F(3.0)) + _coulsw2*tc*(F_F(1.0)/F_F(4.0)));
+	  			}
+	  			F_FLOAT qprod=_qqrd2e * qij*factor_coul;
+	  			forcecoul*=qprod;
+	    	    if(eflag) 
+	    	    {
+	    	    	ecoul += ecoul_tmp*qprod;
+	    	    }
+				return forcecoul*(F_F(1.0)/rsq);	    	    
+	  		}
+			return F_F(0.0);
+}
diff --git a/lib/cuda/pair_lj_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_cuda.cu
new file mode 100644
index 0000000000..ce0c08f6f0
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_cuda.cu
@@ -0,0 +1,83 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+#define _ljsw1 MY_AP(coeff5_gm)
+#define _ljsw2 MY_AP(coeff6_gm)
+#define _ljsw3 MY_AP(coeff7_gm)
+#define _ljsw4 MY_AP(coeff8_gm)
+#define _ljsw5 MY_AP(coeff9_gm)
+
+#include "pair_lj_gromacs_cuda_cu.h"
+#include "pair_lj_gromacs_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJGromacsCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 9,false,true,true);
+}
+
+
+
+void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+			int eflag_atom,int vflag_atom)
+{
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJGromacsCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_GROMACS,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_GROMACS,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _ljsw1
+#undef _ljsw2
+#undef _ljsw3
+#undef _ljsw4
+#undef _ljsw5
diff --git a/lib/cuda/pair_lj_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_cuda_cu.h
new file mode 100644
index 0000000000..970eb1f832
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..818c9f55fc
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu
@@ -0,0 +1,51 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJGromacsCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+	F_FLOAT tlj;
+	const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const F_FLOAT r = _RSQRT_(r2inv);
+	const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+	F_FLOAT	forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]);
+	const X_FLOAT cut_lj_innersq=(_cut_innersq_global > X_F(0.0)? _cut_innersq_global : _cut_innersq[ij_type]);
+	if (rsq > cut_lj_innersq)
+	{
+		tlj = r - _SQRT_(cut_lj_innersq);
+	    forcelj += r*tlj*tlj*(_ljsw1[ij_type] + _ljsw2[ij_type]*tlj);
+	}
+				
+    if (eflag) 
+    {
+    	ENERGY_FLOAT evdwl_tmp = r6inv*(_lj3[ij_type]*r6inv-_lj4[ij_type]);
+	      									 		
+    	if (rsq > cut_lj_innersq) 
+    	{
+			  evdwl_tmp += tlj*tlj*tlj*
+			  	(_ljsw3[ij_type] + _ljsw4[ij_type]*tlj) + _ljsw5[ij_type];;
+    	}
+			    	
+    	evdwl+=evdwl_tmp*factor_lj;
+    }
+    return factor_lj*forcelj*r2inv;
+}
diff --git a/lib/cuda/pair_lj_smooth_cuda.cu b/lib/cuda/pair_lj_smooth_cuda.cu
new file mode 100644
index 0000000000..5723ffc94c
--- /dev/null
+++ b/lib/cuda/pair_lj_smooth_cuda.cu
@@ -0,0 +1,83 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+#define _ljsw1 MY_AP(coeff5_gm)
+#define _ljsw2 MY_AP(coeff6_gm)
+#define _ljsw3 MY_AP(coeff7_gm)
+#define _ljsw4 MY_AP(coeff8_gm)
+#define _ljsw0 MY_AP(coeff9_gm)
+
+#include "pair_lj_smooth_cuda_cu.h"
+#include "pair_lj_smooth_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJSmoothCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 9,false,true,true);
+}
+
+
+
+void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+			int eflag_atom,int vflag_atom)
+{
+	// initialize only on first call
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairLJSmoothCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192);
+	
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_LJ_SMOOTH,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_LJ_SMOOTH,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+ 
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _ljsw1
+#undef _ljsw2
+#undef _ljsw3
+#undef _ljsw4
+#undef _ljsw0
diff --git a/lib/cuda/pair_lj_smooth_cuda_cu.h b/lib/cuda/pair_lj_smooth_cuda_cu.h
new file mode 100644
index 0000000000..504cf19f98
--- /dev/null
+++ b/lib/cuda/pair_lj_smooth_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..bcac8bf88a
--- /dev/null
+++ b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu
@@ -0,0 +1,66 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJSmoothCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+	F_FLOAT fskin,t,tsq,forcelj;
+	const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const F_FLOAT r = _RSQRT_(r2inv);
+	const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+
+
+	X_FLOAT cut_lj_innersq=(_cut_innersq_global > X_F(0.0)? _cut_innersq_global : _cut_innersq[ij_type]);
+	if (rsq < cut_lj_innersq)
+	{
+	  forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]);
+	}
+	else
+	{
+   	  t = r - _SQRT_(cut_lj_innersq);
+   	  tsq = t*t;
+   	  fskin = _ljsw1[ij_type] +  _ljsw2[ij_type]*t +
+   	  		  _ljsw3[ij_type]*tsq +  _ljsw4[ij_type]*tsq*t;
+   	  forcelj = fskin*r;
+	  				
+	}
+				
+	if (eflag) 
+	{
+	  	ENERGY_FLOAT evdwl_tmp;
+	   	 
+	   	if (rsq < cut_lj_innersq) 
+	   	{
+	   	  evdwl_tmp = r6inv*(_lj3[ij_type]*r6inv-_lj4[ij_type]) -
+     								 		_offset[ij_type];
+	   	}
+	   	else
+	   	{
+		  evdwl_tmp = _ljsw0[ij_type] - _ljsw1[ij_type]*t -  
+	   		_ljsw2[ij_type]*tsq*F_F(0.5) - _ljsw3[ij_type]*tsq*t*(F_F(1.0)/F_F(3.0)) -
+	   		_ljsw4[ij_type]*tsq*tsq*(F_F(1.0)/F_F(4.0)) - _offset[ij_type];
+	   	}
+			    	
+	   	evdwl+=evdwl_tmp*factor_lj;
+    }
+    return factor_lj*forcelj * r2inv;
+}
diff --git a/lib/cuda/pair_morse_coul_long_cuda.cu b/lib/cuda/pair_morse_coul_long_cuda.cu
new file mode 100644
index 0000000000..cb226b58f4
--- /dev/null
+++ b/lib/cuda/pair_morse_coul_long_cuda.cu
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _r0 MY_AP(coeff1)
+#define _alpha MY_AP(coeff2)
+#define _morse1 MY_AP(coeff3)
+#define _d0 MY_AP(coeff4)
+#define _c0 MY_AP(coeff5)
+
+#include "pair_morse_coul_long_cuda_cu.h"
+#include "pair_morse_coul_long_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairMorseCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 5,true);
+}
+
+void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairMorseCoulLongCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
+
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_MORSE_R6,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_MORSE_R6,COUL_LONG,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+
+#undef _rhoinv
+#undef _sigma
+#undef _a
+#undef _c
+#undef _d
+#undef _c0
+
diff --git a/lib/cuda/pair_morse_coul_long_cuda_cu.h b/lib/cuda/pair_morse_coul_long_cuda_cu.h
new file mode 100644
index 0000000000..63055289f4
--- /dev/null
+++ b/lib/cuda/pair_morse_coul_long_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..b367914a78
--- /dev/null
+++ b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu
@@ -0,0 +1,33 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairMorseR6Cuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+    const F_FLOAT r2inv = F_F(1.0)/rsq;
+    const F_FLOAT r = _SQRT_(rsq);
+    const F_FLOAT r4inv = r2inv*r2inv;
+    const F_FLOAT dr = r-_r0[ij_type];
+    const F_FLOAT dexp = _EXP_(-_alpha[ij_type]*dr);
+			      if(eflag) evdwl += factor_lj*(_d0[ij_type]*(dexp*dexp-F_F(2.0)*dexp) + _c0[ij_type]*r4inv*r4inv*r4inv
+			    		- _offset[ij_type]);
+    return factor_lj*(_morse1[ij_type]*(dexp*dexp-dexp)*(F_F(1.0)/r)- F_F(12.0)*_c0[ij_type]*r4inv*r4inv*r4inv*r2inv); 
+}
diff --git a/lib/cuda/pair_morse_cuda.cu b/lib/cuda/pair_morse_cuda.cu
new file mode 100644
index 0000000000..d33ac842d3
--- /dev/null
+++ b/lib/cuda/pair_morse_cuda.cu
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _r0 MY_AP(coeff1)
+#define _alpha MY_AP(coeff2)
+#define _morse1 MY_AP(coeff3)
+#define _d0 MY_AP(coeff4)
+
+#include "pair_morse_cuda_cu.h"
+#include "pair_morse_cuda_kernel_nc.cu"
+#include <time.h>
+
+
+
+void Cuda_PairMorseCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_Pair_Init_AllStyles(sdata, 4);
+}
+
+
+
+
+void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+{
+	
+	// initialize only on first call
+	static  short init=0;
+	if(! init)
+	{
+		init = 1;
+		Cuda_PairMorseCuda_Init(sdata);
+	}
+
+	dim3 grid,threads;
+	int sharedperproc;
+	
+	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,256);
+	
+	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+	if(sdata->pair.use_block_per_atom)
+		Pair_Kernel_BpA<PAIR_MORSE,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	else
+		Pair_Kernel_TpA<PAIR_MORSE,COUL_NONE,DATA_NONE>
+		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
+	
+	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _r0
+#undef _alpha
+#undef _morse1
+#undef _d0
+
+
diff --git a/lib/cuda/pair_morse_cuda_cu.h b/lib/cuda/pair_morse_cuda_cu.h
new file mode 100644
index 0000000000..2cfe350458
--- /dev/null
+++ b/lib/cuda/pair_morse_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
diff --git a/lib/cuda/pair_morse_cuda_kernel_nc.cu b/lib/cuda/pair_morse_cuda_kernel_nc.cu
new file mode 100644
index 0000000000..ead1c54fb2
--- /dev/null
+++ b/lib/cuda/pair_morse_cuda_kernel_nc.cu
@@ -0,0 +1,32 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairMorseCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+  const F_FLOAT r = _SQRT_(rsq);
+  const F_FLOAT dr = r-_r0[ij_type];
+  const F_FLOAT dexp = _EXP_(-_alpha[ij_type]*dr);
+			      if(eflag) evdwl += factor_lj*(_d0[ij_type]*(dexp*dexp-F_F(2.0)*dexp) 
+			    		- _offset[ij_type]);
+  return factor_lj*_morse1[ij_type]*(dexp*dexp-dexp)*(F_F(1.0)/r); 
+}
+
diff --git a/lib/cuda/pair_virial_compute_cu.h b/lib/cuda/pair_virial_compute_cu.h
new file mode 100644
index 0000000000..fdd2cecb8c
--- /dev/null
+++ b/lib/cuda/pair_virial_compute_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairVirialCompute(cuda_shared_data* sdata, int offset, int end);
diff --git a/lib/cuda/pppm_cuda.cu b/lib/cuda/pppm_cuda.cu
new file mode 100644
index 0000000000..cabea885d3
--- /dev/null
+++ b/lib/cuda/pppm_cuda.cu
@@ -0,0 +1,579 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_precision.h"
+//#define FFT_CUFFT
+#define MY_PREFIX pppm
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "pppm_cuda_cu.h"
+#include "cuda_runtime.h"
+#include <stdio.h>
+
+//#include "crm_cuda_utils.cu"
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+  __device__ __constant__ FFT_FLOAT* work1;
+  __device__ __constant__ FFT_FLOAT* work2;
+  __device__ __constant__ FFT_FLOAT* work3;
+  __device__ __constant__ PPPM_FLOAT* greensfn;
+  __device__ __constant__ PPPM_FLOAT* gf_b;
+  __device__ __constant__ PPPM_FLOAT* fkx;
+  __device__ __constant__ PPPM_FLOAT* fky;
+  __device__ __constant__ PPPM_FLOAT* fkz;
+  __device__ __constant__ PPPM_FLOAT* vg;
+  __device__ __constant__ int* part2grid;
+  __device__ __constant__ PPPM_FLOAT* density_brick;
+  __device__ __constant__ int* density_brick_int;
+  __device__ __constant__ PPPM_FLOAT density_intScale;
+  __device__ __constant__ PPPM_FLOAT* vdx_brick;
+  __device__ __constant__ PPPM_FLOAT* vdy_brick;
+  __device__ __constant__ PPPM_FLOAT* vdz_brick;
+  __device__ __constant__ PPPM_FLOAT* density_fft;
+  __device__ __constant__ ENERGY_FLOAT* energy;
+  __device__ __constant__ ENERGY_FLOAT* virial;
+  __device__ __constant__ int nxlo_in;
+  __device__ __constant__ int nxhi_in;
+  __device__ __constant__ int nxlo_out;
+  __device__ __constant__ int nxhi_out;
+  __device__ __constant__ int nylo_in;
+  __device__ __constant__ int nyhi_in;
+  __device__ __constant__ int nylo_out;
+  __device__ __constant__ int nyhi_out;
+  __device__ __constant__ int nzlo_in;
+  __device__ __constant__ int nzhi_in;
+  __device__ __constant__ int nzlo_out;
+  __device__ __constant__ int nzhi_out;
+  __device__ __constant__ int nxlo_fft;
+  __device__ __constant__ int nxhi_fft;
+  __device__ __constant__ int nylo_fft;
+  __device__ __constant__ int nyhi_fft;
+  __device__ __constant__ int nzlo_fft;
+  __device__ __constant__ int nzhi_fft;
+  __device__ __constant__ int nx_pppm;
+  __device__ __constant__ int ny_pppm;
+  __device__ __constant__ int nz_pppm;
+  __device__ __constant__ int slabflag;
+  __device__ __constant__ PPPM_FLOAT qqrd2e;
+  __device__ __constant__ int order;
+  //__device__ __constant__ float3 sublo;
+  __device__ __constant__ PPPM_FLOAT* rho_coeff;
+  __device__ __constant__ int nmax;
+  __device__ __constant__ int nlocal;
+  __device__ __constant__ PPPM_FLOAT* debugdata;
+  __device__ __constant__ PPPM_FLOAT delxinv;
+  __device__ __constant__ PPPM_FLOAT delyinv;
+  __device__ __constant__ PPPM_FLOAT delzinv;
+  __device__ __constant__ int nlower;
+  __device__ __constant__ int nupper;
+  __device__ __constant__ PPPM_FLOAT shiftone;
+  
+  
+#include "pppm_cuda_kernel.cu"
+#include "stdio.h"
+void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial
+	    ,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg
+	    ,int cu_nxlo_in,int cu_nxhi_in,int cu_nylo_in,int cu_nyhi_in,int cu_nzlo_in,int cu_nzhi_in,int cu_nxlo_out,int cu_nxhi_out,int cu_nylo_out,int cu_nyhi_out,int cu_nzlo_out,int cu_nzhi_out,int cu_nx_pppm,int cu_ny_pppm,int cu_nz_pppm
+	    ,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b
+	    ,double cu_qqrd2e, int cu_order, void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_int,int cu_slabflag
+	 )
+{
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start");
+  cudaMemcpyToSymbol("density_brick",&cu_density_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("density_brick_int",&cu_density_brick_int, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("vdx_brick",&cu_vdx_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("vdy_brick",&cu_vdy_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("vdz_brick",&cu_vdz_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("density_fft",&cu_density_fft, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("energy",&cu_energy, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol("virial",&cu_virial, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol("nxlo_in",&cu_nxlo_in, sizeof(int));
+  cudaMemcpyToSymbol("nxhi_in",&cu_nxhi_in, sizeof(int));
+  cudaMemcpyToSymbol("nxlo_out",&cu_nxlo_out, sizeof(int));
+  cudaMemcpyToSymbol("nxhi_out",&cu_nxhi_out, sizeof(int));
+  cudaMemcpyToSymbol("nylo_in",&cu_nylo_in, sizeof(int));
+  cudaMemcpyToSymbol("nyhi_in",&cu_nyhi_in, sizeof(int));
+  cudaMemcpyToSymbol("nylo_out",&cu_nylo_out, sizeof(int));
+  cudaMemcpyToSymbol("nyhi_out",&cu_nyhi_out, sizeof(int));
+  cudaMemcpyToSymbol("nzlo_in",&cu_nzlo_in, sizeof(int));
+  cudaMemcpyToSymbol("nzhi_in",&cu_nzhi_in, sizeof(int));
+  cudaMemcpyToSymbol("nzlo_out",&cu_nzlo_out, sizeof(int));
+  cudaMemcpyToSymbol("nzhi_out",&cu_nzhi_out, sizeof(int));
+  cudaMemcpyToSymbol("nxlo_fft",&cu_nxlo_fft, sizeof(int));
+  cudaMemcpyToSymbol("nxhi_fft",&cu_nxhi_fft, sizeof(int));
+  cudaMemcpyToSymbol("nylo_fft",&cu_nylo_fft, sizeof(int));
+  cudaMemcpyToSymbol("nyhi_fft",&cu_nyhi_fft, sizeof(int));
+  cudaMemcpyToSymbol("nzlo_fft",&cu_nzlo_fft, sizeof(int));
+  cudaMemcpyToSymbol("nzhi_fft",&cu_nzhi_fft, sizeof(int));
+  cudaMemcpyToSymbol("slabflag",&cu_slabflag, sizeof(int));  
+  cudaMemcpyToSymbol("nx_pppm",&cu_nx_pppm, sizeof(int));
+  cudaMemcpyToSymbol("ny_pppm",&cu_ny_pppm, sizeof(int));
+  cudaMemcpyToSymbol("nz_pppm",&cu_nz_pppm, sizeof(int));
+  cudaMemcpyToSymbol("work1",&cu_work1, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol("work2",&cu_work2, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol("work3",&cu_work3, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol("greensfn",&cu_greensfn, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("gf_b",&cu_gf_b, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("fkx",&cu_fkx, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("fky",&cu_fky, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("fkz",&cu_fkz, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("vg",&cu_vg, sizeof(PPPM_FLOAT*));
+
+  PPPM_FLOAT cu_qqrd2e_a=cu_qqrd2e;
+  cudaMemcpyToSymbol("qqrd2e",&cu_qqrd2e_a, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol("order",&cu_order, sizeof(int));
+  cudaMemcpyToSymbol("rho_coeff",&cu_rho_coeff, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("debugdata",&cu_debugdata, sizeof(PPPM_FLOAT*));
+  
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_init");
+
+/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n");
+
+#ifdef PPPM_PRECISION
+if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n");
+if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n");
+#endif
+#ifdef ENERGY_PRECISION
+if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n");
+if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n");
+#endif
+#ifdef ENERGY_PRECISION
+if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n");
+if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n");
+#endif
+#ifdef X_PRECISION
+if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n");
+if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n");
+#endif
+#ifdef F_PRECISION
+if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n");
+if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n");
+#endif*/
+}
+
+void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT cu_shiftone,PPPM_FLOAT cu_delxinv,PPPM_FLOAT cu_delyinv,PPPM_FLOAT cu_delzinv,int cu_nlower,int cu_nupper)
+{
+  cudaMemcpyToSymbol("delxinv",&cu_delxinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol("delyinv",&cu_delyinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol("delzinv",&cu_delzinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol("shiftone",&cu_shiftone, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol("nlower",&cu_nlower, sizeof(int));
+  cudaMemcpyToSymbol("nupper",&cu_nupper, sizeof(int));
+  cudaMemcpyToSymbol(MY_CONST(sublo)   , sdata->domain.sublo, 3*sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_CONST(subhi)   , sdata->domain.subhi, 3*sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_CONST(boxlo)   , sdata->domain.boxlo, 3*sizeof(X_FLOAT));
+  CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup");
+}
+
+void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa)
+{
+  cudaMemcpyToSymbol("part2grid",&cu_part2grid, sizeof(int*));
+  cudaMemcpyToSymbol(MY_CONST(x)   , & sdata->atom.x   .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_CONST(f)   , & sdata->atom.f   .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_CONST(q)   , & sdata->atom.q   .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_CONST(tag)   , & sdata->atom.tag   .dev_data, sizeof(int*));
+  //cudaMemcpyToSymbol(MY_CONST(nlocal)   , & sdata->atom.nlocal   .dev_data, sizeof(int));
+  cudaMemcpyToSymbol("nlocal"   , &nlocala, sizeof(int));
+  cudaMemcpyToSymbol("nmax"   , &nmaxa, sizeof(int));
+  CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update");
+  
+}
+
+void pppm_update_nlocal(int nlocala)
+{
+ cudaMemcpyToSymbol("nlocal"   , &nlocala, sizeof(int));
+ CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b");
+}
+
+
+void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  setup_fkxyz_vg<<<grid,threads,0>>>(unitkx,unitky,unitkz,g_ewald);
+  cudaThreadSynchronize();
+  
+  CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg ");
+}
+
+void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald,
+int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  setup_greensfn<<<grid,threads,0>>>(unitkx,unitky,unitkz,g_ewald,nbx,nby,nbz,xprd,yprd, zprd_slab);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn ");
+}
+
+void poisson_scale(int nx_pppma,int ny_pppma,int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  poisson_scale_kernel<<<grid,threads,0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_scale ");
+
+}
+
+void poisson_xgrad(int nx_pppma,int ny_pppma,int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  poisson_xgrad_kernel<<<grid,threads,0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad ");
+}
+
+void poisson_ygrad(int nx_pppma,int ny_pppma,int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  poisson_ygrad_kernel<<<grid,threads,0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad ");
+}
+
+void poisson_zgrad(int nx_pppma,int ny_pppma,int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  poisson_zgrad_kernel<<<grid,threads,0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad ");
+}
+
+void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppma,int ny_pppma,int nz_pppma)
+{
+	
+  dim3 grid;
+  dim3 threads;
+  grid.x=khi-klo+1;
+  grid.y=jhi-jlo+1;
+  grid.z=1;
+  threads.x=ihi-ilo+1;
+  threads.y=1;
+  threads.z=1;
+  //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x);
+  poisson_vdx_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick ");
+  cudaThreadSynchronize();
+}
+
+void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=khi-klo+1;
+  grid.y=jhi-jlo+1;
+  grid.z=1;
+  threads.x=ihi-ilo+1;
+  threads.y=1;
+  threads.z=1;
+  poisson_vdy_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick ");
+  cudaThreadSynchronize();
+}
+
+void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=khi-klo+1;
+  grid.y=jhi-jlo+1;
+  grid.z=1;
+  threads.x=ihi-ilo+1;
+  threads.y=1;
+  threads.z=1;
+  poisson_vdz_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick ");
+  cudaThreadSynchronize();
+}
+
+
+void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag)
+{
+  //printf("VFLAG_GPU: %i\n",vflag);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start ");
+  dim3 grid;
+  dim3 threads;
+  grid.x=nzhi_fft-nzlo_fft+1;
+  grid.y=nyhi_fft-nylo_fft+1;
+  grid.z=1;
+  threads.x=nxhi_fft-nxlo_fft+1;
+  threads.y=1;
+  threads.z=1;
+  poisson_energy_kernel<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(nxlo_fft,nylo_fft,nzlo_fft,vflag);
+  
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end ");
+}
+
+ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial)
+{
+  ENERGY_FLOAT host_energy=0;
+  dim3 grid;
+  dim3 threads;
+
+  grid.x=nz_pppma;
+  grid.y=1;
+  grid.z=1;
+  threads.x=ny_pppma;
+  threads.y=1;
+  threads.z=1;
+  sum_energy_kernel1<<<grid,threads,ny_pppma*sizeof(ENERGY_FLOAT)>>>(vflag); 
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 ");
+
+  grid.x=1;
+  grid.y=1;
+  grid.z=1;
+  threads.x=nz_pppma;
+  threads.y=1;
+  threads.z=1;
+  sum_energy_kernel2<<<grid,threads,nz_pppma*sizeof(ENERGY_FLOAT)>>>(vflag); 
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 ");
+
+  cudaMemcpy((void*) (&host_energy), cu_energy, sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
+  if(vflag)
+  cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
+  CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy");
+    
+  return host_energy;
+}
+
+void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int)
+{
+  CUT_CHECK_ERROR("cuda_make_rho begin");
+  dim3 grid,threads;
+  int cpu_flag[3];
+  grid.x=(sdata->atom.nlocal+31)/32;
+  grid.y=1;
+  grid.z=1;
+  threads.x=32;
+  threads.y=1;
+  threads.z=1;
+  int sharedmemsize=(32+32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT);
+  do
+  { 
+    cpu_flag[0]=0;
+    cpu_flag[1]=0;
+    cpu_flag[2]=0;
+    cudaMemcpyToSymbol("density_intScale",cu_density_intScale,sizeof(PPPM_FLOAT*));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z");
+    cudaMemset(flag,0,3*sizeof(int));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A");
+    cudaMemset(cu_density_brick,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(PPPM_FLOAT));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B");
+    cudaMemset(cu_density_brick_int,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(int));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C");
+     make_rho_kernel<<<grid,threads,sharedmemsize>>>((int*) flag,32/(sdata->pppm.nupper-sdata->pppm.nlower+1));
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho A");
+    cudaMemcpy((void*) &cpu_flag, flag, 3*sizeof(int),cudaMemcpyDeviceToHost);
+    if(cpu_flag[0]!=0) {(*cu_density_intScale)/=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n",*cu_density_intScale);)}
+    if((cpu_flag[0]==0)&&(cpu_flag[1]==0)) {(*cu_density_intScale)*=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n",*cu_density_intScale);)}
+   /* if((*cu_density_intScale)>0xe0000000)
+    {
+    	printf("Error Scaling\n");
+        cpu_flag[0]=0;
+        cpu_flag[1]=1;
+    }*/
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho B");
+  } while((cpu_flag[0]!=0)||(cpu_flag[1]==0));
+  
+  
+  grid.x=khi-klo+1;
+  grid.y=jhi-jlo+1;
+  threads.x=ihi-ilo+1;
+  scale_rho_kernel<<<grid,threads,0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale");
+}
+
+
+int cuda_particle_map(cuda_shared_data* sdata,void* flag)
+{
+  dim3 grid,threads;
+  int cpu_flag;
+  grid.x=(sdata->atom.nlocal+31)/32;
+  grid.y=1;
+  grid.z=1;
+  threads.x=32;
+  threads.y=1;
+  threads.z=1;
+  CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre");
+  particle_map_kernel<<<grid,threads,0>>>((int*) flag);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA particla_map a");
+  cudaMemcpy((void*) &cpu_flag, flag, sizeof(int),cudaMemcpyDeviceToHost);
+  CUT_CHECK_ERROR("ERROR-CUDA particla_map b");
+  return cpu_flag;
+}
+
+
+void cuda_fieldforce(cuda_shared_data* sdata,void* flag)
+{
+  dim3 grid,threads;
+  grid.x=(sdata->atom.nlocal+31)/32;
+  grid.y=1;
+  grid.z=1;
+  threads.x=32;
+  threads.y=1;
+  threads.z=1;
+  int sharedmemsize=(32+3*32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT);
+  fieldforce_kernel<<<grid,threads,sharedmemsize>>>
+  (sdata->pppm.nupper-sdata->pppm.nlower+1,32/(sdata->pppm.nupper-sdata->pppm.nlower+1),(int*) flag);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA fieldforce");
+}
+
+double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf)
+{
+  dim3 grid,threads;
+  grid.x=(sdata->atom.nlocal+31)/32;
+  grid.y=1;
+  grid.z=1;
+  threads.x=32;
+  threads.y=1;
+  threads.z=1;	
+  slabcorr_energy_kernel<<<grid,threads,32*sizeof(ENERGY_FLOAT)>>>(dev_buf);
+  cudaThreadSynchronize();
+  cudaMemcpy((void*) buf, dev_buf, grid.x*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
+  
+  double dipole_all=0.0;
+  for(int i=0;i<grid.x;i++)
+   dipole_all+=buf[i];
+   
+  return dipole_all;
+}
+
+void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact)
+{
+  dim3 grid,threads;
+  grid.x=(sdata->atom.nlocal+31)/32;
+  grid.y=1;
+  grid.z=1;
+  threads.x=32;
+  threads.y=1;
+  threads.z=1;	
+  slabcorr_force_kernel<<<grid,threads>>>(ffact);	
+  cudaThreadSynchronize();
+}
+
+void sum_virial(double* host_virial)
+{
+}
+
+void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out)
+{
+  int nslow=sdata->pppm.nzhi_in-sdata->pppm.nzlo_in;	
+  int nmid=sdata->pppm.nyhi_in-sdata->pppm.nylo_in;	
+  int nfast=sdata->pppm.nxhi_in-sdata->pppm.nxlo_in;
+  int nrimz=MAX(sdata->pppm.nzlo_in-sdata->pppm.nzlo_out,sdata->pppm.nzhi_out-sdata->pppm.nzhi_in);	
+  int nrimy=MAX(sdata->pppm.nylo_in-sdata->pppm.nylo_out,sdata->pppm.nyhi_out-sdata->pppm.nyhi_in);	
+  int nrimx=MAX(sdata->pppm.nxlo_in-sdata->pppm.nxlo_out,sdata->pppm.nxhi_out-sdata->pppm.nxhi_in);	
+  dim3 grid;
+  grid.x=nslow+1;
+  grid.y=nmid+1;
+  grid.z=1;
+  dim3 threads;
+  threads.x=nfast+1;
+  threads.y=1;
+  threads.z=1;
+  cudaThreadSynchronize();
+  initfftdata_core_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nrimz;
+  grid.y=nmid+1;
+  threads.x=nfast+1;
+  initfftdata_z_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nslow+1;
+  grid.y=nrimy;
+  threads.x=nfast+1;
+  initfftdata_y_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nslow+1;
+  grid.y=nmid+1;
+  threads.x=nrimx;
+  initfftdata_x_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nrimz;
+  grid.y=nrimy;
+  threads.x=nfast+1;
+  initfftdata_yz_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nrimz;
+  grid.y=nmid+1;
+  threads.x=nrimx;
+  initfftdata_xz_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nslow+1;
+  grid.y=nrimy;
+  threads.x=nrimx;
+  initfftdata_xy_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nrimz;
+  grid.y=nrimy;
+  threads.x=nrimx;
+  initfftdata_xyz_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel");
+}
+
+
diff --git a/lib/cuda/pppm_cuda_cu.h b/lib/cuda/pppm_cuda_cu.h
new file mode 100644
index 0000000000..b594715b7c
--- /dev/null
+++ b/lib/cuda/pppm_cuda_cu.h
@@ -0,0 +1,55 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef PPPM_CUDA_CU_H_
+#define PPPM_CUDA_CU_H_
+
+extern "C" void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial
+	    ,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg
+	    ,int nxlo_in,int nxhi_in,int nylo_in,int nyhi_in,int nzlo_in,int nzhi_in,int nxlo_out,int nxhi_out,int nylo_out,int nyhi_out,int nzlo_out,int nzhi_out, int nx_pppm,int ny_pppm,int nz_pppm
+	    ,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b
+	    ,double cu_qqrd2e, int cu_order,void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_lock,int slabflag
+	 );
+extern "C" void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT shiftone,PPPM_FLOAT delxinv,PPPM_FLOAT delyinv,PPPM_FLOAT delzinv,int nlower,int nupper);
+extern "C" void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald);
+extern "C" void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald,
+		int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab);
+
+extern "C" void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa);
+extern "C" void pppm_update_nlocal(int nlocala);
+extern "C" void poisson_scale(int nx_pppm,int ny_pppm,int nz_pppm);
+extern "C" void poisson_xgrad(int nx_pppm,int ny_pppm,int nz_pppm);
+extern "C" void poisson_ygrad(int nx_pppm,int ny_pppm,int nz_pppm);
+extern "C" void poisson_zgrad(int nx_pppm,int ny_pppm,int nz_pppm);
+extern "C" void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm);
+extern "C" void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm);
+extern "C" void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm);
+extern "C" void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag);
+extern "C" ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial);
+extern "C" int cuda_particle_map(cuda_shared_data* sdata,void* flag);
+extern "C" void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int);
+extern "C" void cuda_fieldforce(cuda_shared_data* sdata,void* flag);
+extern "C" double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf);
+extern "C" void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact);
+extern "C" void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out);
+#endif /*PPPM_CUDA_CU_H_*/
diff --git a/lib/cuda/pppm_cuda_kernel.cu b/lib/cuda/pppm_cuda_kernel.cu
new file mode 100644
index 0000000000..808c98fe39
--- /dev/null
+++ b/lib/cuda/pppm_cuda_kernel.cu
@@ -0,0 +1,816 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#define OFFSET 4096
+__device__ int negativCUDA(float f)
+{
+ return ((unsigned int)1<<31&(__float_as_int(f)))>>31;
+}
+
+__device__ void reduceBlock(float* data)
+{
+  int p2=1;
+  while(p2*2<blockDim.x) p2*=2;
+
+  if(threadIdx.x<blockDim.x-p2)
+    data[threadIdx.x]+=data[threadIdx.x+p2];
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    if(threadIdx.x<p2/i)
+    data[threadIdx.x]+=data[threadIdx.x+p2/i];
+    __syncthreads();
+  }
+}
+
+__device__ void reduceBlock(double* data)
+{
+  int p2=1;
+  while(p2*2<blockDim.x) p2*=2;
+
+  if(threadIdx.x<blockDim.x-p2)
+    data[threadIdx.x]+=data[threadIdx.x+p2];
+  __syncthreads();
+  for(int i=2;i<=p2;i*=2)
+  {
+    if(threadIdx.x<p2/i)
+    data[threadIdx.x]+=data[threadIdx.x+p2/i];
+    __syncthreads();
+  }
+}
+
+extern __shared__ PPPM_FLOAT sharedmem[];
+
+__global__ void setup_fkxyz_vg(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald)
+{
+	PPPM_FLOAT my_fkx=unitkx*(int(threadIdx.x) - nx_pppm*(2*int(threadIdx.x)/nx_pppm));
+	PPPM_FLOAT my_fky=unitky*(int(blockIdx.y) - ny_pppm*(2*int(blockIdx.y)/ny_pppm));
+	PPPM_FLOAT my_fkz=unitkz*(int(blockIdx.x) - nz_pppm*(2*int(blockIdx.x)/nz_pppm));
+	
+	if((blockIdx.x==0)&&(blockIdx.y==0)) fkx[threadIdx.x]=my_fkx;
+	if((blockIdx.x==0)&&(threadIdx.x==0)) fky[blockIdx.y]=my_fky;
+	if((threadIdx.x==0)&&(blockIdx.y==0)) fkz[blockIdx.x]=my_fkz;
+	__syncthreads();
+	if((blockIdx.x >= nzlo_fft)&&(blockIdx.x <=nzhi_fft)&&
+	   (blockIdx.y >= nylo_fft)&&(blockIdx.y <=nyhi_fft)&&
+	   (threadIdx.x>= nxlo_fft)&&(threadIdx.x<=nxhi_fft))
+	{
+		int n=((int(blockIdx.x)-nzlo_fft)*(nyhi_fft-nylo_fft+1)+int(blockIdx.y)-nylo_fft)*(nxhi_fft-nxlo_fft+1)+int(threadIdx.x)-nxlo_fft;
+		PPPM_FLOAT sqk = my_fkx*my_fkx + my_fky*my_fky + my_fkz*my_fkz;
+		PPPM_FLOAT vterm = (sqk==PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0)/sqk + PPPM_F(0.25)/(g_ewald*g_ewald));
+		vg[6*n+0] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkx*my_fkx;
+		vg[6*n+1] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fky*my_fky;
+		vg[6*n+2] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkz*my_fkz;
+		vg[6*n+3] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx*my_fky;
+		vg[6*n+4] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx*my_fkz;
+		vg[6*n+5] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fky*my_fkz;
+		
+	}
+}
+
+__device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z)
+{
+  PPPM_FLOAT sx,sy,sz;
+  sz = sy = sx = PPPM_F(0.0);
+  for (int l = order-1; l >= 0; l--) {
+    sx = gf_b[l] + sx*x;
+    sy = gf_b[l] + sy*y;
+    sz = gf_b[l] + sz*z;
+  }
+  PPPM_FLOAT s = sx*sy*sz;
+  return s*s;
+}
+
+__global__ void setup_greensfn(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald,
+int nbx,int nby,int nbz,
+PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab)
+{
+  PPPM_FLOAT sqk;
+  int nx,ny,nz,kper,lper,mper,k,l,m;
+  PPPM_FLOAT snx,sny,snz,snx2,sny2,snz2;
+  PPPM_FLOAT argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  PPPM_FLOAT sum1,dot1,dot2;
+  PPPM_FLOAT numerator,denominator;
+
+  PPPM_FLOAT form=PPPM_F(1.0);
+  int n=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  m=blockIdx.x;
+  l=blockIdx.y;
+  k=threadIdx.x;
+
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = sin(PPPM_F(0.5)*unitkz*mper*zprd_slab/nz_pppm);
+    snz2 = snz*snz;
+
+
+    lper = l - ny_pppm*(2*l/ny_pppm);
+    sny = sin(PPPM_F(0.5)*unitky*lper*yprd/ny_pppm);
+    sny2 = sny*sny;
+
+	kper = k - nx_pppm*(2*k/nx_pppm);
+	snx = sin(PPPM_F(0.5)*unitkx*kper*xprd/nx_pppm);
+	snx2 = snx*snx;
+      
+	sqk = pow(unitkx*kper,PPPM_F(2.0)) + pow(unitky*lper,PPPM_F(2.0)) + 
+	  pow(unitkz*mper,PPPM_F(2.0));
+
+	if (sqk != PPPM_F(0.0)) {
+	  numerator = form*PPPM_F(12.5663706)/sqk;
+	  denominator = gf_denom(snx2,sny2,snz2);  
+	  sum1 = PPPM_F(0.0);
+	  for (nx = -nbx; nx <= nbx; nx++) {
+	    qx = unitkx*(kper+nx_pppm*nx);
+	    sx = exp(PPPM_F(-.25)*pow(qx/g_ewald,PPPM_F(2.0)));
+	    wx = PPPM_F(1.0);
+	    argx = PPPM_F(0.5)*qx*xprd/nx_pppm;
+	    if (argx != PPPM_F(0.0)) wx = pow(sin(argx)/argx,order);
+	    for (ny = -nby; ny <= nby; ny++) {
+	      qy = unitky*(lper+ny_pppm*ny);
+	      sy = exp(PPPM_F(-.25)*pow(qy/g_ewald,PPPM_F(2.0)));
+	      wy = PPPM_F(1.0);
+	      argy = PPPM_F(0.5)*qy*yprd/ny_pppm;
+	      if (argy != PPPM_F(0.0)) wy = pow(sin(argy)/argy,order);
+	      for (nz = -nbz; nz <= nbz; nz++) {
+		qz = unitkz*(mper+nz_pppm*nz);
+		sz = exp(PPPM_F(-.25)*pow(qz/g_ewald,PPPM_F(2.0)));
+		wz = PPPM_F(1.0);
+		argz = PPPM_F(0.5)*qz*zprd_slab/nz_pppm;
+		if (argz != PPPM_F(0.0)) wz = pow(sin(argz)/argz,order);
+
+		dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+		dot2 = qx*qx+qy*qy+qz*qz;
+		sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,PPPM_F(2.0));
+	      }
+	    }
+	  }
+	  greensfn[n] = numerator*sum1/denominator;
+	} else greensfn[n] = PPPM_F(0.0);
+}
+
+__global__ void poisson_scale_kernel()
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  FFT_FLOAT scaleinv=FFT_F(1.0)/(gridDim.x*gridDim.y*blockDim.x);
+    work1[2*i] *= scaleinv * greensfn[i];
+    work1[2*i+1] *= scaleinv * greensfn[i];
+}
+
+__global__ void poisson_xgrad_kernel()
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+    work2[2*i] = fkx[threadIdx.x] * work1[2*i+1];
+    work2[2*i+1] = -fkx[threadIdx.x] * work1[2*i];
+}
+
+__global__ void poisson_ygrad_kernel()
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+    work2[2*i] = fky[blockIdx.y] * work1[2*i+1];
+    work2[2*i+1] = -fky[blockIdx.y] * work1[2*i];
+}
+
+__global__ void poisson_zgrad_kernel()
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+    work2[2*i] = fkz[blockIdx.x] * work1[2*i+1];
+    work2[2*i+1] = -fkz[blockIdx.x] * work1[2*i];
+}
+
+__global__ void poisson_vdx_brick_kernel(int ilo,int jlo,int klo)
+{
+  int k=blockIdx.x+klo;
+  k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1));
+  int j=blockIdx.y+jlo;
+  j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1));
+  int i=threadIdx.x+ilo;
+  i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1));
+  vdx_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)];
+}
+
+__global__ void poisson_vdy_brick_kernel(int ilo,int jlo,int klo)
+{
+  int k=blockIdx.x+klo;
+  k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1));
+  int j=blockIdx.y+jlo;
+  j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1));
+  int i=threadIdx.x+ilo;
+  i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1));
+  vdy_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)];
+}
+
+__global__ void poisson_vdz_brick_kernel(int ilo,int jlo,int klo)
+{
+  int k=blockIdx.x+klo;
+  k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1));
+  int j=blockIdx.y+jlo;
+  j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1));
+  int i=threadIdx.x+ilo;
+  i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1));
+  vdz_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)];
+}
+
+__global__ void poisson_energy_kernel(int nxlo_fft,int nylo_fft,int nzlo_fft,int vflag)
+{
+  ENERGY_FLOAT scaleinv=FFT_F(1.0)/(nx_pppm*ny_pppm*nz_pppm);
+  int i=(blockIdx.x+nzlo_fft)*ny_pppm*nx_pppm+(blockIdx.y+nylo_fft)*nx_pppm+threadIdx.x+nxlo_fft;
+  ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem;
+  ENERGY_FLOAT myenergy= scaleinv*scaleinv * greensfn[i] * (work1[2*i]*work1[2*i] + work1[2*i+1]*work1[2*i+1]);
+  s_energy[threadIdx.x]=myenergy;
+  
+  __syncthreads();
+  reduceBlock(s_energy);
+  if(threadIdx.x==0)
+  energy[blockIdx.x*ny_pppm+blockIdx.y]=s_energy[0];
+  if(vflag)
+  {
+  	__syncthreads();
+  	for (int j = 0; j < 6; j++) 
+  	{
+  	  s_energy[threadIdx.x]= myenergy*vg[((blockIdx.x*gridDim.y+blockIdx.y)*(blockDim.x)+threadIdx.x)*6+j];
+      __syncthreads();
+      reduceBlock(s_energy);
+      if(threadIdx.x==0)
+      virial[blockIdx.x*ny_pppm+blockIdx.y+j*nz_pppm*ny_pppm]=s_energy[0];
+  	}
+  }   
+}
+
+
+__global__ void sum_energy_kernel1(int vflag)
+{
+  ENERGY_FLOAT myenergy=energy[(blockIdx.x*ny_pppm+threadIdx.x)];
+  ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem;
+  s_energy[threadIdx.x]=myenergy;
+  __syncthreads();
+  reduceBlock(s_energy);
+  if(threadIdx.x==0)  
+  energy[blockIdx.x*ny_pppm]=s_energy[0];
+  if(vflag)
+  {
+  	__syncthreads();
+  	for (int j = 0; j < 6; j++) 
+  	{
+        myenergy=virial[blockIdx.x*ny_pppm+threadIdx.x+j*ny_pppm*nz_pppm];
+        s_energy[threadIdx.x]=myenergy;
+        __syncthreads();
+	    reduceBlock(s_energy);
+        if(threadIdx.x==0)  
+        virial[blockIdx.x*ny_pppm+j*ny_pppm*nz_pppm]=s_energy[0];
+  	}
+  }
+
+}
+
+__global__ void sum_energy_kernel2(int vflag)
+{
+  ENERGY_FLOAT myenergy=energy[threadIdx.x*ny_pppm];
+  ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem;
+  s_energy[threadIdx.x]=myenergy;
+  __syncthreads();
+  reduceBlock(s_energy);
+  if(threadIdx.x==0)  
+  energy[0]=s_energy[0];
+  if(vflag)
+  {
+  	__syncthreads();
+  	for (int j = 0; j < 6; j++) 
+  	{
+        myenergy=virial[threadIdx.x*ny_pppm+j*ny_pppm*nz_pppm];
+        s_energy[threadIdx.x]=myenergy;
+        __syncthreads();
+	    reduceBlock(s_energy);
+        if(threadIdx.x==0)  
+        virial[j]=s_energy[0];
+  	}
+  }
+}
+
+__device__ PPPM_FLOAT rho1d(int k,PPPM_FLOAT d,PPPM_FLOAT* srho_coeff)
+{
+    PPPM_FLOAT rho1d_tmp=PPPM_F(0.0);
+    for (int l = order-1; l >= 0; l--) 
+      rho1d_tmp = srho_coeff[l*order+k-(1-order)/2] + rho1d_tmp*d;
+    return rho1d_tmp;
+}
+
+__global__ void particle_map_kernel(int* flag)
+{
+  int i=blockIdx.x*blockDim.x+threadIdx.x;
+  if(i<nlocal)
+  {
+    int nx,ny,nz;
+  	PPPM_FLOAT shift=PPPM_F(0.5)-shiftone;//+OFFSET;
+    nx = (int)((_x[i]-_boxlo[0])*delxinv+shift);// - OFFSET;
+    ny = (int)((_x[i+nmax]-_boxlo[1])*delyinv+shift);// - OFFSET;
+    nz = (int)((_x[i+2*nmax]-_boxlo[2])*delzinv+shift);// - OFFSET;
+
+    part2grid[i] = nx;
+    part2grid[i+nmax] = ny;
+    part2grid[i+2*nmax] = nz;
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+	ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+	nz+nlower < nzlo_out || nz+nupper > nzhi_out) 
+	{flag[0]++;
+	  debugdata[0]=i;
+	  debugdata[1]=_boxlo[0];
+	  debugdata[2]=_boxlo[1];
+	  debugdata[3]=_boxlo[2];
+	  debugdata[4]=nx;
+	  debugdata[5]=ny;
+	  debugdata[6]=nz;
+	  debugdata[7]=_x[i];
+	  debugdata[8]=_x[i+_nmax];
+	  debugdata[9]=_x[i+2*_nmax];
+	  debugdata[10]=nlocal;
+	  
+	}
+  }
+}
+
+__global__ void make_rho_kernelA()
+{
+ int i,l,m,n,nx,ny,nz,mx,my,mz;
+
+  // clear 3d density array
+
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  i=blockIdx.x*blockDim.x+threadIdx.x;
+  
+  if(i < nlocal) {
+
+    PPPM_FLOAT dx,dy,dz,x0,y0,z0;
+    nx = part2grid[i];
+    ny = part2grid[i+nmax];
+    nz = part2grid[i+2*nmax];
+    dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv;
+    dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv;
+    dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv;
+    
+    z0 = delxinv*delyinv*delzinv * _q[i];
+    for (n = nlower; n <= nupper; n++) 
+    {
+      mz = n+nz;
+      y0 = z0*rho1d(n,dz,rho_coeff);
+      for (m = nlower; m <= nupper; m++) 
+      {
+		my = m+ny;
+		x0 = y0*rho1d(m,dy,rho_coeff);
+		for (l = nlower; l <= nupper; l++) 
+		{
+	  		mx = l+nx;
+			int mzyx=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+mx-nxlo_out;
+	  	 	while(atomicAdd(&density_brick_int[mzyx],1)!=0) atomicAdd(&density_brick_int[mzyx],-1);
+	  	 	density_brick[mzyx]+=x0*rho1d(l,dx,rho_coeff);
+	  	 	__threadfence();
+	  	 	atomicAdd(&density_brick_int[mzyx],-1);
+	  	 	__syncthreads();
+
+	    }
+      }
+    }
+  }
+}
+
+__global__ void make_rho_kernel(int* flag,int read_threads_at_same_time)
+{
+ int i,l,m,n,nx,ny,nz,mx,my,mz,a,b;
+
+  // clear 3d density array
+
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+ // int nzxy=blockIdx.x*gridDim.y+blockIdx.y;
+  
+  int nelements=nupper-nlower+1;
+  int* idx=(int*) sharedmem;
+  int* sdensity_brick_int=&idx[blockDim.x];
+  PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &sdensity_brick_int[nelements*blockDim.x];
+  if(threadIdx.x<order*(order/2-(1-order)/2+1))
+  srho_coeff[threadIdx.x]=rho_coeff[threadIdx.x];
+  __syncthreads();
+    
+ i=blockIdx.x*blockDim.x+threadIdx.x;
+ 
+ if(false)
+ {
+  if(i < nlocal) {
+
+    PPPM_FLOAT dx,dy,dz,x0,y0,z0;
+    nx = part2grid[i];
+    ny = part2grid[i+nmax];
+    nz = part2grid[i+2*nmax];
+    dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv;
+    dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv;
+    dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv;
+    
+    z0 = delxinv*delyinv*delzinv * _q[i];
+    for (n = nlower; n <= nupper; n++) 
+    {
+      mz = n+nz;
+      y0 = z0*rho1d(n,dz,srho_coeff);
+      for (m = nlower; m <= nupper; m++) 
+      {
+		my = m+ny;
+		x0 = y0*rho1d(m,dy,srho_coeff);
+		for (l = nlower; l <= nupper; l++) 
+		{
+	  		mx = l+nx;
+			int mzyx=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+mx-nxlo_out;
+	  	 	
+	  	 	a=int(x0*rho1d(l,dx,srho_coeff)*density_intScale);
+	  	 	b=(atomicAdd(&density_brick_int[mzyx],a)|a);
+	  	 	if(((b)&(0x7c000000))&&(not((b)&(0x80000000)))) 
+	  	 	{
+	  	 	  flag[1]++;
+	  	 	  if((b)&(0x60000000)) flag[0]++;
+	  	 	}
+	  	 	__syncthreads();
+	    }
+      }
+    }
+  }
+  return;
+  }
+  
+  i=blockIdx.x*blockDim.x+threadIdx.x;
+  {
+
+    PPPM_FLOAT dx,dy,dz,x0,y0,z0,qtmp;
+    if(i < nlocal) 
+    {
+      qtmp= _q[i];
+      nx = part2grid[i];
+      ny = part2grid[i+nmax];
+      nz = part2grid[i+2*nmax];
+      dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv;
+      dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv;
+      dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv;
+      z0 = delxinv*delyinv*delzinv * qtmp;
+    }
+    else
+    {
+      nx=ny=nz=1; dx=dy=dz=PPPM_F(0.1);
+    }
+    __syncthreads();
+    for (n = nlower; n <= nupper; n++) 
+    {
+      mz = n+nz;
+      y0 = z0*rho1d(n,dz,srho_coeff);
+      for (m = nlower; m <= nupper; m++) 
+      {
+		my = m+ny;
+		x0 = y0*rho1d(m,dy,srho_coeff);
+        if(i<nlocal)
+        {
+	   	  idx[threadIdx.x]=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+nx+nlower-nxlo_out;
+		  for (l = nlower; l <= nupper; l++) 
+		  {
+	  	 	sdensity_brick_int[threadIdx.x*nelements+l-nlower]=int(x0*rho1d(l,dx,srho_coeff)*density_intScale);
+	      }
+        }
+	   	else idx[threadIdx.x]=-1;
+		__syncthreads();
+		
+ 		for(int ii=0;ii<blockDim.x;ii+=read_threads_at_same_time)
+		{
+			int kk=threadIdx.x/nelements;
+		
+			if((threadIdx.x<nelements*read_threads_at_same_time)&&(kk+ii<blockDim.x)&&(idx[ii+kk]>-1))
+			{
+			  a=sdensity_brick_int[ii*nelements+threadIdx.x];
+			  //if(a*a>1e-100)
+			  b=(atomicAdd(&density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements],a)|a);
+			  //else 
+			  //b=(density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements]|a);
+	  	 	  if(((b)&(0x7c000000))&&(not((b)&(0x80000000)))) 
+	  	 	  {
+	  	 	    flag[1]++;
+	  	 	    if((b)&(0x60000000)) flag[0]++;
+	  	 	  }
+			}
+		}
+		__syncthreads();	   //*/ 
+      }
+    }
+  
+  }
+}
+
+__global__ void scale_rho_kernel()
+{
+  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
+  density_brick[i]=(1.0/density_intScale)*density_brick_int[i];      		
+}
+
+__global__ void fieldforce_kernel(int elements_per_thread,int read_threads_at_same_time,int* flag) //20*x64 0.36
+{
+  int i;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+  i=blockIdx.x*blockDim.x+threadIdx.x;
+  int* idx=(int*) sharedmem;
+  PPPM_FLOAT* tmp_brick=(PPPM_FLOAT*) &idx[blockDim.x];
+  PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &tmp_brick[3*blockDim.x*elements_per_thread];
+  if(threadIdx.x<order*(order/2-(1-order)/2+1))
+  srho_coeff[threadIdx.x]=rho_coeff[threadIdx.x];
+  __syncthreads();
+  {
+  	int l,m,n,nx,ny,nz,my,mz;
+    PPPM_FLOAT dx,dy,dz,x0,y0,z0;
+    PPPM_FLOAT ek[3];
+    if(i<nlocal)
+    {
+      nx = part2grid[i];
+      ny = part2grid[i+nmax];
+      nz = part2grid[i+2*nmax];
+      dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv;
+      dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv;
+      dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv;
+    
+      ek[0] = ek[1] = ek[2] = PPPM_F(0.0);
+    }
+    else
+    {
+      nx=ny=nz=1; dx=dy=dz=PPPM_F(0.1);
+    }
+    __syncthreads();
+    for (n = nlower; n <= nupper; n++) 
+    {
+      mz = n+nz;
+      z0 = rho1d(n,dz,srho_coeff);
+      for (m = nlower; m <= nupper; m++) 
+      {
+	    my = m+ny;
+	    y0 = z0*rho1d(m,dy,srho_coeff);
+	
+	
+        if(i<nlocal)
+	   	idx[threadIdx.x]=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+nx+nlower-nxlo_out;
+	   	else idx[threadIdx.x]=-1;
+		__syncthreads();
+
+		for(int ii=0;ii<blockDim.x;ii+=read_threads_at_same_time)
+		{
+			int kk=threadIdx.x/elements_per_thread;
+		
+			if((threadIdx.x<elements_per_thread*read_threads_at_same_time)&&(kk+ii<blockDim.x)&&(idx[ii+kk]>-1))
+			{
+			tmp_brick[ii*elements_per_thread+threadIdx.x]=vdx_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread];
+			tmp_brick[(ii+blockDim.x)*elements_per_thread+threadIdx.x]=vdy_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread];
+			tmp_brick[(ii+2*blockDim.x)*elements_per_thread+threadIdx.x]=vdz_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread];
+			}
+		}
+		__syncthreads();
+ 		
+        if(i<nlocal)
+ 		for (l = nlower; l <= nupper; l++) 
+ 		{
+	  		x0 = y0*rho1d(l,dx,srho_coeff);
+
+	  		ek[0] -= x0*tmp_brick[threadIdx.x*elements_per_thread+l-nlower];
+	  		ek[1] -= x0*tmp_brick[threadIdx.x*elements_per_thread+l-nlower+blockDim.x*elements_per_thread];
+	  		ek[2] -= x0*tmp_brick[threadIdx.x*elements_per_thread+l-nlower+2*blockDim.x*elements_per_thread];
+	  	}
+		__syncthreads();
+      }
+    }
+    // convert E-field to force
+
+
+    _f[i] += qqrd2e*_q[i]*ek[0];
+    _f[i+nmax] += qqrd2e*_q[i]*ek[1];
+    _f[i+2*nmax] += qqrd2e*_q[i]*ek[2];
+  }
+}
+
+__global__ void slabcorr_energy_kernel(ENERGY_FLOAT* buf)
+{
+	ENERGY_FLOAT* dipole =(ENERGY_FLOAT*) sharedmem;
+	int i=blockIdx.x*blockDim.x+threadIdx.x;
+	if(i<nlocal)
+	dipole[threadIdx.x]=_q[i]*_x[i+2*nmax];
+	else
+	dipole[threadIdx.x]=ENERGY_F(0.0);
+    __syncthreads();
+    reduceBlock(dipole);
+    if(threadIdx.x==0) buf[blockIdx.x]=dipole[0];
+}
+
+__global__ void slabcorr_force_kernel(F_FLOAT ffact)
+{
+	int i=blockIdx.x*blockDim.x+threadIdx.x;
+	if(i<nlocal)
+	_f[i+2*nmax] += qqrd2e*_q[i]*ffact;
+}
+
+
+__global__ void initfftdata_core_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)+1]=0;
+}
+
+__global__ void initfftdata_z_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+   if(slabflag){
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   out[2*(((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1)+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+   }else{
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   out[2*(((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1)+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];   
+   }
+   if(blockIdx.x<nzhi_out-nzhi_in)
+   out[2*((((blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+(nzhi_out-nzlo_in))*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+ }
+
+__global__ void initfftdata_y_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+   if(blockIdx.y<nylo_in-nylo_out)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+(2*(nyhi_in+1)-nylo_in-nyhi_out)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+(nyhi_out-nylo_in))*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+}
+
+__global__ void initfftdata_x_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+}
+
+__global__ void initfftdata_yz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+   if(slabflag){
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nylo_in-nylo_out)
+   out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+   }else{
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nylo_in-nylo_out)
+   out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+   }
+   
+   if(blockIdx.x<nzhi_out-nzhi_in)
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+
+   if(blockIdx.x<nzhi_out-nzhi_in)
+   if(blockIdx.y<nylo_in-nylo_out)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
+}
+
+__global__ void initfftdata_xz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+   if(blockIdx.x<nzhi_out-nzhi_in)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.x<nzhi_out-nzhi_in)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+
+   if(slabflag)
+   {
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+   }else{
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+   }
+}
+
+__global__ void initfftdata_xy_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+
+   if(blockIdx.y<nylo_in-nylo_out)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.y<nylo_in-nylo_out)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+}
+
+__global__ void initfftdata_xyz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+   if(blockIdx.x<nzhi_out-nzhi_in)
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.x<nzhi_out-nzhi_in)
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+
+   if(blockIdx.x<nzhi_out-nzhi_in)
+   if(blockIdx.y<nylo_in-nylo_out)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.x<nzhi_out-nzhi_in)
+   if(blockIdx.y<nylo_in-nylo_out)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+
+   if(slabflag){
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nylo_in-nylo_out)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nylo_in-nylo_out)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+   }else{
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nyhi_out-nyhi_in)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nylo_in-nylo_out)
+   if(threadIdx.x<nxlo_in-nxlo_out)
+   out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x];
+
+   if(blockIdx.x<nzlo_in-nzlo_out)
+   if(blockIdx.y<nylo_in-nylo_out)
+   if(threadIdx.x<nxhi_out-nxhi_in)
+   out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
+   }
+}