From 5a8719e38f33a86e730325e53706a74200e61ac1 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Thu, 26 May 2011 22:16:44 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6219
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/USER-CUDA/Install.sh          |   74 ++
 src/USER-CUDA/comm_cuda.cpp       | 1431 +++++++++++++++++++++++++++++
 src/USER-CUDA/comm_cuda.h         |   69 ++
 src/USER-CUDA/cuda.cpp            |  837 +++++++++++++++++
 src/USER-CUDA/cuda.h              |  153 +++
 src/USER-CUDA/cuda_common.h       |  344 +++++++
 src/USER-CUDA/cuda_data.h         |  796 ++++++++++++++++
 src/USER-CUDA/cuda_modify_flags.h |   39 +
 src/USER-CUDA/cuda_neigh_list.cpp |  180 ++++
 src/USER-CUDA/cuda_neigh_list.h   |   83 ++
 src/USER-CUDA/cuda_precision.h    |  269 ++++++
 src/USER-CUDA/cuda_shared.h       |  378 ++++++++
 src/USER-CUDA/domain_cuda.cpp     |  270 ++++++
 src/USER-CUDA/domain_cuda.h       |   41 +
 src/USER-CUDA/modify_cuda.cpp     |  442 +++++++++
 src/USER-CUDA/modify_cuda.h       |   82 ++
 src/USER-CUDA/neigh_full_cuda.cpp |  317 +++++++
 src/USER-CUDA/neighbor_cuda.cpp   |  221 +++++
 src/USER-CUDA/neighbor_cuda.h     |   39 +
 src/USER-CUDA/verlet_cuda.cpp     | 1103 ++++++++++++++++++++++
 src/USER-CUDA/verlet_cuda.h       |   63 ++
 21 files changed, 7231 insertions(+)
 create mode 100755 src/USER-CUDA/Install.sh
 create mode 100644 src/USER-CUDA/comm_cuda.cpp
 create mode 100644 src/USER-CUDA/comm_cuda.h
 create mode 100644 src/USER-CUDA/cuda.cpp
 create mode 100644 src/USER-CUDA/cuda.h
 create mode 100644 src/USER-CUDA/cuda_common.h
 create mode 100644 src/USER-CUDA/cuda_data.h
 create mode 100644 src/USER-CUDA/cuda_modify_flags.h
 create mode 100644 src/USER-CUDA/cuda_neigh_list.cpp
 create mode 100644 src/USER-CUDA/cuda_neigh_list.h
 create mode 100644 src/USER-CUDA/cuda_precision.h
 create mode 100644 src/USER-CUDA/cuda_shared.h
 create mode 100644 src/USER-CUDA/domain_cuda.cpp
 create mode 100644 src/USER-CUDA/domain_cuda.h
 create mode 100644 src/USER-CUDA/modify_cuda.cpp
 create mode 100644 src/USER-CUDA/modify_cuda.h
 create mode 100644 src/USER-CUDA/neigh_full_cuda.cpp
 create mode 100644 src/USER-CUDA/neighbor_cuda.cpp
 create mode 100644 src/USER-CUDA/neighbor_cuda.h
 create mode 100644 src/USER-CUDA/verlet_cuda.cpp
 create mode 100644 src/USER-CUDA/verlet_cuda.h

diff --git a/src/USER-CUDA/Install.sh b/src/USER-CUDA/Install.sh
new file mode 100755
index 0000000000..c11e58b4f6
--- /dev/null
+++ b/src/USER-CUDA/Install.sh
@@ -0,0 +1,74 @@
+# Install/unInstall package files in LAMMPS
+# edit Makefile.package to include/exclude CUDA library
+# do not copy child files if parent does not exist
+
+if (test $1 = 1) then
+
+  if (test -e ../Makefile.package) then
+      sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
+      sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
+      sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
+      sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
+      sed -i '1 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package
+      sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package
+      sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) |' ../Makefile.package
+      sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda -lcuda -lcudart -lrt |' ../Makefile.package
+  fi
+
+  cp comm_cuda.cpp ..
+  cp domain_cuda.cpp ..
+  cp modify_cuda.cpp ..
+  cp neighbor_cuda.cpp ..
+  cp neigh_full_cuda.cpp ..
+  cp verlet_cuda.cpp ..
+
+  cp cuda.cpp ..
+  cp cuda_neigh_list.cpp ..
+
+  cp comm_cuda.h ..
+  cp domain_cuda.h ..
+  cp modify_cuda.h ..
+  cp neighbor_cuda.h ..
+  cp verlet_cuda.h ..
+
+  cp cuda.h ..
+  cp cuda_common.h ..
+  cp cuda_data.h ..
+  cp cuda_modify_flags.h ..
+  cp cuda_neigh_list.h ..
+  cp cuda_precision.h ..
+  cp cuda_shared.h ..
+
+elif (test $1 = 0) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
+    sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
+    sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
+    sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
+  fi
+
+  rm ../comm_cuda.cpp
+  rm ../domain_cuda.cpp
+  rm ../modify_cuda.cpp
+  rm ../neighbor_cuda.cpp
+  rm ../neigh_full_cuda.cpp
+  rm ../verlet_cuda.cpp
+
+  rm ../cuda.cpp
+  rm ../cuda_neigh_list.cpp
+
+  rm ../comm_cuda.h
+  rm ../domain_cuda.h
+  rm ../modify_cuda.h
+  rm ../neighbor_cuda.h
+  rm ../verlet_cuda.h
+
+  rm ../cuda.h
+  rm ../cuda_common.h
+  rm ../cuda_data.h
+  rm ../cuda_modify_flags.h
+  rm ../cuda_neigh_list.h
+  rm ../cuda_precision.h
+  rm ../cuda_shared.h
+fi
diff --git a/src/USER-CUDA/comm_cuda.cpp b/src/USER-CUDA/comm_cuda.cpp
new file mode 100644
index 0000000000..6f90227112
--- /dev/null
+++ b/src/USER-CUDA/comm_cuda.cpp
@@ -0,0 +1,1431 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author (triclinic) : Pieter in 't Veld (SNL)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cmath>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include "comm_cuda.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "force.h"
+#include "pair.h"
+#include "domain.h"
+#include "neighbor.h"
+#include "modify.h"
+#include "fix.h"
+#include "group.h"
+#include "compute.h"
+#include "cuda.h"
+#include "error.h"
+#include "memory.h"
+#include "comm_cuda_cu.h"
+
+using namespace LAMMPS_NS;
+
+#define BUFFACTOR 1.5
+#define BUFMIN 1000
+#define BUFEXTRA 1000
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#define BIG 1.0e20
+
+enum{SINGLE,MULTI};
+
+/* ----------------------------------------------------------------------
+   setup MPI and allocate buffer space 
+------------------------------------------------------------------------- */
+
+CommCuda::CommCuda(LAMMPS *lmp):Comm(lmp) 
+{
+  cuda = lmp->cuda;
+
+  cu_pbc=NULL;
+  cu_slablo=NULL;
+  cu_slabhi=NULL;
+  cu_multilo=NULL;
+  cu_multihi=NULL;
+  cu_sendlist=NULL;
+
+
+  memory->sfree(buf_send);
+  memory->sfree(buf_recv);
+  buf_send = NULL;
+  buf_recv = NULL;
+
+  Comm::free_swap();
+  allocate_swap(maxswap);
+}
+
+/* ---------------------------------------------------------------------- */
+
+CommCuda::~CommCuda()
+{
+  delete cu_sendlist;
+  if(cuda->pinned)
+  {
+    CudaWrapper_FreePinnedHostData((void*)buf_send);
+    CudaWrapper_FreePinnedHostData((void*)buf_recv);
+  }
+  else
+  {
+    memory->sfree(buf_send);
+    memory->sfree(buf_recv);
+  }
+  buf_send=NULL;
+  buf_recv=NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CommCuda::init()
+{
+  int factor = 1;
+  if(cuda->shared_data.overlap_comm) factor=maxswap;
+  if(not buf_send)
+  grow_send(maxsend,0);
+  if(not buf_recv)
+  grow_recv(maxrecv);
+  if(not cu_sendlist)
+  {
+    cu_sendlist=new cCudaData<int, int, xy> ((int*)sendlist,maxswap,BUFMIN);
+    cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data();
+    cuda->shared_data.comm.maxswap=maxswap;
+    cuda->shared_data.comm.maxlistlength=BUFMIN;
+    cu_sendlist->upload();
+  }
+  delete cu_pbc;
+  cu_pbc=new cCudaData<int, int, xy> ((int*)pbc,cuda->shared_data.comm.maxswap,6);
+  cu_pbc->upload();
+
+  delete cu_slablo;
+  cu_slablo = new cCudaData<double, X_FLOAT,x>(slablo,cuda->shared_data.comm.maxswap);
+  cu_slablo->upload();
+
+  delete cu_slabhi;
+  cu_slabhi = new cCudaData<double, X_FLOAT,x>(slabhi,cuda->shared_data.comm.maxswap);
+  cu_slabhi->upload();
+  
+  cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data();
+  cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
+  cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data();
+ 
+  Comm::init();
+}
+
+/* ----------------------------------------------------------------------
+   setup spatial-decomposition communication patterns
+   function of neighbor cutoff(s) & cutghostuser & current box size
+   single style sets slab boundaries (slablo,slabhi) based on max cutoff
+   multi style sets type-dependent slab boundaries (multilo,multihi)
+------------------------------------------------------------------------- */
+
+void CommCuda::setup()
+{
+  Comm::setup();
+   
+  //upload changed geometry to device
+    if(style == SINGLE)
+    {
+    	if(cu_slablo) cu_slablo->upload();
+    	if(cu_slabhi) cu_slabhi->upload();
+    }
+	else
+    {
+    	if(cu_multilo) cu_multilo->upload();
+    	if(cu_multihi) cu_multihi->upload();
+    }
+}
+
+/* ----------------------------------------------------------------------
+   forward communication of atom coords every timestep
+   other per-atom attributes may also be sent via pack/unpack routines
+------------------------------------------------------------------------- */
+
+void CommCuda::forward_comm(int mode)
+{
+  if(mode==0) return forward_comm_cuda();
+  if(mode==1) return forward_comm_pack_cuda();
+  if(mode==2) return forward_comm_transfer_cuda();
+  if(mode==3) return forward_comm_unpack_cuda();
+}
+
+
+void CommCuda::forward_comm_cuda()
+{
+  static int count=0;
+  static double kerneltime=0.0;
+  static double copytime=0.0;
+  timespec time1,time2,time3;
+  
+  int n;
+  MPI_Request request;
+  MPI_Status status;
+  AtomVec *avec = atom->avec;
+  double **x = atom->x;
+
+  cuda->shared_data.domain.xy=domain->xy;
+  cuda->shared_data.domain.xz=domain->xz;
+  cuda->shared_data.domain.yz=domain->yz;
+  cuda->shared_data.domain.prd[0]=domain->prd[0];
+  cuda->shared_data.domain.prd[1]=domain->prd[1];
+  cuda->shared_data.domain.prd[2]=domain->prd[2];
+  cuda->shared_data.domain.triclinic=domain->triclinic;
+  if(not comm_x_only && not avec->cudable) 
+  {
+  	cuda->downloadAll(); 
+    Comm::forward_comm();
+    cuda->uploadAll();
+    return;
+  } 
+  
+  // exchange data with another proc
+  // if other proc is self, just copy
+  // if comm_x_only set, exchange or copy directly to x, don't unpack
+
+  for (int iswap = 0; iswap < nswap; iswap++) {
+    if (sendproc[iswap] != me) 
+    {   
+      if (comm_x_only) 
+      {
+
+        int size_forward_recv_now=0;
+        
+        if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
+          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
+        else
+          size_forward_recv_now=size_forward_recv[iswap];
+clock_gettime(CLOCK_REALTIME,&time1);
+        
+        MPI_Irecv(buf_recv,size_forward_recv_now,MPI_DOUBLE,
+                 recvproc[iswap],0,world,&request);
+        n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) buf_send,pbc[iswap],pbc_flag[iswap]);
+		
+clock_gettime(CLOCK_REALTIME,&time2);
+		
+        if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
+          n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
+
+		//printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now,n);
+	    MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	    MPI_Wait(&request,&status);
+       
+clock_gettime(CLOCK_REALTIME,&time3);
+cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
+      time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
+cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
+      time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000;
+ 
+        Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],(void*)buf_recv,iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]
+ 	
+      } 
+      else if (ghost_velocity) 
+      {
+	    MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
+		  recvproc[iswap],0,world,&request);
+        
+        if(avec->cudable)
+          n = avec->pack_comm_vel(sendnum[iswap],&iswap,
+                           buf_send,pbc_flag[iswap],pbc[iswap]);
+        else
+	      n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
+				buf_send,pbc_flag[iswap],pbc[iswap]);
+	    
+	    MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	    MPI_Wait(&request,&status);
+	    avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv);
+      } 
+      else 
+      {
+	    MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
+		  recvproc[iswap],0,world,&request);
+        
+        if(avec->cudable)
+          n = avec->pack_comm(sendnum[iswap],&iswap,
+                           buf_send,pbc_flag[iswap],pbc[iswap]);
+        else
+	      n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
+			    buf_send,pbc_flag[iswap],pbc[iswap]);
+	
+	    MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	    MPI_Wait(&request,&status);
+	    avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
+      }
+
+    }
+    else  //sendproc == me
+    {
+      cuda->self_comm=1;
+      if (comm_x_only) 
+      {     	
+	    if (sendnum[iswap])
+		{
+          n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]);
+          if(n<0) error->all(" # CUDA ERRROR on PackComm_Self");
+          if((sizeof(X_FLOAT)!=sizeof(double)) && n)
+            n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
+		}
+      } 
+      else if (ghost_velocity) 
+      {
+		n = avec->pack_comm_vel(sendnum[iswap],&iswap,
+				(double*) firstrecv,pbc_flag[iswap],pbc[iswap]);
+	    //avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],(double*) firstrecv);
+      } 
+      else 
+      {
+		n = avec->pack_comm(sendnum[iswap],&iswap,
+			    (double*) firstrecv,pbc_flag[iswap],pbc[iswap]);
+		//avec->unpack_comm(recvnum[iswap],firstrecv[iswap],(double*) firstrecv);
+      }
+      cuda->self_comm=0;
+    }
+  }
+}
+
+void CommCuda::forward_comm_pack_cuda()
+{
+	static int count=0;
+	static double kerneltime=0.0;
+	static double copytime=0.0;
+    timespec time1,time2,time3;
+  int n;  // initialize comm buffers & exchange memory
+
+  MPI_Request request;
+  MPI_Status status;
+  AtomVec *avec = atom->avec;
+  double **x = atom->x;
+
+  cuda->shared_data.domain.xy=domain->xy;
+  cuda->shared_data.domain.xz=domain->xz;
+  cuda->shared_data.domain.yz=domain->yz;
+  cuda->shared_data.domain.prd[0]=domain->prd[0];
+  cuda->shared_data.domain.prd[1]=domain->prd[1];
+  cuda->shared_data.domain.prd[2]=domain->prd[2];
+  cuda->shared_data.domain.triclinic=domain->triclinic;
+  if(not comm_x_only && not avec->cudable) cuda->downloadAll();  //if not comm_x_only the communication routine of the atom_vec style class is used 
+
+  // exchange data with another proc
+  // if other proc is self, just copy
+  // if comm_x_only set, exchange or copy directly to x, don't unpack
+
+  for (int iswap = 0; iswap < nswap; iswap++) {
+    if (sendproc[iswap] != me) 
+    {   
+      if (comm_x_only) 
+      {
+
+		
+clock_gettime(CLOCK_REALTIME,&time1);
+        
+      //  n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) cuda->shared_data.comm.buf_send[iswap],pbc[iswap],pbc_flag[iswap]);
+		  n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*)buf_send,pbc[iswap],pbc_flag[iswap]);
+		
+clock_gettime(CLOCK_REALTIME,&time2);
+		
+        if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
+          n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
+		cuda->shared_data.comm.send_size[iswap]=n;
+      } 
+      else if (ghost_velocity) 
+      {
+clock_gettime(CLOCK_REALTIME,&time1);
+
+       // n = Cuda_CommCuda_PackComm_Vel(&cuda->shared_data,sendnum[iswap],iswap,(void*) &buf_send[iswap*maxsend],pbc[iswap],pbc_flag[iswap]);
+		
+clock_gettime(CLOCK_REALTIME,&time2);
+		
+        if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
+          n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
+		cuda->shared_data.comm.send_size[iswap]=n;
+       } 
+      else 
+      {
+	    MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
+		  recvproc[iswap],0,world,&request);
+        
+        if(avec->cudable)
+          n = avec->pack_comm(sendnum[iswap],&iswap,
+                           cuda->shared_data.comm.buf_send[iswap],pbc_flag[iswap],pbc[iswap]);
+        else
+	      n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
+			    cuda->shared_data.comm.buf_send[iswap],pbc_flag[iswap],pbc[iswap]);
+	
+	    MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	    MPI_Wait(&request,&status);
+	    avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
+      }
+
+    }
+    else  //sendproc == me
+    {
+      if (comm_x_only) 
+      {     	
+	    if (sendnum[iswap])
+		{
+          n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]);
+          if(n<0) error->all(" # CUDA ERRROR on PackComm_Self");
+          if((sizeof(X_FLOAT)!=sizeof(double)) && n)
+            n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
+		}
+      } 
+      else if (ghost_velocity) 
+      {
+		n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
+				buf_send,pbc_flag[iswap],pbc[iswap]);
+	    avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_send);
+      } 
+      else 
+      {
+		n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
+			    buf_send,pbc_flag[iswap],pbc[iswap]);
+		avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
+      }
+    }
+  }
+  if(not comm_x_only && not avec->cudable) cuda->uploadAll(); 
+}
+
+void CommCuda::forward_comm_transfer_cuda()
+{
+	static int count=0;
+	static double kerneltime=0.0;
+	static double copytime=0.0;
+    timespec time1,time2,time3;
+  int n;
+  MPI_Request request;
+  MPI_Status status;
+  AtomVec *avec = atom->avec;
+  double **x = atom->x;
+  cuda->shared_data.domain.xy=domain->xy;
+  cuda->shared_data.domain.xz=domain->xz;
+  cuda->shared_data.domain.yz=domain->yz;
+  cuda->shared_data.domain.prd[0]=domain->prd[0];
+  cuda->shared_data.domain.prd[1]=domain->prd[1];
+  cuda->shared_data.domain.prd[2]=domain->prd[2];
+  cuda->shared_data.domain.triclinic=domain->triclinic;
+  if(not comm_x_only && not avec->cudable) cuda->downloadAll();  //if not comm_x_only the communication routine of the atom_vec style class is used 
+//printf("A\n");
+  // exchange data with another proc
+  // if other proc is self, just copy
+  // if comm_x_only set, exchange or copy directly to x, don't unpack
+
+  for (int iswap = 0; iswap < nswap; iswap++) {
+    if (sendproc[iswap] != me) 
+    {   
+      if (comm_x_only) 
+      {
+
+        int size_forward_recv_now=0;
+        
+        if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
+          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
+        else
+          size_forward_recv_now=size_forward_recv[iswap];
+		
+        //printf("A: %i \n",size_forward_recv_now/1024*4);
+        //MPI_Irecv(cuda->shared_data.comm.buf_recv[iswap],size_forward_recv_now,MPI_DOUBLE,
+        //         recvproc[iswap],0,world,&request);
+        MPI_Irecv(buf_recv,size_forward_recv_now,MPI_DOUBLE,
+                 recvproc[iswap],0,world,&request);
+		//printf("%p %p %i\n",buf_send, cuda->shared_data.comm.buf_send_dev[iswap], cuda->shared_data.comm.send_size[iswap]*sizeof(double));
+        //memcpy(buf_send,cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap]*sizeof(double));
+	//	CudaWrapper_SyncStream(1);
+        //printf("B: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
+		CudaWrapper_DownloadCudaDataAsync((void*) buf_send, cuda->shared_data.comm.buf_send_dev[iswap], cuda->shared_data.comm.send_size[iswap]*sizeof(double),2);
+	    //MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
+clock_gettime(CLOCK_REALTIME,&time1);
+        CudaWrapper_SyncStream(2);
+        //printf("C: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
+clock_gettime(CLOCK_REALTIME,&time2);
+cuda->shared_data.cuda_timings.comm_forward_download+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+	    MPI_Send(buf_send,cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
+	    MPI_Wait(&request,&status);
+        //printf("D: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
+		CudaWrapper_UploadCudaDataAsync((void*) buf_recv,cuda->shared_data.comm.buf_recv_dev[iswap], size_forward_recv_now*sizeof(double),2);
+clock_gettime(CLOCK_REALTIME,&time1);
+        CudaWrapper_SyncStream(2);
+        //printf("E: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
+        //memcpy(cuda->shared_data.comm.buf_recv[iswap],buf_recv,size_forward_recv_now*sizeof(double));
+ 		//printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now*sizeof(double),cuda->shared_data.comm.send_size[iswap]*sizeof(double));      
+clock_gettime(CLOCK_REALTIME,&time3);
+cuda->shared_data.cuda_timings.comm_forward_upload+=
+      time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
+cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
+      time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000;
+clock_gettime(CLOCK_REALTIME,&time3);
+cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
+      time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
+      } 
+      else if (ghost_velocity) 
+      {
+ /*       int size_forward_recv_now=0;
+        
+        if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
+          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
+        else
+          size_forward_recv_now=size_forward_recv[iswap];
+		
+clock_gettime(CLOCK_REALTIME,&time1);
+        
+        MPI_Irecv(cuda->shared_data.comm.buf_recv[iswap],size_forward_recv_now,MPI_DOUBLE,
+                 recvproc[iswap],0,world,&request);
+		
+clock_gettime(CLOCK_REALTIME,&time2);
+		
+	    MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
+	    MPI_Wait(&request,&status);
+       
+clock_gettime(CLOCK_REALTIME,&time3);
+cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
+      time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
+cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
+      time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000;*/
+ 
+       } 
+      else 
+      {
+	    MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
+		  recvproc[iswap],0,world,&request);
+        
+        if(avec->cudable)
+          n = avec->pack_comm(sendnum[iswap],&iswap,
+                           buf_send,pbc_flag[iswap],pbc[iswap]);
+        else
+	      n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
+			    buf_send,pbc_flag[iswap],pbc[iswap]);
+	
+	    MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	    MPI_Wait(&request,&status);
+	    avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
+      }
+
+    }
+    else  //sendproc == me
+    {
+      if (comm_x_only) 
+      {     	
+	    if (sendnum[iswap])
+		{
+		}
+      } 
+      else if (ghost_velocity) 
+      {
+      } 
+      else 
+      {
+		n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
+			    buf_send,pbc_flag[iswap],pbc[iswap]);
+		avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
+      }
+    }
+  }
+  if(not comm_x_only && not avec->cudable) cuda->uploadAll(); 
+}
+
+void CommCuda::forward_comm_unpack_cuda()
+{
+	static int count=0;
+	static double kerneltime=0.0;
+	static double copytime=0.0;
+    timespec time1,time2,time3;
+  int n;
+  MPI_Request request;
+  MPI_Status status;
+  AtomVec *avec = atom->avec;
+  double **x = atom->x;
+
+  cuda->shared_data.domain.xy=domain->xy;
+  cuda->shared_data.domain.xz=domain->xz;
+  cuda->shared_data.domain.yz=domain->yz;
+  cuda->shared_data.domain.prd[0]=domain->prd[0];
+  cuda->shared_data.domain.prd[1]=domain->prd[1];
+  cuda->shared_data.domain.prd[2]=domain->prd[2];
+  cuda->shared_data.domain.triclinic=domain->triclinic;
+  if(not comm_x_only && not avec->cudable) cuda->downloadAll();  //if not comm_x_only the communication routine of the atom_vec style class is used 
+
+  // exchange data with another proc
+  // if other proc is self, just copy
+  // if comm_x_only set, exchange or copy directly to x, don't unpack
+
+  for (int iswap = 0; iswap < nswap; iswap++) {
+    if (sendproc[iswap] != me) 
+    {   
+      if (comm_x_only) 
+      {   
+ 
+        //Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],cuda->shared_data.comm.buf_recv[iswap],iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]
+        Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],buf_recv,iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]
+ 	
+      } 
+      else if (ghost_velocity) 
+      { 
+        //Cuda_CommCuda_UnpackComm_Vel(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],(void*)&buf_recv[iswap*maxrecv]); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]
+      } 
+      else 
+      {
+	    MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
+		  recvproc[iswap],0,world,&request);
+        
+        if(avec->cudable)
+          n = avec->pack_comm(sendnum[iswap],&iswap,
+                           buf_send,pbc_flag[iswap],pbc[iswap]);
+        else
+	      n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
+			    buf_send,pbc_flag[iswap],pbc[iswap]);
+	
+	    MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	    MPI_Wait(&request,&status);
+	    avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
+      }
+
+    }
+    else  //sendproc == me
+    {
+      if (comm_x_only) 
+      {     	
+	    if (sendnum[iswap])
+		{
+		}
+      } 
+      else if (ghost_velocity) 
+      {
+      } 
+      else 
+      {
+		n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
+			    buf_send,pbc_flag[iswap],pbc[iswap]);
+		avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
+      }
+    }
+  }
+  if(not comm_x_only && not avec->cudable) cuda->uploadAll(); 
+}
+
+void CommCuda::forward_comm_pair(Pair *pair)
+{
+  if(not cuda->shared_data.pair.cudable_force)
+  {
+  	return Comm::forward_comm_pair(pair);
+  }
+  
+  int iswap,n;
+  double *buf;
+  MPI_Request request;
+  MPI_Status status;
+
+  for (iswap = 0; iswap < nswap; iswap++) {
+
+    // pack buffer
+
+    n = pair->pack_comm(sendnum[iswap],&iswap,
+			buf_send,pbc_flag[iswap],pbc[iswap]);
+	int nrecv = recvnum[iswap]*n;
+	if(nrecv<0) nrecv=-(nrecv+1)/2;
+	int nsend = sendnum[iswap]*n;
+	if(nsend<0) nsend=-(nsend+1)/2;
+	
+    // exchange with another proc
+    // if self, set recv buffer to send buffer
+
+    if (sendproc[iswap] != me) {
+      MPI_Irecv(buf_recv,nrecv,MPI_DOUBLE,recvproc[iswap],0,
+		world,&request);
+      MPI_Send(buf_send,nsend,MPI_DOUBLE,sendproc[iswap],0,world);
+      MPI_Wait(&request,&status);
+      buf = buf_recv;
+    } else buf = buf_send;
+
+    // unpack buffer
+
+    pair->unpack_comm(recvnum[iswap],firstrecv[iswap],buf);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   reverse communication of forces on atoms every timestep 
+   other per-atom attributes may also be sent via pack/unpack routines
+------------------------------------------------------------------------- */
+      
+void CommCuda::reverse_comm()
+{
+  int n;
+  MPI_Request request;
+  MPI_Status status;
+  AtomVec *avec = atom->avec;
+  double **f = atom->f;
+  double *buf;
+
+  if(not comm_f_only && not avec->cudable) cuda->downloadAll();  //not yet implemented in CUDA but only needed for non standard atom styles
+
+  // exchange data with another proc
+  // if other proc is self, just copy
+  // if comm_f_only set, exchange or copy directly from f, don't pack
+
+  for (int iswap = nswap-1; iswap >= 0; iswap--) {
+    if (sendproc[iswap] != me) {
+      if (comm_f_only) {
+
+    int size_recv_now=size_reverse_recv[iswap];
+	if((sizeof(F_FLOAT)!=sizeof(double))&& size_reverse_recv[iswap])
+	  size_recv_now=(size_recv_now+1)*sizeof(F_FLOAT)/sizeof(double);
+	MPI_Irecv(buf_recv,size_recv_now,MPI_DOUBLE,
+		  sendproc[iswap],0,world,&request);
+
+    buf=buf_send;
+    if (size_reverse_send[iswap])
+    {
+      Cuda_CommCuda_PackReverse(&cuda->shared_data,size_reverse_send[iswap]/3,firstrecv[iswap],buf);
+    }
+    else buf=NULL;
+    int size_reverse_send_now=size_reverse_send[iswap];
+	if((sizeof(F_FLOAT)!=sizeof(double))&& size_reverse_send[iswap])
+	  size_reverse_send_now=(size_reverse_send_now+1)*sizeof(F_FLOAT)/sizeof(double);
+	MPI_Send(buf,size_reverse_send_now,MPI_DOUBLE,
+		 recvproc[iswap],0,world);
+	MPI_Wait(&request,&status);
+	Cuda_CommCuda_UnpackReverse(&cuda->shared_data,sendnum[iswap],iswap,buf_recv);
+
+      } else {
+	MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE,
+		  sendproc[iswap],0,world,&request);
+	n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
+	MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap],0,world);
+	MPI_Wait(&request,&status);
+
+      avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_recv);
+      }
+
+    } else {
+      if (comm_f_only) {
+	if (sendnum[iswap])
+      	Cuda_CommCuda_UnpackReverse_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap]);
+      } else {
+	n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
+	avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_send);
+      }
+    }
+  }
+  if(not comm_f_only && not avec->cudable) cuda->uploadAll();  //not yet implemented in CUDA but only needed for non standard atom styles
+}
+
+/* ----------------------------------------------------------------------
+   exchange: move atoms to correct processors
+   atoms exchanged with all 6 stencil neighbors
+   send out atoms that have left my box, receive ones entering my box
+   atoms will be lost if not inside some proc's box
+     can happen if atom moves outside of non-periodic bounary
+     or if atom moves more than one proc away
+   this routine called before every reneighboring
+   for triclinic, atoms must be in lamda coords (0-1) before exchange is called
+------------------------------------------------------------------------- */
+
+void CommCuda::exchange()
+{
+  AtomVec *avec = atom->avec;
+	
+  if(not cuda->oncpu && avec->cudable)
+    	return exchange_cuda();
+
+  if(not cuda->oncpu) cuda->downloadAll();  
+  
+  Comm::exchange();
+}
+
+
+void CommCuda::exchange_cuda()
+{
+  int i,m,nsend,nrecv,nrecv1,nrecv2,nlocal;
+  double lo,hi,value;
+  double **x;
+  double *sublo,*subhi,*buf;
+  MPI_Request request;
+  MPI_Status status;
+  AtomVec *avec = atom->avec;
+    timespec time1,time2,time3;
+
+  // clear global->local map for owned and ghost atoms
+  // b/c atoms migrate to new procs in exchange() and 
+  // new ghosts are created in borders()
+  // map_set() is done at end of borders()
+
+  
+  if(map_style) cuda->cu_tag->download();
+
+  if (map_style) atom->map_clear();
+
+  // subbox bounds for orthogonal or triclinic
+
+  if (triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  // loop over dimensions
+	
+  for (int dim = 0; dim < 3; dim++) {
+    // fill buffer with atoms leaving my box, using < and >=
+    // when atom is deleted, fill it in with last atom
+
+  	cuda->shared_data.exchange_dim=dim;
+    
+    nlocal = atom->nlocal;
+    avec->maxsend=&maxsend;
+    nsend=avec->pack_exchange(dim,(double*) &buf_send);
+    nlocal = atom->nlocal;
+
+
+    atom->nlocal = nlocal;
+
+    // send/recv atoms in both directions
+    // if 1 proc in dimension, no send/recv, set recv buf to send buf
+    // if 2 procs in dimension, single send/recv
+    // if more than 2 procs in dimension, send/recv to both neighbors
+
+ clock_gettime(CLOCK_REALTIME,&time1);
+ 
+    if (procgrid[dim] == 1) {
+      nrecv = nsend;
+      buf = buf_send;
+
+    } else {
+      MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
+		   &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,&status);
+      nrecv = nrecv1;
+      if (procgrid[dim] > 2) {
+	MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0,
+		     &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,&status);
+	nrecv += nrecv2;
+      }
+      if (nrecv+1 > maxrecv) grow_recv(nrecv+1);
+      
+      MPI_Irecv(buf_recv,nrecv1,MPI_DOUBLE,procneigh[dim][1],0,
+		world,&request);
+      MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][0],0,world);
+      MPI_Wait(&request,&status);
+      
+      if (procgrid[dim] > 2) {
+	MPI_Irecv(&buf_recv[nrecv1],nrecv2,MPI_DOUBLE,procneigh[dim][0],0,
+		  world,&request);
+	MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][1],0,world);
+	MPI_Wait(&request,&status);
+
+    	if((nrecv1==0)||(nrecv2==0)) buf_recv[nrecv]=0;
+      }
+      
+      buf = buf_recv;
+    }
+	//printf("nsend: %i nrecv: %i\n",nsend,nrecv);
+    // check incoming atoms to see if they are in my box
+    // if so, add to my list
+clock_gettime(CLOCK_REALTIME,&time2);
+cuda->shared_data.cuda_timings.comm_exchange_mpi+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+    if(nrecv)
+    {
+      avec->maxsend=&maxsend;
+      avec->unpack_exchange(buf);
+    }
+  }
+
+  if(atom->firstgroupname) cuda->downloadAll();
+
+  if(atom->firstgroupname) atom->first_reorder();
+
+  if(atom->firstgroupname) cuda->uploadAll(); 
+}
+
+/* ----------------------------------------------------------------------
+   borders: list nearby atoms to send to neighboring procs at every timestep
+   one list is created for every swap that will be made
+   as list is made, actually do swaps
+   this does equivalent of a communicate (so don't need to explicitly
+     call communicate routine on reneighboring timestep)
+   this routine is called before every reneighboring
+   for triclinic, atoms must be in lamda coords (0-1) before borders is called
+------------------------------------------------------------------------- */
+
+
+void CommCuda::borders()
+{
+  AtomVec *avec = atom->avec;
+  if(not cuda->oncpu && avec->cudable)
+  {
+  	if(cuda->shared_data.overlap_comm&&cuda->finished_setup)
+   	  borders_cuda_overlap_forward_comm();
+   	else
+   	  borders_cuda();
+   	  
+   	return;
+  }
+
+  Comm::borders();
+ 
+  cuda->setSystemParams();
+  if(cuda->finished_setup) {cuda->checkResize(); cuda->uploadAll();}
+  cuda->shared_data.atom.nghost=atom->nghost;  
+  cu_sendlist->upload();
+}
+
+void CommCuda::borders_cuda()
+{
+  int i,n,itype,iswap,dim,ineed,maxneed,smax,rmax;
+  int nsend,nrecv,nfirst,nlast,ngroup;
+  double lo,hi;
+  int *type;
+  double **x;
+  double *buf,*mlo,*mhi;
+  MPI_Request request;
+  MPI_Status status;
+  AtomVec *avec = atom->avec;
+    timespec time1,time2,time3;
+
+  // clear old ghosts
+
+  atom->nghost = 0;
+
+  // do swaps over all 3 dimensions
+
+  iswap = 0;
+  smax = rmax = 0;
+  
+  cuda->shared_data.comm.nsend=0;
+  for (dim = 0; dim < 3; dim++) {
+    nlast = 0;
+    maxneed = 2*need[dim];
+    for (ineed = 0; ineed < maxneed; ineed++) {
+
+      // find atoms within slab boundaries lo/hi using <= and >=
+      // check atoms between nfirst and nlast
+      //   for first swaps in a dim, check owned and ghost
+      //   for later swaps in a dim, only check newly arrived ghosts
+      // store sent atom indices in list for use in future timesteps
+
+      x = atom->x;
+      if (style == SINGLE) {
+	lo = slablo[iswap];
+	hi = slabhi[iswap];
+      } else {
+	type = atom->type;
+	mlo = multilo[iswap];
+	mhi = multihi[iswap];
+      }
+      if (ineed % 2 == 0) {
+	nfirst = nlast;
+	nlast = atom->nlocal + atom->nghost;
+      }
+
+      nsend = 0;
+
+      // find send atoms according to SINGLE vs MULTI
+      // all atoms eligible versus atoms in bordergroup
+      // only need to limit loop to bordergroup for first sends (ineed < 2)
+      // on these sends, break loop in two: owned (in group) and ghost
+     do
+     {
+       if(nsend>=maxsendlist[iswap]) grow_list(iswap,static_cast <int> (nsend*1.05));
+   	    nsend=Cuda_CommCuda_BuildSendlist(&cuda->shared_data,bordergroup,ineed,style==SINGLE?1:0,atom->nfirst,nfirst,nlast,dim,iswap);
+     }while(nsend>=maxsendlist[iswap]);
+      // pack up list of border atoms
+
+      if (nsend*size_border > maxsend)
+	grow_send(nsend*size_border,0);
+  
+      if (ghost_velocity)
+        n = avec->pack_border_vel(nsend,&iswap,buf_send,
+                           pbc_flag[iswap],pbc[iswap]);
+      else
+        n = avec->pack_border(nsend,&iswap,buf_send,
+                           pbc_flag[iswap],pbc[iswap]);
+
+      // swap atoms with other proc
+      // put incoming ghosts at end of my atom arrays
+      // if swapping with self, simply copy, no messages
+
+clock_gettime(CLOCK_REALTIME,&time1);
+      if (sendproc[iswap] != me) {
+	MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
+		     &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
+	if (nrecv*size_border > maxrecv) 
+	  grow_recv(nrecv*size_border);
+	MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
+		  recvproc[iswap],0,world,&request);
+	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	MPI_Wait(&request,&status);
+	buf = buf_recv;
+      } else {
+	nrecv = nsend;
+	buf = buf_send;
+      }
+
+clock_gettime(CLOCK_REALTIME,&time2);
+cuda->shared_data.cuda_timings.comm_border_mpi+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+      // unpack buffer
+
+      if (ghost_velocity)
+	avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf);
+      else
+	avec->unpack_border(nrecv,atom->nlocal+atom->nghost,buf);
+
+      // set all pointers & counters
+
+      smax = MAX(smax,nsend);
+      rmax = MAX(rmax,nrecv);
+      sendnum[iswap] = nsend;
+      recvnum[iswap] = nrecv;
+      size_forward_recv[iswap] = nrecv*size_forward;
+      size_reverse_send[iswap] = nrecv*size_reverse;
+      size_reverse_recv[iswap] = nsend*size_reverse;
+      firstrecv[iswap] = atom->nlocal + atom->nghost;
+      atom->nghost += nrecv;
+      iswap++;
+    }
+  }
+
+  // insure send/recv buffers are long enough for all forward & reverse comm
+
+  int max = MAX(maxforward*smax,maxreverse*rmax);
+  if (max > maxsend) grow_send(max,0);
+  max = MAX(maxforward*rmax,maxreverse*smax);
+  if (max > maxrecv) grow_recv(max);
+
+  // reset global->local map
+  if(map_style) 
+  {
+  	cuda->cu_tag->download();
+ 	atom->map_set();
+  }
+  
+  cuda->setSystemParams();
+  cuda->shared_data.atom.nghost+=n;  
+}
+
+void CommCuda::borders_cuda_overlap_forward_comm()
+{
+  int i,n,itype,iswap,dim,ineed,maxneed,smax,rmax;
+  int nsend,nrecv,nfirst,nlast,ngroup;
+  double lo,hi;
+  int *type;
+  double **x;
+  double *buf,*mlo,*mhi;
+  MPI_Request request;
+  MPI_Status status;
+  AtomVec *avec = atom->avec;
+    timespec time1,time2,time3;
+
+  // clear old ghosts
+
+  atom->nghost = 0;
+
+  // do swaps over all 3 dimensions
+
+  iswap = 0;
+  smax = rmax = 0;
+  
+  cuda->shared_data.comm.nsend=0;
+  for (dim = 0; dim < 3; dim++) {
+    nlast = 0;
+    maxneed = 2*need[dim];
+    for (ineed = 0; ineed < maxneed; ineed++) {
+
+      // find atoms within slab boundaries lo/hi using <= and >=
+      // check atoms between nfirst and nlast
+      //   for first swaps in a dim, check owned and ghost
+      //   for later swaps in a dim, only check newly arrived ghosts
+      // store sent atom indices in list for use in future timesteps
+
+      x = atom->x;
+      if (style == SINGLE) {
+	lo = slablo[iswap];
+	hi = slabhi[iswap];
+      } else {
+	type = atom->type;
+	mlo = multilo[iswap];
+	mhi = multihi[iswap];
+      }
+      if (ineed % 2 == 0) {
+	nfirst = nlast;
+	nlast = atom->nlocal + atom->nghost;
+      }
+
+      nsend = 0;
+
+      // find send atoms according to SINGLE vs MULTI
+      // all atoms eligible versus atoms in bordergroup
+      // only need to limit loop to bordergroup for first sends (ineed < 2)
+      // on these sends, break loop in two: owned (in group) and ghost
+     do
+     {
+       if(nsend>=maxsendlist[iswap]) grow_list(iswap,static_cast <int> (nsend*1.05));
+   	    nsend=Cuda_CommCuda_BuildSendlist(&cuda->shared_data,bordergroup,ineed,style==SINGLE?1:0,atom->nfirst,nfirst,nlast,dim,iswap);
+     }while(nsend>=maxsendlist[iswap]);
+	 cuda->shared_data.comm.nsend_swap[iswap]=nsend;
+	  // pack up list of border atoms
+
+      if (nsend*size_border > maxsend)
+	grow_send(nsend*size_border,0);
+  
+      if (ghost_velocity)
+        n = avec->pack_border_vel(nsend,&iswap,buf_send,
+                           pbc_flag[iswap],pbc[iswap]);
+      else
+        n = avec->pack_border(nsend,&iswap,buf_send,
+                           pbc_flag[iswap],pbc[iswap]);
+
+      // swap atoms with other proc
+      // put incoming ghosts at end of my atom arrays
+      // if swapping with self, simply copy, no messages
+
+clock_gettime(CLOCK_REALTIME,&time1);
+      if (sendproc[iswap] != me) {
+	MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
+		     &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
+	if (nrecv*size_border > maxrecv) 
+	  grow_recv(nrecv*size_border);
+	MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
+		  recvproc[iswap],0,world,&request);
+	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	MPI_Wait(&request,&status);
+	buf = buf_recv;
+      } else {
+	nrecv = nsend;
+	buf = buf_send;
+      }
+
+clock_gettime(CLOCK_REALTIME,&time2);
+cuda->shared_data.cuda_timings.comm_border_mpi+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+      // unpack buffer
+
+      if (ghost_velocity)
+	avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf);
+      else
+	avec->unpack_border(nrecv,atom->nlocal+atom->nghost,buf);
+
+      // set all pointers & counters
+
+      smax = MAX(smax,nsend);
+      rmax = MAX(rmax,nrecv);
+      sendnum[iswap] = nsend;
+      recvnum[iswap] = nrecv;
+      size_forward_recv[iswap] = nrecv*size_forward;
+      size_reverse_send[iswap] = nrecv*size_reverse;
+      size_reverse_recv[iswap] = nsend*size_reverse;
+      firstrecv[iswap] = atom->nlocal + atom->nghost;
+      atom->nghost += nrecv;
+      iswap++;
+    }
+  }
+
+  // insure send/recv buffers are long enough for all forward & reverse comm
+
+  int max = MAX(maxforward*smax,maxreverse*rmax);
+  if (max > maxsend) grow_send(max,0);
+  max = MAX(maxforward*rmax,maxreverse*smax);
+  if (max > maxrecv) grow_recv(max);
+
+  // reset global->local map
+  if(map_style) 
+  {
+  	cuda->cu_tag->download();
+ 	atom->map_set();
+  }
+  
+  cuda->setSystemParams();
+  cuda->shared_data.atom.nghost+=n;  
+}
+
+
+
+
+void CommCuda::forward_comm_fix(Fix *fix)
+{
+  int iswap,n;
+  double *buf;
+  MPI_Request request;
+  MPI_Status status;
+
+  for (iswap = 0; iswap < nswap; iswap++) {
+    // pack buffer
+    if(fix->cudable_comm&&cuda->finished_setup)
+    {
+    	int swap=iswap;
+        if(sendproc[iswap] == me) {swap=-iswap-1; buf=(double*)&(firstrecv[iswap]);}
+        else buf=buf_send;
+        
+        n = fix->pack_comm(sendnum[iswap],&swap,
+                        buf,pbc_flag[iswap],pbc[iswap]);
+        if(sendproc[iswap] == me) 
+        {
+        	continue;
+        }
+    }
+    else
+    n = fix->pack_comm(sendnum[iswap],sendlist[iswap],
+		       buf_send,pbc_flag[iswap],pbc[iswap]);
+
+     // exchange with another proc
+    // if self, set recv buffer to send buffer
+
+    if (sendproc[iswap] != me) {
+      MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
+		world,&request);
+      MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
+      MPI_Wait(&request,&status);
+      buf = buf_recv;
+    } else buf = buf_send;
+
+    // unpack buffer
+
+    fix->unpack_comm(recvnum[iswap],firstrecv[iswap],buf);
+  }
+}
+
+
+void CommCuda::grow_send(int n, int flag)
+{
+  int oldmaxsend = (maxsend+BUFEXTRA)*sizeof(double);
+  maxsend = static_cast<int> (BUFFACTOR * n);
+  if (flag){
+    if(cuda->pinned)
+    {
+      double* tmp = new double[oldmaxsend];
+      memcpy((void*) tmp,(void*) buf_send,oldmaxsend*sizeof(double));
+      if(buf_send) CudaWrapper_FreePinnedHostData((void*) (buf_send));
+      buf_send = (double*) CudaWrapper_AllocPinnedHostData((maxsend+BUFEXTRA)*sizeof(double),false);
+      memcpy(buf_send,tmp,oldmaxsend*sizeof(double));
+      delete [] tmp;	        	
+    }
+    else
+    {
+    buf_send = (double *) 
+      memory->srealloc(buf_send,(maxsend+BUFEXTRA)*sizeof(double),
+		       "comm:buf_send");printf("srealloc\n"); 
+    }
+  }
+  else {
+    if(cuda->pinned)
+    {
+      if(buf_send) CudaWrapper_FreePinnedHostData((void*) buf_send);
+      buf_send = (double*) CudaWrapper_AllocPinnedHostData((maxsend+BUFEXTRA)*sizeof(double),false);
+    }
+    else
+    {
+      memory->sfree(buf_send);
+      buf_send = (double *) memory->smalloc((maxsend+BUFEXTRA)*sizeof(double),
+					  "comm:buf_send");
+    }
+    for(int i=0;i<maxswap;i++)
+    {
+      if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],oldmaxsend);
+      cuda->shared_data.comm.buf_send_dev[i]=CudaWrapper_AllocCudaData((maxsend+BUFEXTRA)*sizeof(double));
+    }
+  }
+}
+/* ----------------------------------------------------------------------
+   free/malloc the size of the recv buffer as needed with BUFFACTOR 
+------------------------------------------------------------------------- */
+
+
+void CommCuda::grow_recv(int n)
+{
+  int oldmaxrecv = maxrecv*sizeof(double);
+  maxrecv = static_cast<int> (BUFFACTOR * n);
+  if(cuda->pinned)
+  {
+    if(buf_recv) CudaWrapper_FreePinnedHostData((void*)buf_recv);
+    buf_recv = (double*) CudaWrapper_AllocPinnedHostData(maxrecv*sizeof(double), false,true);
+  }
+  else
+  {
+    memory->sfree(buf_recv);
+    buf_recv = (double *) memory->smalloc(maxrecv*sizeof(double),
+					"comm:buf_recv");
+  }
+  for(int i=0;i<maxswap;i++)
+  {
+    if(cuda->shared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],oldmaxrecv);
+    cuda->shared_data.comm.buf_recv_dev[i]=CudaWrapper_AllocCudaData((maxrecv)*sizeof(double));
+  }
+}
+
+/* ----------------------------------------------------------------------
+   realloc the size of the iswap sendlist as needed with BUFFACTOR 
+------------------------------------------------------------------------- */
+
+void CommCuda::grow_list(int iswap, int n)
+{
+  
+  MYDBG(printf(" # CUDA CommCuda::grow_list\n");)
+  if(cuda->finished_setup&&cu_sendlist) cu_sendlist->download();
+  if(!cu_sendlist||n*BUFFACTOR>cu_sendlist->get_dim()[1]||n*BUFFACTOR>maxsendlist[iswap])
+  {
+  	for(int i=0;i<maxswap;i++)
+  	{
+  	  maxsendlist[i] = static_cast<int> (BUFFACTOR * n);
+  	  sendlist[i] = (int *) 
+    		memory->srealloc(sendlist[i],maxsendlist[i]*sizeof(int),
+		     		"comm:sendlist[iswap]");
+  	}
+  	delete cu_sendlist;
+  	cu_sendlist=new cCudaData<int, int, xy> ((int*)sendlist,maxswap,maxsendlist[iswap]);
+  	cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data();
+    cuda->shared_data.comm.maxlistlength=maxsendlist[iswap];
+    cu_sendlist->upload();
+  }
+ }
+
+/* ----------------------------------------------------------------------
+   realloc the buffers needed for swaps 
+------------------------------------------------------------------------- */
+
+void CommCuda::grow_swap(int n)
+{
+  int oldmaxswap=maxswap;
+  Comm::grow_swap(n);
+  if(n>cu_sendlist->get_dim()[0])
+  {
+   MYDBG(printf(" # CUDA CommCuda::grow_swap\n");)
+    
+  	delete cu_sendlist;
+  	cu_sendlist=new cCudaData<int, int, xy> ((int*)sendlist,n,BUFMIN);
+  	cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data();
+    cuda->shared_data.comm.maxlistlength=BUFMIN;
+    cuda->shared_data.comm.maxswap=n;
+    cuda->shared_data.comm.nsend_swap=new int[n];
+    cuda->shared_data.comm.send_size=new int[n];
+    cuda->shared_data.comm.recv_size=new int[n]; 
+  }
+  for(int i=0;i<oldmaxswap;i++)
+  {
+    if(cuda->shared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],maxrecv*sizeof(double));
+    if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],maxsend*sizeof(double));
+    cuda->shared_data.comm.buf_recv_dev[i]=NULL;
+    cuda->shared_data.comm.buf_send_dev[i]=NULL;    
+  }
+  cuda->shared_data.comm.buf_send= new double*[n];
+  cuda->shared_data.comm.buf_recv= new double*[n];
+  cuda->shared_data.comm.buf_send_dev= new void*[n];
+  cuda->shared_data.comm.buf_recv_dev= new void*[n];
+  for(int i=0;i<n;i++)
+  {
+    cuda->shared_data.comm.buf_recv[i]=NULL;
+    cuda->shared_data.comm.buf_send[i]=NULL;      	
+    cuda->shared_data.comm.buf_recv_dev[i]=NULL;
+    cuda->shared_data.comm.buf_send_dev[i]=NULL;      	
+  }
+  grow_send(maxsend,0);
+  grow_recv(maxrecv);
+
+  maxswap=n;
+}
+
+/* ----------------------------------------------------------------------
+   allocation of swap info 
+------------------------------------------------------------------------- */
+
+void CommCuda::allocate_swap(int n)
+{
+   Comm::allocate_swap(n);
+   
+  	delete cu_pbc;
+  	delete cu_slablo;
+  	delete cu_slabhi;
+  
+    cuda->shared_data.comm.maxswap=n;
+  	if(cu_sendlist)
+  	{
+  	  cu_pbc=new cCudaData<int, int, xy> ((int*)pbc,n,6);
+  	  cu_slablo = new cCudaData<double, X_FLOAT,x>(slablo,n);
+  	  cu_slabhi = new cCudaData<double, X_FLOAT,x>(slabhi,n);
+  	
+  	  cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data();
+  	  cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
+  	  cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data();
+  	}
+    cuda->shared_data.comm.nsend_swap=new int[n];
+    cuda->shared_data.comm.send_size=new int[n];
+    cuda->shared_data.comm.recv_size=new int[n];
+    cuda->shared_data.comm.buf_send= new double*[n];
+    cuda->shared_data.comm.buf_recv= new double*[n];
+    cuda->shared_data.comm.buf_send_dev= new void*[n];
+    cuda->shared_data.comm.buf_recv_dev= new void*[n];
+    for(int i=0;i<n;i++) cuda->shared_data.comm.buf_send_dev[i]=NULL;
+    for(int i=0;i<n;i++) cuda->shared_data.comm.buf_recv_dev[i]=NULL;
+}
+
+
+/* ----------------------------------------------------------------------
+   allocation of multi-type swap info
+------------------------------------------------------------------------- */
+
+void CommCuda::allocate_multi(int n)
+{
+  Comm::allocate_multi(n);
+
+  	delete cu_multilo;
+  	delete cu_multihi;
+  	cu_multilo = new cCudaData<double, X_FLOAT,xy>(slablo,n,atom->ntypes+1);
+  	cu_multihi = new cCudaData<double, X_FLOAT,xy>(slabhi,n,atom->ntypes+1);
+  	
+  	cuda->shared_data.comm.multilo.dev_data=cu_multilo->dev_data();
+  	cuda->shared_data.comm.multihi.dev_data=cu_multihi->dev_data();
+}
+
+/* ----------------------------------------------------------------------
+   free memory for swaps 
+------------------------------------------------------------------------- */
+
+void CommCuda::free_swap()
+{
+
+  Comm::free_swap();
+
+  delete cuda->shared_data.comm.nsend_swap; cuda->shared_data.comm.nsend_swap=NULL;
+  delete cu_pbc; cu_pbc = NULL;
+  delete cu_slablo; cu_slablo = NULL;
+  delete cu_slabhi; cu_slabhi = NULL;
+  for(int i=0;i<maxswap;i++)
+  {
+    if(cuda->shared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],maxrecv*sizeof(double));
+    if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],maxsend*sizeof(double));
+  }
+  
+
+}
+
+/* ----------------------------------------------------------------------
+   free memory for multi-type swaps
+------------------------------------------------------------------------- */
+
+void CommCuda::free_multi()
+{
+  Comm::free_multi();
+  delete cu_multilo; cu_multilo = NULL;
+  delete cu_multihi; cu_multihi = NULL;
+}
+
diff --git a/src/USER-CUDA/comm_cuda.h b/src/USER-CUDA/comm_cuda.h
new file mode 100644
index 0000000000..933d7364c1
--- /dev/null
+++ b/src/USER-CUDA/comm_cuda.h
@@ -0,0 +1,69 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_COMM_CUDA_H
+#define LMP_COMM_CUDA_H
+
+#include "pointers.h"
+
+#include "cuda_data.h"
+#include "comm.h"
+
+namespace LAMMPS_NS {
+
+class CommCuda : public Comm {
+public:
+  CommCuda(class LAMMPS *);
+  ~CommCuda();
+
+  virtual void init();
+  virtual void setup();                     // setup 3d communication pattern
+  virtual void forward_comm(int mode=0);              // forward communication of atom coords
+  virtual void forward_comm_cuda();
+  virtual void forward_comm_pack_cuda();
+  virtual void forward_comm_transfer_cuda();
+  virtual void forward_comm_unpack_cuda();
+  virtual void forward_comm_pair(Pair *pair);    
+  virtual void reverse_comm();              // reverse communication of forces
+  virtual void exchange();                  // move atoms to new procs
+  virtual void exchange_cuda();                  // move atoms to new procs
+  virtual void borders();                   // setup list of atoms to communicate
+  virtual void borders_cuda();                   // setup list of atoms to communicate
+  virtual void borders_cuda_overlap_forward_comm();
+  virtual void forward_comm_fix(class Fix *);          // forward comm from a Fix
+
+
+  
+
+ protected:
+  class Cuda *cuda;
+  cCudaData<int, int, xy>* cu_pbc;
+  cCudaData<double, X_FLOAT, x>* cu_slablo;
+  cCudaData<double, X_FLOAT, x>* cu_slabhi;
+  cCudaData<double, X_FLOAT, xy>* cu_multilo;
+  cCudaData<double, X_FLOAT, xy>* cu_multihi;
+  
+  cCudaData<int, int, xy>* cu_sendlist;
+  virtual void grow_send(int,int);          // reallocate send buffer
+  virtual void grow_recv(int);              // free/allocate recv buffer
+  virtual void grow_list(int, int);         // reallocate one sendlist
+  virtual void grow_swap(int);              // grow swap and multi arrays
+  virtual void allocate_swap(int);          // allocate swap arrays
+  virtual void allocate_multi(int);         // allocate multi arrays
+  virtual void free_swap();                 // free swap arrays
+  virtual void free_multi();                // free multi arrays
+};
+
+}
+
+#endif
diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp
new file mode 100644
index 0000000000..f5ff1ea72d
--- /dev/null
+++ b/src/USER-CUDA/cuda.cpp
@@ -0,0 +1,837 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "cuda.h"
+#include "atom.h"
+#include "domain.h"
+#include "force.h"
+#include "pair.h"
+#include "update.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "universe.h"
+#include "input.h"
+#include "error.h"
+#include "cuda_neigh_list.h"
+//#include "pre_binning_cu.h"
+#include "binning_cu.h"
+//#include "reverse_binning_cu.h"
+#include <ctime>
+#include <cmath>
+#include "cuda_pair_cu.h"
+#include "cuda_cu.h"
+
+using namespace LAMMPS_NS;
+
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp)
+{
+	cuda_exists=true;
+	lmp->cuda=this;
+	if(universe->me==0)
+	printf("# Using LAMMPS_CUDA \n");
+	shared_data.me=universe->me;
+	device_set=false;
+	
+	Cuda_Cuda_GetCompileSettings(&shared_data);
+	
+	if(shared_data.compile_settings.prec_glob!=sizeof(CUDA_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: Global Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_glob, sizeof(CUDA_FLOAT)/4);
+	if(shared_data.compile_settings.prec_x!=sizeof(X_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: X Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_x, sizeof(X_FLOAT)/4);
+	if(shared_data.compile_settings.prec_v!=sizeof(V_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: V Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_v, sizeof(V_FLOAT)/4);
+	if(shared_data.compile_settings.prec_f!=sizeof(F_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: F Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_f, sizeof(F_FLOAT)/4);
+	if(shared_data.compile_settings.prec_pppm!=sizeof(PPPM_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: PPPM Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_pppm, sizeof(PPPM_FLOAT)/4);
+	if(shared_data.compile_settings.prec_fft!=sizeof(FFT_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: FFT Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_fft, sizeof(FFT_FLOAT)/4);
+    #ifdef FFT_CUFFT
+      if(shared_data.compile_settings.cufft!=1) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 1);
+    #else	
+      if(shared_data.compile_settings.cufft!=0) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 0);
+    #endif
+    
+    if(shared_data.compile_settings.arch!=CUDA_ARCH)  printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: arch: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, CUDA_ARCH);
+	
+	cu_x          = 0;
+	cu_v          = 0;
+	cu_f          = 0;
+	cu_tag        = 0;
+	cu_type       = 0;
+	cu_mask       = 0;
+	cu_image      = 0;
+	cu_xhold      = 0;
+	cu_q          = 0;
+	cu_rmass      = 0;
+	cu_mass       = 0;
+	cu_virial     = 0;
+	cu_eatom      = 0;
+	cu_vatom      = 0;
+	cu_radius	  = 0;
+	cu_density	  = 0;
+	cu_omega	  = 0;
+	cu_torque	  = 0;
+	
+	cu_special 	  = 0;
+	cu_nspecial   = 0;
+	
+	cu_molecule   = 0;
+	
+	cu_x_type 	  = 0;
+	x_type		  = 0;
+	cu_v_radius	  = 0;
+	v_radius	  = 0;
+	cu_omega_rmass	  = 0;
+	omega_rmass	  = 0;
+	
+	binned_id = 0;
+	cu_binned_id  = 0;
+	binned_idnew = 0;
+	cu_binned_idnew = 0;
+	
+	cu_map_array = 0;
+	
+	copy_buffer=0;
+	copy_buffersize=0;
+	
+	neighbor_decide_by_integrator=0;
+	pinned=true;
+	
+	debugdata=0;
+	new int[2*CUDA_MAX_DEBUG_SIZE];
+	
+	finished_setup = false;
+	begin_setup = false;
+	finished_run = false;
+
+	setSharedDataZero();
+	
+	uploadtime=0;
+	downloadtime=0;
+	dotiming=false;
+
+    dotestatom = false;
+    testatom = 0;	
+	oncpu = true;
+
+    self_comm = 0;
+	MYDBG( printf("# CUDA: Cuda::Cuda Done...\n");)
+	//cCudaData<double, float, yx >  
+}
+
+Cuda::~Cuda()
+{
+	
+	print_timings();
+	
+	if(universe->me==0) printf("# CUDA: Free memory...\n");
+	
+	delete cu_q;
+	delete cu_x;
+	delete cu_v;
+	delete cu_f;
+	delete cu_tag;
+	delete cu_type;
+	delete cu_mask;
+	delete cu_image;
+	delete cu_xhold;
+	delete cu_mass;
+	delete cu_rmass;
+	delete cu_virial;
+	delete cu_eng_vdwl;
+	delete cu_eng_coul;
+	delete cu_eatom;
+	delete cu_vatom;
+	delete cu_radius;
+	delete cu_density;
+	delete cu_omega;
+	delete cu_torque;
+	delete cu_molecule;
+	
+	delete cu_x_type;
+	delete [] x_type;
+	delete cu_v_radius;
+	delete [] v_radius;
+	delete cu_omega_rmass;
+	delete [] omega_rmass;
+	
+	delete cu_map_array;
+	
+	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
+	while(p != neigh_lists.end())
+	{
+		delete p->second;
+		++p;
+	}
+}
+
+void Cuda::accelerator(int narg, char** arg)
+{
+	if(device_set) return;
+	if(universe->me==0)
+	printf("# CUDA: Activate GPU \n");
+	
+	int* devicelist=NULL;
+	int pppn=2;
+    for(int i=0;i<narg;i++)
+	{
+	  if(strcmp(arg[i],"gpu/node")==0) 
+	  {
+	  	if(++i==narg) 
+	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number or keyword 'special' after 'gpu/node' option."); 
+	    if(strcmp(arg[i],"special")==0)
+	    {
+	  	   if(++i==narg) 
+	  	     error->all("Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'."); 
+	       pppn=atoi(arg[i]);
+	       if(pppn<1) error->all("Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'."); 
+	  	   if(i+pppn==narg) 
+	  	     error->all("Invalid Options for 'accelerator' command. Expecting list of device ids after keyword 'gpu/node special'."); 
+	       devicelist=new int[pppn];
+	       for(int k=0;k<pppn;k++)
+	         {i++;devicelist[k]=atoi(arg[i]);}
+	  	   
+	    }
+	    else
+	    pppn=atoi(arg[i]);
+	  }
+	  if(strcmp(arg[i],"pinned")==0) 
+	  {
+	  	if(++i==narg) 
+	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number after 'pinned' option."); 
+	    pinned=atoi(arg[i])==0?false:true;
+            if((pinned==false)&&(universe->me==0)) printf(" #CUDA: Pinned memory is not used for communication\n");
+	  }
+	  if(strcmp(arg[i],"dotiming")==0) 
+	  {
+	  	dotiming=true;
+	  }
+	  if(strcmp(arg[i],"suffix")==0) 
+	  {
+	  	if(++i==narg) 
+	  	  error->all("Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option."); 
+	  	strcpy(lmp->asuffix,arg[i]);
+	  }
+	  if(strcmp(arg[i],"overlap_comm")==0) 
+	  {
+	  	shared_data.overlap_comm=1;
+	  }
+	  if(strcmp(arg[i],"dotest")==0) 
+	  {
+	  	if(++i==narg) 
+	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number after 'dotest' option."); 
+	    testatom=atof(arg[i]);
+	    dotestatom=true;
+	  }
+	  if(strcmp(arg[i],"override_bpa")==0) 
+	  {
+	  	if(++i==narg) 
+	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number after 'override_bpa' option."); 
+	      shared_data.pair.override_block_per_atom = atoi(arg[i]);
+	  }
+	}
+	CudaWrapper_Init(0, (char**)0,universe->me,pppn,devicelist);
+	//if(shared_data.overlap_comm)
+	  	CudaWrapper_AddStreams(3);
+	cu_x          = 0;
+	cu_v          = 0;
+	cu_f          = 0;
+	cu_tag        = 0;
+	cu_type       = 0;
+	cu_mask       = 0;
+	cu_image      = 0;
+	cu_xhold      = 0;
+	cu_q          = 0;
+	cu_rmass      = 0;
+	cu_mass       = 0;
+	cu_virial     = 0;
+	cu_eatom      = 0;
+	cu_vatom      = 0;
+	cu_radius	  = 0;
+	cu_density	  = 0;
+	cu_omega	  = 0;
+	cu_torque	  = 0;
+	
+	cu_special 	  = 0;
+	cu_nspecial   = 0;
+	
+	cu_molecule   = 0;
+	
+	cu_x_type 	  = 0;
+	cu_v_radius	  = 0;
+	cu_omega_rmass	  = 0;
+	
+	cu_binned_id  = 0;
+	cu_binned_idnew = 0;
+	device_set=true;
+	allocate();
+	delete devicelist;
+}
+
+void Cuda::setSharedDataZero()
+{
+	MYDBG(printf("# CUDA: Cuda::setSharedDataZero ...\n");)
+	shared_data.atom.nlocal = 0;
+	shared_data.atom.nghost = 0;
+	shared_data.atom.nall = 0;
+	shared_data.atom.nmax = 0;
+	shared_data.atom.ntypes = 0;
+	shared_data.atom.q_flag = 0;
+	shared_data.atom.need_eatom = 0;
+	shared_data.atom.need_vatom = 0;
+	
+    shared_data.pair.cudable_force = 0;
+	shared_data.pair.collect_forces_later = 0;
+	shared_data.pair.use_block_per_atom = 0;
+	shared_data.pair.override_block_per_atom = -1;
+	shared_data.pair.cut = 0;
+	shared_data.pair.cutsq = 0;
+	shared_data.pair.cut_inner = 0;
+	shared_data.pair.cut_coul = 0;
+	shared_data.pair.special_lj = 0;
+	shared_data.pair.special_coul = 0;
+	
+	
+	shared_data.pppm.cudable_force = 0;
+	
+	shared_data.buffersize = 0;
+	shared_data.buffer_new = 1;
+	shared_data.buffer = NULL;
+	
+	shared_data.comm.comm_phase=0;
+	shared_data.overlap_comm=0;
+
+	shared_data.comm.buffer = NULL;
+	shared_data.comm.buffer_size=0;
+	shared_data.comm.overlap_split_ratio=0;
+   // setTimingsZero();
+}
+
+void Cuda::allocate()
+{
+	accelerator(0,NULL);
+	MYDBG(printf("# CUDA: Cuda::allocate ...\n");)
+	if(not cu_virial)
+	{
+	  cu_virial    = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.virial , 6);
+	  cu_eng_vdwl  = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.eng_vdwl ,1);
+	  cu_eng_coul  = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.eng_coul ,1);
+	  cu_extent 	 = new cCudaData<double, double, x> (extent, 6);
+	  shared_data.flag = CudaWrapper_AllocCudaData(sizeof(int));
+	  int size=2*CUDA_MAX_DEBUG_SIZE;
+	  debugdata = new int[size];
+	  cu_debugdata    = new cCudaData<int, int, x > (debugdata , size);
+	  shared_data.debugdata=cu_debugdata->dev_data();
+	}
+	checkResize();
+	setSystemParams();
+	MYDBG(printf("# CUDA: Cuda::allocate done...\n");)
+}
+
+void Cuda::setSystemParams()
+{
+    MYDBG(printf("# CUDA: Cuda::setSystemParams ...\n");)
+	shared_data.atom.nlocal = atom->nlocal;
+	shared_data.atom.nghost = atom->nghost;
+	shared_data.atom.nall = atom->nlocal + atom->nghost;
+	shared_data.atom.ntypes = atom->ntypes;
+	shared_data.atom.q_flag = atom->q_flag;
+	shared_data.atom.rmass_flag = atom->rmass_flag;
+    MYDBG(printf("# CUDA: Cuda::setSystemParams done ...\n");)
+}
+
+void Cuda::setDomainParams()
+{
+    MYDBG(printf("# CUDA: Cuda::setDomainParams ...\n");)
+ 	cuda_shared_domain* cu_domain = &shared_data.domain;
+ 
+    cu_domain->triclinic = domain->triclinic;
+	for(short i=0; i<3; ++i)
+	{
+		cu_domain->periodicity[i] = domain->periodicity[i];
+		cu_domain->sublo[i] = domain->sublo[i];
+		cu_domain->subhi[i] = domain->subhi[i];
+		cu_domain->boxlo[i] = domain->boxlo[i];
+		cu_domain->boxhi[i] = domain->boxhi[i];
+		cu_domain->prd[i] = domain->prd[i];
+	}
+	if(domain->triclinic)
+    {
+	  for(short i=0; i<3; ++i)
+	  {
+	    cu_domain->boxlo_lamda[i] = domain->boxlo_lamda[i];
+	    cu_domain->boxhi_lamda[i] = domain->boxhi_lamda[i];
+	    cu_domain->prd_lamda[i] = domain->prd_lamda[i];
+	  }
+	  cu_domain->xy = domain->xy;
+	  cu_domain->xz = domain->xz;
+	  cu_domain->yz = domain->yz;
+	}
+
+    for(int i=0;i<6;i++) 
+	{
+	  cu_domain->h[i]=domain->h[i];
+	  cu_domain->h_inv[i]=domain->h_inv[i];
+	  cu_domain->h_rate[i]=domain->h_rate[i];
+	}
+	
+	cu_domain->update=2;
+    MYDBG(printf("# CUDA: Cuda::setDomainParams done ...\n");)
+}
+
+void Cuda::checkResize()
+{
+    MYDBG(printf("# CUDA: Cuda::checkResize ...\n");)
+    accelerator(0,NULL);
+	cuda_shared_atom* cu_atom = & shared_data.atom;
+	cuda_shared_pair* cu_pair = & shared_data.pair;
+	cu_atom->q_flag      = atom->q_flag;
+	cu_atom->rmass_flag  = atom->rmass ? 1 : 0;
+	cu_atom->nall = atom->nlocal + atom->nghost;
+	cu_atom->nlocal      = atom->nlocal;
+	cu_atom->nghost      = atom->nghost;
+	
+	// do we have more atoms to upload than currently allocated memory on device? (also true if nothing yet allocated)
+	if(atom->nmax > cu_atom->nmax || cu_tag == NULL)
+	{
+		delete cu_x;               cu_x         = new cCudaData<double, X_FLOAT, yx> ((double*)atom->x , & cu_atom->x        , atom->nmax, 3,0,true); //cu_x->set_buffer(&(shared_data.buffer),&(shared_data.buffersize),true);
+		delete cu_v;               cu_v         = new cCudaData<double, V_FLOAT, yx> ((double*)atom->v, & cu_atom->v         , atom->nmax, 3);
+		delete cu_f;               cu_f         = new cCudaData<double, F_FLOAT, yx> ((double*)atom->f, & cu_atom->f         , atom->nmax, 3,0,true);
+		delete cu_tag;             cu_tag       = new cCudaData<int   , int    , x > (atom->tag       , & cu_atom->tag       , atom->nmax   );
+		delete cu_type;            cu_type      = new cCudaData<int   , int    , x > (atom->type      , & cu_atom->type      , atom->nmax   );
+		delete cu_mask;            cu_mask      = new cCudaData<int   , int    , x > (atom->mask      , & cu_atom->mask      , atom->nmax   );
+		delete cu_image;           cu_image     = new cCudaData<int   , int    , x > (atom->image     , & cu_atom->image     , atom->nmax   );
+
+		if(atom->rmass)
+			{delete cu_rmass;      cu_rmass     = new cCudaData<double, V_FLOAT, x > (atom->rmass     , & cu_atom->rmass     , atom->nmax  );}
+
+		if(cu_atom->q_flag)
+			{delete cu_q;          cu_q         = new cCudaData<double, F_FLOAT, x > ((double*)atom->q, & cu_atom->q         , atom->nmax  );}// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
+
+/*
+		if(force->pair)
+		if(force->pair->eatom)
+			{delete cu_eatom;          cu_eatom         = new cCudaData<double, ENERGY_FLOAT, x > (force->pair->eatom, & cu_atom->eatom         , atom->nmax  );}// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
+		if(force->pair)
+		if(force->pair->vatom)
+			{delete cu_vatom;          cu_vatom         = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & cu_atom->vatom         , atom->nmax,6  );}// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
+*/
+		if(atom->radius)
+		{
+			delete cu_radius;     cu_radius    = new cCudaData<double, X_FLOAT, x > (atom->radius    , & cu_atom->radius     , atom->nmax  );
+		    delete cu_v_radius;   cu_v_radius  = new cCudaData<V_FLOAT, V_FLOAT, x> (v_radius , & cu_atom->v_radius      , atom->nmax*4);
+		    delete cu_omega_rmass;   cu_omega_rmass  = new cCudaData<V_FLOAT, V_FLOAT, x> (omega_rmass , & cu_atom->omega_rmass      , atom->nmax*4);
+		}
+
+		/*		
+		if(atom->density)
+			{delete cu_density;    cu_density   = new cCudaData<double, F_FLOAT, x > (atom->density   , & cu_atom->density     , atom->nmax  );}
+		*/
+
+		if(atom->omega)
+			{delete cu_omega;      cu_omega     = new cCudaData<double, V_FLOAT, yx > (((double*) atom->omega)    , & cu_atom->omega     , atom->nmax,3  );}
+
+		if(atom->torque)
+			{delete cu_torque;     cu_torque    = new cCudaData<double, F_FLOAT, yx > (((double*) atom->torque)   , & cu_atom->torque     , atom->nmax,3  );}
+
+		if(atom->special)
+			{delete cu_special;     cu_special    = new cCudaData<int, int, yx > (((int*) &(atom->special[0][0]))   , & cu_atom->special     , atom->nmax,atom->maxspecial  ); shared_data.atom.maxspecial=atom->maxspecial;}
+		if(atom->nspecial)
+			{delete cu_nspecial;     cu_nspecial    = new cCudaData<int, int, yx > (((int*) atom->nspecial)  , & cu_atom->nspecial     , atom->nmax,3  );}
+		if(atom->molecule)
+			{delete cu_molecule;     cu_molecule    = new cCudaData<int, int, x > (((int*) atom->molecule)  , & cu_atom->molecule     , atom->nmax  );}
+		shared_data.atom.special_flag = neighbor->special_flag;
+		shared_data.atom.molecular = atom->molecular;
+		   
+  	    cu_atom->update_nmax = 2;
+	    cu_atom->nmax        = atom->nmax;
+	    
+	    //delete [] x_type; 			x_type 		= new X_FLOAT4[atom->nmax];
+		delete cu_x_type;           cu_x_type   = new cCudaData<X_FLOAT, X_FLOAT, x> (x_type , & cu_atom->x_type      , atom->nmax*4);
+	   // shared_data.buffer_new = 2;
+	}
+
+	if(((cu_xhold==NULL)||(cu_xhold->get_dim()[0]<neighbor->maxhold))&&neighbor->xhold)
+	{
+		delete cu_xhold;           cu_xhold     = new cCudaData<double, X_FLOAT, yx> ((double*)neighbor->xhold, & cu_atom->xhold         , neighbor->maxhold, 3);
+		shared_data.atom.maxhold=neighbor->maxhold;
+	}
+	
+	if(atom->mass && !cu_mass) 
+	{cu_mass      = new cCudaData<double, V_FLOAT, x > (atom->mass      , & cu_atom->mass      , atom->ntypes+1);}
+	cu_atom->mass_host   = atom->mass;
+	
+	if(atom->map_style==1)
+	{
+	  if((cu_map_array==NULL))
+	  {
+	  	cu_map_array   = new cCudaData<int, int, x > (atom->get_map_array()   , & cu_atom->map_array     , atom->get_map_size()  );
+	  }
+	}
+	
+	
+	// if any of the host pointers have changed (e.g. re-allocated somewhere else), set to correct pointer
+	if(cu_x   ->get_host_data() != atom->x)    cu_x   ->set_host_data((double*) (atom->x));
+	if(cu_v   ->get_host_data() != atom->v)    cu_v   ->set_host_data((double*) (atom->v));
+	if(cu_f   ->get_host_data() != atom->f)    cu_f   ->set_host_data((double*) (atom->f));
+	if(cu_tag ->get_host_data() != atom->tag)  cu_tag ->set_host_data(atom->tag);
+	if(cu_type->get_host_data() != atom->type) cu_type->set_host_data(atom->type);
+	if(cu_mask->get_host_data() != atom->mask) cu_mask->set_host_data(atom->mask);
+	if(cu_image->get_host_data() != atom->image) cu_mask->set_host_data(atom->image);
+	
+	if(cu_xhold)
+	if(cu_xhold->get_host_data()!= neighbor->xhold) cu_xhold->set_host_data((double*)(neighbor->xhold));
+	
+	if(atom->rmass)
+	if(cu_rmass->get_host_data() != atom->rmass) cu_rmass->set_host_data((double*) (atom->rmass));
+
+	if(cu_atom->q_flag)
+	if(cu_q->get_host_data() != atom->q) cu_q->set_host_data((double*) (atom->q));
+	
+	if(atom->radius)
+	if(cu_radius->get_host_data() != atom->radius) cu_radius->set_host_data((double*) (atom->radius));
+
+	/*
+	if(atom->density)
+	if(cu_density->get_host_data() != atom->density) cu_density->set_host_data((double*) (atom->density));
+	*/
+
+	if(atom->omega)
+	if(cu_omega->get_host_data() != atom->omega) cu_omega->set_host_data((double*) (atom->omega));
+
+	if(atom->torque)
+	if(cu_torque->get_host_data() != atom->torque) cu_torque->set_host_data((double*) (atom->torque));
+
+	if(atom->special)
+	if(cu_special->get_host_data() != atom->special)
+			{delete cu_special;     cu_special    = new cCudaData<int, int, yx > (((int*) atom->special)   , & cu_atom->special     , atom->nmax,atom->maxspecial  ); shared_data.atom.maxspecial=atom->maxspecial;}
+	
+	if(atom->nspecial)
+	if(cu_nspecial->get_host_data() != atom->nspecial) cu_nspecial->set_host_data((int*) (atom->nspecial));
+
+	if(atom->molecule)
+	if(cu_molecule->get_host_data() != atom->molecule) cu_molecule->set_host_data((int*) (atom->molecule));
+
+	if(force)
+	if(cu_virial   ->get_host_data() != force->pair->virial)    cu_virial   ->set_host_data(force->pair->virial);
+	if(force)
+	if(cu_eng_vdwl ->get_host_data() != &force->pair->eng_vdwl)    cu_eng_vdwl  ->set_host_data(&force->pair->eng_vdwl);
+	if(force)
+	if(cu_eng_coul ->get_host_data() != &force->pair->eng_coul)    cu_eng_coul   ->set_host_data(&force->pair->eng_coul);
+
+ 	cu_atom->update_nlocal = 2;
+	MYDBG(printf("# CUDA: Cuda::checkResize done...\n");)
+}
+
+void Cuda::evsetup_eatom_vatom(int eflag_atom,int vflag_atom)
+{
+    if(eflag_atom)
+    {
+    	if(not cu_eatom) 
+    		cu_eatom         = new cCudaData<double, ENERGY_FLOAT, x > (force->pair->eatom, & (shared_data.atom.eatom)         , atom->nmax  );// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
+    	cu_eatom->set_host_data(force->pair->eatom); 
+		cu_eatom->memset_device(0);
+    }
+    if(vflag_atom)
+    {	
+    	if(not cu_vatom) 
+    		cu_vatom         = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom)         , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
+    	cu_vatom->set_host_data((double*)force->pair->vatom); 
+		cu_vatom->memset_device(0);
+    }
+}
+
+void Cuda::uploadAll()
+{
+	MYDBG(printf("# CUDA: Cuda::uploadAll() ... start\n");)
+	timespec starttime;
+	timespec endtime;
+
+	if(atom->nmax!=shared_data.atom.nmax) checkResize();
+	clock_gettime(CLOCK_REALTIME,&starttime);
+	cu_x   ->upload();
+	cu_v   ->upload();
+	cu_f   ->upload();
+	cu_tag ->upload();
+	cu_type->upload();
+	cu_mask->upload();
+	cu_image->upload();
+	if(shared_data.atom.q_flag) cu_q    ->upload();
+	
+	//printf("A3\n");
+	//if(shared_data.atom.need_eatom) cu_eatom->upload();
+	//printf("A4\n");
+	//if(shared_data.atom.need_vatom) cu_vatom->upload();
+	//printf("A5\n");
+	
+	if(atom->rmass)             cu_rmass->upload();
+
+	if(atom->radius)            cu_radius->upload();
+	//	if(atom->density)           cu_density->upload();
+	if(atom->omega)             cu_omega->upload();
+	if(atom->torque)            cu_torque->upload();
+	if(atom->special)           cu_special->upload();
+	if(atom->nspecial)          cu_nspecial->upload();
+	if(atom->molecule)          cu_molecule->upload();
+	if(cu_eatom) cu_eatom->upload();
+	if(cu_vatom) cu_vatom->upload();
+
+	clock_gettime(CLOCK_REALTIME,&endtime);
+	uploadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+	CUDA_IF_BINNING(Cuda_PreBinning(& shared_data);)
+	CUDA_IF_BINNING(Cuda_Binning   (& shared_data);)
+	
+	shared_data.atom.triggerneighsq=neighbor->triggersq;
+	MYDBG(printf("# CUDA: Cuda::uploadAll() ... end\n");)
+}
+
+void Cuda::downloadAll()
+{
+	MYDBG(printf("# CUDA: Cuda::downloadAll() ... start\n");)
+	timespec starttime;
+	timespec endtime;
+
+	if(atom->nmax!=shared_data.atom.nmax) checkResize();
+
+	CUDA_IF_BINNING( Cuda_ReverseBinning(& shared_data); )
+	clock_gettime(CLOCK_REALTIME,&starttime);
+	cu_x   ->download();
+	cu_v   ->download();
+	cu_f   ->download();
+	cu_type->download();
+	cu_tag ->download();
+	cu_mask->download();
+	cu_image->download();
+
+	//if(shared_data.atom.need_eatom) cu_eatom->download();
+	//if(shared_data.atom.need_vatom) cu_vatom->download();
+
+	if(shared_data.atom.q_flag) cu_q    ->download();
+	if(atom->rmass)             cu_rmass->download();
+	
+	if(atom->radius)            cu_radius->download();
+	//	if(atom->density)           cu_density->download();
+	if(atom->omega)             cu_omega->download();
+	if(atom->torque)            cu_torque->download();
+	if(atom->special)           cu_special->download();
+	if(atom->nspecial)          cu_nspecial->download();
+	if(atom->molecule)          cu_molecule->download();
+	if(cu_eatom) cu_eatom->download();
+	if(cu_vatom) cu_vatom->download();
+	
+	clock_gettime(CLOCK_REALTIME,&endtime);
+	downloadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+	MYDBG(printf("# CUDA: Cuda::downloadAll() ... end\n");)
+}
+
+void Cuda::downloadX()
+{
+	Cuda_Pair_RevertXType(& this->shared_data);
+	cu_x->download();
+}
+
+CudaNeighList* Cuda::registerNeighborList(class NeighList* neigh_list)
+{
+	MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... start a\n");)
+	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.find(neigh_list);
+
+	if(p != neigh_lists.end()) return p->second;
+	else
+	{
+		CudaNeighList* neigh_list_cuda = new CudaNeighList(lmp, neigh_list);
+		neigh_lists.insert(std::pair<NeighList*, CudaNeighList*>(neigh_list, neigh_list_cuda));
+		return neigh_list_cuda;
+	}
+	MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... end b\n");)
+}
+
+void Cuda::uploadAllNeighborLists()
+{
+	MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... start\n");)
+	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
+	while(p != neigh_lists.end())
+	{
+		p->second->nl_upload();
+		if(not (p->second->neigh_list->cuda_list->build_cuda))
+		for(int i=0;i<atom->nlocal;i++)
+		p->second->sneighlist.maxneighbors=MAX(p->second->neigh_list->numneigh[i],p->second->sneighlist.maxneighbors) ;
+		++p;
+	}
+	MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... done\n");)
+}
+
+void Cuda::downloadAllNeighborLists()
+{
+	MYDBG(printf("# CUDA: Cuda::downloadAllNeighborList() ... start\n");)
+	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
+	while(p != neigh_lists.end())
+	{
+		p->second->nl_download();
+		++p;
+	}
+}
+
+void Cuda::update_xhold(int &maxhold,double* xhold)
+{
+     if(this->shared_data.atom.maxhold<atom->nmax)
+     {
+        maxhold = atom->nmax;
+		delete this->cu_xhold;           this->cu_xhold     = new cCudaData<double, X_FLOAT, yx> ((double*)xhold, & this->shared_data.atom.xhold         , maxhold, 3);
+     }
+     this->shared_data.atom.maxhold=maxhold;
+ 	 CudaWrapper_CopyData(this->cu_xhold->dev_data(),this->cu_x->dev_data(),3*atom->nmax*sizeof(X_FLOAT));
+}
+
+void Cuda::setTimingsZero()
+{
+	shared_data.cuda_timings.test1=0;
+	shared_data.cuda_timings.test2=0;
+	
+	//communication
+	shared_data.cuda_timings.comm_forward_total = 0;
+	shared_data.cuda_timings.comm_forward_mpi_upper = 0;
+	shared_data.cuda_timings.comm_forward_mpi_lower = 0;
+	shared_data.cuda_timings.comm_forward_kernel_pack = 0;
+	shared_data.cuda_timings.comm_forward_kernel_unpack = 0;
+	shared_data.cuda_timings.comm_forward_upload = 0;
+	shared_data.cuda_timings.comm_forward_download = 0;
+
+	shared_data.cuda_timings.comm_exchange_total = 0;
+	shared_data.cuda_timings.comm_exchange_mpi = 0;
+	shared_data.cuda_timings.comm_exchange_kernel_pack = 0;
+	shared_data.cuda_timings.comm_exchange_kernel_unpack = 0;
+	shared_data.cuda_timings.comm_exchange_kernel_fill = 0;
+	shared_data.cuda_timings.comm_exchange_cpu_pack= 0;
+	shared_data.cuda_timings.comm_exchange_upload = 0;
+	shared_data.cuda_timings.comm_exchange_download = 0;
+
+	shared_data.cuda_timings.comm_border_total = 0;
+	shared_data.cuda_timings.comm_border_mpi = 0;
+	shared_data.cuda_timings.comm_border_kernel_pack = 0;
+	shared_data.cuda_timings.comm_border_kernel_unpack = 0;
+	shared_data.cuda_timings.comm_border_kernel_buildlist = 0;
+	shared_data.cuda_timings.comm_border_kernel_self = 0;
+	shared_data.cuda_timings.comm_border_upload = 0;
+	shared_data.cuda_timings.comm_border_download = 0;
+	
+	//pair forces
+	shared_data.cuda_timings.pair_xtype_conversion = 0;
+	shared_data.cuda_timings.pair_kernel = 0;
+	shared_data.cuda_timings.pair_virial = 0;
+	shared_data.cuda_timings.pair_force_collection = 0;
+	
+	//neighbor
+	shared_data.cuda_timings.neigh_bin = 0;
+	shared_data.cuda_timings.neigh_build = 0;
+	shared_data.cuda_timings.neigh_special = 0;
+	
+	//PPPM
+ 	shared_data.cuda_timings.pppm_particle_map; 
+    shared_data.cuda_timings.pppm_make_rho; 
+    shared_data.cuda_timings.pppm_brick2fft; 
+    shared_data.cuda_timings.pppm_poisson; 
+    shared_data.cuda_timings.pppm_fillbrick; 
+    shared_data.cuda_timings.pppm_fieldforce; 
+    shared_data.cuda_timings.pppm_compute; 
+	
+	CudaWrapper_CheckUploadTime(true);
+	CudaWrapper_CheckDownloadTime(true);
+	CudaWrapper_CheckCPUBufUploadTime(true);
+	CudaWrapper_CheckCPUBufDownloadTime(true);	
+}
+
+void Cuda::print_timings()
+{
+	if(universe->me!=0) return;
+	if(not dotiming) return;
+	printf("\n # CUDA: Special timings\n\n");
+	printf("\n Transfer Times\n");
+	printf(" PCIe Upload:  \t %lf s\n",CudaWrapper_CheckUploadTime());
+	printf(" PCIe Download:\t %lf s\n",CudaWrapper_CheckDownloadTime());
+	printf(" CPU Tempbbuf Upload:   \t %lf \n",CudaWrapper_CheckCPUBufUploadTime());
+	printf(" CPU Tempbbuf Download: \t %lf \n",CudaWrapper_CheckCPUBufDownloadTime());
+	
+	printf("\n Communication \n");
+    
+	printf(" Forward Total           \t %lf \n",shared_data.cuda_timings.comm_forward_total);
+	printf(" Forward MPI Upper Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_upper);
+	printf(" Forward MPI Lower Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_lower);
+	printf(" Forward Kernel Pack     \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_pack);
+	printf(" Forward Kernel Unpack   \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_unpack);
+	printf(" Forward Kernel Self     \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_self);
+	printf(" Forward Upload          \t %lf \n",shared_data.cuda_timings.comm_forward_upload);
+	printf(" Forward Download        \t %lf \n",shared_data.cuda_timings.comm_forward_download);
+	printf(" Forward Overlap Split Ratio\t %lf \n",shared_data.comm.overlap_split_ratio);
+	printf("\n");
+
+	printf(" Exchange Total          \t %lf \n",shared_data.cuda_timings.comm_exchange_total);
+	printf(" Exchange MPI            \t %lf \n",shared_data.cuda_timings.comm_exchange_mpi);
+	printf(" Exchange Kernel Pack    \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_pack);
+	printf(" Exchange Kernel Unpack  \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_unpack);
+    printf(" Exchange Kernel Fill    \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_fill);
+    printf(" Exchange CPU Pack	     \t %lf \n",shared_data.cuda_timings.comm_exchange_cpu_pack);
+	printf(" Exchange Upload         \t %lf \n",shared_data.cuda_timings.comm_exchange_upload);
+	printf(" Exchange Download       \t %lf \n",shared_data.cuda_timings.comm_exchange_download);
+	printf("\n");
+
+	printf(" Border Total            \t %lf \n",shared_data.cuda_timings.comm_border_total);
+	printf(" Border MPI              \t %lf \n",shared_data.cuda_timings.comm_border_mpi);
+	printf(" Border Kernel Pack      \t %lf \n",shared_data.cuda_timings.comm_border_kernel_pack);
+	printf(" Border Kernel Unpack    \t %lf \n",shared_data.cuda_timings.comm_border_kernel_unpack);
+	printf(" Border Kernel Self      \t %lf \n",shared_data.cuda_timings.comm_border_kernel_self);
+	printf(" Border Kernel BuildList \t %lf \n",shared_data.cuda_timings.comm_border_kernel_buildlist);
+	printf(" Border Upload           \t %lf \n",shared_data.cuda_timings.comm_border_upload);
+	printf(" Border Download 	     \t %lf \n",shared_data.cuda_timings.comm_border_download);
+	printf("\n");
+	
+	//pair forces
+	printf(" Pair XType Conversion   \t %lf \n",shared_data.cuda_timings.pair_xtype_conversion );
+	printf(" Pair Kernel             \t %lf \n",shared_data.cuda_timings.pair_kernel );
+	printf(" Pair Virial             \t %lf \n",shared_data.cuda_timings.pair_virial );
+	printf(" Pair Force Collection   \t %lf \n",shared_data.cuda_timings.pair_force_collection );
+	printf("\n");
+	
+	//neighbor
+	printf(" Neighbor Binning        \t %lf \n",shared_data.cuda_timings.neigh_bin );
+	printf(" Neighbor Build          \t %lf \n",shared_data.cuda_timings.neigh_build );
+	printf(" Neighbor Special        \t %lf \n",shared_data.cuda_timings.neigh_special );	
+	printf("\n");
+	
+	//pppm
+	if(force->kspace)
+	{
+	printf(" PPPM Total              \t %lf \n",shared_data.cuda_timings.pppm_compute );
+	printf(" PPPM Particle Map       \t %lf \n",shared_data.cuda_timings.pppm_particle_map );
+	printf(" PPPM Make Rho           \t %lf \n",shared_data.cuda_timings.pppm_make_rho );
+	printf(" PPPM Brick2fft          \t %lf \n",shared_data.cuda_timings.pppm_brick2fft );
+	printf(" PPPM Poisson            \t %lf \n",shared_data.cuda_timings.pppm_poisson );
+	printf(" PPPM Fillbrick          \t %lf \n",shared_data.cuda_timings.pppm_fillbrick );
+	printf(" PPPM Fieldforce         \t %lf \n",shared_data.cuda_timings.pppm_fieldforce );
+	printf("\n");
+	}	
+
+	printf(" Debug Test 1            \t %lf \n",shared_data.cuda_timings.test1);
+	printf(" Debug Test 2            \t %lf \n",shared_data.cuda_timings.test2);
+	
+	printf("\n");
+}
diff --git a/src/USER-CUDA/cuda.h b/src/USER-CUDA/cuda.h
new file mode 100644
index 0000000000..21d47610fb
--- /dev/null
+++ b/src/USER-CUDA/cuda.h
@@ -0,0 +1,153 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef CUDA_H
+#define CUDA_H
+
+#include "pointers.h"
+#include "cuda_shared.h"
+#include "cuda_data.h"
+#include "cuda_precision.h"
+#include <map>
+
+#ifdef _DEBUG
+#define MYDBG(a) a
+#else
+#define MYDBG(a) 
+#endif
+
+namespace LAMMPS_NS
+{
+	class Cuda : protected Pointers
+	{
+		public:
+		Cuda(class LAMMPS *);
+		~Cuda();
+		//static void setDevice(class LAMMPS*);
+		void allocate();
+
+		void accelerator(int, char **);
+
+		void setSharedDataZero();
+		void setSystemParams();
+		
+		void setDomainParams();
+		
+		void checkResize();
+		void evsetup_eatom_vatom(int eflag_atom,int vflag_atom);
+		void uploadAll();
+		void downloadAll();
+		void downloadX();
+		
+		class CudaNeighList* registerNeighborList(class NeighList* neigh_list);
+		void uploadAllNeighborLists();
+		void downloadAllNeighborLists();
+		void set_neighinit(int dist_check, double triggerneighsq)
+		{
+    	  shared_data.atom.dist_check=dist_check;  
+    	  shared_data.atom.triggerneighsq = triggerneighsq;	  	
+		}
+		bool decide_by_integrator()
+		{
+		 return neighbor_decide_by_integrator  && cu_xhold && finished_setup;
+	    }
+	    void update_xhold(int &maxhold,double* xhold);
+	    
+		void setTimingsZero();
+		void print_timings();
+		
+		void cu_x_download() {cu_x->download();}
+		bool device_set;
+		bool dotiming;
+		bool dotestatom;
+		int testatom;
+		
+		double uploadtime,downloadtime;
+		bool finished_setup,begin_setup;
+		bool oncpu;
+		bool finished_run;
+		
+		int self_comm;
+
+		int cuda_exists;
+
+		double extent[6];
+		int* debugdata;
+		// data shared between host code and device code
+		// (number of atoms, device pointers for up- & download)
+		cuda_shared_data shared_data;
+		
+		cCudaData<double  , F_FLOAT , x >* cu_q;
+		cCudaData<double  , F_FLOAT , yx>* cu_f;
+		cCudaData<double  , V_FLOAT , x >* cu_mass;
+		cCudaData<double  , V_FLOAT , x >* cu_rmass;
+		cCudaData<double  , V_FLOAT , yx>* cu_v;
+		cCudaData<double  , X_FLOAT , yx>* cu_x;
+		cCudaData<double  , X_FLOAT , yx>* cu_xhold;
+		cCudaData<int     , int     , x >* cu_mask;
+		cCudaData<int     , int     , x >* cu_tag;
+		cCudaData<int     , int     , x >* cu_type;
+		cCudaData<int     , int     , x >* cu_image;
+		cCudaData<double  , ENERGY_FLOAT, x >* cu_eatom;
+		cCudaData<double  , ENERGY_FLOAT, yx>* cu_vatom;
+		cCudaData<double  , ENERGY_FLOAT, x >* cu_virial;
+		cCudaData<double  , ENERGY_FLOAT, x >* cu_eng_vdwl;
+		cCudaData<double  , ENERGY_FLOAT, x >* cu_eng_coul;
+		cCudaData<double  , double  , x >* cu_extent;
+		int* binned_id;
+		cCudaData<int 	  , int	    , xx >* cu_binned_id;
+		int* binned_idnew;
+		cCudaData<int 	  , int	    , xx >* cu_binned_idnew;
+		cCudaData<int 	  , int	    , x >* cu_debugdata;
+		cCudaData<double  , X_FLOAT , x>* cu_radius;
+		cCudaData<double  , F_FLOAT , x>* cu_density;
+		cCudaData<double  , V_FLOAT , yx>* cu_omega;
+		cCudaData<double  , F_FLOAT , yx>* cu_torque;
+		cCudaData<int 	  , int	    , yx >* cu_special;
+		cCudaData<int 	  , int	    , yx >* cu_nspecial;
+		cCudaData<int     , int     , x >* cu_molecule;
+
+		
+		cCudaData<X_FLOAT  , X_FLOAT , x>* cu_x_type;
+		X_FLOAT* x_type;
+
+		cCudaData<V_FLOAT  , V_FLOAT , x>* cu_v_radius;
+		V_FLOAT* v_radius;
+
+		cCudaData<V_FLOAT  , V_FLOAT , x>* cu_omega_rmass;
+		V_FLOAT* omega_rmass;
+
+		cCudaData<int     , int     , x >* cu_map_array;
+		int neighbor_decide_by_integrator;
+
+        bool pinned;
+        
+		void* copy_buffer;
+		int copy_buffersize;
+
+		private:
+		std::map<class NeighList*, class CudaNeighList*> neigh_lists;
+	};
+}
+
+#endif // CUDA_H
diff --git a/src/USER-CUDA/cuda_common.h b/src/USER-CUDA/cuda_common.h
new file mode 100644
index 0000000000..d4687ebd06
--- /dev/null
+++ b/src/USER-CUDA/cuda_common.h
@@ -0,0 +1,344 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_COMMON_H_
+#define _CUDA_COMMON_H_
+
+//#include "cutil.h"
+#include "cuda_precision.h"
+#include "cuda_wrapper_cu.h"
+
+#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
+//this can not be arbitrarly large, since constant space is limited. 
+//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
+//Christian   
+#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
+#define CUDA_MAX_NSPECIAL 25
+
+// define some easy-to-use debug and emulation macros
+#ifdef _DEBUG
+#define MYDBG(a) a
+#else
+#define MYDBG(a) 
+#endif
+
+#if __DEVICE_EMULATION__
+#define MYEMU(a) a
+#else
+#define MYEMU(a) 
+#endif
+
+#define MYEMUDBG(a) MYEMU(MYDBG(a))
+
+// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
+#define MY_ADD_PREFIX(prefix, var) prefix##_##var
+#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
+#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
+
+#define MY_VAR_TO_STR(var) #var
+#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
+#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
+
+#define CUDA_USE_TEXTURE
+#define CUDA_USE_FLOAT4
+
+//constants used by many classes
+
+//domain
+#define _boxhi       MY_AP(boxhi)
+#define _boxlo       MY_AP(boxlo)
+#define _subhi       MY_AP(subhi)
+#define _sublo       MY_AP(sublo)
+#define _box_size    MY_AP(box_size)
+#define _prd         MY_AP(prd)
+#define _periodicity MY_AP(periodicity)
+#define _triclinic	 MY_AP(triclinic)
+#define _boxhi_lamda MY_AP(boxhi_lamda)
+#define _boxlo_lamda MY_AP(boxlo_lamda)
+#define _prd_lamda   MY_AP(prd_lamda)
+#define _h		 	 MY_AP(h)
+#define _h_inv	 	 MY_AP(h_inv)
+#define _h_rate		 MY_AP(h_rate)
+__device__ __constant__ X_FLOAT _boxhi[3];
+__device__ __constant__ X_FLOAT _boxlo[3];
+__device__ __constant__ X_FLOAT _subhi[3];
+__device__ __constant__ X_FLOAT _sublo[3];
+__device__ __constant__ X_FLOAT _box_size[3];
+__device__ __constant__ X_FLOAT _prd[3];
+__device__ __constant__ int _periodicity[3];
+__device__ __constant__ int _triclinic;
+__device__ __constant__ X_FLOAT _boxhi_lamda[3];
+__device__ __constant__ X_FLOAT _boxlo_lamda[3];
+__device__ __constant__ X_FLOAT _prd_lamda[3];
+__device__ __constant__ X_FLOAT _h[6];
+__device__ __constant__ X_FLOAT _h_inv[6];
+__device__ __constant__ V_FLOAT _h_rate[6];
+
+
+//atom properties
+#define _x           MY_AP(x)
+#define _v           MY_AP(v)
+#define _f           MY_AP(f)
+#define _tag         MY_AP(tag)
+#define _type        MY_AP(type)
+#define _mask        MY_AP(mask)
+#define _image       MY_AP(image)
+#define _q           MY_AP(q)
+#define _mass        MY_AP(mass)
+#define _rmass       MY_AP(rmass)
+#define _rmass_flag  MY_AP(rmass_flag)
+#define _eatom       MY_AP(eatom)
+#define _vatom       MY_AP(vatom)
+#define _x_type      MY_AP(x_type)
+#define _radius      MY_AP(radius)
+#define _density     MY_AP(density)
+#define _omega       MY_AP(omega)
+#define _torque      MY_AP(torque)
+#define _special     MY_AP(special)
+#define _maxspecial  MY_AP(maxspecial)
+#define _nspecial    MY_AP(nspecial)
+#define _special_flag  MY_AP(special_flag)
+#define _molecule    MY_AP(molecule)
+#define _v_radius    MY_AP(v_radius)
+#define _omega_rmass MY_AP(omega_rmass)
+#define _freeze_group_bit MY_AP(freeze_group_bit)
+#define _map_array   MY_AP(map_array)
+__device__ __constant__ X_FLOAT* _x;  //holds pointer to positions
+__device__ __constant__ V_FLOAT* _v;
+__device__ __constant__ F_FLOAT* _f;
+__device__ __constant__ int* _tag;
+__device__ __constant__ int* _type;
+__device__ __constant__ int* _mask;
+__device__ __constant__ int* _image;
+__device__ __constant__ V_FLOAT* _mass;
+__device__ __constant__ F_FLOAT* _q;
+__device__ __constant__ V_FLOAT* _rmass;
+__device__ __constant__ int _rmass_flag;
+__device__ __constant__ ENERGY_FLOAT* _eatom;
+__device__ __constant__ ENERGY_FLOAT* _vatom;
+__device__ __constant__ X_FLOAT4* _x_type;  //holds pointer to positions
+__device__ __constant__ X_FLOAT* _radius;  
+__device__ __constant__ F_FLOAT* _density;  
+__device__ __constant__ V_FLOAT* _omega;  
+__device__ __constant__ F_FLOAT* _torque;  
+__device__ __constant__ int* _special;
+__device__ __constant__ int _maxspecial;
+__device__ __constant__ int* _nspecial;
+__device__ __constant__ int _special_flag[4];
+__device__ __constant__ int* _molecule;
+__device__ __constant__ V_FLOAT4* _v_radius;  //holds pointer to positions
+__device__ __constant__ V_FLOAT4* _omega_rmass;  //holds pointer to positions
+__device__ __constant__ int _freeze_group_bit;
+__device__ __constant__ int* _map_array;
+
+#ifdef CUDA_USE_TEXTURE
+	
+	#define _x_tex         MY_AP(x_tex)
+	#if X_PRECISION == 1
+	texture<float> _x_tex;
+	#else
+	texture<int2,1> _x_tex;
+	#endif
+	
+	#define _type_tex         MY_AP(type_tex)
+	texture<int> _type_tex;
+	
+	#define _x_type_tex         MY_AP(x_type_tex)
+	#if X_PRECISION == 1
+	texture<float4,1> _x_type_tex;
+	#else
+	texture<int4,1> _x_type_tex;
+	#endif
+	
+	#define _v_radius_tex         MY_AP(v_radius_tex)
+	#if V_PRECISION == 1
+	texture<float4,1> _v_radius_tex;
+	#else
+	texture<int4,1> _v_radius_tex;
+	#endif
+	
+	#define _omega_rmass_tex         MY_AP(omega_rmass_tex)
+	#if V_PRECISION == 1
+	texture<float4,1> _omega_rmass_tex;
+	#else
+	texture<int4,1> _omega_rmass_tex;
+	#endif
+
+	#define _q_tex         MY_AP(q_tex)
+	#if F_PRECISION == 1
+	texture<float> _q_tex;
+	#else
+	texture<int2,1> _q_tex;
+	#endif
+
+#endif
+
+//neighbor
+#ifdef IncludeCommonNeigh
+#define _inum        	MY_AP(inum)
+#define _inum_border    MY_AP(inum_border)
+#define _ilist       	MY_AP(ilist)
+#define _ilist_border 	MY_AP(ilist_border)
+#define _numneigh    	MY_AP(numneigh)
+#define _numneigh_border 	MY_AP(numneigh_border)
+#define _numneigh_inner		MY_AP(numneigh_inner)
+#define _firstneigh  	MY_AP(firstneigh)
+#define _neighbors 	MY_AP(neighbors)
+#define _neighbors_border 	MY_AP(neighbors_border)
+#define _neighbors_inner  	MY_AP(neighbors_inner)
+#define _reneigh_flag 	MY_AP(reneigh_flag)
+#define _triggerneighsq MY_AP(triggerneighsq)
+#define _xhold       	MY_AP(xhold)
+#define _maxhold     	MY_AP(maxhold)
+#define _dist_check     MY_AP(dist_check)
+#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
+#define _maxneighbors   MY_AP(maxneighbors)
+#define _overlap_comm   MY_AP(overlap_comm)
+__device__ __constant__ int _inum;
+__device__ __constant__ int* _inum_border;
+__device__ __constant__ int* _ilist;
+__device__ __constant__ int* _ilist_border;
+__device__ __constant__ int* _numneigh;
+__device__ __constant__ int* _numneigh_border;
+__device__ __constant__ int* _numneigh_inner;
+__device__ __constant__ int** _firstneigh;
+__device__ __constant__ int* _neighbors;
+__device__ __constant__ int* _neighbors_border;
+__device__ __constant__ int* _neighbors_inner;
+__device__ __constant__ int* _reneigh_flag;
+__device__ __constant__ X_FLOAT _triggerneighsq;
+__device__ __constant__ X_FLOAT* _xhold;  //holds pointer to positions
+__device__ __constant__ int _maxhold;
+__device__ __constant__ int _dist_check;
+__device__ __constant__ int _neighbor_maxlocal;
+__device__ __constant__ int _maxneighbors;
+__device__ __constant__ int _overlap_comm;
+#endif
+
+//system properties
+#define _nall        MY_AP(nall)
+#define _nghost      MY_AP(nghost)
+#define _nlocal      MY_AP(nlocal)
+#define _nmax        MY_AP(nmax)
+#define _cuda_ntypes MY_AP(cuda_ntypes)
+#define _dtf         MY_AP(dtf)
+#define _dtv         MY_AP(dtv)
+#define _factor      MY_AP(factor)
+#define _virial      MY_AP(virial)
+#define _eng_vdwl    MY_AP(eng_vdwl)
+#define _eng_coul    MY_AP(eng_coul)
+#define _molecular   MY_AP(molecular)
+__device__ __constant__ unsigned _nall;
+__device__ __constant__ unsigned _nghost;
+__device__ __constant__ unsigned _nlocal;
+__device__ __constant__ unsigned _nmax;
+__device__ __constant__ unsigned _cuda_ntypes;
+__device__ __constant__ V_FLOAT _dtf;
+__device__ __constant__ X_FLOAT _dtv;
+__device__ __constant__ V_FLOAT _factor;
+__device__ __constant__ ENERGY_FLOAT* _virial;
+__device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
+__device__ __constant__ ENERGY_FLOAT* _eng_coul;
+__device__ __constant__ int _molecular;
+
+//other general constants
+#define _buffer      MY_AP(buffer)
+#define _flag		 MY_AP(flag)
+#define _debugdata   MY_AP(debugdata)
+__device__ __constant__ void* _buffer;
+__device__ __constant__ int* _flag;
+__device__ __constant__ int* _debugdata;
+
+// pointers to data fields on GPU are hold in constant space 
+// -> reduces register usage and number of parameters for kernelcalls 
+// will be variables of file scope in cuda files
+
+
+
+
+// maybe used to output cudaError_t
+#define MY_OUTPUT_RESULT(result) \
+	switch(result) \
+	{ \
+		case cudaSuccess: printf(" => cudaSuccess\n"); break; \
+		case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
+		case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
+		case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
+		case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
+		default: printf(" => unknown\n"); break; \
+	}
+
+#ifdef _DEBUG
+#  define CUT_CHECK_ERROR(errorMessage) {                                    \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    err = cudaThreadSynchronize();                                           \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    }
+#else
+#  define CUT_CHECK_ERROR(errorMessage) {                                    \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    }
+#endif
+
+#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } }
+
+#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
+
+#define X_MASK 1
+#define V_MASK 2
+#define F_MASK 4
+#define TAG_MASK 8
+#define TYPE_MASK 16
+#define MASK_MASK 32
+#define IMAGE_MASK 64
+#define Q_MASK 128
+#define MOLECULE_MASK 256
+#define RMASS_MASK 512
+#define RADIUS_MASK 1024
+#define DENSITY_MASK 2048
+#define OMEGA_MASK 4096
+#define TORQUE_MASK 8192
+
+
+
+#endif // #ifdef _CUDA_COMMON_H_
diff --git a/src/USER-CUDA/cuda_data.h b/src/USER-CUDA/cuda_data.h
new file mode 100644
index 0000000000..6311598332
--- /dev/null
+++ b/src/USER-CUDA/cuda_data.h
@@ -0,0 +1,796 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_DATA_H_
+#define _CUDA_DATA_H_
+
+
+enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
+//xx==x in atom_vec x is a member therefore copymode x produces compile errors
+#include "cuda_shared.h"
+#include "cuda_wrapper_cu.h"
+#include "cuda_data_cu.h"
+#include <ctime>
+
+#include <cstdio>
+#include <typeinfo>
+template <typename host_type, typename dev_type, copy_mode mode>
+class cCudaData
+{
+	protected:
+	void** buffer;
+	int* buf_size;
+	host_type* host_data;
+	dev_array* dev_data_array;
+	dev_type* temp_data;
+	unsigned nbytes;
+	bool owns_dev_array;
+	bool current_data_on_device; //this is not yet working as intended and therefore deactivated
+	bool current_data_on_host;
+	bool is_continues;
+	bool pinned;
+	
+	public:
+	cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
+	cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
+	~cCudaData();
+	void* dev_data() {if(dev_data_array!=NULL) return dev_data_array->dev_data; else return NULL;};
+	void set_dev_data(void* adev_data) {dev_data_array->dev_data=adev_data;};
+	void set_dev_array(dev_array* adev_array) {dev_data_array=adev_array;};
+	void set_host_data(host_type* host_data);
+	void* get_host_data() { return host_data;};
+	void set_buffer(void** buffer,int* buf_size,bool ais_continues);
+	unsigned int* get_dim() {return dev_data_array->dim;};
+	// if you want to upload data to the gpu, which will not change there, then set will_be_changed=false
+	// if you want to upload data to the gpu and update it there, then set will_be_changed=true (default)
+	void upload(bool will_be_changed=true);
+	void uploadAsync(int stream, bool will_be_changed=true ); 
+	// if you want to download data just to have a look at it, then set will_be_changed=false
+	// if you are going to modify the downloaded data, then set will_be_changed=true (default)
+	void download(bool will_be_changed=true);
+	void downloadAsync(int stream); 
+	void memset_device(int value);
+	void device_data_has_changed() {current_data_on_device=false;}
+	void host_data_has_changed() {current_data_on_host=false;}
+	int dev_size() {
+		int size = dev_data_array->dim[0]*sizeof(dev_type);
+		if(dev_data_array->dim[1]) size*=dev_data_array->dim[1];
+		if(dev_data_array->dim[2]) size*=dev_data_array->dim[2];
+		return size;}
+};
+
+
+template <typename host_type, typename dev_type, copy_mode mode>
+cCudaData<host_type, dev_type, mode>
+::cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
+{
+	pinned=is_pinned;
+	owns_dev_array = false;
+	current_data_on_device = false;
+	current_data_on_host = false;
+	is_continues = false;
+	this->host_data = host_data;
+	this->dev_data_array = dev_data_array;
+	unsigned ndev;
+	if((mode == x)||(mode==xx))
+	{
+		ndev = dim_x;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = 0;
+		dev_data_array->dim[2] = 0;
+	}
+	else if(mode == xy || mode == yx )
+	{
+		ndev = dim_x * dim_y;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = dim_y;
+		dev_data_array->dim[2] = 0;
+	}
+	else
+	{
+		ndev = dim_x * dim_y * dim_z;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = dim_y;
+		dev_data_array->dim[2] = dim_z;
+	}
+	nbytes = ndev * sizeof(dev_type);
+	if(nbytes<=0)
+	{
+		host_data=NULL;
+		temp_data=NULL;
+		dev_data_array->dev_data=NULL;
+		return;
+	}
+	
+	dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
+	if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type))
+	{
+		if(not pinned)
+		temp_data = new dev_type[ndev];
+		else
+		{
+			temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type));
+		}
+	}
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+cCudaData<host_type, dev_type, mode>
+::cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
+{
+	pinned=is_pinned;
+	this->dev_data_array = new dev_array;
+	this->owns_dev_array = true;
+	current_data_on_device = false;
+	current_data_on_host = false;
+	is_continues = false;
+	this->host_data = host_data;
+	unsigned ndev;
+	if((mode == x)||(mode==xx))
+	{
+		ndev = dim_x;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = 0;
+		dev_data_array->dim[2] = 0;
+	}
+	else if(mode == xy || mode == yx )
+	{
+		ndev = dim_x * dim_y;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = dim_y;
+		dev_data_array->dim[2] = 0;
+	}
+	else
+	{
+		ndev = dim_x * dim_y * dim_z;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = dim_y;
+		dev_data_array->dim[2] = dim_z;
+	}
+	nbytes = ndev * sizeof(dev_type);
+	if(nbytes<=0)
+	{
+		host_data=NULL;
+		temp_data=NULL;
+		dev_data_array->dev_data=NULL;
+		return;
+	}
+	
+	dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
+	if(((mode!=x)&&(mode!=xx)) || (typeid(host_type) != typeid(dev_type)))
+	{
+		if(not pinned)
+		temp_data = new dev_type[ndev];
+		else
+		{
+			temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type));
+		}
+	}
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+cCudaData<host_type, dev_type, mode>
+::~cCudaData()
+{
+	if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type))
+	{
+		if(not pinned)
+		delete [] temp_data;
+		else
+		{
+			CudaWrapper_FreePinnedHostData((void*)temp_data);
+		}
+	}
+	if((dev_data_array->dev_data)&&(nbytes>0))
+	CudaWrapper_FreeCudaData(dev_data_array->dev_data,nbytes);
+	if(owns_dev_array) delete dev_data_array;
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::set_host_data(host_type* host_data)
+{
+	this->host_data = host_data;
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::upload(bool will_be_changed)
+{
+	// if current data is already up, do not re-upload it
+//	if(current_data_on_device) return;
+    if(buffer&&is_continues)
+    {
+   	printf("Actual Buffer: %p %i\n",*buffer,*buf_size);
+    	if(typeid(host_type)==typeid(double))
+    	{
+    	  if(typeid(dev_type)==typeid(double))
+    	  {
+    	  	CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	  else if(typeid(dev_type)==typeid(float))
+    	  {
+    	  	CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    	else if(typeid(host_type)==typeid(float))
+    	{
+    	  if(typeid(dev_type)==typeid(double))
+    	  {
+    	  	CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	  else if(typeid(dev_type)==typeid(float))
+    	  {
+    	  	CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    	else if(typeid(host_type)==typeid(int))
+    	{
+    	  if(typeid(dev_type)==typeid(int))
+    	  {
+    	  	CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    }
+	switch(mode)
+	{
+		case x:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
+			else
+			{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			  for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			  CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			}
+			break;
+		}
+		
+		case xx:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
+			else
+			{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+				CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			}
+			break;
+		}
+
+		case xy:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			{
+				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			break;
+		}
+		
+		case yx:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+				{
+					temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			break;
+		}	
+		case xyz:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
+				for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+				{
+					temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			break;
+		}	
+
+		case xzy:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+			{
+				dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			break;
+		}	
+	}
+	// we have uploaded the data to the device, i.e.:
+	current_data_on_device = true;
+	// the data is going to change on the device, making the host data out-dated
+	if(will_be_changed) current_data_on_host = false;
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::uploadAsync(int stream,bool will_be_changed)
+{
+	// if current data is already up, do not re-upload it
+//	if(current_data_on_device) return;
+    if(buffer&&is_continues)
+    {
+   	printf("Actual Buffer: %p %i\n",*buffer,*buf_size);
+    	if(typeid(host_type)==typeid(double))
+    	{
+    	  if(typeid(dev_type)==typeid(double))
+    	  {
+    	  	CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	  else if(typeid(dev_type)==typeid(float))
+    	  {
+    	  	CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    	else if(typeid(host_type)==typeid(float))
+    	{
+    	  if(typeid(dev_type)==typeid(double))
+    	  {
+    	  	CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	  else if(typeid(dev_type)==typeid(float))
+    	  {
+    	  	CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    	else if(typeid(host_type)==typeid(int))
+    	{
+    	  if(typeid(dev_type)==typeid(int))
+    	  {
+    	  	CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    }
+	switch(mode)
+	{
+		case x:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
+			else
+			{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			  for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			  CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			}
+			break;
+		}
+		
+		case xx:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
+			else
+			{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+				CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			}
+			break;
+		}
+
+		case xy:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			{
+				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			break;
+		}
+		
+		case yx:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+				{
+					temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			break;
+		}	
+		case xyz:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
+				for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+				{
+					temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			break;
+		}	
+
+		case xzy:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+			{
+				dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			break;
+		}	
+	}
+	// we have uploaded the data to the device, i.e.:
+	current_data_on_device = true;
+	// the data is going to change on the device, making the host data out-dated
+	if(will_be_changed) current_data_on_host = false;
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::download(bool will_be_changed)
+{
+	// if current data is already down, do not re-download it
+//	if(current_data_on_host) return;
+	switch(mode)
+	{
+		case x:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
+			else
+			{
+				CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			}
+			break;
+		}
+		
+		case xx:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
+			else
+			{
+				CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			}
+			break;
+		}
+		
+		case xy:
+		{
+			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			{
+				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			break;
+		}
+		
+		case yx:
+		{
+			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+				{
+					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			break;
+		}
+
+		case xyz:
+		{
+			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[(i * dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
+				for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+				{
+					reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[k]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			break;
+		}
+
+		case xzy:
+		{
+			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+			{
+				dev_type* temp = &temp_data[(i * dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			break;
+		}
+	}
+	// we have downloaded the data to the host, i.e.:
+	current_data_on_host = true;
+	// the data is going to change on the host, making the device data out-dated
+	if(will_be_changed) current_data_on_device = false;
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::downloadAsync(int stream)
+{
+	switch(mode)
+	{
+		case x:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+			{
+				CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
+				CudaWrapper_SyncStream(stream);
+			}		
+			else
+			{
+				CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
+				CudaWrapper_SyncStream(stream);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
+			}
+			break;
+		}
+		
+		case xx:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+			{
+				CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
+			    CudaWrapper_SyncStream(stream);
+			}
+			else
+			{
+				CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
+ 			    CudaWrapper_SyncStream(stream);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
+			}
+			break;
+		}
+
+		case xy:
+		{
+			CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
+			CudaWrapper_SyncStream(stream);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			{
+				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
+				}
+			}
+			break;
+		}
+		
+		case yx:
+		{
+			CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
+			CudaWrapper_SyncStream(stream);
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+				{
+					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
+				}
+			}
+			break;
+		}
+	}
+}
+
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::memset_device(int value)
+{
+   CudaWrapper_Memset(dev_data_array->dev_data,value, nbytes);
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::set_buffer(void** abuffer,int* abuf_size,bool ais_continues)
+{
+   buffer = abuffer;
+   buf_size = abuf_size;
+   unsigned nbytes_buf=(nbytes/sizeof(dev_type))*sizeof(host_type);
+   if(buffer!=NULL)
+   if(not((typeid(host_type) == typeid(dev_type))&&(mode == x || mode == xx)))
+   {
+   	printf("Allocate Buffer: %p %i\n",*buffer,*buf_size);
+   	 if(((*buffer)!=NULL)&&(*buf_size<nbytes_buf))
+   	 CudaWrapper_FreeCudaData(*buffer,*buf_size);
+   	 if(*buf_size<nbytes_buf)
+   	 {*buffer=CudaWrapper_AllocCudaData(nbytes_buf);*buf_size=nbytes_buf;}
+   	printf("Allocate Buffer2: %p %i\n",*buffer,*buf_size);
+   	 
+   }
+   is_continues=ais_continues;
+}
+#endif // _CUDA_DATA_H_
diff --git a/src/USER-CUDA/cuda_modify_flags.h b/src/USER-CUDA/cuda_modify_flags.h
new file mode 100644
index 0000000000..f604cb5da3
--- /dev/null
+++ b/src/USER-CUDA/cuda_modify_flags.h
@@ -0,0 +1,39 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef CUDA_MODIFY_FLAGS_H
+#define CUDA_MODIFY_FLAGS_H
+
+#define INITIAL_INTEGRATE_CUDA  (1 << 16)
+#define POST_INTEGRATE_CUDA     (1 << 17)
+#define PRE_EXCHANGE_CUDA       (1 << 18)
+#define PRE_NEIGHBOR_CUDA       (1 << 19)
+#define PRE_FORCE_CUDA          (1 << 20)
+#define POST_FORCE_CUDA         (1 << 21)
+#define FINAL_INTEGRATE_CUDA    (1 << 22)
+#define END_OF_STEP_CUDA        (1 << 23)
+#define THERMO_ENERGY_CUDA      (1 << 24)
+#define MIN_POST_FORCE_CUDA      (1 << 25)
+// remember not to shift over 31 bits
+
+#endif // CUDA_MODIFY_FLAGS_H
diff --git a/src/USER-CUDA/cuda_neigh_list.cpp b/src/USER-CUDA/cuda_neigh_list.cpp
new file mode 100644
index 0000000000..e6d7a6f516
--- /dev/null
+++ b/src/USER-CUDA/cuda_neigh_list.cpp
@@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_neigh_list.h"
+#include "neigh_list.h"
+#include <cstring>
+#include <vector>
+#include <map>
+#include <algorithm>
+#include "cuda.h"
+#include "atom.h"
+
+using namespace LAMMPS_NS;
+
+CudaNeighList::CudaNeighList(LAMMPS *lmp, class NeighList* neigh_list) : Pointers(lmp)
+{
+        cuda = lmp->cuda;
+	MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... start\n");)
+	this->neigh_list = neigh_list;
+	neigh_list->cuda_list=this;
+	sneighlist.maxlocal = neigh_list->get_maxlocal();
+	sneighlist.maxneighbors = 32;
+	sneighlist.maxcut = 0.0;
+	sneighlist.cutneighsq = NULL;
+	cu_neighbors = NULL;
+	cu_neighbors_border = NULL;
+	cu_neighbors_inner = NULL;
+	cu_numneigh_border = NULL;
+	cu_numneigh_inner = NULL;
+	cu_numneigh = NULL;
+	cu_ilist = NULL;
+	cu_ilist_border = NULL;
+	cu_inum_border = NULL;
+	inum_border = 0;
+	neighbors = NULL;
+	neighbors_inner = NULL;
+	neighbors_border = NULL;
+	numneigh_border = NULL;
+	numneigh_inner = NULL;
+	ilist_border = NULL;
+
+	build_cuda = false;
+	sneighlist.binned_id=NULL;
+	sneighlist.bin_dim=new int[3];
+	sneighlist.bin_dim[0]=0;
+	sneighlist.bin_dim[1]=0;
+	sneighlist.bin_dim[2]=0;
+	
+	cu_ex_type = NULL;
+	cu_ex1_bit = NULL;
+	cu_ex2_bit = NULL;
+	cu_ex_mol_bit = NULL;
+	sneighlist.nex_type=0;
+	sneighlist.nex_group=0;
+	sneighlist.nex_mol=0;
+	
+	sneighlist.bin_nmax=0;
+	sneighlist.bin_extraspace=0.05;
+	MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... end\n");)
+	
+}
+
+CudaNeighList::~CudaNeighList()
+{
+	dev_free();
+}
+
+void CudaNeighList::dev_alloc()
+{ 
+	MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... start\n"); )
+	cu_ilist         = new cCudaData<int , int , x> (neigh_list->ilist   , & sneighlist.ilist     , sneighlist.maxlocal );
+	cu_numneigh      = new cCudaData<int , int , x> (neigh_list->numneigh, & sneighlist.numneigh  , sneighlist.maxlocal );
+	neighbors = new int[atom->nmax*sneighlist.maxneighbors];
+	cu_neighbors= new cCudaData<int, int, x> (neighbors			 		 , & sneighlist.neighbors, atom->nmax*sneighlist.maxneighbors );
+
+	if(cuda->shared_data.overlap_comm)
+	{
+	ilist_border  = new int[sneighlist.maxlocal];
+	numneigh_border	= new int[sneighlist.maxlocal];
+	numneigh_inner	= new int[sneighlist.maxlocal];
+	cu_inum_border  = new cCudaData<int , int , x> (&inum_border		 , & sneighlist.inum_border      , 1 );
+	cu_ilist_border  = new cCudaData<int , int , x> (ilist_border		 , & sneighlist.ilist_border     , sneighlist.maxlocal );
+	cu_numneigh_border	= new cCudaData<int , int , x> (numneigh_border  , & sneighlist.numneigh_border  , sneighlist.maxlocal );
+	cu_numneigh_inner 	= new cCudaData<int , int , x> (numneigh_inner   , & sneighlist.numneigh_inner   , sneighlist.maxlocal );
+	neighbors_border = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
+	cu_neighbors_border= new cCudaData<int, int, x> (neighbors_border	 , & sneighlist.neighbors_border, sneighlist.maxlocal*sneighlist.maxneighbors );
+	neighbors_inner = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
+	cu_neighbors_inner = new cCudaData<int, int, x> (neighbors_inner	 , & sneighlist.neighbors_inner , sneighlist.maxlocal*sneighlist.maxneighbors );
+	}
+	MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... end\n"); )
+}
+
+void CudaNeighList::dev_free()
+{
+	MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... start\n"); )
+	delete cu_numneigh;
+	delete cu_ilist;
+	delete [] neighbors;
+	delete cu_neighbors;
+
+	if(cuda->shared_data.overlap_comm)
+	{
+	delete [] ilist_border;
+	delete [] numneigh_border;
+	delete [] numneigh_inner;
+	delete [] neighbors_border;
+	delete [] neighbors_inner;
+	delete cu_inum_border;
+	delete cu_neighbors_border;
+	delete cu_neighbors_inner;
+	delete cu_numneigh_border;
+	delete cu_numneigh_inner;
+	delete cu_ilist_border;
+	}
+	MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... end\n"); )
+}
+
+void CudaNeighList::grow_device()
+{
+	MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... start\n");)
+	// if host has allocated more memory for atom arrays than device has, then allocate more memory on device
+	int new_maxlocal = neigh_list->get_maxlocal();
+	if(sneighlist.maxlocal < new_maxlocal)
+	{
+		sneighlist.maxlocal = new_maxlocal;
+		dev_free();
+		dev_alloc();
+	}
+
+	if(!cu_ilist || !cu_numneigh) dev_alloc();
+
+	// check, if hosts data has been allocated somewhere else
+	if(cu_ilist   ->get_host_data() != neigh_list->ilist)    cu_ilist   ->set_host_data(neigh_list->ilist);
+	if(cu_numneigh->get_host_data() != neigh_list->numneigh) cu_numneigh->set_host_data(neigh_list->numneigh);
+		
+	MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... end\n");)
+}
+
+
+void CudaNeighList::nl_upload(bool will_be_changed)
+{
+	//return;
+	MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... start\n");)
+	if(cu_ilist)
+	cu_ilist->upload();
+	if(cu_numneigh)
+	cu_numneigh->upload();
+	MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... end\n");)
+}
+
+void CudaNeighList::nl_download(bool will_be_changed)
+{
+	MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... start\n");)
+	if(cu_ilist)
+	cu_ilist->download();
+	if(cu_numneigh)
+	cu_numneigh->download();
+	MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... end\n");)
+}
+
diff --git a/src/USER-CUDA/cuda_neigh_list.h b/src/USER-CUDA/cuda_neigh_list.h
new file mode 100644
index 0000000000..97e75b9390
--- /dev/null
+++ b/src/USER-CUDA/cuda_neigh_list.h
@@ -0,0 +1,83 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_NEIGH_LIST_CUDA_H
+#define LMP_NEIGH_LIST_CUDA_H
+
+#include "pointers.h"
+#include "cuda_data.h"
+#include "neigh_list.h"
+
+namespace LAMMPS_NS
+{
+
+class CudaNeighList : protected Pointers
+{
+	public:
+		cCudaData<int , int , x>*  cu_ilist;
+		cCudaData<int , int , x>*  cu_numneigh;
+		cCudaData<int , int , x>*  cu_inum_border;
+		cCudaData<int , int , x>*  cu_ilist_border;
+		cCudaData<int , int , x>*  cu_numneigh_border;
+		cCudaData<int , int , x>*  cu_numneigh_inner;
+		cCudaData<int , int , x>*  cu_neighbors;
+		cCudaData<int , int , x>*  cu_neighbors_border;
+		cCudaData<int , int , x>*  cu_neighbors_inner;
+		cCudaData<int , int , x>*  cu_ex_type;
+		cCudaData<int , int , x>*  cu_ex1_bit;
+		cCudaData<int , int , x>*  cu_ex2_bit;
+		cCudaData<int , int , x>*  cu_ex_mol_bit;
+		
+		
+		cuda_shared_neighlist sneighlist;
+		
+		int* neighbors;
+		int* neighbors_inner;
+		int* neighbors_border;
+		int inum_border;
+		int* ilist_border;
+		int* numneigh_border;
+		int* numneigh_inner;
+		int nex_type;
+		int nex_group;
+		int nex_mol;
+		
+		bool build_cuda;
+		
+		CudaNeighList(class LAMMPS *, class NeighList* neigh_list);
+		~CudaNeighList();
+		void grow_device(); // will grow pages memory on device, keeping old pages. will grow lists memory on device, deleting old lists
+		void nl_upload(bool will_be_changed=true);
+		void nl_download(bool will_be_changed=true);
+		NeighList* neigh_list;
+			
+		void dev_alloc();
+		void dev_free();
+
+ private:
+  class Cuda *cuda;
+};
+
+}
+
+#endif
diff --git a/src/USER-CUDA/cuda_precision.h b/src/USER-CUDA/cuda_precision.h
new file mode 100644
index 0000000000..5b7d6a6843
--- /dev/null
+++ b/src/USER-CUDA/cuda_precision.h
@@ -0,0 +1,269 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef CUDA_PRECISION_H_
+#define CUDA_PRECISION_H_
+/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
+ * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
+ * ***_FLOAT: type definition of given property
+ * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
+ */
+
+#ifdef CUDA_USE_BINNING
+#define CUDA_IF_BINNING(a) a
+#else
+#define CUDA_IF_BINNING(a) 
+#endif
+
+//GLOBAL
+
+#ifdef CUDA_PRECISION
+  #if CUDA_PRECISION == 1
+    #define CUDA_FLOAT float
+    #define CUDA_F(x) x##f
+  #endif 
+  #if CUDA_PRECISION == 2
+    #define CUDA_FLOAT double
+    #define CUDA_F(x) x
+  #endif
+#endif
+ 
+#ifndef CUDA_PRECISION
+  #define CUDA_FLOAT double
+  #define CUDA_F(x) x
+  #define CUDA_PRECISION 2
+#endif
+//--------------------------------
+//-----------FFT-----------------
+//--------------------------------
+
+#ifdef FFT_PRECISION_CU
+  #if FFT_PRECISION_CU == 1
+    #define FFT_FLOAT float
+    #define FFT_F(x) x##f 
+  #endif 
+  #if FFT_PRECISION_CU == 2
+    #define FFT_FLOAT double
+    #define FFT_F(x) x 
+  #endif
+#endif
+
+#ifndef FFT_PRECISION_CU
+  #define FFT_FLOAT CUDA_FLOAT
+  #define FFT_F(x) CUDA_F(x)
+  #define FFT_PRECISION_CU CUDA_PRECISION 
+#endif
+
+//--------------------------------
+//-----------PPPM-----------------
+//--------------------------------
+
+#ifdef PPPM_PRECISION
+  #if PPPM_PRECISION == 1
+    #define PPPM_FLOAT float
+    #define PPPM_F(x) x##f 
+  #endif 
+  #if PPPM_PRECISION == 2
+    #define PPPM_FLOAT double
+    #define PPPM_F(x) x 
+  #endif
+#endif
+
+#ifndef PPPM_PRECISION
+  #define PPPM_FLOAT CUDA_FLOAT
+  #define PPPM_F(x) CUDA_F(x) 
+  #define PPPM_PRECISION CUDA_PRECISION 
+#endif
+
+//--------------------------------
+//-----------FORCE-----------------
+//--------------------------------
+
+
+#ifdef F_PRECISION
+  #if F_PRECISION == 1
+    #define F_FLOAT float
+    #define F_F(x) x##f 
+  #endif 
+  #if F_PRECISION == 2
+    #define F_FLOAT double
+    #define F_F(x) x 
+  #endif
+#endif
+
+#ifndef F_PRECISION
+  #define F_FLOAT CUDA_FLOAT
+  #define F_F(x) CUDA_F(x) 
+  #define F_PRECISION CUDA_PRECISION 
+#endif
+
+#if F_PRECISION == 1
+#define _SQRT_ sqrtf
+#define _RSQRT_ rsqrtf
+#define _EXP_ expf
+#else
+#define _SQRT_ sqrt
+#define _RSQRT_ rsqrt
+#define _EXP_ exp
+#endif
+
+#if F_PRECISION == 2
+struct F_FLOAT2
+{
+  F_FLOAT x;
+  F_FLOAT y;
+};
+struct F_FLOAT3
+{
+  F_FLOAT x;
+  F_FLOAT y;
+  F_FLOAT z;
+};
+struct F_FLOAT4
+{
+  F_FLOAT x;
+  F_FLOAT y;
+  F_FLOAT z;
+  F_FLOAT w;
+};
+#else
+#define F_FLOAT2 float2
+#define F_FLOAT3 float3
+#define F_FLOAT4 float4
+#endif
+//--------------------------------
+//-----------ENERGY-----------------
+//--------------------------------
+
+#ifndef ENERGY_PRECISION
+  #define ENERGY_FLOAT CUDA_FLOAT
+  #define ENERGY_F(x) CUDA_F(x) 
+#endif
+
+#ifdef ENERGY_PRECISION
+  #if ENERGY_PRECISION == 1
+    #define ENERGY_FLOAT float
+    #define ENERGY_F(x) x##f 
+  #endif 
+  #if ENERGY_PRECISION == 2
+    #define ENERGY_FLOAT double
+    #define ENERGY_F(x) x 
+  #endif
+#endif
+
+#ifndef ENERGY_PRECISION
+  #define ENERGY_FLOAT CUDA_FLOAT
+  #define ENERGY_F(x) CUDA_F(x) 
+  #define ENERGY_PRECISION CUDA_PRECISION 
+#endif
+
+//--------------------------------
+//-----------POSITIONS------------
+//--------------------------------
+
+#ifdef X_PRECISION
+  #if X_PRECISION == 1
+    #define X_FLOAT float
+    #define X_F(x) x##f 
+  #endif 
+  #if X_PRECISION == 2
+    #define X_FLOAT double
+    #define X_F(x) x 
+  #endif
+#endif
+
+#ifndef X_PRECISION
+  #define X_FLOAT CUDA_FLOAT
+  #define X_F(x) CUDA_F(x) 
+  #define X_PRECISION CUDA_PRECISION 
+#endif
+
+#if X_PRECISION == 2
+struct X_FLOAT2
+{
+  X_FLOAT x;
+  X_FLOAT y;
+};
+struct X_FLOAT3
+{
+  X_FLOAT x;
+  X_FLOAT y;
+  X_FLOAT z;
+};
+struct X_FLOAT4
+{
+  X_FLOAT x;
+  X_FLOAT y;
+  X_FLOAT z;
+  X_FLOAT w;
+};
+#else
+#define X_FLOAT2 float2
+#define X_FLOAT3 float3
+#define X_FLOAT4 float4
+#endif
+
+//--------------------------------
+//-----------velocities-----------
+//--------------------------------
+
+#ifdef V_PRECISION
+  #if V_PRECISION == 1
+    #define V_FLOAT float
+    #define V_F(x) x##f  
+  #endif 
+  #if V_PRECISION == 2
+    #define V_FLOAT double
+    #define V_F(x) x  
+  #endif
+#endif
+
+#ifndef V_PRECISION
+  #define V_FLOAT CUDA_FLOAT
+  #define V_F(x) CUDA_F(x) 
+  #define V_PRECISION CUDA_PRECISION 
+#endif
+
+#if V_PRECISION == 2
+struct V_FLOAT4
+{
+  V_FLOAT x;
+  V_FLOAT y;
+  V_FLOAT z;
+  V_FLOAT w;
+};
+#else
+#define V_FLOAT4 float4
+#endif
+
+#ifdef NO_PREC_TIMING
+struct timespec_2
+{
+	unsigned int tv_sec;
+	unsigned int tv_nsec;
+};
+
+#define timespec timespec_2
+#define clock_gettime(a,b) 
+#endif
+#endif /*CUDA_PRECISION_H_*/
diff --git a/src/USER-CUDA/cuda_shared.h b/src/USER-CUDA/cuda_shared.h
new file mode 100644
index 0000000000..f7983fff05
--- /dev/null
+++ b/src/USER-CUDA/cuda_shared.h
@@ -0,0 +1,378 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_SHARED_H_
+#define _CUDA_SHARED_H_
+#include "cuda_precision.h"
+
+#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
+
+struct dev_array
+{
+	void* dev_data;			// pointer to memory address on cuda device
+	unsigned dim[3];		// array dimensions
+};
+
+struct cuda_shared_atom		// relevent data from atom class
+{
+	dev_array dx; 			// cumulated distance for binning settings
+	dev_array x;			// position
+	dev_array v;			// velocity
+	dev_array f;			// force
+	dev_array tag;
+	dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)   
+	dev_array mask;
+	dev_array image; 		
+	dev_array q;			// charges
+	dev_array mass;			// per-type masses
+	dev_array rmass;		// per-atom masses
+	dev_array radius;		// per-atom radius
+	dev_array density;
+	dev_array omega;
+	dev_array torque;
+	dev_array molecule;
+	
+	dev_array special;
+	int maxspecial;
+	dev_array nspecial;
+	int* special_flag;
+	int molecular;
+	
+	dev_array eatom;		// per-atom energy
+	dev_array vatom;		// per-atom virial
+	int need_eatom;
+	int need_vatom;
+	
+	dev_array x_type;		// position + type in X_FLOAT4 struct
+	dev_array v_radius;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
+	dev_array omega_rmass;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
+
+	double* mass_host;		// remember per-type host pointer to masses
+	//int natoms;				// total # of atoms in system, could be 0
+	int nghost;				// and ghost atoms on this proc
+	int nlocal;				// # of owned
+	int nall;			    // total # of atoms in this proc
+	int nmax;				// max # of owned+ghost in arrays on this proc 	
+	int ntypes;
+	int q_flag;				// do we have charges?
+	int rmass_flag;			// do we have per-atom masses?
+	int firstgroup;
+	int nfirst;
+	
+ 	int update_nlocal;
+ 	int update_nmax;
+ 	
+	dev_array xhold;	    // position at last neighboring
+ 	X_FLOAT triggerneighsq;		// maximum square movement before reneighboring
+ 	int reneigh_flag;		// is reneighboring necessary
+ 	int maxhold;			// size of xhold
+ 	int dist_check; 		//perform distance check for reneighboring
+ 	dev_array binned_id;    //id of each binned atom (not tag!!)
+ 	dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
+    float bin_extraspace;	
+	int bin_dim[3];
+	int bin_nmax;
+	dev_array map_array;
+};
+
+struct cuda_shared_pair		// relevent data from pair class
+{
+	char cudable_force;		// check for (cudable_force!=0)
+	X_FLOAT cut_global;
+	X_FLOAT cut_inner_global;
+	X_FLOAT cut_coul_global;
+	double** cut;			// type-type cutoff 
+	double** cutsq;			// type-type cutoff 
+	double** cut_inner;			// type-type cutoff for coul
+	double** cut_coul;			// type-type cutoff for coul
+	double** coeff1;		// tpye-type pair parameters
+	double** coeff2;
+	double** coeff3;
+	double** coeff4;
+	double** coeff5;		
+	double** coeff6;		
+	double** coeff7;		
+	double** coeff8;		
+	double** coeff9;		
+	double** coeff10;		
+	double** offset;
+	double* special_lj;
+	double* special_coul;
+	dev_array virial; // ENERGY_FLOAT
+	dev_array eng_vdwl; // ENERGY_FLOAT
+	dev_array eng_coul; // ENERGY_FLOAT
+	X_FLOAT cut_coulsq_global;
+	F_FLOAT g_ewald,kappa;
+	int freeze_group_bit;
+	
+	dev_array coeff1_gm;
+	dev_array coeff2_gm;
+	dev_array coeff3_gm;
+	dev_array coeff4_gm;
+	dev_array coeff5_gm;
+	dev_array coeff6_gm;
+	dev_array coeff7_gm;
+	dev_array coeff8_gm;
+	dev_array coeff9_gm;
+	dev_array coeff10_gm;
+	
+	int lastgridsize;
+	int n_energy_virial;
+	int collect_forces_later;
+	int use_block_per_atom;
+	int override_block_per_atom;
+	
+};
+
+struct cuda_shared_domain	// relevent data from domain class
+{
+	X_FLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
+	X_FLOAT subhi[3];
+	X_FLOAT boxlo[3];
+	X_FLOAT boxhi[3];
+	X_FLOAT prd[3];
+	int periodicity[3];		// xyz periodicity as array
+
+	int triclinic;
+	X_FLOAT xy;
+	X_FLOAT xz;
+	X_FLOAT yz;
+	X_FLOAT boxlo_lamda[3];
+	X_FLOAT boxhi_lamda[3];
+	X_FLOAT prd_lamda[3];
+	X_FLOAT h[6];
+	X_FLOAT h_inv[6];
+	V_FLOAT h_rate[6];
+	int update;
+};
+
+struct cuda_shared_pppm
+{
+   char cudable_force;
+#ifdef FFT_CUFFT  
+   FFT_FLOAT* work1;
+   FFT_FLOAT* work2;
+   FFT_FLOAT* work3;
+   PPPM_FLOAT* greensfn;
+   PPPM_FLOAT* fkx;
+   PPPM_FLOAT* fky;
+   PPPM_FLOAT* fkz;
+   PPPM_FLOAT* vg;
+#endif
+   int* part2grid;
+   PPPM_FLOAT* density_brick;
+   int* density_brick_int;
+   PPPM_FLOAT density_intScale;
+   PPPM_FLOAT* vdx_brick;
+   PPPM_FLOAT* vdy_brick;
+   PPPM_FLOAT* vdz_brick;
+   PPPM_FLOAT* density_fft;
+   ENERGY_FLOAT* energy;
+   ENERGY_FLOAT* virial;
+   int nxlo_in;
+   int nxhi_in;
+   int nxlo_out;
+   int nxhi_out;
+   int nylo_in;
+   int nyhi_in;
+   int nylo_out;
+   int nyhi_out;
+   int nzlo_in;
+   int nzhi_in;
+   int nzlo_out;
+   int nzhi_out;
+   int nx_pppm;
+   int ny_pppm;
+   int nz_pppm;
+   PPPM_FLOAT qqrd2e;
+   int order;
+  // float3 sublo;
+   PPPM_FLOAT* rho_coeff;
+   int nmax;
+   int nlocal;
+   PPPM_FLOAT* debugdata;
+   PPPM_FLOAT delxinv;
+   PPPM_FLOAT delyinv;
+   PPPM_FLOAT delzinv;
+   int nlower;
+   int nupper;
+   PPPM_FLOAT shiftone;
+   
+};
+
+struct cuda_shared_comm
+{
+   int maxswap;
+   int maxlistlength;
+   dev_array pbc;
+   dev_array slablo;
+   dev_array slabhi;
+   dev_array multilo;
+   dev_array multihi;
+   dev_array sendlist;
+   int grow_flag;
+   int comm_phase;
+   
+   int nsend;
+   int* nsend_swap;
+   int* send_size;
+   int* recv_size; 
+   double** buf_send;
+   void** buf_send_dev;
+   double** buf_recv;
+   void** buf_recv_dev;
+   void* buffer;
+   int buffer_size;
+   double overlap_split_ratio;
+};
+
+struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
+{
+	int maxlocal;
+	int inum;                // # of I atoms neighbors are stored for local indices of I atoms
+	int inum_border2;
+	dev_array inum_border;         // # of atoms which interact with border atoms
+	dev_array ilist;
+	dev_array ilist_border;
+	dev_array numneigh;
+	dev_array numneigh_inner;
+	dev_array numneigh_border;
+	dev_array firstneigh;
+	dev_array neighbors;
+	dev_array neighbors_border;
+	dev_array neighbors_inner;
+	int maxpage;
+	dev_array page_pointers;
+	dev_array* pages;
+	int maxneighbors;
+	int neigh_lists_per_page;
+	double** cutneighsq;
+	CUDA_FLOAT* cu_cutneighsq;
+	int* binned_id;
+	int* bin_dim;
+	int bin_nmax;
+	float bin_extraspace;
+	double maxcut;
+	dev_array ex_type;
+	int nex_type;
+	dev_array ex1_bit;
+	dev_array ex2_bit;
+	int nex_group;
+	dev_array ex_mol_bit;
+	int nex_mol;
+	
+};
+
+struct cuda_compile_settings		// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
+{
+    int prec_glob;
+    int prec_x;
+    int prec_v;
+    int prec_f;
+    int prec_pppm;
+    int prec_fft;
+    int cufft;
+    int arch;
+};
+
+struct cuda_timings_struct
+{
+	//Debug:
+	double test1;
+	double test2;
+	//transfers
+	double transfer_upload_tmp_constr;
+	double transfer_download_tmp_deconstr;
+	
+	//communication
+	double comm_forward_total;
+	double comm_forward_mpi_upper;
+	double comm_forward_mpi_lower;
+	double comm_forward_kernel_pack;
+	double comm_forward_kernel_unpack;
+	double comm_forward_kernel_self;
+	double comm_forward_upload;
+	double comm_forward_download;
+
+	double comm_exchange_total;
+	double comm_exchange_mpi;
+	double comm_exchange_kernel_pack;
+	double comm_exchange_kernel_unpack;
+	double comm_exchange_kernel_fill;
+	double comm_exchange_cpu_pack;
+	double comm_exchange_upload;
+	double comm_exchange_download;
+
+	double comm_border_total;
+	double comm_border_mpi;
+	double comm_border_kernel_pack;
+	double comm_border_kernel_unpack;
+	double comm_border_kernel_self;
+	double comm_border_kernel_buildlist;
+	double comm_border_upload;
+	double comm_border_download;
+	
+	//pair forces
+	double pair_xtype_conversion;
+	double pair_kernel;
+	double pair_virial;
+	double pair_force_collection;
+	
+	//neighbor
+	double neigh_bin;
+	double neigh_build;
+	double neigh_special;
+	
+	//PPPM
+ 	double pppm_particle_map; 
+    double pppm_make_rho; 
+    double pppm_brick2fft; 
+    double pppm_poisson; 
+    double pppm_fillbrick; 
+    double pppm_fieldforce; 
+    double pppm_compute; 
+	
+};
+
+struct cuda_shared_data		// holds space for all relevent data from the different classes
+{
+	void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
+	int buffersize; //maxsize of buffer
+	int buffer_new; //should be 1 if the pointer to buffer has changed
+	void* flag;
+	void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
+	cuda_shared_atom atom;
+	cuda_shared_pair pair;	
+	cuda_shared_domain domain;
+	cuda_shared_pppm pppm;
+	cuda_shared_comm comm;
+	cuda_compile_settings compile_settings;
+	cuda_timings_struct cuda_timings;
+	int exchange_dim;
+	int me; //mpi rank
+	unsigned int datamask;
+	int overlap_comm;
+};
+
+
+#endif // #ifndef _CUDA_SHARED_H_
diff --git a/src/USER-CUDA/domain_cuda.cpp b/src/USER-CUDA/domain_cuda.cpp
new file mode 100644
index 0000000000..fc8d8bb498
--- /dev/null
+++ b/src/USER-CUDA/domain_cuda.cpp
@@ -0,0 +1,270 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author (triclinic) : Pieter in 't Veld (SNL)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "stdlib.h"
+#include "string.h"
+#include "stdio.h"
+#include "math.h"
+#include "domain_cuda.h"
+#include "style_region.h"
+#include "atom.h"
+#include "force.h"
+#include "update.h"
+#include "modify.h"
+#include "fix.h"
+#include "fix_deform.h"
+#include "region.h"
+#include "lattice.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+
+#include "cuda.h"
+#include "domain_cu.h"
+
+using namespace LAMMPS_NS;
+
+#define BIG   1.0e20
+#define SMALL 1.0e-4
+#define DELTA 1
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+enum{NO_REMAP,X_REMAP,V_REMAP};                   // same as fix_deform.cpp
+
+/* ----------------------------------------------------------------------
+   default is periodic 
+------------------------------------------------------------------------- */
+
+DomainCuda::DomainCuda(LAMMPS *lmp) : Domain(lmp)
+{
+  cuda = lmp->cuda;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DomainCuda::init()
+{
+  Domain::init();
+
+  if(not cuda->finished_run)
+  {
+    cuda->setDomainParams();
+    Cuda_Domain_Init(&cuda->shared_data);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set global box params
+   assumes boxlo/hi and triclinic tilts are already set
+------------------------------------------------------------------------- */
+
+void DomainCuda::set_global_box()
+{
+  Domain::set_global_box();
+
+  if(not cuda->finished_run)
+  {
+    cuda->setDomainParams();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set lamda box params, only need be done one time
+   assumes global box is defined and proc assignment has been made by comm
+   for uppermost proc, insure subhi = 1.0 (in case round-off occurs)
+------------------------------------------------------------------------- */
+
+void DomainCuda::set_lamda_box()
+{
+  Domain::set_lamda_box();
+
+  if(not cuda->finished_run)
+  {
+    cuda->setDomainParams();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set local subbox params
+   assumes global box is defined and proc assignment has been made
+   for uppermost proc, insure subhi = boxhi (in case round-off occurs)
+------------------------------------------------------------------------- */
+
+void DomainCuda::set_local_box()
+{
+  Domain::set_local_box();
+
+  if(not cuda->finished_run)
+  {
+   // cuda->setDomainParams();
+    //Cuda_Domain_Init(&cuda->shared_data);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   reset global & local boxes due to global box boundary changes
+   if shrink-wrapped, determine atom extent and reset boxlo/hi
+   if shrink-wrapped and triclinic, perform shrink-wrap in box coords
+------------------------------------------------------------------------- */
+
+void DomainCuda::reset_box()
+{
+  if (nonperiodic == 2) {
+
+    // convert back to box coords for shrink-wrap operation
+
+    if (triclinic) lamda2x(atom->nlocal);
+
+    // compute extent of atoms on this proc
+
+    double extent[3][2],all[3][2];
+
+    extent[2][0] = extent[1][0] = extent[0][0] = BIG;
+    extent[2][1] = extent[1][1] = extent[0][1] = -BIG;
+
+    double **x = atom->x;
+    int nlocal = atom->nlocal;
+
+    if (cuda->finished_setup&&(!cuda->oncpu))
+      {
+        extent[0][0]=cuda->extent[0];
+        extent[0][1]=cuda->extent[1];
+        extent[1][0]=cuda->extent[2];
+        extent[1][1]=cuda->extent[3];
+        extent[2][0]=cuda->extent[4];
+        extent[2][1]=cuda->extent[5];
+      }
+    else
+      for (int i = 0; i < nlocal; i++) {
+        extent[0][0] = MIN(extent[0][0],x[i][0]);
+        extent[0][1] = MAX(extent[0][1],x[i][0]);
+        extent[1][0] = MIN(extent[1][0],x[i][1]);
+        extent[1][1] = MAX(extent[1][1],x[i][1]);
+        extent[2][0] = MIN(extent[2][0],x[i][2]);
+        extent[2][1] = MAX(extent[2][1],x[i][2]);
+      }
+
+    // compute extent across all procs
+    // flip sign of MIN to do it in one Allreduce MAX
+
+    extent[0][0] = -extent[0][0];
+    extent[1][0] = -extent[1][0];
+    extent[2][0] = -extent[2][0];
+
+    MPI_Allreduce(extent,all,6,MPI_DOUBLE,MPI_MAX,world);
+
+    // in shrink-wrapped dims, set box by atom extent
+    // if minimum set, enforce min box size settings
+
+    if (xperiodic == 0) {
+      if (boundary[0][0] == 2) boxlo[0] = -all[0][0] - SMALL;
+      else if (boundary[0][0] == 3) boxlo[0] = MIN(-all[0][0]-SMALL,minxlo);
+      if (boundary[0][1] == 2) boxhi[0] = all[0][1] + SMALL;
+      else if (boundary[0][1] == 3) boxhi[0] = MAX(all[0][1]+SMALL,minxhi);
+      if (boxlo[0] > boxhi[0]) error->all("Illegal simulation box");
+    }
+    if (yperiodic == 0) {
+      if (boundary[1][0] == 2) boxlo[1] = -all[1][0] - SMALL;
+      else if (boundary[1][0] == 3) boxlo[1] = MIN(-all[1][0]-SMALL,minylo);
+      if (boundary[1][1] == 2) boxhi[1] = all[1][1] + SMALL;
+      else if (boundary[1][1] == 3) boxhi[1] = MAX(all[1][1]+SMALL,minyhi);
+      if (boxlo[1] > boxhi[1]) error->all("Illegal simulation box");
+    }
+    if (zperiodic == 0) {
+      if (boundary[2][0] == 2) boxlo[2] = -all[2][0] - SMALL;
+      else if (boundary[2][0] == 3) boxlo[2] = MIN(-all[2][0]-SMALL,minzlo);
+      if (boundary[2][1] == 2) boxhi[2] = all[2][1] + SMALL;
+      else if (boundary[2][1] == 3) boxhi[2] = MAX(all[2][1]+SMALL,minzhi);
+      if (boxlo[2] > boxhi[2]) error->all("Illegal simulation box");
+    }
+  }
+
+  set_global_box();
+  set_local_box();
+
+  if(not cuda->finished_run)
+  {
+    cuda->setDomainParams();
+    Cuda_Domain_Init(&cuda->shared_data);
+  }
+
+  // if shrink-wrapped, convert to lamda coords for new box
+  // must re-invoke pbc() b/c x2lamda result can be outside 0,1 due to roundoff
+
+  if (nonperiodic == 2 && triclinic) {
+    x2lamda(atom->nlocal);
+    pbc();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   enforce PBC and modify box image flags for each atom
+   called every reneighboring and by other commands that change atoms
+   resulting coord must satisfy lo <= coord < hi
+   MAX is important since coord - prd < lo can happen when coord = hi
+   if fix deform, remap velocity of fix group atoms by box edge velocities
+   for triclinic, atoms must be in lamda coords (0-1) before pbc is called
+   image = 10 bits for each dimension
+   increment/decrement in wrap-around fashion
+------------------------------------------------------------------------- */
+
+void DomainCuda::pbc()
+{
+  if(cuda->finished_setup&&(!cuda->oncpu))
+  {
+  	cuda->setDomainParams();
+    Cuda_Domain_PBC(&cuda->shared_data, deform_vremap, deform_groupbit,cuda->extent);
+    return;
+  }
+  
+  Domain::pbc();
+}
+
+
+/* ----------------------------------------------------------------------
+   convert triclinic 0-1 lamda coords to box coords for all N atoms
+   x = H lamda + x0;
+------------------------------------------------------------------------- */
+
+void DomainCuda::lamda2x(int n)
+{
+  if(cuda->finished_setup&&(!cuda->oncpu))
+  {
+    Cuda_Domain_lamda2x(&cuda->shared_data,n);
+    return;
+  }
+
+  Domain::lamda2x(n);
+}
+
+/* ----------------------------------------------------------------------
+   convert box coords to triclinic 0-1 lamda coords for all N atoms
+   lamda = H^-1 (x - x0)
+------------------------------------------------------------------------- */
+
+void DomainCuda::x2lamda(int n)
+{
+  if(cuda->finished_setup&&(!cuda->oncpu))
+  {
+    Cuda_Domain_x2lamda(&cuda->shared_data,n);
+    return;
+  }
+ 
+  Domain::x2lamda(n);
+}
diff --git a/src/USER-CUDA/domain_cuda.h b/src/USER-CUDA/domain_cuda.h
new file mode 100644
index 0000000000..1ff4f75946
--- /dev/null
+++ b/src/USER-CUDA/domain_cuda.h
@@ -0,0 +1,41 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_DOMAIN_CUDA_H
+#define LMP_DOMAIN_CUDA_H
+
+#include "pointers.h"
+#include "domain.h"
+
+namespace LAMMPS_NS {
+
+class DomainCuda : public Domain {
+ public:
+  DomainCuda(class LAMMPS *);
+  void init();
+  void set_global_box();
+  void set_lamda_box();
+  void set_local_box();
+  void reset_box();
+  void pbc();
+ 
+  void lamda2x(int);
+  void x2lamda(int);
+
+ protected:
+  class Cuda *cuda;
+};
+
+}
+
+#endif
diff --git a/src/USER-CUDA/modify_cuda.cpp b/src/USER-CUDA/modify_cuda.cpp
new file mode 100644
index 0000000000..7f8d7f8c5a
--- /dev/null
+++ b/src/USER-CUDA/modify_cuda.cpp
@@ -0,0 +1,442 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdio>
+#include <cstring>
+#include "modify_cuda.h"
+#include "style_compute.h"
+#include "style_fix.h"
+#include "atom.h"
+#include "comm.h"
+#include "fix.h"
+#include "compute.h"
+#include "group.h"
+#include "update.h"
+#include "domain.h"
+#include "cuda.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 4
+
+// mask settings - same as in fix.cpp
+
+#define INITIAL_INTEGRATE   1
+#define POST_INTEGRATE      2
+#define PRE_EXCHANGE        4
+#define PRE_NEIGHBOR        8
+#define PRE_FORCE          16
+#define POST_FORCE         32
+#define FINAL_INTEGRATE    64
+#define END_OF_STEP       128
+#define THERMO_ENERGY     256
+#define INITIAL_INTEGRATE_RESPA   512
+#define POST_INTEGRATE_RESPA     1024
+#define PRE_FORCE_RESPA          2048
+#define POST_FORCE_RESPA         4096
+#define FINAL_INTEGRATE_RESPA    8192
+#define MIN_PRE_EXCHANGE        16384
+#define MIN_POST_FORCE          32768
+#define MIN_ENERGY              65536
+
+#include "cuda_modify_flags.h"
+
+#define MIN(A,B) ((A) < (B)) ? (A) : (B)
+#define MAX(A,B) ((A) > (B)) ? (A) : (B)
+
+#define BIG 1.0e20
+
+/* ---------------------------------------------------------------------- */
+
+ModifyCuda::ModifyCuda(LAMMPS *lmp) : Modify(lmp)
+{
+  cuda = lmp->cuda;
+
+  n_initial_integrate_cuda = 0;
+  n_post_integrate_cuda = 0;
+  n_pre_exchange = 0;
+  n_pre_neighbor_cuda = 0;
+  n_pre_force_cuda = 0;
+  n_post_force_cuda = 0;
+  n_final_integrate_cuda = 0;
+  n_end_of_step_cuda = 0;
+  n_thermo_energy_cuda = 0;
+
+  n_initial_integrate_host = 0;
+  n_post_integrate_host = 0;
+  n_pre_exchange = 0;
+  n_pre_neighbor_host = 0;
+  n_pre_force_host = 0;
+  n_post_force_host = 0;
+  n_final_integrate_host = 0;
+  n_end_of_step_host = 0;
+  n_thermo_energy_host = 0;
+
+  list_initial_integrate_cuda = NULL;
+  list_post_integrate_cuda = NULL;
+  list_pre_exchange_cuda = NULL;
+  list_pre_neighbor_cuda = NULL;
+  list_pre_force_cuda = NULL;
+  list_post_force_cuda = NULL;
+  list_final_integrate_cuda = NULL;
+  list_end_of_step_cuda = NULL;
+  list_thermo_energy_cuda = NULL;
+  end_of_step_every_cuda = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ModifyCuda::~ModifyCuda()
+{
+  delete [] list_initial_integrate_cuda;
+  delete [] list_post_integrate_cuda;
+  delete [] list_pre_exchange_cuda;
+  delete [] list_pre_neighbor_cuda;
+  delete [] list_pre_force_cuda;
+  delete [] list_post_force_cuda;
+  delete [] list_final_integrate_cuda;
+  delete [] list_end_of_step_cuda;
+  delete [] list_thermo_energy_cuda;
+  delete [] end_of_step_every_cuda;
+}
+
+/* ----------------------------------------------------------------------
+   initialize all fixes and computes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::init()
+{
+  int i,j;
+
+  // delete storage of restart info since it is not valid after 1st run
+
+  restart_deallocate();
+
+  // create lists of fixes to call at each stage of run
+
+  list_init(INITIAL_INTEGRATE,n_initial_integrate,list_initial_integrate);
+  list_init(POST_INTEGRATE,n_post_integrate,list_post_integrate);
+  list_init(PRE_EXCHANGE,n_pre_exchange,list_pre_exchange);
+  list_init(PRE_NEIGHBOR,n_pre_neighbor,list_pre_neighbor);
+  list_init(PRE_FORCE,n_pre_force,list_pre_force);
+  list_init(POST_FORCE,n_post_force,list_post_force);
+  list_init(FINAL_INTEGRATE,n_final_integrate,list_final_integrate);
+  list_init_end_of_step(END_OF_STEP,n_end_of_step,list_end_of_step);
+  list_init_thermo_energy(THERMO_ENERGY,n_thermo_energy,list_thermo_energy);
+
+  list_init(INITIAL_INTEGRATE_CUDA, n_initial_integrate_cuda, list_initial_integrate_cuda);
+  list_init(POST_INTEGRATE_CUDA, n_post_integrate_cuda, list_post_integrate_cuda);
+  list_init(PRE_EXCHANGE_CUDA, n_pre_exchange_cuda, list_pre_exchange_cuda);
+  list_init(PRE_NEIGHBOR_CUDA, n_pre_neighbor_cuda, list_pre_neighbor_cuda);
+  list_init(PRE_FORCE_CUDA, n_pre_force_cuda, list_pre_force_cuda);
+  list_init(POST_FORCE_CUDA, n_post_force_cuda, list_post_force_cuda);
+  list_init(FINAL_INTEGRATE_CUDA, n_final_integrate_cuda, list_final_integrate_cuda);
+  list_init_end_of_step_cuda(END_OF_STEP_CUDA, n_end_of_step_cuda, list_end_of_step_cuda);
+  list_init_thermo_energy(THERMO_ENERGY_CUDA, n_thermo_energy_cuda, list_thermo_energy_cuda);
+
+  n_initial_integrate_host = n_initial_integrate;
+  n_post_integrate_host = n_post_integrate;
+  n_pre_exchange_host = n_pre_exchange;
+  n_pre_neighbor_host = n_pre_neighbor;
+  n_pre_force_host = n_pre_force;
+  n_post_force_host = n_post_force;
+  n_final_integrate_host = n_final_integrate;
+  n_end_of_step_host = n_end_of_step;
+  n_thermo_energy_host = n_thermo_energy;
+  
+  n_initial_integrate = n_initial_integrate_cuda+n_initial_integrate_host;
+  n_post_integrate = n_post_integrate_cuda+n_post_integrate_host;
+  n_pre_exchange = n_pre_exchange_cuda+n_pre_exchange_host;
+  n_pre_neighbor = n_pre_neighbor_cuda+n_pre_neighbor_host;
+  n_pre_force = n_pre_force_cuda+n_pre_force_host;
+  n_post_force = n_post_force_cuda+n_post_force_host;
+  n_final_integrate = n_final_integrate_cuda+n_final_integrate_host;
+  n_end_of_step = n_end_of_step_cuda+n_end_of_step_host;
+  n_thermo_energy = n_thermo_energy_cuda+n_thermo_energy_host;
+  
+  list_init(INITIAL_INTEGRATE_RESPA,
+	    n_initial_integrate_respa,list_initial_integrate_respa);
+  list_init(POST_INTEGRATE_RESPA,
+	    n_post_integrate_respa,list_post_integrate_respa);
+  list_init(POST_FORCE_RESPA,
+	    n_post_force_respa,list_post_force_respa);
+  list_init(PRE_FORCE_RESPA,
+	    n_pre_force_respa,list_pre_force_respa);
+  list_init(FINAL_INTEGRATE_RESPA,
+	    n_final_integrate_respa,list_final_integrate_respa);
+
+  list_init(MIN_PRE_EXCHANGE,n_min_pre_exchange,list_min_pre_exchange);
+  list_init(MIN_POST_FORCE,n_min_post_force,list_min_post_force);
+  list_init(MIN_ENERGY,n_min_energy,list_min_energy);
+
+  // init each fix
+  // needs to come before compute init
+  // this is b/c some computes call fix->dof()
+  // FixRigid::dof() depends on its own init having been called
+
+  comm->maxforward_fix = comm->maxreverse_fix = 0;
+  for (i = 0; i < nfix; i++) fix[i]->init();
+
+  // set global flag if any fix has its restart_pbc flag set
+
+  restart_pbc_any = 0;
+  for (i = 0; i < nfix; i++)
+    if (fix[i]->restart_pbc) restart_pbc_any = 1;
+
+  // create list of computes that store invocation times
+
+  list_init_compute();
+
+  // init each compute
+  // set invoked_scalar,vector,etc to -1 to force new run to re-compute them
+  // add initial timestep to all computes that store invocation times
+  //   since any of them may be invoked by initial thermo
+  // do not clear out invocation times stored within a compute,
+  //   b/c some may be holdovers from previous run, like for ave fixes
+
+  for (i = 0; i < ncompute; i++) {
+    compute[i]->init();
+    compute[i]->invoked_scalar = -1;
+    compute[i]->invoked_vector = -1;
+    compute[i]->invoked_array = -1;
+    compute[i]->invoked_peratom = -1;
+    compute[i]->invoked_local = -1;
+  }
+  addstep_compute_all(update->ntimestep);
+
+  // warn if any particle is time integrated more than once
+
+  int nlocal = atom->nlocal;
+  int *mask = atom->mask;
+
+  int *flag = new int[nlocal];
+  for (i = 0; i < nlocal; i++) flag[i] = 0;
+
+  int groupbit;
+  for (i = 0; i < nfix; i++) {
+    if (fix[i]->time_integrate == 0) continue;
+    groupbit = fix[i]->groupbit;
+    for (j = 0; j < nlocal; j++)
+      if (mask[j] & groupbit) flag[j]++;
+  }
+
+  int check = 0;
+  for (i = 0; i < nlocal; i++)
+    if (flag[i] > 1) check = 1;
+
+  delete [] flag;
+
+  int checkall;
+  MPI_Allreduce(&check,&checkall,1,MPI_INT,MPI_SUM,world);
+  if (comm->me == 0 && checkall)
+    error->warning("One or more atoms are time integrated more than once");
+}
+
+/* ----------------------------------------------------------------------
+   1st half of integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::initial_integrate(int vflag)
+{
+	for(int i = 0; i < n_initial_integrate_cuda; i++)
+		fix[list_initial_integrate_cuda[i]]->initial_integrate(vflag);
+
+	if(n_initial_integrate_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_initial_integrate_host; i++)
+			fix[list_initial_integrate[i]]->initial_integrate(vflag);
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   post_integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::post_integrate()
+{
+	for(int i = 0; i < n_post_integrate_cuda; i++)
+		fix[list_post_integrate_cuda[i]]->post_integrate();
+	
+	if(n_post_integrate_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_post_integrate_host; i++)
+			fix[list_post_integrate[i]]->post_integrate();
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   pre_exchange call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::pre_exchange()
+{
+	for(int i = 0; i < n_pre_exchange_cuda; i++)
+		fix[list_pre_exchange_cuda[i]]->pre_exchange();
+
+	if(n_pre_exchange_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_pre_exchange_host; i++)
+			fix[list_pre_exchange[i]]->pre_exchange();
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   pre_neighbor call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::pre_neighbor()
+{
+	for(int i = 0; i < n_pre_neighbor_cuda; i++)
+		fix[list_pre_neighbor_cuda[i]]->pre_neighbor();
+	
+	if(n_pre_neighbor_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_pre_neighbor_host; i++)
+			fix[list_pre_neighbor[i]]->pre_neighbor();
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   pre_force call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::pre_force(int vflag)
+{
+	for(int i = 0; i < n_pre_force_cuda; i++)
+		fix[list_pre_force_cuda[i]]->pre_force(vflag);
+
+	if(n_pre_force_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_pre_force_host; i++)
+			fix[list_pre_force[i]]->pre_force(vflag);
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   post_force call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::post_force(int vflag)
+{
+	for(int i = 0; i < n_post_force_cuda; i++)
+			fix[list_post_force_cuda[i]]->post_force(vflag);
+
+	if(n_post_force_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_post_force_host; i++)
+			fix[list_post_force[i]]->post_force(vflag);
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   2nd half of integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::final_integrate()
+{
+	for (int i = 0; i < n_final_integrate_cuda; i++)
+		fix[list_final_integrate_cuda[i]]->final_integrate();
+	
+	if(n_final_integrate_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_final_integrate_host; i++)
+			fix[list_final_integrate[i]]->final_integrate();
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   end-of-timestep call, only for relevant fixes
+   only call fix->end_of_step() on timesteps that are multiples of nevery
+------------------------------------------------------------------------- */
+
+void ModifyCuda::end_of_step()
+{
+	for (int i = 0; i < n_end_of_step_cuda; i++)
+		if (update->ntimestep % end_of_step_every_cuda[i] == 0)
+			fix[list_end_of_step_cuda[i]]->end_of_step();
+	
+	if(n_end_of_step_host != 0)
+	{
+		int do_thisstep=0;
+		for (int i = 0; i < n_end_of_step_host; i++)
+			if (update->ntimestep % end_of_step_every[i] == 0) do_thisstep=1;
+		if(do_thisstep)
+		{
+		  cuda->downloadAll(); cuda->oncpu = true;
+		  for (int i = 0; i < n_end_of_step_host; i++)
+			 if (update->ntimestep % end_of_step_every[i] == 0)
+				fix[list_end_of_step[i]]->end_of_step();
+		  cuda->uploadAll(); cuda->oncpu = false;
+		}
+	}
+}
+
+/* ----------------------------------------------------------------------
+   thermo energy call, only for relevant fixes
+   called by Thermo class
+   compute_scalar() is fix call to return energy
+------------------------------------------------------------------------- */
+
+double ModifyCuda::thermo_energy()
+{
+	double energy = 0.0;
+	
+	for (int i = 0; i < n_thermo_energy_cuda; i++)
+		energy += fix[list_thermo_energy_cuda[i]]->compute_scalar();
+	
+	if(n_thermo_energy_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_thermo_energy_host; i++)
+			energy += fix[list_thermo_energy[i]]->compute_scalar();
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+	
+	return energy;
+}
+
+
+
+void ModifyCuda::list_init_end_of_step_cuda(int mask, int &n, int *&list)
+{
+  delete [] list;
+  delete [] end_of_step_every_cuda;
+
+  n = 0;
+  for (int i = 0; i < nfix; i++) if (fmask[i] & mask) n++;
+  list = new int[n];
+  end_of_step_every_cuda = new int[n];
+
+  n = 0;
+  for (int i = 0; i < nfix; i++)
+    if (fmask[i] & mask) {
+      list[n] = i;
+      end_of_step_every_cuda[n++] = fix[i]->nevery;
+    }
+}
diff --git a/src/USER-CUDA/modify_cuda.h b/src/USER-CUDA/modify_cuda.h
new file mode 100644
index 0000000000..7d0d670f9c
--- /dev/null
+++ b/src/USER-CUDA/modify_cuda.h
@@ -0,0 +1,82 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_MODIFY_CUDA_H
+#define LMP_MODIFY_CUDA_H
+
+#include <cstdio>
+#include "modify.h"
+
+namespace LAMMPS_NS {
+
+class ModifyCuda : public Modify {
+ public:
+
+  int n_initial_integrate_cuda;
+  int n_post_integrate_cuda;
+  int n_pre_exchange_cuda;
+  int n_pre_neighbor_cuda;
+  int n_pre_force_cuda;
+  int n_post_force_cuda;
+  int n_final_integrate_cuda;
+  int n_end_of_step_cuda;
+  int n_thermo_energy_cuda;
+
+  int n_initial_integrate_host;
+  int n_post_integrate_host;
+  int n_pre_exchange_host;
+  int n_pre_neighbor_host;
+  int n_pre_force_host;
+  int n_post_force_host;
+  int n_final_integrate_host;
+  int n_end_of_step_host;
+  int n_thermo_energy_host;
+
+  ModifyCuda(class LAMMPS *);
+  ~ModifyCuda();
+  void init();
+  void initial_integrate(int);
+  void post_integrate();
+  //void pre_decide();
+  void pre_exchange();
+  void pre_neighbor();
+  void pre_force(int);
+  void post_force(int);
+  void final_integrate();
+  void end_of_step();
+  double thermo_energy();
+
+
+ protected:
+  class Cuda *cuda;
+
+  // lists of fixes to apply at different stages of timestep
+
+  // list of cuda fixes
+  int *list_initial_integrate_cuda;
+  int *list_post_integrate_cuda;
+  int *list_pre_exchange_cuda;
+  int *list_pre_neighbor_cuda;
+  int *list_pre_force_cuda;
+  int *list_post_force_cuda;
+  int *list_final_integrate_cuda;
+  int *list_end_of_step_cuda;
+  int *list_thermo_energy_cuda;
+  int *end_of_step_every_cuda;
+
+  void list_init_end_of_step_cuda(int, int &, int *&);
+};
+
+}
+
+#endif
diff --git a/src/USER-CUDA/neigh_full_cuda.cpp b/src/USER-CUDA/neigh_full_cuda.cpp
new file mode 100644
index 0000000000..49678c1d06
--- /dev/null
+++ b/src/USER-CUDA/neigh_full_cuda.cpp
@@ -0,0 +1,317 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef CUDA
+#include "neighbor_cuda.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "domain.h"
+#include "group.h"
+#include "error.h"
+#include "cuda_neigh_list.h"
+#include "cuda.h"
+#include "neighbor_cu.h"
+#include <cmath>
+using namespace LAMMPS_NS;
+
+/* ----------------------------------------------------------------------
+   N^2 search for all neighbors
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+void NeighborCuda::full_bin_cuda(NeighList *list)
+{
+  MYDBG(printf(" # CUDA::NeighFullBinCuda ... start\n");)
+  if(includegroup) error->warning("Warning using inlcudegroup neighborbuild. This is not yet supported by CUDA neighborbuild styles.\n");
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+ 
+  if(nlocal==0) return;
+  CudaNeighList* clist=list->cuda_list;
+  cuda_shared_neighlist* slist=&clist->sneighlist;
+  
+  if(not clist) cuda->registerNeighborList(list);
+  
+  clist->build_cuda=true;
+  
+  if(slist->bin_extraspace<0.09)
+  {
+    for(int i=1;i<=atom->ntypes;i++)
+    for(int j=1;j<=atom->ntypes;j++)
+    {
+    	if(slist->maxcut<cutneighsq[i][j]) slist->maxcut=cutneighsq[i][j];
+    }
+    slist->maxcut=sqrt(slist->maxcut);
+  }
+  int bin_dim_tmp[3];
+  int bin_nmax_tmp;
+//printf("Hallo\n");
+  timespec starttime,endtime;
+  do
+  {
+    do
+    {
+      bin_dim_tmp[0]=static_cast <int> ((domain->subhi[0]-domain->sublo[0])/slist->maxcut);
+      bin_dim_tmp[1]=static_cast <int> ((domain->subhi[1]-domain->sublo[1])/slist->maxcut);
+      bin_dim_tmp[2]=static_cast <int> ((domain->subhi[2]-domain->sublo[2])/slist->maxcut);
+      if(bin_dim_tmp[0]==0) bin_dim_tmp[0]+=1;
+      if(bin_dim_tmp[1]==0) bin_dim_tmp[1]+=1;
+      if(bin_dim_tmp[2]==0) bin_dim_tmp[2]+=1;
+      bin_nmax_tmp=static_cast <int> ((1.0+slist->bin_extraspace)*nlocal/(bin_dim_tmp[0]*bin_dim_tmp[1]*bin_dim_tmp[2]));
+      bin_dim_tmp[0]+=4;
+      bin_dim_tmp[1]+=4;
+      bin_dim_tmp[2]+=4;
+ 	  if(bin_nmax_tmp<32) slist->maxcut*=1.2;
+ 	 // printf("slist->maxcut: %lf\n", slist->maxcut);
+    } while(bin_nmax_tmp<32);
+    if((slist->bin_dim[0]!=bin_dim_tmp[0])||(slist->bin_dim[1]!=bin_dim_tmp[1])||(slist->bin_dim[2]!=bin_dim_tmp[2])||(slist->bin_nmax!=bin_nmax_tmp))
+    {
+    	if(slist->binned_id!=NULL) 
+    	CudaWrapper_FreeCudaData(slist->binned_id,slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int));
+    	slist->bin_dim[0] = bin_dim_tmp[0];
+    	slist->bin_dim[1] = bin_dim_tmp[1];
+    	slist->bin_dim[2] = bin_dim_tmp[2];
+    	slist->bin_nmax = bin_nmax_tmp;
+    	slist->binned_id=(int*) CudaWrapper_AllocCudaData(slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int));
+ 	  //printf("slist->bin: %i %i %i %i \n", bin_dim_tmp[0],bin_dim_tmp[1],bin_dim_tmp[2],bin_nmax_tmp);
+    }
+    //if(list->cuda_list->sneighlist.bin_nmax>512) error->all("To many atoms per bin. Likely cause is very long pair cutoff. This needs major rewrite of code and is not yet scheduled to be done.\n");
+  }while(Cuda_BinAtoms(&cuda->shared_data, &list->cuda_list->sneighlist));
+
+ // cuda->cu_debugdata->memset_device(0);
+  int maxneighbors=slist->maxneighbors;
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+
+  if((nex_type!=slist->nex_type)||
+  (nex_group!=slist->nex_group)||
+  (nex_mol!=slist->nex_mol))
+  {
+  	slist->nex_type=nex_type;
+  	slist->nex_group=nex_group;
+  	slist->nex_mol=nex_mol;
+  	//printf("%i %i %i\n",nex_type,nex_group,nex_mol);
+  	if(nex_type)
+  	{
+  	delete clist->cu_ex_type;
+  	clist->cu_ex_type=new cCudaData<int , int , x> (&ex_type[0][0]   , & slist->ex_type     , (atom->ntypes+1)*(atom->ntypes+1) );
+  	clist->cu_ex_type->upload();
+  	}
+ 	//printf("AA %i %i %i\n",nex_type,nex_group,nex_mol);
+  	if(nex_group)
+  	{
+   	delete clist->cu_ex1_bit;
+  	clist->cu_ex1_bit=new cCudaData<int , int , x> (ex1_bit   , & slist->ex1_bit     , nex_group );
+  	clist->cu_ex1_bit->upload();
+  	//printf("A %i %i %i\n",nex_type,nex_group,nex_mol);
+  	delete clist->cu_ex2_bit;
+  	clist->cu_ex2_bit=new cCudaData<int , int , x> (ex2_bit   , & slist->ex2_bit     , nex_group );
+  	clist->cu_ex2_bit->upload();
+  	}
+  	//printf("B %i %i %i\n",nex_type,nex_group,nex_mol);
+  	if(nex_mol)
+  	{
+  	delete clist->cu_ex_mol_bit;
+  	clist->cu_ex_mol_bit=new cCudaData<int , int , x> (ex_mol_bit   , & slist->ex_mol_bit     , nex_mol );
+  	clist->cu_ex_mol_bit->upload();
+  	}
+  	//printf("C %i %i %i\n",nex_type,nex_group,nex_mol);
+  }
+  int overflow = 0;
+  int inum = 0;
+  int npnt = 0;
+  do
+  {
+  	npnt=0;
+  	inum=0;
+    overflow=0;
+    clist->grow_device();
+    slist->cutneighsq=cutneighsq;
+    slist->maxneighbors=maxneighbors;
+    slist->inum = list->inum = nlocal;
+    //list->cuda_list->grow_device();
+    if(cuda->shared_data.overlap_comm)
+    {
+	  list->cuda_list->inum_border=0;
+	  list->cuda_list->cu_inum_border->upload();
+    }
+    
+    cuda->shared_data.atom.nall=nall;
+    //Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist);
+    overflow= Cuda_NeighborBuildFullBin(&cuda->shared_data, &list->cuda_list->sneighlist);
+	
+	/*cuda->cu_debugdata->download();
+	printf("Debugdata: %i ",cuda->debugdata[0]);
+	for(int i=0;i<cuda->debugdata[0];i+=3) printf("// %i %i %i",cuda->debugdata[i+1],cuda->debugdata[i+2],cuda->debugdata[i+3]);
+	printf("\n");*/
+	//printf("maxneighborsA: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
+	
+    if(overflow<0) 
+    {
+    	maxneighbors+=32; 
+    	if(-overflow>maxneighbors) maxneighbors=((-overflow+37)/32)*32;
+    	delete list->cuda_list->cu_neighbors;
+    	delete [] list->cuda_list->neighbors;
+    	list->cuda_list->neighbors= new int[slist->maxlocal*maxneighbors];
+    	list->cuda_list->sneighlist.maxneighbors=maxneighbors;
+	//printf("maxneighborsA1: %i %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax,slist->maxlocal);
+    	list->cuda_list->cu_neighbors= new cCudaData<int, int, x> (list->cuda_list->neighbors	 		 , & list->cuda_list->sneighlist.neighbors, slist->maxlocal*maxneighbors );
+	//printf("maxneighborsA2: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
+    	
+    	if(cuda->shared_data.overlap_comm)
+    	{
+    	  list->cuda_list->sneighlist.maxneighbors=maxneighbors;
+    	  list->cuda_list->dev_free();
+    	  list->cuda_list->dev_alloc();
+    	}
+	//printf("maxneighborsA3: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
+    }
+	//printf("maxneighborsB: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
+    if(cuda->shared_data.overlap_comm)
+    {
+  		list->cuda_list->cu_inum_border->download();
+  		list->cuda_list->sneighlist.inum_border2=list->cuda_list->inum_border;
+    }
+  }
+  while(overflow<0);
+  
+  //cuda->cu_debugdata->download();
+ // printf("Differences in: %i\n",cuda->debugdata[0]);
+ // for(int i=0;i<20;i++) printf("%i %i %i %i// ",cuda->debugdata[4*i+1],cuda->debugdata[4*i+2],cuda->debugdata[4*i+3],cuda->debugdata[4*i+4]);
+//  printf("\n");
+/*for(int i=0;i<10;i++)
+{
+	printf("%i %i // ",i,numneigh[i]);
+	for(int j=0;j<numneigh[i];j++)
+	 printf("%i ",list->cuda_list->neighbors[i+j*nlocal]);
+	printf("\n");
+}*/
+/*  int count=0;
+  if(cuda->shared_data.overlap_comm)
+  {
+  list->cuda_list->cu_inum_border->download();
+  list->cuda_list->cu_ilist_border->download();
+  list->cuda_list->cu_numneigh_border->download();
+  list->cuda_list->cu_numneigh_inner->download();
+  list->cuda_list->cu_neighbors->download();
+  list->cuda_list->cu_neighbors_inner->download();
+  list->cuda_list->cu_neighbors_border->download();
+  
+  //list->cuda_list->cu_firstneigh->download();
+ // list->cuda_list->nl_download();
+  list->cuda_list->cu_numneigh->download();
+  int diff=0;
+  //for(int i=0;i<nlocal;i++)*/
+ /* int i=123;
+  {
+  	int k=-1;
+  	//printf("inum_border: %i\n",list->cuda_list->inum_border);
+  	//for(int j=0;j<list->numneigh[i];j++) printf("%i ",list->firstneigh[i][j]);printf("\n");
+  	for(int j=0;j<list->cuda_list->inum_border;j++)
+  	if(list->cuda_list->ilist_border[j]==i) k=j;
+  	int d=numneigh[i]-list->cuda_list->numneigh_inner[i];
+  	if(k>-1) d-=list->cuda_list->numneigh_border[k];
+  	if(d!=0) {printf("Error at %i %i %i %i %i\n",i,k,d,numneigh[i],list->cuda_list->numneigh_inner[i]); diff++;}
+  	if(k>-1 && count<10) 
+  	{
+  		printf("Numneighs: %i %i %i  Border_i: %i %i\n",numneigh[i],list->cuda_list->numneigh_inner[i],list->cuda_list->numneigh_border[k],k,(int)list->cuda_list->cu_ilist_border->dev_data());
+	cuda->shared_data.me=k;
+	for(int j=0;j<numneigh[i];j++)
+	 printf("%i ",list->cuda_list->neighbors[i+j*nlocal]);
+  	 printf("\n");
+	for(int j=0;j<list->cuda_list->numneigh_inner[i];j++)
+	 printf("%i ",list->cuda_list->neighbors_inner[i+j*nlocal]);
+	 printf(" // ");
+	for(int j=0;j<list->cuda_list->numneigh_border[k];j++)
+	 printf("%i ",list->cuda_list->neighbors_border[k+j*nlocal]);
+  	 printf("\n");
+  	 count++;
+  	}
+  }
+  printf("%i\n",diff);
+  }*/
+  list->cuda_list->cu_numneigh->download();
+  list->cuda_list->cu_ilist->download();
+	//printf("Done\n");
+  
+  MYDBG(printf(" # CUDA::NeighFullBinCuda ... end\n");)
+  
+}
+
+
+void NeighborCuda::full_nsq_cuda(NeighList *list)
+{
+	printf("Full_Nsq cuda neighbor list build is not implemented anymore.\n");
+return;
+/*
+  MYDBG(printf(" # CUDA::NeighFullNSQCuda ... start\n");)
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+ 
+  if(cuda->cu_xhold) cuda->cu_xhold->upload();
+  
+
+  if(not list->cuda_list) cuda->registerNeighborList(list);
+  list->cuda_list->build_cuda=true;
+  int maxneighbors=list->cuda_list->sneighlist.maxneighbors;
+  int neigh_lists_per_page=pgsize/maxneighbors;
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int **pages = list->pages;
+
+  int overflow = 0;
+  int inum = 0;
+  int npage = 0;
+  int npnt = 0;
+  do
+  {
+  	npage=0;
+  	npnt=0;
+  	inum=0;
+    overflow=0;
+  	neigh_lists_per_page=pgsize/maxneighbors;
+    npage=(2*nlocal*maxneighbors-1)/pgsize;
+    while(npage>list->maxpage) list->add_pages();
+    pages = list->pages;
+    npage=0;
+  	list->cuda_list->sneighlist.neigh_lists_per_page=pgsize/maxneighbors;
+    list->cuda_list->grow_device();
+    list->cuda_list->sneighlist.cutneighsq=cutneighsq;
+    list->cuda_list->sneighlist.maxneighbors=maxneighbors;
+    list->cuda_list->sneighlist.inum = list->inum = nlocal;
+    
+    cuda->shared_data.atom.nall=nall;
+    Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist);
+    overflow= not Cuda_NeighborBuildFullNsq(&cuda->shared_data, &list->cuda_list->sneighlist);
+	
+	
+	    
+     if(overflow) maxneighbors+=32;
+  }
+  while(overflow);
+   if(not cudable) list->cuda_list->nl_download();
+  MYDBG(printf(" # CUDA::NeighFullNSQCuda ... end\n");)
+  */
+}
+#endif
+
diff --git a/src/USER-CUDA/neighbor_cuda.cpp b/src/USER-CUDA/neighbor_cuda.cpp
new file mode 100644
index 0000000000..4575ce2acc
--- /dev/null
+++ b/src/USER-CUDA/neighbor_cuda.cpp
@@ -0,0 +1,221 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "neighbor_cuda.h"
+#include "cuda.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "domain.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+enum{NSQ,BIN,MULTI};     // also in neigh_list.cpp
+
+/* ---------------------------------------------------------------------- */
+
+NeighborCuda::NeighborCuda(LAMMPS *lmp) : Neighbor(lmp)
+{
+  cuda = lmp->cuda;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborCuda::init()
+{
+  cuda->set_neighinit(dist_check,0.25*skin*skin);
+  cudable = 1;
+
+  Neighbor::init();
+}
+
+/* ----------------------------------------------------------------------
+   overwrite either full_nsq or full_bin with CUDA-equivalent methods
+   any other neighbor build method is unchanged
+------------------------------------------------------------------------- */
+
+void NeighborCuda::choose_build(int index, NeighRequest *rq)
+{
+  Neighbor::choose_build(index,rq);
+  
+  if (rq->full && style == NSQ && rq->ghost == 0 && rq->cudable)
+    pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_nsq_cuda;
+  else if (rq->full && style == BIN && rq->ghost == 0 && rq->cudable)
+    pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_bin_cuda;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int NeighborCuda::check_distance()
+{
+  double delx,dely,delz,rsq;
+  double delta,deltasq,delta1,delta2;
+
+  if (boxcheck) {
+    if (triclinic == 0) {
+      delx = bboxlo[0] - boxlo_hold[0];
+      dely = bboxlo[1] - boxlo_hold[1];
+      delz = bboxlo[2] - boxlo_hold[2];
+      delta1 = sqrt(delx*delx + dely*dely + delz*delz);
+      delx = bboxhi[0] - boxhi_hold[0];
+      dely = bboxhi[1] - boxhi_hold[1];
+      delz = bboxhi[2] - boxhi_hold[2];
+      delta2 = sqrt(delx*delx + dely*dely + delz*delz);
+      delta = 0.5 * (skin - (delta1+delta2));
+      deltasq = delta*delta;
+    } else {
+      domain->box_corners();
+      delta1 = delta2 = 0.0;
+      for (int i = 0; i < 8; i++) {
+	delx = corners[i][0] - corners_hold[i][0];
+	dely = corners[i][1] - corners_hold[i][1];
+	delz = corners[i][2] - corners_hold[i][2];
+	delta = sqrt(delx*delx + dely*dely + delz*delz);
+	if (delta > delta1) delta1 = delta;
+	else if (delta > delta2) delta2 = delta;
+      }
+      delta = 0.5 * (skin - (delta1+delta2));
+      deltasq = delta*delta;
+    }
+  } else deltasq = triggersq;
+
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+  if (includegroup) nlocal = atom->nfirst;
+
+  int flag = 0;
+
+  if (not cuda->neighbor_decide_by_integrator) {
+    cuda->cu_x_download();
+    for (int i = 0; i < nlocal; i++) {
+      delx = x[i][0] - xhold[i][0];
+      dely = x[i][1] - xhold[i][1];
+      delz = x[i][2] - xhold[i][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      if (rsq > deltasq) flag = 1;
+    }
+  }
+  else flag = cuda->shared_data.atom.reneigh_flag;
+  
+  int flagall;
+  MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_MAX,world);
+  if (flagall && ago == MAX(every,delay)) ndanger++;
+  return flagall;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborCuda::build()
+{
+  int i;
+
+  ago = 0;
+  ncalls++;
+
+  // store current atom positions and box size if needed
+
+  if (dist_check) {
+    if (cuda->decide_by_integrator())
+      cuda->update_xhold(maxhold, &xhold[0][0]);
+    else {
+      if (cuda->finished_setup) cuda->cu_x_download();
+      
+      double **x = atom->x;
+      int nlocal = atom->nlocal;
+      if (includegroup) nlocal = atom->nfirst;
+      if (nlocal > maxhold) {
+	maxhold = atom->nmax;
+	memory->destroy(xhold);
+	memory->create(xhold,maxhold,3,"neigh:xhold");
+      }
+      for (i = 0; i < nlocal; i++) {
+	xhold[i][0] = x[i][0];
+	xhold[i][1] = x[i][1];
+	xhold[i][2] = x[i][2];
+      }
+      if (boxcheck) {
+	if (triclinic == 0) {
+	  boxlo_hold[0] = bboxlo[0];
+	  boxlo_hold[1] = bboxlo[1];
+	  boxlo_hold[2] = bboxlo[2];
+	  boxhi_hold[0] = bboxhi[0];
+	  boxhi_hold[1] = bboxhi[1];
+	  boxhi_hold[2] = bboxhi[2];
+	} else {
+	  domain->box_corners();
+	  corners = domain->corners;
+	  for (i = 0; i < 8; i++) {
+	    corners_hold[i][0] = corners[i][0];
+	    corners_hold[i][1] = corners[i][1];
+	    corners_hold[i][2] = corners[i][2];
+	  }
+	}
+      }
+    }
+  }
+  
+  if (not cudable && cuda->finished_setup && atom->avec->cudable)
+    cuda->downloadAll(); 
+  if (cudable && (not cuda->finished_setup)) {
+    cuda->checkResize();
+    cuda->uploadAll();
+  }
+
+  // if any lists store neighbors of ghosts:
+  // invoke grow() if nlocal+nghost exceeds previous list size
+  // else only invoke grow() if nlocal exceeds previous list size
+  // only done for lists with growflag set and which are perpetual
+
+  if (anyghostlist && atom->nlocal+atom->nghost > maxatom) {
+    maxatom = atom->nmax;
+    for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
+  } else if (atom->nlocal > maxatom) {
+    maxatom = atom->nmax;
+    for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
+  }
+  
+  // extend atom bin list if necessary
+
+  if (style != NSQ && atom->nmax > maxbin) {
+    maxbin = atom->nmax;
+    memory->destroy(bins);
+    memory->create(bins,maxbin,"bins");
+  }
+
+  // check that neighbor list with special bond flags will not overflow
+
+  if (atom->nlocal+atom->nghost > NEIGHMASK)
+    error->one("Too many local+ghost atoms for neighbor list");
+
+  // invoke building of pair and molecular neighbor lists
+  // only for pairwise lists with buildflag set
+
+  for (i = 0; i < nblist; i++)
+    (this->*pair_build[blist[i]])(lists[blist[i]]);
+
+  if (atom->molecular) {
+    if (force->bond) (this->*bond_build)();
+    if (force->angle) (this->*angle_build)();
+    if (force->dihedral) (this->*dihedral_build)();
+    if (force->improper) (this->*improper_build)();
+  }
+}
diff --git a/src/USER-CUDA/neighbor_cuda.h b/src/USER-CUDA/neighbor_cuda.h
new file mode 100644
index 0000000000..c171b23be3
--- /dev/null
+++ b/src/USER-CUDA/neighbor_cuda.h
@@ -0,0 +1,39 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_NEIGHBOR_CUDA_H
+#define LMP_NEIGHBOR_CUDA_H
+
+#include "neighbor.h"
+
+namespace LAMMPS_NS {
+
+class NeighborCuda : public Neighbor {
+ public:
+  NeighborCuda(class LAMMPS *);
+  void init();
+  int check_distance();
+  void build();
+
+ private:
+  class Cuda *cuda;
+
+  void choose_build(int, class NeighRequest *);
+  typedef void (NeighborCuda::*PairPtr)(class NeighList *);
+  void full_nsq_cuda(class NeighList *);
+  void full_bin_cuda(class NeighList *);
+};
+
+}
+
+#endif
diff --git a/src/USER-CUDA/verlet_cuda.cpp b/src/USER-CUDA/verlet_cuda.cpp
new file mode 100644
index 0000000000..259ae8815d
--- /dev/null
+++ b/src/USER-CUDA/verlet_cuda.cpp
@@ -0,0 +1,1103 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "verlet_cuda.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "comm.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+#include "output.h"
+#include "update.h"
+#include "modify_cuda.h"
+#include "compute.h"
+#include "fix.h"
+#include "timer.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda_wrapper_cu.h"
+#include "thermo.h"
+#include "cuda_pair_cu.h"
+#include "cuda.h"
+#include <ctime>
+#include <cmath>
+
+using namespace LAMMPS_NS;
+
+#define MAX(a, b) ((a)>(b) ? (a) : (b))
+#define MAKETIMEING
+
+
+VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, arg) {	
+  cuda = lmp->cuda;
+
+	modify_cuda=(ModifyCuda*) modify;
+}
+
+/* ----------------------------------------------------------------------
+   setup before run
+------------------------------------------------------------------------- */
+
+void VerletCuda::setup()
+{
+	//debug related variables
+	cuda->debugdata[0]=0;
+	cuda->cu_debugdata->upload();
+	dotestatom=cuda->dotestatom;
+	int testatom=cuda->testatom;//48267;
+	
+	
+	MYDBG(printf("# CUDA VerletCuda::setup start\n"); )
+
+	cuda->oncpu = true;
+	cuda->begin_setup = true;
+	cuda->finished_run = false;
+	strcpy(update->integrate_style,"verlet");
+	
+	time_pair=0;
+	time_kspace=0;
+	time_comm=0;
+	time_modify=0;
+	time_fulliterate=0;
+		
+    atom->setup();
+	
+	cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
+	cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
+	cuda_shared_pair*   cu_pair   = & cuda->shared_data.pair;
+	cu_atom->update_nlocal=1;
+	cu_atom->update_nmax=1;
+	
+	if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
+
+	cuda->setDomainParams();
+	
+
+	if(cuda->shared_data.me==0)
+	printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
+	if(cuda->shared_data.me==0)
+	printf("# CUDA: Using precision: Global: %u X: %u V: %u F: %u PPPM: %u \n", CUDA_PRECISION==1?4:8,sizeof(X_FLOAT),sizeof(V_FLOAT),sizeof(F_FLOAT),sizeof(PPPM_FLOAT));
+	cuda->allocate();
+	
+
+  if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");
+
+  // setup domain, communication and neighboring
+  // acquire ghosts
+  // build neighbor lists
+
+  if (triclinic) domain->x2lamda(atom->nlocal);
+  domain->pbc();
+  domain->reset_box();
+  comm->setup();
+  if (neighbor->style) neighbor->setup_bins();
+  comm->exchange();
+  if (atom->sortfreq > 0) atom->sort();
+  comm->borders();
+  if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+  cuda->setSystemParams();
+  cuda->checkResize();
+
+  if(cuda->shared_data.me==0)
+  printf("# CUDA: VerletCuda::setup: Upload data...\n");
+  cuda->uploadAll();
+  neighbor->build();
+  neighbor->ncalls = 0;
+  cuda->uploadAllNeighborLists();
+  if(atom->mass)
+  cuda->cu_mass->upload();
+ 
+  if(cuda->cu_map_array)
+  cuda->cu_map_array->upload();
+ 
+  // compute all forces
+
+  ev_set(update->ntimestep);
+  if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
+  if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
+  if(elist_atom||vlist_atom) cuda->checkResize();
+  
+
+  int test_BpA_vs_TpA = true;
+  timespec starttime;
+  timespec endtime;
+  #ifdef NO_PREC_TIMING
+    double startsec,endsec;
+  #endif
+ 	//if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = false;
+   if(test_BpA_vs_TpA && cuda->shared_data.pair.cudable_force && force->pair &&(cuda->shared_data.pair.override_block_per_atom<0))
+   {
+     int StyleLoops=10;
+     if(cuda->shared_data.me==0)
+    printf("Test TpA\n");
+     cuda->shared_data.pair.use_block_per_atom = 0;
+     neighbor->build();
+     Cuda_Pair_GenerateXType(&cuda->shared_data);
+	 if(cuda->cu_v_radius) 
+	   Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+	 if(cuda->cu_omega_rmass) 
+	   Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+     force->pair->compute(eflag,vflag);
+       CudaWrapper_Sync();
+     #ifdef NO_PREC_TIMING
+       startsec = 1.0*clock()/CLOCKS_PER_SEC;
+     #endif
+     clock_gettime(CLOCK_REALTIME,&starttime);
+     for(int i=0;i<StyleLoops;i++)
+     {
+	   Cuda_Pair_GenerateXType(&cuda->shared_data);
+	   if(cuda->cu_v_radius) 
+	     Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+	   if(cuda->cu_omega_rmass) 
+	     Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+       force->pair->compute(eflag,vflag);
+       CudaWrapper_Sync();
+     }
+     clock_gettime(CLOCK_REALTIME,&endtime);
+     
+  	 double TpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+     #ifdef NO_PREC_TIMING
+       endsec = 1.0*clock()/CLOCKS_PER_SEC;
+       TpAtime = endsec - startsec;
+     #endif
+     if(cuda->shared_data.me==0)
+     printf("Test BpA\n");
+     cuda->shared_data.pair.use_block_per_atom = 1;
+     neighbor->build();
+     Cuda_Pair_GenerateXType(&cuda->shared_data);
+	 if(cuda->cu_v_radius) 
+	   Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+	 if(cuda->cu_omega_rmass) 
+	   Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+     force->pair->compute(eflag,vflag);
+       CudaWrapper_Sync();
+
+     clock_gettime(CLOCK_REALTIME,&starttime);
+     #ifdef NO_PREC_TIMING
+       startsec = 1.0*clock()/CLOCKS_PER_SEC;
+     #endif
+     for(int i=0;i<StyleLoops;i++)
+     {
+	   Cuda_Pair_GenerateXType(&cuda->shared_data);
+	   if(cuda->cu_v_radius) 
+	     Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+	   if(cuda->cu_omega_rmass) 
+	     Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+       force->pair->compute(eflag,vflag);
+       CudaWrapper_Sync();
+     }
+     clock_gettime(CLOCK_REALTIME,&endtime);
+  	 double BpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+      #ifdef NO_PREC_TIMING
+       endsec = 1.0*clock()/CLOCKS_PER_SEC;
+       BpAtime = endsec - startsec;
+     #endif
+  	 
+	if(cuda->shared_data.me==0)
+  	 printf("\n# CUDA: Timing of parallelisation layout with %i loops:\n",StyleLoops);
+	if(cuda->shared_data.me==0)
+  	 printf("# CUDA: BpA TpA\n %lf %lf\n",BpAtime,TpAtime);
+  	 if(BpAtime>TpAtime) cuda->shared_data.pair.use_block_per_atom = 0;
+   }
+   else
+   cuda->shared_data.pair.use_block_per_atom = cuda->shared_data.pair.override_block_per_atom;
+   //cuda->shared_data.pair.use_block_per_atom = 0;
+   if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
+   neighbor->build();
+   neighbor->ncalls = 0;
+   
+   force_clear();
+  
+   cuda->cu_f->download();
+   if(cuda->cu_torque)
+   cuda->cu_torque->download();
+   
+  //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
+  
+  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); )
+
+  test_atom(testatom,"pre pair force");
+    
+  if(cuda->shared_data.pair.cudable_force)
+  {
+	cuda->uploadAll();
+	Cuda_Pair_GenerateXType(&cuda->shared_data);
+	if(cuda->cu_v_radius) 
+	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+	if(cuda->cu_omega_rmass) 
+	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+  }
+
+  if (force->pair) force->pair->compute(eflag,vflag);
+  
+  if(cuda->shared_data.pair.cudable_force)
+  {
+	if(cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(eflag) cuda->cu_eng_coul->upload();
+	  if(vflag) cuda->cu_virial->upload();
+      Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+	}		
+	cuda->downloadAll();
+  }
+
+  test_atom(testatom,"post pair force");
+
+  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); )
+    //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
+
+  if (atom->molecular) {
+    if (force->bond) force->bond->compute(eflag,vflag);
+    if (force->angle) force->angle->compute(eflag,vflag);
+    if (force->dihedral) force->dihedral->compute(eflag,vflag);
+    if (force->improper) force->improper->compute(eflag,vflag);
+  }
+
+
+  if(cuda->shared_data.pppm.cudable_force)
+  {
+	cuda->cu_tag ->upload();
+	cuda->cu_type->upload();
+	cuda->cu_x   ->upload();
+	cuda->cu_v   ->upload();
+	cuda->cu_f   ->upload();
+	if(cu_atom->q_flag) cuda->cu_q->upload();
+  }
+  if (force->kspace) {
+    force->kspace->setup();
+    force->kspace->compute(eflag,vflag);
+  }
+  if(cuda->shared_data.pppm.cudable_force)
+  {
+	cuda->cu_f   ->download();
+  }
+  
+  test_atom(testatom,"post kspace");
+
+  cuda->uploadAll();
+  if (force->newton) comm->reverse_comm();
+  cuda->downloadAll();
+
+  test_atom(testatom,"post reverse comm");
+  
+ if(cuda->shared_data.me==0)
+  printf("# CUDA: Total Device Memory useage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUseage()/1024/1024);
+
+  MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); )
+  modify->setup(vflag);
+
+  MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup done\n"); )
+  output->setup(1);
+
+  test_atom(testatom,"post setup");
+  
+  MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); )
+  cuda->finished_setup = true;
+  cuda->oncpu = false;
+}
+
+
+//this routine is in a messy state
+void VerletCuda::setup_minimal(int flag)
+{
+
+	
+	dotestatom=0;
+	int testatom=104;
+	cuda->oncpu = true;
+	cuda->begin_setup = true;
+	cuda->finished_run = false;
+	MYDBG(printf("# CUDA VerletCuda::setup start\n"); )
+	    strcpy(update->integrate_style,"verlet");
+	    time_pair=0;
+	    time_kspace=0;
+	    time_comm=0;
+	    time_modify=0;
+		time_fulliterate=0;
+		
+	//cuda->allocate();
+	
+	cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
+	cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
+	cuda_shared_pair*   cu_pair   = & cuda->shared_data.pair;
+	cu_atom->update_nlocal=1;
+	cu_atom->update_nmax=1;
+	
+	if(atom->molecular) cuda->shared_data.pair.collect_forces_later = true;
+
+	cuda->setDomainParams();
+	
+
+
+	if(cuda->shared_data.me==0)
+	printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
+	cuda->allocate();
+	
+	
+
+
+  // setup domain, communication and neighboring
+  // acquire ghosts
+  // build neighbor lists
+
+  if (flag) {
+    if (triclinic) domain->x2lamda(atom->nlocal);
+    domain->pbc();
+    domain->reset_box();
+    comm->setup();
+    if (neighbor->style) neighbor->setup_bins();
+    comm->exchange();
+    comm->borders();
+    if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+    cuda->setSystemParams();
+    cuda->checkResize();
+    neighbor->build();
+    neighbor->ncalls = 0;
+  }
+
+	if(cuda->shared_data.me==0)
+	printf("# CUDA: VerletCuda::setup: Upload data...\n");
+	cuda->uploadAll();
+    cuda->uploadAllNeighborLists();
+ 	if(atom->mass)
+ 	cuda->cu_mass->upload();
+ 
+ 	if(cuda->cu_map_array)
+ 	cuda->cu_map_array->upload();
+ 
+  // compute all forces
+
+  ev_set(update->ntimestep);
+  if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
+  if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
+  if(elist_atom||vlist_atom) cuda->checkResize();
+   
+  force_clear();
+  cuda->cu_f->download();
+   
+  //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
+  
+  cuda->cu_mass->upload();
+  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); )
+
+  test_atom(testatom,"pre pair force");
+    
+  if(cuda->shared_data.pair.cudable_force)
+  {
+	cuda->uploadAll();
+	Cuda_Pair_GenerateXType(&cuda->shared_data);
+	if(cuda->cu_v_radius) 
+	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+	if(cuda->cu_omega_rmass) 
+	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+  }
+  
+  if (force->pair) force->pair->compute(eflag,vflag);
+  
+  if(cuda->shared_data.pair.cudable_force)
+  {
+	if(cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(eflag) cuda->cu_eng_coul->upload();
+	  if(vflag) cuda->cu_virial->upload();
+      Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+	}		
+	cuda->downloadAll();
+  }
+  
+  test_atom(testatom,"post pair force");
+
+  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); )
+    //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
+
+  if (atom->molecular) {
+    if (force->bond) force->bond->compute(eflag,vflag);
+    if (force->angle) force->angle->compute(eflag,vflag);
+    if (force->dihedral) force->dihedral->compute(eflag,vflag);
+    if (force->improper) force->improper->compute(eflag,vflag);
+  }
+
+
+  if(cuda->shared_data.pppm.cudable_force)
+  {
+	cuda->cu_tag ->upload();
+	cuda->cu_type->upload();
+	cuda->cu_x   ->upload();
+	cuda->cu_v   ->upload();
+	cuda->cu_f   ->upload();
+	if(cu_atom->q_flag) cuda->cu_q->upload();
+  }
+  if (force->kspace) {
+    force->kspace->setup();
+    force->kspace->compute(eflag,vflag);
+  }
+  if(cuda->shared_data.pppm.cudable_force)
+  {
+	cuda->cu_f   ->download();
+  }
+  
+  test_atom(testatom,"post kspace");
+
+  cuda->uploadAll();
+  if (force->newton) comm->reverse_comm();
+  cuda->downloadAll();
+
+  test_atom(testatom,"post reverse comm");
+  
+ if(cuda->shared_data.me==0)
+  printf("# CUDA: Total Device Memory useage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUseage()/1024/1024);
+
+  MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); )
+  modify->setup(vflag);
+
+  MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); )
+  cuda->finished_setup=true;
+  cuda->oncpu=false;
+}
+
+//#define TESTATOM
+/* ----------------------------------------------------------------------
+   iterate for n steps
+------------------------------------------------------------------------- */
+
+void VerletCuda::run(int n)
+{
+  dotestatom=cuda->dotestatom;
+  int testatom=cuda->testatom;//48267;
+
+
+  timespec starttime;
+  timespec endtime;
+  timespec starttotal;
+  timespec endtotal;
+
+  cuda->setTimingsZero();
+
+  static double testtime=0.0;
+//				clock_gettime(CLOCK_REALTIME,&starttime);
+//  				clock_gettime(CLOCK_REALTIME,&endtime);
+//				testtime+=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+// 				printf("Time: %lf\n",testtime);*/
+
+
+  cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
+
+  int nflag,ntimestep,sortflag;
+		
+  int n_initial_integrate = modify_cuda->n_initial_integrate;
+  int n_post_integrate = modify_cuda->n_post_integrate;
+  int n_final_integrate = modify_cuda->n_final_integrate;
+  int n_pre_exchange = modify_cuda->n_pre_exchange;
+  int n_pre_neighbor = modify_cuda->n_pre_neighbor;
+  int n_pre_force = modify_cuda->n_pre_force;
+  int n_post_force = modify_cuda->n_post_force;
+  int n_end_of_step = modify_cuda->n_end_of_step;
+  MYDBG(printf("# CUDA: Fixes: i_int: %i p_int: %i f_int: %i pr_exc: %i pr_neigh: %i pr_f: %i p_f: %i eos: %i\n",
+	n_initial_integrate,n_post_integrate,n_final_integrate,n_pre_exchange,n_pre_neighbor,n_pre_force,n_post_force,n_end_of_step);)
+
+  if (atom->sortfreq > 0) sortflag = 1;
+  else sortflag = 0;
+
+
+	if(cuda->shared_data.me==0)
+	{
+	  if((not cuda->shared_data.pair.cudable_force)&&(force->pair))
+		  error->warning("# CUDA: You asked for a Verlet integration using Cuda, "
+			             "but selected a pair force which has not yet been ported to Cuda");
+	  if((not cuda->shared_data.pppm.cudable_force)&&(force->kspace))
+		  error->warning("# CUDA: You asked for a Verlet integration using Cuda, "
+						 "but selected a kspace force which has not yet been ported to Cuda");
+      if(modify_cuda->n_post_integrate_host+modify_cuda->n_pre_exchange_host+modify_cuda->n_pre_neighbor_host+modify_cuda->n_pre_force_host+modify_cuda->n_post_force_host+modify_cuda->n_end_of_step_host+modify_cuda->n_initial_integrate_host+modify_cuda->n_final_integrate_host)
+		  error->warning("# CUDA: You asked for a Verlet integration using Cuda, "
+						 "but several fixes have not yet been ported to Cuda.\n"
+						 "This can cause a severe speed penalty due to frequent data synchronization between host and GPU.");
+	  if(atom->firstgroupname) 
+		  error->warning("Warning: firstgroupname is used, this will cause additional data transfers.");
+	}
+    cuda->uploadAll();
+  
+  if(cuda->neighbor_decide_by_integrator && cuda->cu_xhold)
+  {
+  	 const int n=cuda->shared_data.atom.maxhold;
+ 	 CudaWrapper_CopyData(cuda->cu_xhold->dev_data(),cuda->cu_x->dev_data(),n*sizeof(X_FLOAT));
+ 	 CudaWrapper_CopyData((void*) &((X_FLOAT*)cuda->cu_xhold->dev_data())[n],(void*) &((X_FLOAT*)cuda->cu_x->dev_data())[atom->nmax],n*sizeof(X_FLOAT));
+ 	 CudaWrapper_CopyData((void*) &((X_FLOAT*)cuda->cu_xhold->dev_data())[2*n],(void*) &((X_FLOAT*)cuda->cu_x->dev_data())[2*atom->nmax],n*sizeof(X_FLOAT));
+  }
+  
+  cuda->shared_data.atom.reneigh_flag=0;
+  cuda->shared_data.atom.update_nlocal=1;
+  cuda->shared_data.atom.update_nmax=1;
+  cuda->shared_data.domain.update=1;
+  cuda->shared_data.buffer_new=1;
+  cuda->uploadtime=0;
+  cuda->downloadtime=0;
+  int firstreneigh=1;
+
+		for(int i = 0; i < n; i++)
+		{
+			ntimestep = ++update->ntimestep;
+			ev_set(ntimestep);
+			
+			// initial time integration
+				
+			test_atom(testatom,"Pre initial");
+
+			MYDBG( printf("# CUDA VerletCuda::iterate: before initial_integrate\n"); )
+ 
+ 			modify->initial_integrate(vflag);
+			
+			MYDBG( printf("# CUDA VerletCuda::iterate: after initial_integrate\n"); )
+
+			if(n_post_integrate) modify->post_integrate();
+			
+			
+			
+			// regular communication vs neighbor list rebuild
+			
+			test_atom(testatom,"Pre Exchange");
+			
+			MYDBG( printf("# CUDA VerletCuda::iterate: before neighbor decide\n"); )
+			nflag = neighbor->decide(); 
+			if(nflag == 0)
+			{
+				MYDBG( printf("# CUDA VerletCuda::iterate: communicate\n"); )
+				timer->stamp();
+				
+			    if((not (eflag||vflag))&&(cuda->shared_data.overlap_comm))
+			    {
+				  //overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms)
+			      //build communication buffers
+			//      printf("Pre forward_comm(1)\n");
+			      clock_gettime(CLOCK_REALTIME,&starttotal);
+	 			  cuda->shared_data.atom.reneigh_flag=0;
+				  clock_gettime(CLOCK_REALTIME,&starttime);
+ 				  timer->stamp();
+				  comm->forward_comm(1);
+ 				  timer->stamp(TIME_COMM);
+				  clock_gettime(CLOCK_REALTIME,&endtime);
+				  cuda->shared_data.cuda_timings.comm_forward_total+=
+ 						endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+			      
+			      //prepare force calculation
+			 //     printf("Pre force_clear\n");
+			      force_clear();
+			 //     printf("Pre Generate XType\n");
+				  Cuda_Pair_GenerateXType(&cuda->shared_data);
+				  if(cuda->cu_v_radius) 
+				    Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+				  if(cuda->cu_omega_rmass) 
+				    Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+		          
+		          //start force calculation asynchronus
+			      cuda->shared_data.comm.comm_phase=1;
+			    //  printf("Pre Force Compute\n");
+		          force->pair->compute(eflag, vflag);
+			      timer->stamp(TIME_PAIR);
+                  //CudaWrapper_Sync();
+				
+				  //download comm buffers from GPU, perform MPI communication and upload buffers again
+				  clock_gettime(CLOCK_REALTIME,&starttime);
+			   //   printf("Pre forward_comm(2)\n");
+				  comm->forward_comm(2);
+ 				  clock_gettime(CLOCK_REALTIME,&endtime);
+				  cuda->shared_data.cuda_timings.comm_forward_total+=
+ 						endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+ 				  timer->stamp(TIME_COMM);
+ 				  
+ 				  //wait for force calculation
+			      //printf("Pre Synch\n");
+				  CudaWrapper_Sync();
+				  timer->stamp(TIME_PAIR);			
+				
+				  //unpack communication buffers
+				  clock_gettime(CLOCK_REALTIME,&starttime);
+			    //  printf("Pre forward_comm(3)\n");
+				  comm->forward_comm(3);
+				  clock_gettime(CLOCK_REALTIME,&endtime);
+			  //    printf("Post forward_comm(3)\n");
+				  cuda->shared_data.cuda_timings.comm_forward_total+=
+ 						endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+				
+ 				  timer->stamp(TIME_COMM);
+				  MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); )
+				  cuda->shared_data.cuda_timings.test1+=
+ 						endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000;
+			    }
+			    else
+			    {
+			  	  //perform standard forward communication
+				//printf("Forward_comm\n");
+				  clock_gettime(CLOCK_REALTIME,&starttime);
+				  comm->forward_comm();
+				  clock_gettime(CLOCK_REALTIME,&endtime);
+				//printf("Forward_comm_done\n");
+				  cuda->shared_data.cuda_timings.comm_forward_total+=
+ 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+ 				  timer->stamp(TIME_COMM);
+				  MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); )
+			    }
+			}
+			else
+			{
+ 				int nlocalold=cuda->shared_data.atom.nlocal;
+ 				//if(firstreneigh)
+ 				{
+ 				  cuda->shared_data.atom.update_nlocal=1; 
+  				  cuda->shared_data.atom.update_nmax=1;
+ 				  firstreneigh=0;
+ 				}
+ 				  cuda->shared_data.buffer_new=1;
+				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor\n"); )
+ 				cuda->setDomainParams();
+				if(n_pre_exchange) modify->pre_exchange();
+				if(atom->nlocal!=cuda->shared_data.atom.nlocal) //did someone add atoms during pre_exchange?
+				{
+					cuda->checkResize();
+					cuda->uploadAll();
+				}
+				
+				//check domain changes
+				if(domain->triclinic) domain->x2lamda(atom->nlocal);
+				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor pbc\n"); )
+				domain->pbc();
+				if(domain->box_change)
+				{
+					domain->reset_box();
+					comm->setup(); 
+					if (neighbor->style) neighbor->setup_bins(); 
+					
+				}
+				timer->stamp();
+				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor exchange\n"); )
+
+				//perform exchange of local atoms
+				clock_gettime(CLOCK_REALTIME,&starttime);
+				comm->exchange();
+				clock_gettime(CLOCK_REALTIME,&endtime);
+
+				//special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future
+				if(comm->nprocs>1)
+				{
+				  clock_gettime(CLOCK_REALTIME,&starttime);
+				  if(atom->special)
+					cuda->cu_special->upload();
+				  if(atom->nspecial)
+					cuda->cu_nspecial->upload();
+				  clock_gettime(CLOCK_REALTIME,&endtime);
+				  cuda->shared_data.cuda_timings.test1+=
+ 					  endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+				}
+
+				cuda->shared_data.cuda_timings.comm_exchange_total+=
+ 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+ 				
+ 				if(nlocalold!=cuda->shared_data.atom.nlocal) cuda->shared_data.atom.update_nlocal=2;
+
+				//sort atoms
+      			if (sortflag && ntimestep >= atom->nextsort) atom->sort();
+				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor borders\n"); )
+
+				//generate ghost atom lists, and transfer ghost atom data
+				clock_gettime(CLOCK_REALTIME,&starttime);
+				comm->borders();
+				clock_gettime(CLOCK_REALTIME,&endtime);
+				cuda->shared_data.cuda_timings.comm_border_total+=
+ 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+
+				clock_gettime(CLOCK_REALTIME,&starttime);
+				//atom index maps are generated on CPU, and need to be transfered to GPU if they are used
+				if(cuda->cu_map_array)
+				  cuda->cu_map_array->upload();
+				  
+				
+				if(domain->triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+				
+				if(n_pre_neighbor) modify->pre_neighbor();
+
+  				cuda->shared_data.buffer_new=2;
+  				if(atom->molecular) cuda->cu_molecule->download();
+				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor build\n"); )
+  			    timer->stamp(TIME_COMM);
+				clock_gettime(CLOCK_REALTIME,&endtime);
+				cuda->shared_data.cuda_timings.test2+=
+ 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+			    
+			    //rebuild neighbor list
+			    test_atom(testatom,"Pre Neighbor");
+				neighbor->build();
+				timer->stamp(TIME_NEIGHBOR);
+				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor done\n"); )
+				
+				//if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU 
+				if(cuda->shared_data.pair.collect_forces_later)
+				{
+					if(cuda->cu_molecule) cuda->cu_molecule->download();
+					cuda->cu_tag->download();
+					cuda->cu_type->download();
+					cuda->cu_mask->download();
+		      		if(cuda->cu_q) cuda->cu_q->download();
+				}
+				cuda->shared_data.comm.comm_phase=3;
+			}
+	
+			test_atom(testatom,"Post Exchange");
+			
+			// force computations
+			
+			//only do force_clear if it has not been done during overlap of communication with local interactions
+			if(not((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)))
+			force_clear();
+			
+			if(n_pre_force) modify->pre_force(vflag);
+   			
+			timer->stamp();
+  			
+  			//if overlap of bonded interactions with nonbonded interactions takes place, download forces and positions
+ 	/*	    if(cuda->shared_data.pair.collect_forces_later)
+		    {
+		      cuda->cu_x->downloadAsync(2);
+		      cuda->cu_f->downloadAsync(2);
+		    }*/
+		    
+			if(force->pair)
+			{
+ 				if((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)&&cuda->shared_data.pair.cudable_force)
+				{
+				  //second part of force calculations in case of overlaping it with commuincation. Only interactions between local and ghost atoms are done now
+				  //regenerate data layout for force computations, its actually only needed for the ghost atoms
+				  cuda->shared_data.comm.comm_phase=2;
+      			 
+      			  timespec atime1,atime2;
+	  			  clock_gettime(CLOCK_REALTIME,&atime1);
+ 				  
+ 				  Cuda_Pair_GenerateXType(&cuda->shared_data);
+				  if(cuda->cu_v_radius) 
+				    Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+				  if(cuda->cu_omega_rmass) 
+				    Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+				  
+	  			  clock_gettime(CLOCK_REALTIME,&atime2);
+	  			  cuda->shared_data.cuda_timings.pair_xtype_conversion+=
+        			atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
+				  force->pair->compute(eflag, vflag);
+		          				
+				}
+				else
+				{
+				  //calculate complete pair interactions
+ 			      if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll();
+				  else
+				  {
+				    //regenerate data layout for force computations, its actually only needed for the ghost atoms
+      			    timespec atime1,atime2;
+	  			    clock_gettime(CLOCK_REALTIME,&atime1);
+				   
+				    Cuda_Pair_GenerateXType(&cuda->shared_data);
+				    if(cuda->cu_v_radius) 
+				      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+				    if(cuda->cu_omega_rmass) 
+				      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+	  			 
+	  			    clock_gettime(CLOCK_REALTIME,&atime2);
+	  			    cuda->shared_data.cuda_timings.pair_xtype_conversion+=
+        				atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
+				  }
+				  cuda->shared_data.comm.comm_phase=0;	
+				  force->pair->compute(eflag, vflag);
+				}
+
+ 			    if(not cuda->shared_data.pair.cudable_force) cuda->uploadAll(); 
+ 			    
+ 			    //wait for force calculation in case of not using overlap with bonded interactions
+ 			    if(not cuda->shared_data.pair.collect_forces_later)
+ 				  CudaWrapper_Sync();
+ 				  
+				timer->stamp(TIME_PAIR);
+			}
+   			
+   			//calculate bonded interactions
+		    if(atom->molecular)
+			{
+	          cuda->cu_x->downloadAsync(2);
+		      if(n_pre_force==0) Verlet::force_clear();
+		      else  cuda->cu_f->downloadAsync(2);
+		      
+			  timer->stamp(TIME_PAIR);
+			  
+			  test_atom(testatom,"pre bond force");
+			  if(force->bond) force->bond->compute(eflag, vflag);
+			  if(force->angle) force->angle->compute(eflag, vflag);
+			  if(force->dihedral) force->dihedral->compute(eflag, vflag);
+			  if(force->improper) force->improper->compute(eflag, vflag);
+			  timer->stamp(TIME_BOND);
+		    }
+		
+		   //collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used
+	       	if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&(not (force->kspace&&(not cuda->shared_data.pppm.cudable_force))))
+	       	{
+		      clock_gettime(CLOCK_REALTIME,&starttime);
+		      cuda->cu_f->uploadAsync(2);
+
+  		      test_atom(testatom,"post molecular force");
+		      
+ 	          
+ 	          if(eflag) cuda->cu_eng_vdwl->upload();
+	          if(eflag) cuda->cu_eng_coul->upload();
+	          if(vflag) cuda->cu_virial->upload();
+              Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
+	          if(eflag) cuda->cu_eng_vdwl->download();
+	          if(eflag) cuda->cu_eng_coul->download();
+	          if(vflag) cuda->cu_virial->download();
+		      timer->stamp(TIME_PAIR);
+			  
+			  clock_gettime(CLOCK_REALTIME,&endtime);
+			  cuda->shared_data.cuda_timings.pair_force_collection+=
+ 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+	       	}		
+		
+		    //compute kspace force
+		   	if(force->kspace)
+		   	{
+			  if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later)) 
+				cuda->downloadAll();
+			  if((not cuda->shared_data.pppm.cudable_force) && (cuda->shared_data.pair.collect_forces_later) && (not atom->molecular))
+			  {
+	            cuda->cu_x->downloadAsync(2);
+		        if(n_pre_force==0) Verlet::force_clear();
+		        else  cuda->cu_f->downloadAsync(2);
+		      
+			    timer->stamp(TIME_PAIR);
+			  }
+			  
+			  force->kspace->compute(eflag,vflag);
+			  if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later)) 
+				cuda->uploadAll();
+			  timer->stamp(TIME_KSPACE);
+		   	}
+			
+			//collect forces in case pair forces and kspace was overlaped
+	   	    if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&((force->kspace&&(not cuda->shared_data.pppm.cudable_force))))
+	       	{
+		      cuda->cu_f->uploadAsync(2);
+
+			  clock_gettime(CLOCK_REALTIME,&starttime);
+	          
+	          if(eflag) cuda->cu_eng_vdwl->upload();
+	          if(eflag) cuda->cu_eng_coul->upload();
+	      	  if(vflag) cuda->cu_virial->upload();
+          	  Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
+	      	  if(eflag) cuda->cu_eng_vdwl->download();
+	      	  if(eflag) cuda->cu_eng_coul->download();
+	      	  if(vflag) cuda->cu_virial->download();
+		  	  timer->stamp(TIME_PAIR);
+			  
+			  clock_gettime(CLOCK_REALTIME,&endtime);
+  			  cuda->shared_data.cuda_timings.pair_force_collection+=
+ 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+	    	}		
+				
+			//send forces on ghost atoms back to other GPU: THIS SHOULD NEVER HAPPEN
+			if(force->newton)
+			{
+			  comm->reverse_comm();
+			  timer->stamp(TIME_COMM);
+			}
+   		    test_atom(testatom,"post force");
+			// force modifications, final time integration, diagnostics
+			
+			if(n_post_force) modify->post_force(vflag);
+
+			test_atom(testatom,"pre final");
+
+			modify->final_integrate();
+
+			test_atom(testatom,"post final");
+
+			if(n_end_of_step) modify->end_of_step();
+  
+			// all output
+			
+			test_atom(testatom,"pre output");
+			
+			if(ntimestep == output->next)
+			{
+			  if(not output->thermo->cudable)
+			  cuda->downloadAll();
+			  timer->stamp();
+			  output->write(ntimestep);
+			  timer->stamp(TIME_OUTPUT);
+			}
+
+			
+			test_atom(testatom,"post output");
+			
+			if(cuda->shared_data.atom.update_nlocal>0)
+			cuda->shared_data.atom.update_nlocal--;
+  			if(cuda->shared_data.atom.update_nmax>0)
+  			cuda->shared_data.atom.update_nmax--;
+  			if(cuda->shared_data.domain.update>0)
+  			cuda->shared_data.domain.update--;
+  			if(cuda->shared_data.buffer_new>0)
+  			cuda->shared_data.buffer_new--;
+    		cuda->shared_data.atom.reneigh_flag=0;
+		}
+
+
+  		cuda->downloadAll();
+ 		cuda->downloadAllNeighborLists();
+  		cuda->shared_data.atom.update_nlocal=1;
+  		cuda->shared_data.atom.update_nmax=1;
+  		cuda->shared_data.buffer_new=1;
+  		cuda->shared_data.domain.update=1;
+  		cuda->oncpu = true;
+  		cuda->finished_run = true;
+}
+
+
+/* ----------------------------------------------------------------------
+   clear force on own & ghost atoms
+   setup and clear other arrays as needed
+------------------------------------------------------------------------- */
+
+void VerletCuda::force_clear()
+{
+  cuda->cu_f->memset_device(0);
+  if(cuda->cu_torque) cuda->cu_torque->memset_device(0);
+  return;
+  
+  //The rest should not be necessary
+  int i;
+  for(i=0;i<atom->nlocal;i++)
+  {
+  	atom->f[i][0]=0.0;
+  	atom->f[i][1]=0.0;
+  	atom->f[i][2]=0.0;  	
+  }
+  // clear force on all particles
+  // if either newton flag is set, also include ghosts
+
+  if (neighbor->includegroup == 0) {
+    int nall;
+    if (force->newton) nall = atom->nlocal + atom->nghost;
+    else nall = atom->nlocal;
+    if (torqueflag) {
+      double **torque = atom->torque;
+      for (i = 0; i < nall; i++) {
+	torque[i][0] = 0.0;
+	torque[i][1] = 0.0;
+	torque[i][2] = 0.0;
+      }
+    }
+
+  // neighbor includegroup flag is set
+  // clear force only on initial nfirst particles
+  // if either newton flag is set, also include ghosts
+
+  } else {
+    int nall = atom->nfirst;
+
+    
+    if (torqueflag) {
+      double **torque = atom->torque;
+      for (i = 0; i < nall; i++) {
+	torque[i][0] = 0.0;
+	torque[i][1] = 0.0;
+	torque[i][2] = 0.0;
+      }
+    }
+
+    if (force->newton) {
+      nall = atom->nlocal + atom->nghost;
+ 
+      if (torqueflag) {
+	double **torque = atom->torque;
+	for (i = atom->nlocal; i < nall; i++) {
+	  torque[i][0] = 0.0;
+	  torque[i][1] = 0.0;
+	  torque[i][2] = 0.0;
+	}
+      }
+    }
+  }
+}
+
+void VerletCuda::test_atom(int aatom, char* string)  //printing properties of one atom for test purposes
+{
+  if(not dotestatom) return;
+  bool check=false;
+  if(cuda->finished_setup) cuda->downloadAll();
+  for(int i=0;i<atom->nlocal+atom->nghost;i++)
+  {
+  if((atom->tag[i]==aatom)&&(i<atom->nlocal))
+  	{
+  	  
+  	  printf("%i # CUDA %s: %i %i %e %e %e %i ",comm->me,string,update->ntimestep,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);  
+  	  if(atom->molecular && (i<atom->nlocal))
+	  {
+	  	printf(" // %i %i %i ",atom->num_bond[i],atom->num_angle[i],atom->num_dihedral[i]);
+	  	for(int k=0;k<atom->num_bond[i];k++)
+	  	printf("// %i %i ",atom->bond_type[i][k],atom->bond_atom[i][k]);
+	  }
+	  printf("\n");
+  	}
+  if(i<atom->nlocal)
+  {
+  if((atom->v[i][0]<-100||atom->v[i][0]>100)||
+  	 (atom->v[i][1]<-100||atom->v[i][1]>100)||
+  	 (atom->v[i][2]<-100||atom->v[i][2]>100)||
+  	 (atom->v[i][0]!=atom->v[i][0])||
+  	 (atom->v[i][1]!=atom->v[i][1])||
+  	 (atom->v[i][2]!=atom->v[i][2]))
+  	{printf("%i # CUDA %s velocity: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);  check=true;}
+  if((atom->f[i][0]<-10000||atom->f[i][0]>10000)||
+  	 (atom->f[i][1]<-10000||atom->f[i][1]>10000)||
+  	 (atom->f[i][2]<-10000||atom->f[i][2]>10000)||
+  	 (atom->f[i][0]!=atom->f[i][0])||
+  	 (atom->f[i][1]!=atom->f[i][1])||
+  	 (atom->f[i][2]!=atom->f[i][2]))
+  	{printf("%i # CUDA %s force: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);    check=true;}
+  if(atom->tag[i]<=0)
+  	printf("%i # CUDA %s tag: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);  
+  }
+  }
+  if(check) exit(0);
+}
diff --git a/src/USER-CUDA/verlet_cuda.h b/src/USER-CUDA/verlet_cuda.h
new file mode 100644
index 0000000000..6992f438a4
--- /dev/null
+++ b/src/USER-CUDA/verlet_cuda.h
@@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef INTEGRATE_CLASS
+
+IntegrateStyle(verlet/cuda,VerletCuda)
+
+#else
+
+
+#ifndef LMP_VERLET_CUDA_H
+#define LMP_VERLET_CUDA_H
+#include "verlet.h"
+#include "modify_cuda.h"
+
+namespace LAMMPS_NS {
+
+class VerletCuda : public Verlet
+{
+	public:
+		VerletCuda(class LAMMPS *, int, char **);
+		void setup();
+ 		void setup_minimal(int);
+  		void run(int);
+		
+		void test_atom(int atom,char* astring); //debugging purpose
+		int dotestatom;	//debugging purpose
+	
+	protected:
+		class Cuda *cuda;
+		void force_clear();	
+	    double time_pair;
+	    double time_kspace;
+	    double time_comm;
+	    double time_modify;
+	    double time_fulliterate;
+	    ModifyCuda* modify_cuda;
+};
+
+}
+
+#endif
+#endif