git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6219 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-26 22:16:44 +00:00
parent cfa91c0611
commit 5a8719e38f
21 changed files with 7231 additions and 0 deletions
--- a/src/USER-CUDA/Install.sh
+++ b/src/USER-CUDA/Install.sh
@ -0,0 +1,74 @@
+# Install/unInstall package files in LAMMPS
+# edit Makefile.package to include/exclude CUDA library
+# do not copy child files if parent does not exist
+
+if (test $1 = 1) then
+
+  if (test -e ../Makefile.package) then
+      sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
+      sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
+      sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
+      sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
+      sed -i '1 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package
+      sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package
+      sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) |' ../Makefile.package
+      sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda -lcuda -lcudart -lrt |' ../Makefile.package
+  fi
+
+  cp comm_cuda.cpp ..
+  cp domain_cuda.cpp ..
+  cp modify_cuda.cpp ..
+  cp neighbor_cuda.cpp ..
+  cp neigh_full_cuda.cpp ..
+  cp verlet_cuda.cpp ..
+
+  cp cuda.cpp ..
+  cp cuda_neigh_list.cpp ..
+
+  cp comm_cuda.h ..
+  cp domain_cuda.h ..
+  cp modify_cuda.h ..
+  cp neighbor_cuda.h ..
+  cp verlet_cuda.h ..
+
+  cp cuda.h ..
+  cp cuda_common.h ..
+  cp cuda_data.h ..
+  cp cuda_modify_flags.h ..
+  cp cuda_neigh_list.h ..
+  cp cuda_precision.h ..
+  cp cuda_shared.h ..
+
+elif (test $1 = 0) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
+    sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
+    sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
+    sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
+  fi
+
+  rm ../comm_cuda.cpp
+  rm ../domain_cuda.cpp
+  rm ../modify_cuda.cpp
+  rm ../neighbor_cuda.cpp
+  rm ../neigh_full_cuda.cpp
+  rm ../verlet_cuda.cpp
+
+  rm ../cuda.cpp
+  rm ../cuda_neigh_list.cpp
+
+  rm ../comm_cuda.h
+  rm ../domain_cuda.h
+  rm ../modify_cuda.h
+  rm ../neighbor_cuda.h
+  rm ../verlet_cuda.h
+
+  rm ../cuda.h
+  rm ../cuda_common.h
+  rm ../cuda_data.h
+  rm ../cuda_modify_flags.h
+  rm ../cuda_neigh_list.h
+  rm ../cuda_precision.h
+  rm ../cuda_shared.h
+fi
--- a/src/USER-CUDA/comm_cuda.cpp
+++ b/src/USER-CUDA/comm_cuda.cpp
--- a/src/USER-CUDA/comm_cuda.h
+++ b/src/USER-CUDA/comm_cuda.h
@ -0,0 +1,69 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_COMM_CUDA_H
+#define LMP_COMM_CUDA_H
+
+#include "pointers.h"
+
+#include "cuda_data.h"
+#include "comm.h"
+
+namespace LAMMPS_NS {
+
+class CommCuda : public Comm {
+public:
+  CommCuda(class LAMMPS *);
+  ~CommCuda();
+
+  virtual void init();
+  virtual void setup();                     // setup 3d communication pattern
+  virtual void forward_comm(int mode=0);              // forward communication of atom coords
+  virtual void forward_comm_cuda();
+  virtual void forward_comm_pack_cuda();
+  virtual void forward_comm_transfer_cuda();
+  virtual void forward_comm_unpack_cuda();
+  virtual void forward_comm_pair(Pair *pair);    
+  virtual void reverse_comm();              // reverse communication of forces
+  virtual void exchange();                  // move atoms to new procs
+  virtual void exchange_cuda();                  // move atoms to new procs
+  virtual void borders();                   // setup list of atoms to communicate
+  virtual void borders_cuda();                   // setup list of atoms to communicate
+  virtual void borders_cuda_overlap_forward_comm();
+  virtual void forward_comm_fix(class Fix *);          // forward comm from a Fix
+
+
+  
+
+ protected:
+  class Cuda *cuda;
+  cCudaData<int, int, xy>* cu_pbc;
+  cCudaData<double, X_FLOAT, x>* cu_slablo;
+  cCudaData<double, X_FLOAT, x>* cu_slabhi;
+  cCudaData<double, X_FLOAT, xy>* cu_multilo;
+  cCudaData<double, X_FLOAT, xy>* cu_multihi;
+  
+  cCudaData<int, int, xy>* cu_sendlist;
+  virtual void grow_send(int,int);          // reallocate send buffer
+  virtual void grow_recv(int);              // free/allocate recv buffer
+  virtual void grow_list(int, int);         // reallocate one sendlist
+  virtual void grow_swap(int);              // grow swap and multi arrays
+  virtual void allocate_swap(int);          // allocate swap arrays
+  virtual void allocate_multi(int);         // allocate multi arrays
+  virtual void free_swap();                 // free swap arrays
+  virtual void free_multi();                // free multi arrays
+};
+
+}
+
+#endif
--- a/src/USER-CUDA/cuda.cpp
+++ b/src/USER-CUDA/cuda.cpp
@ -0,0 +1,837 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "cuda.h"
+#include "atom.h"
+#include "domain.h"
+#include "force.h"
+#include "pair.h"
+#include "update.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "universe.h"
+#include "input.h"
+#include "error.h"
+#include "cuda_neigh_list.h"
+//#include "pre_binning_cu.h"
+#include "binning_cu.h"
+//#include "reverse_binning_cu.h"
+#include <ctime>
+#include <cmath>
+#include "cuda_pair_cu.h"
+#include "cuda_cu.h"
+
+using namespace LAMMPS_NS;
+
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp)
+{
+	cuda_exists=true;
+	lmp->cuda=this;
+	if(universe->me==0)
+	printf("# Using LAMMPS_CUDA \n");
+	shared_data.me=universe->me;
+	device_set=false;
+	
+	Cuda_Cuda_GetCompileSettings(&shared_data);
+	
+	if(shared_data.compile_settings.prec_glob!=sizeof(CUDA_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: Global Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_glob, sizeof(CUDA_FLOAT)/4);
+	if(shared_data.compile_settings.prec_x!=sizeof(X_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: X Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_x, sizeof(X_FLOAT)/4);
+	if(shared_data.compile_settings.prec_v!=sizeof(V_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: V Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_v, sizeof(V_FLOAT)/4);
+	if(shared_data.compile_settings.prec_f!=sizeof(F_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: F Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_f, sizeof(F_FLOAT)/4);
+	if(shared_data.compile_settings.prec_pppm!=sizeof(PPPM_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: PPPM Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_pppm, sizeof(PPPM_FLOAT)/4);
+	if(shared_data.compile_settings.prec_fft!=sizeof(FFT_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: FFT Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_fft, sizeof(FFT_FLOAT)/4);
+    #ifdef FFT_CUFFT
+      if(shared_data.compile_settings.cufft!=1) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 1);
+    #else	
+      if(shared_data.compile_settings.cufft!=0) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 0);
+    #endif
+    
+    if(shared_data.compile_settings.arch!=CUDA_ARCH)  printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: arch: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, CUDA_ARCH);
+	
+	cu_x          = 0;
+	cu_v          = 0;
+	cu_f          = 0;
+	cu_tag        = 0;
+	cu_type       = 0;
+	cu_mask       = 0;
+	cu_image      = 0;
+	cu_xhold      = 0;
+	cu_q          = 0;
+	cu_rmass      = 0;
+	cu_mass       = 0;
+	cu_virial     = 0;
+	cu_eatom      = 0;
+	cu_vatom      = 0;
+	cu_radius	  = 0;
+	cu_density	  = 0;
+	cu_omega	  = 0;
+	cu_torque	  = 0;
+	
+	cu_special 	  = 0;
+	cu_nspecial   = 0;
+	
+	cu_molecule   = 0;
+	
+	cu_x_type 	  = 0;
+	x_type		  = 0;
+	cu_v_radius	  = 0;
+	v_radius	  = 0;
+	cu_omega_rmass	  = 0;
+	omega_rmass	  = 0;
+	
+	binned_id = 0;
+	cu_binned_id  = 0;
+	binned_idnew = 0;
+	cu_binned_idnew = 0;
+	
+	cu_map_array = 0;
+	
+	copy_buffer=0;
+	copy_buffersize=0;
+	
+	neighbor_decide_by_integrator=0;
+	pinned=true;
+	
+	debugdata=0;
+	new int[2*CUDA_MAX_DEBUG_SIZE];
+	
+	finished_setup = false;
+	begin_setup = false;
+	finished_run = false;
+
+	setSharedDataZero();
+	
+	uploadtime=0;
+	downloadtime=0;
+	dotiming=false;
+
+    dotestatom = false;
+    testatom = 0;	
+	oncpu = true;
+
+    self_comm = 0;
+	MYDBG( printf("# CUDA: Cuda::Cuda Done...\n");)
+	//cCudaData<double, float, yx >  
+}
+
+Cuda::~Cuda()
+{
+	
+	print_timings();
+	
+	if(universe->me==0) printf("# CUDA: Free memory...\n");
+	
+	delete cu_q;
+	delete cu_x;
+	delete cu_v;
+	delete cu_f;
+	delete cu_tag;
+	delete cu_type;
+	delete cu_mask;
+	delete cu_image;
+	delete cu_xhold;
+	delete cu_mass;
+	delete cu_rmass;
+	delete cu_virial;
+	delete cu_eng_vdwl;
+	delete cu_eng_coul;
+	delete cu_eatom;
+	delete cu_vatom;
+	delete cu_radius;
+	delete cu_density;
+	delete cu_omega;
+	delete cu_torque;
+	delete cu_molecule;
+	
+	delete cu_x_type;
+	delete [] x_type;
+	delete cu_v_radius;
+	delete [] v_radius;
+	delete cu_omega_rmass;
+	delete [] omega_rmass;
+	
+	delete cu_map_array;
+	
+	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
+	while(p != neigh_lists.end())
+	{
+		delete p->second;
+		++p;
+	}
+}
+
+void Cuda::accelerator(int narg, char** arg)
+{
+	if(device_set) return;
+	if(universe->me==0)
+	printf("# CUDA: Activate GPU \n");
+	
+	int* devicelist=NULL;
+	int pppn=2;
+    for(int i=0;i<narg;i++)
+	{
+	  if(strcmp(arg[i],"gpu/node")==0) 
+	  {
+	  	if(++i==narg) 
+	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number or keyword 'special' after 'gpu/node' option."); 
+	    if(strcmp(arg[i],"special")==0)
+	    {
+	  	   if(++i==narg) 
+	  	     error->all("Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'."); 
+	       pppn=atoi(arg[i]);
+	       if(pppn<1) error->all("Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'."); 
+	  	   if(i+pppn==narg) 
+	  	     error->all("Invalid Options for 'accelerator' command. Expecting list of device ids after keyword 'gpu/node special'."); 
+	       devicelist=new int[pppn];
+	       for(int k=0;k<pppn;k++)
+	         {i++;devicelist[k]=atoi(arg[i]);}
+	  	   
+	    }
+	    else
+	    pppn=atoi(arg[i]);
+	  }
+	  if(strcmp(arg[i],"pinned")==0) 
+	  {
+	  	if(++i==narg) 
+	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number after 'pinned' option."); 
+	    pinned=atoi(arg[i])==0?false:true;
+            if((pinned==false)&&(universe->me==0)) printf(" #CUDA: Pinned memory is not used for communication\n");
+	  }
+	  if(strcmp(arg[i],"dotiming")==0) 
+	  {
+	  	dotiming=true;
+	  }
+	  if(strcmp(arg[i],"suffix")==0) 
+	  {
+	  	if(++i==narg) 
+	  	  error->all("Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option."); 
+	  	strcpy(lmp->asuffix,arg[i]);
+	  }
+	  if(strcmp(arg[i],"overlap_comm")==0) 
+	  {
+	  	shared_data.overlap_comm=1;
+	  }
+	  if(strcmp(arg[i],"dotest")==0) 
+	  {
+	  	if(++i==narg) 
+	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number after 'dotest' option."); 
+	    testatom=atof(arg[i]);
+	    dotestatom=true;
+	  }
+	  if(strcmp(arg[i],"override_bpa")==0) 
+	  {
+	  	if(++i==narg) 
+	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number after 'override_bpa' option."); 
+	      shared_data.pair.override_block_per_atom = atoi(arg[i]);
+	  }
+	}
+	CudaWrapper_Init(0, (char**)0,universe->me,pppn,devicelist);
+	//if(shared_data.overlap_comm)
+	  	CudaWrapper_AddStreams(3);
+	cu_x          = 0;
+	cu_v          = 0;
+	cu_f          = 0;
+	cu_tag        = 0;
+	cu_type       = 0;
+	cu_mask       = 0;
+	cu_image      = 0;
+	cu_xhold      = 0;
+	cu_q          = 0;
+	cu_rmass      = 0;
+	cu_mass       = 0;
+	cu_virial     = 0;
+	cu_eatom      = 0;
+	cu_vatom      = 0;
+	cu_radius	  = 0;
+	cu_density	  = 0;
+	cu_omega	  = 0;
+	cu_torque	  = 0;
+	
+	cu_special 	  = 0;
+	cu_nspecial   = 0;
+	
+	cu_molecule   = 0;
+	
+	cu_x_type 	  = 0;
+	cu_v_radius	  = 0;
+	cu_omega_rmass	  = 0;
+	
+	cu_binned_id  = 0;
+	cu_binned_idnew = 0;
+	device_set=true;
+	allocate();
+	delete devicelist;
+}
+
+void Cuda::setSharedDataZero()
+{
+	MYDBG(printf("# CUDA: Cuda::setSharedDataZero ...\n");)
+	shared_data.atom.nlocal = 0;
+	shared_data.atom.nghost = 0;
+	shared_data.atom.nall = 0;
+	shared_data.atom.nmax = 0;
+	shared_data.atom.ntypes = 0;
+	shared_data.atom.q_flag = 0;
+	shared_data.atom.need_eatom = 0;
+	shared_data.atom.need_vatom = 0;
+	
+    shared_data.pair.cudable_force = 0;
+	shared_data.pair.collect_forces_later = 0;
+	shared_data.pair.use_block_per_atom = 0;
+	shared_data.pair.override_block_per_atom = -1;
+	shared_data.pair.cut = 0;
+	shared_data.pair.cutsq = 0;
+	shared_data.pair.cut_inner = 0;
+	shared_data.pair.cut_coul = 0;
+	shared_data.pair.special_lj = 0;
+	shared_data.pair.special_coul = 0;
+	
+	
+	shared_data.pppm.cudable_force = 0;
+	
+	shared_data.buffersize = 0;
+	shared_data.buffer_new = 1;
+	shared_data.buffer = NULL;
+	
+	shared_data.comm.comm_phase=0;
+	shared_data.overlap_comm=0;
+
+	shared_data.comm.buffer = NULL;
+	shared_data.comm.buffer_size=0;
+	shared_data.comm.overlap_split_ratio=0;
+   // setTimingsZero();
+}
+
+void Cuda::allocate()
+{
+	accelerator(0,NULL);
+	MYDBG(printf("# CUDA: Cuda::allocate ...\n");)
+	if(not cu_virial)
+	{
+	  cu_virial    = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.virial , 6);
+	  cu_eng_vdwl  = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.eng_vdwl ,1);
+	  cu_eng_coul  = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.eng_coul ,1);
+	  cu_extent 	 = new cCudaData<double, double, x> (extent, 6);
+	  shared_data.flag = CudaWrapper_AllocCudaData(sizeof(int));
+	  int size=2*CUDA_MAX_DEBUG_SIZE;
+	  debugdata = new int[size];
+	  cu_debugdata    = new cCudaData<int, int, x > (debugdata , size);
+	  shared_data.debugdata=cu_debugdata->dev_data();
+	}
+	checkResize();
+	setSystemParams();
+	MYDBG(printf("# CUDA: Cuda::allocate done...\n");)
+}
+
+void Cuda::setSystemParams()
+{
+    MYDBG(printf("# CUDA: Cuda::setSystemParams ...\n");)
+	shared_data.atom.nlocal = atom->nlocal;
+	shared_data.atom.nghost = atom->nghost;
+	shared_data.atom.nall = atom->nlocal + atom->nghost;
+	shared_data.atom.ntypes = atom->ntypes;
+	shared_data.atom.q_flag = atom->q_flag;
+	shared_data.atom.rmass_flag = atom->rmass_flag;
+    MYDBG(printf("# CUDA: Cuda::setSystemParams done ...\n");)
+}
+
+void Cuda::setDomainParams()
+{
+    MYDBG(printf("# CUDA: Cuda::setDomainParams ...\n");)
+ 	cuda_shared_domain* cu_domain = &shared_data.domain;
+ 
+    cu_domain->triclinic = domain->triclinic;
+	for(short i=0; i<3; ++i)
+	{
+		cu_domain->periodicity[i] = domain->periodicity[i];
+		cu_domain->sublo[i] = domain->sublo[i];
+		cu_domain->subhi[i] = domain->subhi[i];
+		cu_domain->boxlo[i] = domain->boxlo[i];
+		cu_domain->boxhi[i] = domain->boxhi[i];
+		cu_domain->prd[i] = domain->prd[i];
+	}
+	if(domain->triclinic)
+    {
+	  for(short i=0; i<3; ++i)
+	  {
+	    cu_domain->boxlo_lamda[i] = domain->boxlo_lamda[i];
+	    cu_domain->boxhi_lamda[i] = domain->boxhi_lamda[i];
+	    cu_domain->prd_lamda[i] = domain->prd_lamda[i];
+	  }
+	  cu_domain->xy = domain->xy;
+	  cu_domain->xz = domain->xz;
+	  cu_domain->yz = domain->yz;
+	}
+
+    for(int i=0;i<6;i++) 
+	{
+	  cu_domain->h[i]=domain->h[i];
+	  cu_domain->h_inv[i]=domain->h_inv[i];
+	  cu_domain->h_rate[i]=domain->h_rate[i];
+	}
+	
+	cu_domain->update=2;
+    MYDBG(printf("# CUDA: Cuda::setDomainParams done ...\n");)
+}
+
+void Cuda::checkResize()
+{
+    MYDBG(printf("# CUDA: Cuda::checkResize ...\n");)
+    accelerator(0,NULL);
+	cuda_shared_atom* cu_atom = & shared_data.atom;
+	cuda_shared_pair* cu_pair = & shared_data.pair;
+	cu_atom->q_flag      = atom->q_flag;
+	cu_atom->rmass_flag  = atom->rmass ? 1 : 0;
+	cu_atom->nall = atom->nlocal + atom->nghost;
+	cu_atom->nlocal      = atom->nlocal;
+	cu_atom->nghost      = atom->nghost;
+	
+	// do we have more atoms to upload than currently allocated memory on device? (also true if nothing yet allocated)
+	if(atom->nmax > cu_atom->nmax || cu_tag == NULL)
+	{
+		delete cu_x;               cu_x         = new cCudaData<double, X_FLOAT, yx> ((double*)atom->x , & cu_atom->x        , atom->nmax, 3,0,true); //cu_x->set_buffer(&(shared_data.buffer),&(shared_data.buffersize),true);
+		delete cu_v;               cu_v         = new cCudaData<double, V_FLOAT, yx> ((double*)atom->v, & cu_atom->v         , atom->nmax, 3);
+		delete cu_f;               cu_f         = new cCudaData<double, F_FLOAT, yx> ((double*)atom->f, & cu_atom->f         , atom->nmax, 3,0,true);
+		delete cu_tag;             cu_tag       = new cCudaData<int   , int    , x > (atom->tag       , & cu_atom->tag       , atom->nmax   );
+		delete cu_type;            cu_type      = new cCudaData<int   , int    , x > (atom->type      , & cu_atom->type      , atom->nmax   );
+		delete cu_mask;            cu_mask      = new cCudaData<int   , int    , x > (atom->mask      , & cu_atom->mask      , atom->nmax   );
+		delete cu_image;           cu_image     = new cCudaData<int   , int    , x > (atom->image     , & cu_atom->image     , atom->nmax   );
+
+		if(atom->rmass)
+			{delete cu_rmass;      cu_rmass     = new cCudaData<double, V_FLOAT, x > (atom->rmass     , & cu_atom->rmass     , atom->nmax  );}
+
+		if(cu_atom->q_flag)
+			{delete cu_q;          cu_q         = new cCudaData<double, F_FLOAT, x > ((double*)atom->q, & cu_atom->q         , atom->nmax  );}// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
+
+/*
+		if(force->pair)
+		if(force->pair->eatom)
+			{delete cu_eatom;          cu_eatom         = new cCudaData<double, ENERGY_FLOAT, x > (force->pair->eatom, & cu_atom->eatom         , atom->nmax  );}// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
+		if(force->pair)
+		if(force->pair->vatom)
+			{delete cu_vatom;          cu_vatom         = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & cu_atom->vatom         , atom->nmax,6  );}// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
+*/
+		if(atom->radius)
+		{
+			delete cu_radius;     cu_radius    = new cCudaData<double, X_FLOAT, x > (atom->radius    , & cu_atom->radius     , atom->nmax  );
+		    delete cu_v_radius;   cu_v_radius  = new cCudaData<V_FLOAT, V_FLOAT, x> (v_radius , & cu_atom->v_radius      , atom->nmax*4);
+		    delete cu_omega_rmass;   cu_omega_rmass  = new cCudaData<V_FLOAT, V_FLOAT, x> (omega_rmass , & cu_atom->omega_rmass      , atom->nmax*4);
+		}
+
+		/*		
+		if(atom->density)
+			{delete cu_density;    cu_density   = new cCudaData<double, F_FLOAT, x > (atom->density   , & cu_atom->density     , atom->nmax  );}
+		*/
+
+		if(atom->omega)
+			{delete cu_omega;      cu_omega     = new cCudaData<double, V_FLOAT, yx > (((double*) atom->omega)    , & cu_atom->omega     , atom->nmax,3  );}
+
+		if(atom->torque)
+			{delete cu_torque;     cu_torque    = new cCudaData<double, F_FLOAT, yx > (((double*) atom->torque)   , & cu_atom->torque     , atom->nmax,3  );}
+
+		if(atom->special)
+			{delete cu_special;     cu_special    = new cCudaData<int, int, yx > (((int*) &(atom->special[0][0]))   , & cu_atom->special     , atom->nmax,atom->maxspecial  ); shared_data.atom.maxspecial=atom->maxspecial;}
+		if(atom->nspecial)
+			{delete cu_nspecial;     cu_nspecial    = new cCudaData<int, int, yx > (((int*) atom->nspecial)  , & cu_atom->nspecial     , atom->nmax,3  );}
+		if(atom->molecule)
+			{delete cu_molecule;     cu_molecule    = new cCudaData<int, int, x > (((int*) atom->molecule)  , & cu_atom->molecule     , atom->nmax  );}
+		shared_data.atom.special_flag = neighbor->special_flag;
+		shared_data.atom.molecular = atom->molecular;
+		   
+  	    cu_atom->update_nmax = 2;
+	    cu_atom->nmax        = atom->nmax;
+	    
+	    //delete [] x_type; 			x_type 		= new X_FLOAT4[atom->nmax];
+		delete cu_x_type;           cu_x_type   = new cCudaData<X_FLOAT, X_FLOAT, x> (x_type , & cu_atom->x_type      , atom->nmax*4);
+	   // shared_data.buffer_new = 2;
+	}
+
+	if(((cu_xhold==NULL)||(cu_xhold->get_dim()[0]<neighbor->maxhold))&&neighbor->xhold)
+	{
+		delete cu_xhold;           cu_xhold     = new cCudaData<double, X_FLOAT, yx> ((double*)neighbor->xhold, & cu_atom->xhold         , neighbor->maxhold, 3);
+		shared_data.atom.maxhold=neighbor->maxhold;
+	}
+	
+	if(atom->mass && !cu_mass) 
+	{cu_mass      = new cCudaData<double, V_FLOAT, x > (atom->mass      , & cu_atom->mass      , atom->ntypes+1);}
+	cu_atom->mass_host   = atom->mass;
+	
+	if(atom->map_style==1)
+	{
+	  if((cu_map_array==NULL))
+	  {
+	  	cu_map_array   = new cCudaData<int, int, x > (atom->get_map_array()   , & cu_atom->map_array     , atom->get_map_size()  );
+	  }
+	}
+	
+	
+	// if any of the host pointers have changed (e.g. re-allocated somewhere else), set to correct pointer
+	if(cu_x   ->get_host_data() != atom->x)    cu_x   ->set_host_data((double*) (atom->x));
+	if(cu_v   ->get_host_data() != atom->v)    cu_v   ->set_host_data((double*) (atom->v));
+	if(cu_f   ->get_host_data() != atom->f)    cu_f   ->set_host_data((double*) (atom->f));
+	if(cu_tag ->get_host_data() != atom->tag)  cu_tag ->set_host_data(atom->tag);
+	if(cu_type->get_host_data() != atom->type) cu_type->set_host_data(atom->type);
+	if(cu_mask->get_host_data() != atom->mask) cu_mask->set_host_data(atom->mask);
+	if(cu_image->get_host_data() != atom->image) cu_mask->set_host_data(atom->image);
+	
+	if(cu_xhold)
+	if(cu_xhold->get_host_data()!= neighbor->xhold) cu_xhold->set_host_data((double*)(neighbor->xhold));
+	
+	if(atom->rmass)
+	if(cu_rmass->get_host_data() != atom->rmass) cu_rmass->set_host_data((double*) (atom->rmass));
+
+	if(cu_atom->q_flag)
+	if(cu_q->get_host_data() != atom->q) cu_q->set_host_data((double*) (atom->q));
+	
+	if(atom->radius)
+	if(cu_radius->get_host_data() != atom->radius) cu_radius->set_host_data((double*) (atom->radius));
+
+	/*
+	if(atom->density)
+	if(cu_density->get_host_data() != atom->density) cu_density->set_host_data((double*) (atom->density));
+	*/
+
+	if(atom->omega)
+	if(cu_omega->get_host_data() != atom->omega) cu_omega->set_host_data((double*) (atom->omega));
+
+	if(atom->torque)
+	if(cu_torque->get_host_data() != atom->torque) cu_torque->set_host_data((double*) (atom->torque));
+
+	if(atom->special)
+	if(cu_special->get_host_data() != atom->special)
+			{delete cu_special;     cu_special    = new cCudaData<int, int, yx > (((int*) atom->special)   , & cu_atom->special     , atom->nmax,atom->maxspecial  ); shared_data.atom.maxspecial=atom->maxspecial;}
+	
+	if(atom->nspecial)
+	if(cu_nspecial->get_host_data() != atom->nspecial) cu_nspecial->set_host_data((int*) (atom->nspecial));
+
+	if(atom->molecule)
+	if(cu_molecule->get_host_data() != atom->molecule) cu_molecule->set_host_data((int*) (atom->molecule));
+
+	if(force)
+	if(cu_virial   ->get_host_data() != force->pair->virial)    cu_virial   ->set_host_data(force->pair->virial);
+	if(force)
+	if(cu_eng_vdwl ->get_host_data() != &force->pair->eng_vdwl)    cu_eng_vdwl  ->set_host_data(&force->pair->eng_vdwl);
+	if(force)
+	if(cu_eng_coul ->get_host_data() != &force->pair->eng_coul)    cu_eng_coul   ->set_host_data(&force->pair->eng_coul);
+
+ 	cu_atom->update_nlocal = 2;
+	MYDBG(printf("# CUDA: Cuda::checkResize done...\n");)
+}
+
+void Cuda::evsetup_eatom_vatom(int eflag_atom,int vflag_atom)
+{
+    if(eflag_atom)
+    {
+    	if(not cu_eatom) 
+    		cu_eatom         = new cCudaData<double, ENERGY_FLOAT, x > (force->pair->eatom, & (shared_data.atom.eatom)         , atom->nmax  );// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
+    	cu_eatom->set_host_data(force->pair->eatom); 
+		cu_eatom->memset_device(0);
+    }
+    if(vflag_atom)
+    {	
+    	if(not cu_vatom) 
+    		cu_vatom         = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom)         , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
+    	cu_vatom->set_host_data((double*)force->pair->vatom); 
+		cu_vatom->memset_device(0);
+    }
+}
+
+void Cuda::uploadAll()
+{
+	MYDBG(printf("# CUDA: Cuda::uploadAll() ... start\n");)
+	timespec starttime;
+	timespec endtime;
+
+	if(atom->nmax!=shared_data.atom.nmax) checkResize();
+	clock_gettime(CLOCK_REALTIME,&starttime);
+	cu_x   ->upload();
+	cu_v   ->upload();
+	cu_f   ->upload();
+	cu_tag ->upload();
+	cu_type->upload();
+	cu_mask->upload();
+	cu_image->upload();
+	if(shared_data.atom.q_flag) cu_q    ->upload();
+	
+	//printf("A3\n");
+	//if(shared_data.atom.need_eatom) cu_eatom->upload();
+	//printf("A4\n");
+	//if(shared_data.atom.need_vatom) cu_vatom->upload();
+	//printf("A5\n");
+	
+	if(atom->rmass)             cu_rmass->upload();
+
+	if(atom->radius)            cu_radius->upload();
+	//	if(atom->density)           cu_density->upload();
+	if(atom->omega)             cu_omega->upload();
+	if(atom->torque)            cu_torque->upload();
+	if(atom->special)           cu_special->upload();
+	if(atom->nspecial)          cu_nspecial->upload();
+	if(atom->molecule)          cu_molecule->upload();
+	if(cu_eatom) cu_eatom->upload();
+	if(cu_vatom) cu_vatom->upload();
+
+	clock_gettime(CLOCK_REALTIME,&endtime);
+	uploadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+	CUDA_IF_BINNING(Cuda_PreBinning(& shared_data);)
+	CUDA_IF_BINNING(Cuda_Binning   (& shared_data);)
+	
+	shared_data.atom.triggerneighsq=neighbor->triggersq;
+	MYDBG(printf("# CUDA: Cuda::uploadAll() ... end\n");)
+}
+
+void Cuda::downloadAll()
+{
+	MYDBG(printf("# CUDA: Cuda::downloadAll() ... start\n");)
+	timespec starttime;
+	timespec endtime;
+
+	if(atom->nmax!=shared_data.atom.nmax) checkResize();
+
+	CUDA_IF_BINNING( Cuda_ReverseBinning(& shared_data); )
+	clock_gettime(CLOCK_REALTIME,&starttime);
+	cu_x   ->download();
+	cu_v   ->download();
+	cu_f   ->download();
+	cu_type->download();
+	cu_tag ->download();
+	cu_mask->download();
+	cu_image->download();
+
+	//if(shared_data.atom.need_eatom) cu_eatom->download();
+	//if(shared_data.atom.need_vatom) cu_vatom->download();
+
+	if(shared_data.atom.q_flag) cu_q    ->download();
+	if(atom->rmass)             cu_rmass->download();
+	
+	if(atom->radius)            cu_radius->download();
+	//	if(atom->density)           cu_density->download();
+	if(atom->omega)             cu_omega->download();
+	if(atom->torque)            cu_torque->download();
+	if(atom->special)           cu_special->download();
+	if(atom->nspecial)          cu_nspecial->download();
+	if(atom->molecule)          cu_molecule->download();
+	if(cu_eatom) cu_eatom->download();
+	if(cu_vatom) cu_vatom->download();
+	
+	clock_gettime(CLOCK_REALTIME,&endtime);
+	downloadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+	MYDBG(printf("# CUDA: Cuda::downloadAll() ... end\n");)
+}
+
+void Cuda::downloadX()
+{
+	Cuda_Pair_RevertXType(& this->shared_data);
+	cu_x->download();
+}
+
+CudaNeighList* Cuda::registerNeighborList(class NeighList* neigh_list)
+{
+	MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... start a\n");)
+	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.find(neigh_list);
+
+	if(p != neigh_lists.end()) return p->second;
+	else
+	{
+		CudaNeighList* neigh_list_cuda = new CudaNeighList(lmp, neigh_list);
+		neigh_lists.insert(std::pair<NeighList*, CudaNeighList*>(neigh_list, neigh_list_cuda));
+		return neigh_list_cuda;
+	}
+	MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... end b\n");)
+}
+
+void Cuda::uploadAllNeighborLists()
+{
+	MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... start\n");)
+	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
+	while(p != neigh_lists.end())
+	{
+		p->second->nl_upload();
+		if(not (p->second->neigh_list->cuda_list->build_cuda))
+		for(int i=0;i<atom->nlocal;i++)
+		p->second->sneighlist.maxneighbors=MAX(p->second->neigh_list->numneigh[i],p->second->sneighlist.maxneighbors) ;
+		++p;
+	}
+	MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... done\n");)
+}
+
+void Cuda::downloadAllNeighborLists()
+{
+	MYDBG(printf("# CUDA: Cuda::downloadAllNeighborList() ... start\n");)
+	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
+	while(p != neigh_lists.end())
+	{
+		p->second->nl_download();
+		++p;
+	}
+}
+
+void Cuda::update_xhold(int &maxhold,double* xhold)
+{
+     if(this->shared_data.atom.maxhold<atom->nmax)
+     {
+        maxhold = atom->nmax;
+		delete this->cu_xhold;           this->cu_xhold     = new cCudaData<double, X_FLOAT, yx> ((double*)xhold, & this->shared_data.atom.xhold         , maxhold, 3);
+     }
+     this->shared_data.atom.maxhold=maxhold;
+ 	 CudaWrapper_CopyData(this->cu_xhold->dev_data(),this->cu_x->dev_data(),3*atom->nmax*sizeof(X_FLOAT));
+}
+
+void Cuda::setTimingsZero()
+{
+	shared_data.cuda_timings.test1=0;
+	shared_data.cuda_timings.test2=0;
+	
+	//communication
+	shared_data.cuda_timings.comm_forward_total = 0;
+	shared_data.cuda_timings.comm_forward_mpi_upper = 0;
+	shared_data.cuda_timings.comm_forward_mpi_lower = 0;
+	shared_data.cuda_timings.comm_forward_kernel_pack = 0;
+	shared_data.cuda_timings.comm_forward_kernel_unpack = 0;
+	shared_data.cuda_timings.comm_forward_upload = 0;
+	shared_data.cuda_timings.comm_forward_download = 0;
+
+	shared_data.cuda_timings.comm_exchange_total = 0;
+	shared_data.cuda_timings.comm_exchange_mpi = 0;
+	shared_data.cuda_timings.comm_exchange_kernel_pack = 0;
+	shared_data.cuda_timings.comm_exchange_kernel_unpack = 0;
+	shared_data.cuda_timings.comm_exchange_kernel_fill = 0;
+	shared_data.cuda_timings.comm_exchange_cpu_pack= 0;
+	shared_data.cuda_timings.comm_exchange_upload = 0;
+	shared_data.cuda_timings.comm_exchange_download = 0;
+
+	shared_data.cuda_timings.comm_border_total = 0;
+	shared_data.cuda_timings.comm_border_mpi = 0;
+	shared_data.cuda_timings.comm_border_kernel_pack = 0;
+	shared_data.cuda_timings.comm_border_kernel_unpack = 0;
+	shared_data.cuda_timings.comm_border_kernel_buildlist = 0;
+	shared_data.cuda_timings.comm_border_kernel_self = 0;
+	shared_data.cuda_timings.comm_border_upload = 0;
+	shared_data.cuda_timings.comm_border_download = 0;
+	
+	//pair forces
+	shared_data.cuda_timings.pair_xtype_conversion = 0;
+	shared_data.cuda_timings.pair_kernel = 0;
+	shared_data.cuda_timings.pair_virial = 0;
+	shared_data.cuda_timings.pair_force_collection = 0;
+	
+	//neighbor
+	shared_data.cuda_timings.neigh_bin = 0;
+	shared_data.cuda_timings.neigh_build = 0;
+	shared_data.cuda_timings.neigh_special = 0;
+	
+	//PPPM
+ 	shared_data.cuda_timings.pppm_particle_map; 
+    shared_data.cuda_timings.pppm_make_rho; 
+    shared_data.cuda_timings.pppm_brick2fft; 
+    shared_data.cuda_timings.pppm_poisson; 
+    shared_data.cuda_timings.pppm_fillbrick; 
+    shared_data.cuda_timings.pppm_fieldforce; 
+    shared_data.cuda_timings.pppm_compute; 
+	
+	CudaWrapper_CheckUploadTime(true);
+	CudaWrapper_CheckDownloadTime(true);
+	CudaWrapper_CheckCPUBufUploadTime(true);
+	CudaWrapper_CheckCPUBufDownloadTime(true);	
+}
+
+void Cuda::print_timings()
+{
+	if(universe->me!=0) return;
+	if(not dotiming) return;
+	printf("\n # CUDA: Special timings\n\n");
+	printf("\n Transfer Times\n");
+	printf(" PCIe Upload:  \t %lf s\n",CudaWrapper_CheckUploadTime());
+	printf(" PCIe Download:\t %lf s\n",CudaWrapper_CheckDownloadTime());
+	printf(" CPU Tempbbuf Upload:   \t %lf \n",CudaWrapper_CheckCPUBufUploadTime());
+	printf(" CPU Tempbbuf Download: \t %lf \n",CudaWrapper_CheckCPUBufDownloadTime());
+	
+	printf("\n Communication \n");
+    
+	printf(" Forward Total           \t %lf \n",shared_data.cuda_timings.comm_forward_total);
+	printf(" Forward MPI Upper Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_upper);
+	printf(" Forward MPI Lower Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_lower);
+	printf(" Forward Kernel Pack     \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_pack);
+	printf(" Forward Kernel Unpack   \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_unpack);
+	printf(" Forward Kernel Self     \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_self);
+	printf(" Forward Upload          \t %lf \n",shared_data.cuda_timings.comm_forward_upload);
+	printf(" Forward Download        \t %lf \n",shared_data.cuda_timings.comm_forward_download);
+	printf(" Forward Overlap Split Ratio\t %lf \n",shared_data.comm.overlap_split_ratio);
+	printf("\n");
+
+	printf(" Exchange Total          \t %lf \n",shared_data.cuda_timings.comm_exchange_total);
+	printf(" Exchange MPI            \t %lf \n",shared_data.cuda_timings.comm_exchange_mpi);
+	printf(" Exchange Kernel Pack    \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_pack);
+	printf(" Exchange Kernel Unpack  \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_unpack);
+    printf(" Exchange Kernel Fill    \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_fill);
+    printf(" Exchange CPU Pack	     \t %lf \n",shared_data.cuda_timings.comm_exchange_cpu_pack);
+	printf(" Exchange Upload         \t %lf \n",shared_data.cuda_timings.comm_exchange_upload);
+	printf(" Exchange Download       \t %lf \n",shared_data.cuda_timings.comm_exchange_download);
+	printf("\n");
+
+	printf(" Border Total            \t %lf \n",shared_data.cuda_timings.comm_border_total);
+	printf(" Border MPI              \t %lf \n",shared_data.cuda_timings.comm_border_mpi);
+	printf(" Border Kernel Pack      \t %lf \n",shared_data.cuda_timings.comm_border_kernel_pack);
+	printf(" Border Kernel Unpack    \t %lf \n",shared_data.cuda_timings.comm_border_kernel_unpack);
+	printf(" Border Kernel Self      \t %lf \n",shared_data.cuda_timings.comm_border_kernel_self);
+	printf(" Border Kernel BuildList \t %lf \n",shared_data.cuda_timings.comm_border_kernel_buildlist);
+	printf(" Border Upload           \t %lf \n",shared_data.cuda_timings.comm_border_upload);
+	printf(" Border Download 	     \t %lf \n",shared_data.cuda_timings.comm_border_download);
+	printf("\n");
+	
+	//pair forces
+	printf(" Pair XType Conversion   \t %lf \n",shared_data.cuda_timings.pair_xtype_conversion );
+	printf(" Pair Kernel             \t %lf \n",shared_data.cuda_timings.pair_kernel );
+	printf(" Pair Virial             \t %lf \n",shared_data.cuda_timings.pair_virial );
+	printf(" Pair Force Collection   \t %lf \n",shared_data.cuda_timings.pair_force_collection );
+	printf("\n");
+	
+	//neighbor
+	printf(" Neighbor Binning        \t %lf \n",shared_data.cuda_timings.neigh_bin );
+	printf(" Neighbor Build          \t %lf \n",shared_data.cuda_timings.neigh_build );
+	printf(" Neighbor Special        \t %lf \n",shared_data.cuda_timings.neigh_special );	
+	printf("\n");
+	
+	//pppm
+	if(force->kspace)
+	{
+	printf(" PPPM Total              \t %lf \n",shared_data.cuda_timings.pppm_compute );
+	printf(" PPPM Particle Map       \t %lf \n",shared_data.cuda_timings.pppm_particle_map );
+	printf(" PPPM Make Rho           \t %lf \n",shared_data.cuda_timings.pppm_make_rho );
+	printf(" PPPM Brick2fft          \t %lf \n",shared_data.cuda_timings.pppm_brick2fft );
+	printf(" PPPM Poisson            \t %lf \n",shared_data.cuda_timings.pppm_poisson );
+	printf(" PPPM Fillbrick          \t %lf \n",shared_data.cuda_timings.pppm_fillbrick );
+	printf(" PPPM Fieldforce         \t %lf \n",shared_data.cuda_timings.pppm_fieldforce );
+	printf("\n");
+	}	
+
+	printf(" Debug Test 1            \t %lf \n",shared_data.cuda_timings.test1);
+	printf(" Debug Test 2            \t %lf \n",shared_data.cuda_timings.test2);
+	
+	printf("\n");
+}
--- a/src/USER-CUDA/cuda.h
+++ b/src/USER-CUDA/cuda.h
@ -0,0 +1,153 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef CUDA_H
+#define CUDA_H
+
+#include "pointers.h"
+#include "cuda_shared.h"
+#include "cuda_data.h"
+#include "cuda_precision.h"
+#include <map>
+
+#ifdef _DEBUG
+#define MYDBG(a) a
+#else
+#define MYDBG(a) 
+#endif
+
+namespace LAMMPS_NS
+{
+	class Cuda : protected Pointers
+	{
+		public:
+		Cuda(class LAMMPS *);
+		~Cuda();
+		//static void setDevice(class LAMMPS*);
+		void allocate();
+
+		void accelerator(int, char **);
+
+		void setSharedDataZero();
+		void setSystemParams();
+		
+		void setDomainParams();
+		
+		void checkResize();
+		void evsetup_eatom_vatom(int eflag_atom,int vflag_atom);
+		void uploadAll();
+		void downloadAll();
+		void downloadX();
+		
+		class CudaNeighList* registerNeighborList(class NeighList* neigh_list);
+		void uploadAllNeighborLists();
+		void downloadAllNeighborLists();
+		void set_neighinit(int dist_check, double triggerneighsq)
+		{
+    	  shared_data.atom.dist_check=dist_check;  
+    	  shared_data.atom.triggerneighsq = triggerneighsq;	  	
+		}
+		bool decide_by_integrator()
+		{
+		 return neighbor_decide_by_integrator  && cu_xhold && finished_setup;
+	    }
+	    void update_xhold(int &maxhold,double* xhold);
+	    
+		void setTimingsZero();
+		void print_timings();
+		
+		void cu_x_download() {cu_x->download();}
+		bool device_set;
+		bool dotiming;
+		bool dotestatom;
+		int testatom;
+		
+		double uploadtime,downloadtime;
+		bool finished_setup,begin_setup;
+		bool oncpu;
+		bool finished_run;
+		
+		int self_comm;
+
+		int cuda_exists;
+
+		double extent[6];
+		int* debugdata;
+		// data shared between host code and device code
+		// (number of atoms, device pointers for up- & download)
+		cuda_shared_data shared_data;
+		
+		cCudaData<double  , F_FLOAT , x >* cu_q;
+		cCudaData<double  , F_FLOAT , yx>* cu_f;
+		cCudaData<double  , V_FLOAT , x >* cu_mass;
+		cCudaData<double  , V_FLOAT , x >* cu_rmass;
+		cCudaData<double  , V_FLOAT , yx>* cu_v;
+		cCudaData<double  , X_FLOAT , yx>* cu_x;
+		cCudaData<double  , X_FLOAT , yx>* cu_xhold;
+		cCudaData<int     , int     , x >* cu_mask;
+		cCudaData<int     , int     , x >* cu_tag;
+		cCudaData<int     , int     , x >* cu_type;
+		cCudaData<int     , int     , x >* cu_image;
+		cCudaData<double  , ENERGY_FLOAT, x >* cu_eatom;
+		cCudaData<double  , ENERGY_FLOAT, yx>* cu_vatom;
+		cCudaData<double  , ENERGY_FLOAT, x >* cu_virial;
+		cCudaData<double  , ENERGY_FLOAT, x >* cu_eng_vdwl;
+		cCudaData<double  , ENERGY_FLOAT, x >* cu_eng_coul;
+		cCudaData<double  , double  , x >* cu_extent;
+		int* binned_id;
+		cCudaData<int 	  , int	    , xx >* cu_binned_id;
+		int* binned_idnew;
+		cCudaData<int 	  , int	    , xx >* cu_binned_idnew;
+		cCudaData<int 	  , int	    , x >* cu_debugdata;
+		cCudaData<double  , X_FLOAT , x>* cu_radius;
+		cCudaData<double  , F_FLOAT , x>* cu_density;
+		cCudaData<double  , V_FLOAT , yx>* cu_omega;
+		cCudaData<double  , F_FLOAT , yx>* cu_torque;
+		cCudaData<int 	  , int	    , yx >* cu_special;
+		cCudaData<int 	  , int	    , yx >* cu_nspecial;
+		cCudaData<int     , int     , x >* cu_molecule;
+
+		
+		cCudaData<X_FLOAT  , X_FLOAT , x>* cu_x_type;
+		X_FLOAT* x_type;
+
+		cCudaData<V_FLOAT  , V_FLOAT , x>* cu_v_radius;
+		V_FLOAT* v_radius;
+
+		cCudaData<V_FLOAT  , V_FLOAT , x>* cu_omega_rmass;
+		V_FLOAT* omega_rmass;
+
+		cCudaData<int     , int     , x >* cu_map_array;
+		int neighbor_decide_by_integrator;
+
+        bool pinned;
+        
+		void* copy_buffer;
+		int copy_buffersize;
+
+		private:
+		std::map<class NeighList*, class CudaNeighList*> neigh_lists;
+	};
+}
+
+#endif // CUDA_H
--- a/src/USER-CUDA/cuda_common.h
+++ b/src/USER-CUDA/cuda_common.h
@ -0,0 +1,344 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_COMMON_H_
+#define _CUDA_COMMON_H_
+
+//#include "cutil.h"
+#include "cuda_precision.h"
+#include "cuda_wrapper_cu.h"
+
+#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
+//this can not be arbitrarly large, since constant space is limited. 
+//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
+//Christian   
+#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
+#define CUDA_MAX_NSPECIAL 25
+
+// define some easy-to-use debug and emulation macros
+#ifdef _DEBUG
+#define MYDBG(a) a
+#else
+#define MYDBG(a) 
+#endif
+
+#if __DEVICE_EMULATION__
+#define MYEMU(a) a
+#else
+#define MYEMU(a) 
+#endif
+
+#define MYEMUDBG(a) MYEMU(MYDBG(a))
+
+// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
+#define MY_ADD_PREFIX(prefix, var) prefix##_##var
+#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
+#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
+
+#define MY_VAR_TO_STR(var) #var
+#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
+#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
+
+#define CUDA_USE_TEXTURE
+#define CUDA_USE_FLOAT4
+
+//constants used by many classes
+
+//domain
+#define _boxhi       MY_AP(boxhi)
+#define _boxlo       MY_AP(boxlo)
+#define _subhi       MY_AP(subhi)
+#define _sublo       MY_AP(sublo)
+#define _box_size    MY_AP(box_size)
+#define _prd         MY_AP(prd)
+#define _periodicity MY_AP(periodicity)
+#define _triclinic	 MY_AP(triclinic)
+#define _boxhi_lamda MY_AP(boxhi_lamda)
+#define _boxlo_lamda MY_AP(boxlo_lamda)
+#define _prd_lamda   MY_AP(prd_lamda)
+#define _h		 	 MY_AP(h)
+#define _h_inv	 	 MY_AP(h_inv)
+#define _h_rate		 MY_AP(h_rate)
+__device__ __constant__ X_FLOAT _boxhi[3];
+__device__ __constant__ X_FLOAT _boxlo[3];
+__device__ __constant__ X_FLOAT _subhi[3];
+__device__ __constant__ X_FLOAT _sublo[3];
+__device__ __constant__ X_FLOAT _box_size[3];
+__device__ __constant__ X_FLOAT _prd[3];
+__device__ __constant__ int _periodicity[3];
+__device__ __constant__ int _triclinic;
+__device__ __constant__ X_FLOAT _boxhi_lamda[3];
+__device__ __constant__ X_FLOAT _boxlo_lamda[3];
+__device__ __constant__ X_FLOAT _prd_lamda[3];
+__device__ __constant__ X_FLOAT _h[6];
+__device__ __constant__ X_FLOAT _h_inv[6];
+__device__ __constant__ V_FLOAT _h_rate[6];
+
+
+//atom properties
+#define _x           MY_AP(x)
+#define _v           MY_AP(v)
+#define _f           MY_AP(f)
+#define _tag         MY_AP(tag)
+#define _type        MY_AP(type)
+#define _mask        MY_AP(mask)
+#define _image       MY_AP(image)
+#define _q           MY_AP(q)
+#define _mass        MY_AP(mass)
+#define _rmass       MY_AP(rmass)
+#define _rmass_flag  MY_AP(rmass_flag)
+#define _eatom       MY_AP(eatom)
+#define _vatom       MY_AP(vatom)
+#define _x_type      MY_AP(x_type)
+#define _radius      MY_AP(radius)
+#define _density     MY_AP(density)
+#define _omega       MY_AP(omega)
+#define _torque      MY_AP(torque)
+#define _special     MY_AP(special)
+#define _maxspecial  MY_AP(maxspecial)
+#define _nspecial    MY_AP(nspecial)
+#define _special_flag  MY_AP(special_flag)
+#define _molecule    MY_AP(molecule)
+#define _v_radius    MY_AP(v_radius)
+#define _omega_rmass MY_AP(omega_rmass)
+#define _freeze_group_bit MY_AP(freeze_group_bit)
+#define _map_array   MY_AP(map_array)
+__device__ __constant__ X_FLOAT* _x;  //holds pointer to positions
+__device__ __constant__ V_FLOAT* _v;
+__device__ __constant__ F_FLOAT* _f;
+__device__ __constant__ int* _tag;
+__device__ __constant__ int* _type;
+__device__ __constant__ int* _mask;
+__device__ __constant__ int* _image;
+__device__ __constant__ V_FLOAT* _mass;
+__device__ __constant__ F_FLOAT* _q;
+__device__ __constant__ V_FLOAT* _rmass;
+__device__ __constant__ int _rmass_flag;
+__device__ __constant__ ENERGY_FLOAT* _eatom;
+__device__ __constant__ ENERGY_FLOAT* _vatom;
+__device__ __constant__ X_FLOAT4* _x_type;  //holds pointer to positions
+__device__ __constant__ X_FLOAT* _radius;  
+__device__ __constant__ F_FLOAT* _density;  
+__device__ __constant__ V_FLOAT* _omega;  
+__device__ __constant__ F_FLOAT* _torque;  
+__device__ __constant__ int* _special;
+__device__ __constant__ int _maxspecial;
+__device__ __constant__ int* _nspecial;
+__device__ __constant__ int _special_flag[4];
+__device__ __constant__ int* _molecule;
+__device__ __constant__ V_FLOAT4* _v_radius;  //holds pointer to positions
+__device__ __constant__ V_FLOAT4* _omega_rmass;  //holds pointer to positions
+__device__ __constant__ int _freeze_group_bit;
+__device__ __constant__ int* _map_array;
+
+#ifdef CUDA_USE_TEXTURE
+	
+	#define _x_tex         MY_AP(x_tex)
+	#if X_PRECISION == 1
+	texture<float> _x_tex;
+	#else
+	texture<int2,1> _x_tex;
+	#endif
+	
+	#define _type_tex         MY_AP(type_tex)
+	texture<int> _type_tex;
+	
+	#define _x_type_tex         MY_AP(x_type_tex)
+	#if X_PRECISION == 1
+	texture<float4,1> _x_type_tex;
+	#else
+	texture<int4,1> _x_type_tex;
+	#endif
+	
+	#define _v_radius_tex         MY_AP(v_radius_tex)
+	#if V_PRECISION == 1
+	texture<float4,1> _v_radius_tex;
+	#else
+	texture<int4,1> _v_radius_tex;
+	#endif
+	
+	#define _omega_rmass_tex         MY_AP(omega_rmass_tex)
+	#if V_PRECISION == 1
+	texture<float4,1> _omega_rmass_tex;
+	#else
+	texture<int4,1> _omega_rmass_tex;
+	#endif
+
+	#define _q_tex         MY_AP(q_tex)
+	#if F_PRECISION == 1
+	texture<float> _q_tex;
+	#else
+	texture<int2,1> _q_tex;
+	#endif
+
+#endif
+
+//neighbor
+#ifdef IncludeCommonNeigh
+#define _inum        	MY_AP(inum)
+#define _inum_border    MY_AP(inum_border)
+#define _ilist       	MY_AP(ilist)
+#define _ilist_border 	MY_AP(ilist_border)
+#define _numneigh    	MY_AP(numneigh)
+#define _numneigh_border 	MY_AP(numneigh_border)
+#define _numneigh_inner		MY_AP(numneigh_inner)
+#define _firstneigh  	MY_AP(firstneigh)
+#define _neighbors 	MY_AP(neighbors)
+#define _neighbors_border 	MY_AP(neighbors_border)
+#define _neighbors_inner  	MY_AP(neighbors_inner)
+#define _reneigh_flag 	MY_AP(reneigh_flag)
+#define _triggerneighsq MY_AP(triggerneighsq)
+#define _xhold       	MY_AP(xhold)
+#define _maxhold     	MY_AP(maxhold)
+#define _dist_check     MY_AP(dist_check)
+#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
+#define _maxneighbors   MY_AP(maxneighbors)
+#define _overlap_comm   MY_AP(overlap_comm)
+__device__ __constant__ int _inum;
+__device__ __constant__ int* _inum_border;
+__device__ __constant__ int* _ilist;
+__device__ __constant__ int* _ilist_border;
+__device__ __constant__ int* _numneigh;
+__device__ __constant__ int* _numneigh_border;
+__device__ __constant__ int* _numneigh_inner;
+__device__ __constant__ int** _firstneigh;
+__device__ __constant__ int* _neighbors;
+__device__ __constant__ int* _neighbors_border;
+__device__ __constant__ int* _neighbors_inner;
+__device__ __constant__ int* _reneigh_flag;
+__device__ __constant__ X_FLOAT _triggerneighsq;
+__device__ __constant__ X_FLOAT* _xhold;  //holds pointer to positions
+__device__ __constant__ int _maxhold;
+__device__ __constant__ int _dist_check;
+__device__ __constant__ int _neighbor_maxlocal;
+__device__ __constant__ int _maxneighbors;
+__device__ __constant__ int _overlap_comm;
+#endif
+
+//system properties
+#define _nall        MY_AP(nall)
+#define _nghost      MY_AP(nghost)
+#define _nlocal      MY_AP(nlocal)
+#define _nmax        MY_AP(nmax)
+#define _cuda_ntypes MY_AP(cuda_ntypes)
+#define _dtf         MY_AP(dtf)
+#define _dtv         MY_AP(dtv)
+#define _factor      MY_AP(factor)
+#define _virial      MY_AP(virial)
+#define _eng_vdwl    MY_AP(eng_vdwl)
+#define _eng_coul    MY_AP(eng_coul)
+#define _molecular   MY_AP(molecular)
+__device__ __constant__ unsigned _nall;
+__device__ __constant__ unsigned _nghost;
+__device__ __constant__ unsigned _nlocal;
+__device__ __constant__ unsigned _nmax;
+__device__ __constant__ unsigned _cuda_ntypes;
+__device__ __constant__ V_FLOAT _dtf;
+__device__ __constant__ X_FLOAT _dtv;
+__device__ __constant__ V_FLOAT _factor;
+__device__ __constant__ ENERGY_FLOAT* _virial;
+__device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
+__device__ __constant__ ENERGY_FLOAT* _eng_coul;
+__device__ __constant__ int _molecular;
+
+//other general constants
+#define _buffer      MY_AP(buffer)
+#define _flag		 MY_AP(flag)
+#define _debugdata   MY_AP(debugdata)
+__device__ __constant__ void* _buffer;
+__device__ __constant__ int* _flag;
+__device__ __constant__ int* _debugdata;
+
+// pointers to data fields on GPU are hold in constant space 
+// -> reduces register usage and number of parameters for kernelcalls 
+// will be variables of file scope in cuda files
+
+
+
+
+// maybe used to output cudaError_t
+#define MY_OUTPUT_RESULT(result) \
+	switch(result) \
+	{ \
+		case cudaSuccess: printf(" => cudaSuccess\n"); break; \
+		case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
+		case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
+		case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
+		case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
+		default: printf(" => unknown\n"); break; \
+	}
+
+#ifdef _DEBUG
+#  define CUT_CHECK_ERROR(errorMessage) {                                    \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    err = cudaThreadSynchronize();                                           \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    }
+#else
+#  define CUT_CHECK_ERROR(errorMessage) {                                    \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    }
+#endif
+
+#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } }
+
+#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
+
+#define X_MASK 1
+#define V_MASK 2
+#define F_MASK 4
+#define TAG_MASK 8
+#define TYPE_MASK 16
+#define MASK_MASK 32
+#define IMAGE_MASK 64
+#define Q_MASK 128
+#define MOLECULE_MASK 256
+#define RMASS_MASK 512
+#define RADIUS_MASK 1024
+#define DENSITY_MASK 2048
+#define OMEGA_MASK 4096
+#define TORQUE_MASK 8192
+
+
+
+#endif // #ifdef _CUDA_COMMON_H_
--- a/src/USER-CUDA/cuda_data.h
+++ b/src/USER-CUDA/cuda_data.h
@ -0,0 +1,796 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_DATA_H_
+#define _CUDA_DATA_H_
+
+
+enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
+//xx==x in atom_vec x is a member therefore copymode x produces compile errors
+#include "cuda_shared.h"
+#include "cuda_wrapper_cu.h"
+#include "cuda_data_cu.h"
+#include <ctime>
+
+#include <cstdio>
+#include <typeinfo>
+template <typename host_type, typename dev_type, copy_mode mode>
+class cCudaData
+{
+	protected:
+	void** buffer;
+	int* buf_size;
+	host_type* host_data;
+	dev_array* dev_data_array;
+	dev_type* temp_data;
+	unsigned nbytes;
+	bool owns_dev_array;
+	bool current_data_on_device; //this is not yet working as intended and therefore deactivated
+	bool current_data_on_host;
+	bool is_continues;
+	bool pinned;
+	
+	public:
+	cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
+	cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
+	~cCudaData();
+	void* dev_data() {if(dev_data_array!=NULL) return dev_data_array->dev_data; else return NULL;};
+	void set_dev_data(void* adev_data) {dev_data_array->dev_data=adev_data;};
+	void set_dev_array(dev_array* adev_array) {dev_data_array=adev_array;};
+	void set_host_data(host_type* host_data);
+	void* get_host_data() { return host_data;};
+	void set_buffer(void** buffer,int* buf_size,bool ais_continues);
+	unsigned int* get_dim() {return dev_data_array->dim;};
+	// if you want to upload data to the gpu, which will not change there, then set will_be_changed=false
+	// if you want to upload data to the gpu and update it there, then set will_be_changed=true (default)
+	void upload(bool will_be_changed=true);
+	void uploadAsync(int stream, bool will_be_changed=true ); 
+	// if you want to download data just to have a look at it, then set will_be_changed=false
+	// if you are going to modify the downloaded data, then set will_be_changed=true (default)
+	void download(bool will_be_changed=true);
+	void downloadAsync(int stream); 
+	void memset_device(int value);
+	void device_data_has_changed() {current_data_on_device=false;}
+	void host_data_has_changed() {current_data_on_host=false;}
+	int dev_size() {
+		int size = dev_data_array->dim[0]*sizeof(dev_type);
+		if(dev_data_array->dim[1]) size*=dev_data_array->dim[1];
+		if(dev_data_array->dim[2]) size*=dev_data_array->dim[2];
+		return size;}
+};
+
+
+template <typename host_type, typename dev_type, copy_mode mode>
+cCudaData<host_type, dev_type, mode>
+::cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
+{
+	pinned=is_pinned;
+	owns_dev_array = false;
+	current_data_on_device = false;
+	current_data_on_host = false;
+	is_continues = false;
+	this->host_data = host_data;
+	this->dev_data_array = dev_data_array;
+	unsigned ndev;
+	if((mode == x)||(mode==xx))
+	{
+		ndev = dim_x;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = 0;
+		dev_data_array->dim[2] = 0;
+	}
+	else if(mode == xy || mode == yx )
+	{
+		ndev = dim_x * dim_y;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = dim_y;
+		dev_data_array->dim[2] = 0;
+	}
+	else
+	{
+		ndev = dim_x * dim_y * dim_z;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = dim_y;
+		dev_data_array->dim[2] = dim_z;
+	}
+	nbytes = ndev * sizeof(dev_type);
+	if(nbytes<=0)
+	{
+		host_data=NULL;
+		temp_data=NULL;
+		dev_data_array->dev_data=NULL;
+		return;
+	}
+	
+	dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
+	if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type))
+	{
+		if(not pinned)
+		temp_data = new dev_type[ndev];
+		else
+		{
+			temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type));
+		}
+	}
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+cCudaData<host_type, dev_type, mode>
+::cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
+{
+	pinned=is_pinned;
+	this->dev_data_array = new dev_array;
+	this->owns_dev_array = true;
+	current_data_on_device = false;
+	current_data_on_host = false;
+	is_continues = false;
+	this->host_data = host_data;
+	unsigned ndev;
+	if((mode == x)||(mode==xx))
+	{
+		ndev = dim_x;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = 0;
+		dev_data_array->dim[2] = 0;
+	}
+	else if(mode == xy || mode == yx )
+	{
+		ndev = dim_x * dim_y;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = dim_y;
+		dev_data_array->dim[2] = 0;
+	}
+	else
+	{
+		ndev = dim_x * dim_y * dim_z;
+		dev_data_array->dim[0] = dim_x;
+		dev_data_array->dim[1] = dim_y;
+		dev_data_array->dim[2] = dim_z;
+	}
+	nbytes = ndev * sizeof(dev_type);
+	if(nbytes<=0)
+	{
+		host_data=NULL;
+		temp_data=NULL;
+		dev_data_array->dev_data=NULL;
+		return;
+	}
+	
+	dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
+	if(((mode!=x)&&(mode!=xx)) || (typeid(host_type) != typeid(dev_type)))
+	{
+		if(not pinned)
+		temp_data = new dev_type[ndev];
+		else
+		{
+			temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type));
+		}
+	}
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+cCudaData<host_type, dev_type, mode>
+::~cCudaData()
+{
+	if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type))
+	{
+		if(not pinned)
+		delete [] temp_data;
+		else
+		{
+			CudaWrapper_FreePinnedHostData((void*)temp_data);
+		}
+	}
+	if((dev_data_array->dev_data)&&(nbytes>0))
+	CudaWrapper_FreeCudaData(dev_data_array->dev_data,nbytes);
+	if(owns_dev_array) delete dev_data_array;
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::set_host_data(host_type* host_data)
+{
+	this->host_data = host_data;
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::upload(bool will_be_changed)
+{
+	// if current data is already up, do not re-upload it
+//	if(current_data_on_device) return;
+    if(buffer&&is_continues)
+    {
+   	printf("Actual Buffer: %p %i\n",*buffer,*buf_size);
+    	if(typeid(host_type)==typeid(double))
+    	{
+    	  if(typeid(dev_type)==typeid(double))
+    	  {
+    	  	CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	  else if(typeid(dev_type)==typeid(float))
+    	  {
+    	  	CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    	else if(typeid(host_type)==typeid(float))
+    	{
+    	  if(typeid(dev_type)==typeid(double))
+    	  {
+    	  	CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	  else if(typeid(dev_type)==typeid(float))
+    	  {
+    	  	CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    	else if(typeid(host_type)==typeid(int))
+    	{
+    	  if(typeid(dev_type)==typeid(int))
+    	  {
+    	  	CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    }
+	switch(mode)
+	{
+		case x:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
+			else
+			{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			  for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			  CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			}
+			break;
+		}
+		
+		case xx:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
+			else
+			{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+				CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			}
+			break;
+		}
+
+		case xy:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			{
+				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			break;
+		}
+		
+		case yx:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+				{
+					temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			break;
+		}	
+		case xyz:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
+				for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+				{
+					temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			break;
+		}	
+
+		case xzy:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+			{
+				dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+			break;
+		}	
+	}
+	// we have uploaded the data to the device, i.e.:
+	current_data_on_device = true;
+	// the data is going to change on the device, making the host data out-dated
+	if(will_be_changed) current_data_on_host = false;
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::uploadAsync(int stream,bool will_be_changed)
+{
+	// if current data is already up, do not re-upload it
+//	if(current_data_on_device) return;
+    if(buffer&&is_continues)
+    {
+   	printf("Actual Buffer: %p %i\n",*buffer,*buf_size);
+    	if(typeid(host_type)==typeid(double))
+    	{
+    	  if(typeid(dev_type)==typeid(double))
+    	  {
+    	  	CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	  else if(typeid(dev_type)==typeid(float))
+    	  {
+    	  	CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    	else if(typeid(host_type)==typeid(float))
+    	{
+    	  if(typeid(dev_type)==typeid(double))
+    	  {
+    	  	CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	  else if(typeid(dev_type)==typeid(float))
+    	  {
+    	  	CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    	else if(typeid(host_type)==typeid(int))
+    	{
+    	  if(typeid(dev_type)==typeid(int))
+    	  {
+    	  	CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
+    	  										dev_data_array->dim,mode,*buffer);
+			current_data_on_device = true;
+			if(will_be_changed) current_data_on_host = false;
+			return;
+    	  }
+    	}
+    }
+	switch(mode)
+	{
+		case x:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
+			else
+			{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			  for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			  CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			}
+			break;
+		}
+		
+		case xx:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
+			else
+			{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+				CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			}
+			break;
+		}
+
+		case xy:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			{
+				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			break;
+		}
+		
+		case yx:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+				{
+					temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			break;
+		}	
+		case xyz:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
+				for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+				{
+					temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			break;
+		}	
+
+		case xzy:
+		{
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+			{
+				dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufUploadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
+			break;
+		}	
+	}
+	// we have uploaded the data to the device, i.e.:
+	current_data_on_device = true;
+	// the data is going to change on the device, making the host data out-dated
+	if(will_be_changed) current_data_on_host = false;
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::download(bool will_be_changed)
+{
+	// if current data is already down, do not re-download it
+//	if(current_data_on_host) return;
+	switch(mode)
+	{
+		case x:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
+			else
+			{
+				CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			}
+			break;
+		}
+		
+		case xx:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+				CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
+			else
+			{
+				CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			}
+			break;
+		}
+		
+		case xy:
+		{
+			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			{
+				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			break;
+		}
+		
+		case yx:
+		{
+			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+				{
+					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			break;
+		}
+
+		case xyz:
+		{
+			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[(i * dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
+				for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+				{
+					reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[k]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			break;
+		}
+
+		case xzy:
+		{
+			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
+    timespec time1,time2;
+    clock_gettime(CLOCK_REALTIME,&time1);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
+			{
+				dev_type* temp = &temp_data[(i * dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[j]);
+				}
+			}
+	clock_gettime(CLOCK_REALTIME,&time2);
+	CudaWrapper_AddCPUBufDownloadTime(
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
+			break;
+		}
+	}
+	// we have downloaded the data to the host, i.e.:
+	current_data_on_host = true;
+	// the data is going to change on the host, making the device data out-dated
+	if(will_be_changed) current_data_on_device = false;
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::downloadAsync(int stream)
+{
+	switch(mode)
+	{
+		case x:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+			{
+				CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
+				CudaWrapper_SyncStream(stream);
+			}		
+			else
+			{
+				CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
+				CudaWrapper_SyncStream(stream);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
+			}
+			break;
+		}
+		
+		case xx:
+		{
+			if(typeid(host_type) == typeid(dev_type))
+			{
+				CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
+			    CudaWrapper_SyncStream(stream);
+			}
+			else
+			{
+				CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
+ 			    CudaWrapper_SyncStream(stream);
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
+			}
+			break;
+		}
+
+		case xy:
+		{
+			CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
+			CudaWrapper_SyncStream(stream);
+			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+			{
+				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
+				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+				{
+					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
+				}
+			}
+			break;
+		}
+		
+		case yx:
+		{
+			CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
+			CudaWrapper_SyncStream(stream);
+			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
+			{
+				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
+				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
+				{
+					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
+				}
+			}
+			break;
+		}
+	}
+}
+
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::memset_device(int value)
+{
+   CudaWrapper_Memset(dev_data_array->dev_data,value, nbytes);
+}
+
+template <typename host_type, typename dev_type, copy_mode mode>
+void cCudaData<host_type, dev_type, mode>
+::set_buffer(void** abuffer,int* abuf_size,bool ais_continues)
+{
+   buffer = abuffer;
+   buf_size = abuf_size;
+   unsigned nbytes_buf=(nbytes/sizeof(dev_type))*sizeof(host_type);
+   if(buffer!=NULL)
+   if(not((typeid(host_type) == typeid(dev_type))&&(mode == x || mode == xx)))
+   {
+   	printf("Allocate Buffer: %p %i\n",*buffer,*buf_size);
+   	 if(((*buffer)!=NULL)&&(*buf_size<nbytes_buf))
+   	 CudaWrapper_FreeCudaData(*buffer,*buf_size);
+   	 if(*buf_size<nbytes_buf)
+   	 {*buffer=CudaWrapper_AllocCudaData(nbytes_buf);*buf_size=nbytes_buf;}
+   	printf("Allocate Buffer2: %p %i\n",*buffer,*buf_size);
+   	 
+   }
+   is_continues=ais_continues;
+}
+#endif // _CUDA_DATA_H_
--- a/src/USER-CUDA/cuda_modify_flags.h
+++ b/src/USER-CUDA/cuda_modify_flags.h
@ -0,0 +1,39 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef CUDA_MODIFY_FLAGS_H
+#define CUDA_MODIFY_FLAGS_H
+
+#define INITIAL_INTEGRATE_CUDA  (1 << 16)
+#define POST_INTEGRATE_CUDA     (1 << 17)
+#define PRE_EXCHANGE_CUDA       (1 << 18)
+#define PRE_NEIGHBOR_CUDA       (1 << 19)
+#define PRE_FORCE_CUDA          (1 << 20)
+#define POST_FORCE_CUDA         (1 << 21)
+#define FINAL_INTEGRATE_CUDA    (1 << 22)
+#define END_OF_STEP_CUDA        (1 << 23)
+#define THERMO_ENERGY_CUDA      (1 << 24)
+#define MIN_POST_FORCE_CUDA      (1 << 25)
+// remember not to shift over 31 bits
+
+#endif // CUDA_MODIFY_FLAGS_H
--- a/src/USER-CUDA/cuda_neigh_list.cpp
+++ b/src/USER-CUDA/cuda_neigh_list.cpp
@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_neigh_list.h"
+#include "neigh_list.h"
+#include <cstring>
+#include <vector>
+#include <map>
+#include <algorithm>
+#include "cuda.h"
+#include "atom.h"
+
+using namespace LAMMPS_NS;
+
+CudaNeighList::CudaNeighList(LAMMPS *lmp, class NeighList* neigh_list) : Pointers(lmp)
+{
+        cuda = lmp->cuda;
+	MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... start\n");)
+	this->neigh_list = neigh_list;
+	neigh_list->cuda_list=this;
+	sneighlist.maxlocal = neigh_list->get_maxlocal();
+	sneighlist.maxneighbors = 32;
+	sneighlist.maxcut = 0.0;
+	sneighlist.cutneighsq = NULL;
+	cu_neighbors = NULL;
+	cu_neighbors_border = NULL;
+	cu_neighbors_inner = NULL;
+	cu_numneigh_border = NULL;
+	cu_numneigh_inner = NULL;
+	cu_numneigh = NULL;
+	cu_ilist = NULL;
+	cu_ilist_border = NULL;
+	cu_inum_border = NULL;
+	inum_border = 0;
+	neighbors = NULL;
+	neighbors_inner = NULL;
+	neighbors_border = NULL;
+	numneigh_border = NULL;
+	numneigh_inner = NULL;
+	ilist_border = NULL;
+
+	build_cuda = false;
+	sneighlist.binned_id=NULL;
+	sneighlist.bin_dim=new int[3];
+	sneighlist.bin_dim[0]=0;
+	sneighlist.bin_dim[1]=0;
+	sneighlist.bin_dim[2]=0;
+	
+	cu_ex_type = NULL;
+	cu_ex1_bit = NULL;
+	cu_ex2_bit = NULL;
+	cu_ex_mol_bit = NULL;
+	sneighlist.nex_type=0;
+	sneighlist.nex_group=0;
+	sneighlist.nex_mol=0;
+	
+	sneighlist.bin_nmax=0;
+	sneighlist.bin_extraspace=0.05;
+	MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... end\n");)
+	
+}
+
+CudaNeighList::~CudaNeighList()
+{
+	dev_free();
+}
+
+void CudaNeighList::dev_alloc()
+{ 
+	MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... start\n"); )
+	cu_ilist         = new cCudaData<int , int , x> (neigh_list->ilist   , & sneighlist.ilist     , sneighlist.maxlocal );
+	cu_numneigh      = new cCudaData<int , int , x> (neigh_list->numneigh, & sneighlist.numneigh  , sneighlist.maxlocal );
+	neighbors = new int[atom->nmax*sneighlist.maxneighbors];
+	cu_neighbors= new cCudaData<int, int, x> (neighbors			 		 , & sneighlist.neighbors, atom->nmax*sneighlist.maxneighbors );
+
+	if(cuda->shared_data.overlap_comm)
+	{
+	ilist_border  = new int[sneighlist.maxlocal];
+	numneigh_border	= new int[sneighlist.maxlocal];
+	numneigh_inner	= new int[sneighlist.maxlocal];
+	cu_inum_border  = new cCudaData<int , int , x> (&inum_border		 , & sneighlist.inum_border      , 1 );
+	cu_ilist_border  = new cCudaData<int , int , x> (ilist_border		 , & sneighlist.ilist_border     , sneighlist.maxlocal );
+	cu_numneigh_border	= new cCudaData<int , int , x> (numneigh_border  , & sneighlist.numneigh_border  , sneighlist.maxlocal );
+	cu_numneigh_inner 	= new cCudaData<int , int , x> (numneigh_inner   , & sneighlist.numneigh_inner   , sneighlist.maxlocal );
+	neighbors_border = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
+	cu_neighbors_border= new cCudaData<int, int, x> (neighbors_border	 , & sneighlist.neighbors_border, sneighlist.maxlocal*sneighlist.maxneighbors );
+	neighbors_inner = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
+	cu_neighbors_inner = new cCudaData<int, int, x> (neighbors_inner	 , & sneighlist.neighbors_inner , sneighlist.maxlocal*sneighlist.maxneighbors );
+	}
+	MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... end\n"); )
+}
+
+void CudaNeighList::dev_free()
+{
+	MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... start\n"); )
+	delete cu_numneigh;
+	delete cu_ilist;
+	delete [] neighbors;
+	delete cu_neighbors;
+
+	if(cuda->shared_data.overlap_comm)
+	{
+	delete [] ilist_border;
+	delete [] numneigh_border;
+	delete [] numneigh_inner;
+	delete [] neighbors_border;
+	delete [] neighbors_inner;
+	delete cu_inum_border;
+	delete cu_neighbors_border;
+	delete cu_neighbors_inner;
+	delete cu_numneigh_border;
+	delete cu_numneigh_inner;
+	delete cu_ilist_border;
+	}
+	MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... end\n"); )
+}
+
+void CudaNeighList::grow_device()
+{
+	MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... start\n");)
+	// if host has allocated more memory for atom arrays than device has, then allocate more memory on device
+	int new_maxlocal = neigh_list->get_maxlocal();
+	if(sneighlist.maxlocal < new_maxlocal)
+	{
+		sneighlist.maxlocal = new_maxlocal;
+		dev_free();
+		dev_alloc();
+	}
+
+	if(!cu_ilist || !cu_numneigh) dev_alloc();
+
+	// check, if hosts data has been allocated somewhere else
+	if(cu_ilist   ->get_host_data() != neigh_list->ilist)    cu_ilist   ->set_host_data(neigh_list->ilist);
+	if(cu_numneigh->get_host_data() != neigh_list->numneigh) cu_numneigh->set_host_data(neigh_list->numneigh);
+		
+	MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... end\n");)
+}
+
+
+void CudaNeighList::nl_upload(bool will_be_changed)
+{
+	//return;
+	MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... start\n");)
+	if(cu_ilist)
+	cu_ilist->upload();
+	if(cu_numneigh)
+	cu_numneigh->upload();
+	MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... end\n");)
+}
+
+void CudaNeighList::nl_download(bool will_be_changed)
+{
+	MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... start\n");)
+	if(cu_ilist)
+	cu_ilist->download();
+	if(cu_numneigh)
+	cu_numneigh->download();
+	MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... end\n");)
+}
+
--- a/src/USER-CUDA/cuda_neigh_list.h
+++ b/src/USER-CUDA/cuda_neigh_list.h
@ -0,0 +1,83 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_NEIGH_LIST_CUDA_H
+#define LMP_NEIGH_LIST_CUDA_H
+
+#include "pointers.h"
+#include "cuda_data.h"
+#include "neigh_list.h"
+
+namespace LAMMPS_NS
+{
+
+class CudaNeighList : protected Pointers
+{
+	public:
+		cCudaData<int , int , x>*  cu_ilist;
+		cCudaData<int , int , x>*  cu_numneigh;
+		cCudaData<int , int , x>*  cu_inum_border;
+		cCudaData<int , int , x>*  cu_ilist_border;
+		cCudaData<int , int , x>*  cu_numneigh_border;
+		cCudaData<int , int , x>*  cu_numneigh_inner;
+		cCudaData<int , int , x>*  cu_neighbors;
+		cCudaData<int , int , x>*  cu_neighbors_border;
+		cCudaData<int , int , x>*  cu_neighbors_inner;
+		cCudaData<int , int , x>*  cu_ex_type;
+		cCudaData<int , int , x>*  cu_ex1_bit;
+		cCudaData<int , int , x>*  cu_ex2_bit;
+		cCudaData<int , int , x>*  cu_ex_mol_bit;
+		
+		
+		cuda_shared_neighlist sneighlist;
+		
+		int* neighbors;
+		int* neighbors_inner;
+		int* neighbors_border;
+		int inum_border;
+		int* ilist_border;
+		int* numneigh_border;
+		int* numneigh_inner;
+		int nex_type;
+		int nex_group;
+		int nex_mol;
+		
+		bool build_cuda;
+		
+		CudaNeighList(class LAMMPS *, class NeighList* neigh_list);
+		~CudaNeighList();
+		void grow_device(); // will grow pages memory on device, keeping old pages. will grow lists memory on device, deleting old lists
+		void nl_upload(bool will_be_changed=true);
+		void nl_download(bool will_be_changed=true);
+		NeighList* neigh_list;
+			
+		void dev_alloc();
+		void dev_free();
+
+ private:
+  class Cuda *cuda;
+};
+
+}
+
+#endif
--- a/src/USER-CUDA/cuda_precision.h
+++ b/src/USER-CUDA/cuda_precision.h
@ -0,0 +1,269 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef CUDA_PRECISION_H_
+#define CUDA_PRECISION_H_
+/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
+ * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
+ * ***_FLOAT: type definition of given property
+ * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
+ */
+
+#ifdef CUDA_USE_BINNING
+#define CUDA_IF_BINNING(a) a
+#else
+#define CUDA_IF_BINNING(a) 
+#endif
+
+//GLOBAL
+
+#ifdef CUDA_PRECISION
+  #if CUDA_PRECISION == 1
+    #define CUDA_FLOAT float
+    #define CUDA_F(x) x##f
+  #endif 
+  #if CUDA_PRECISION == 2
+    #define CUDA_FLOAT double
+    #define CUDA_F(x) x
+  #endif
+#endif
+ 
+#ifndef CUDA_PRECISION
+  #define CUDA_FLOAT double
+  #define CUDA_F(x) x
+  #define CUDA_PRECISION 2
+#endif
+//--------------------------------
+//-----------FFT-----------------
+//--------------------------------
+
+#ifdef FFT_PRECISION_CU
+  #if FFT_PRECISION_CU == 1
+    #define FFT_FLOAT float
+    #define FFT_F(x) x##f 
+  #endif 
+  #if FFT_PRECISION_CU == 2
+    #define FFT_FLOAT double
+    #define FFT_F(x) x 
+  #endif
+#endif
+
+#ifndef FFT_PRECISION_CU
+  #define FFT_FLOAT CUDA_FLOAT
+  #define FFT_F(x) CUDA_F(x)
+  #define FFT_PRECISION_CU CUDA_PRECISION 
+#endif
+
+//--------------------------------
+//-----------PPPM-----------------
+//--------------------------------
+
+#ifdef PPPM_PRECISION
+  #if PPPM_PRECISION == 1
+    #define PPPM_FLOAT float
+    #define PPPM_F(x) x##f 
+  #endif 
+  #if PPPM_PRECISION == 2
+    #define PPPM_FLOAT double
+    #define PPPM_F(x) x 
+  #endif
+#endif
+
+#ifndef PPPM_PRECISION
+  #define PPPM_FLOAT CUDA_FLOAT
+  #define PPPM_F(x) CUDA_F(x) 
+  #define PPPM_PRECISION CUDA_PRECISION 
+#endif
+
+//--------------------------------
+//-----------FORCE-----------------
+//--------------------------------
+
+
+#ifdef F_PRECISION
+  #if F_PRECISION == 1
+    #define F_FLOAT float
+    #define F_F(x) x##f 
+  #endif 
+  #if F_PRECISION == 2
+    #define F_FLOAT double
+    #define F_F(x) x 
+  #endif
+#endif
+
+#ifndef F_PRECISION
+  #define F_FLOAT CUDA_FLOAT
+  #define F_F(x) CUDA_F(x) 
+  #define F_PRECISION CUDA_PRECISION 
+#endif
+
+#if F_PRECISION == 1
+#define _SQRT_ sqrtf
+#define _RSQRT_ rsqrtf
+#define _EXP_ expf
+#else
+#define _SQRT_ sqrt
+#define _RSQRT_ rsqrt
+#define _EXP_ exp
+#endif
+
+#if F_PRECISION == 2
+struct F_FLOAT2
+{
+  F_FLOAT x;
+  F_FLOAT y;
+};
+struct F_FLOAT3
+{
+  F_FLOAT x;
+  F_FLOAT y;
+  F_FLOAT z;
+};
+struct F_FLOAT4
+{
+  F_FLOAT x;
+  F_FLOAT y;
+  F_FLOAT z;
+  F_FLOAT w;
+};
+#else
+#define F_FLOAT2 float2
+#define F_FLOAT3 float3
+#define F_FLOAT4 float4
+#endif
+//--------------------------------
+//-----------ENERGY-----------------
+//--------------------------------
+
+#ifndef ENERGY_PRECISION
+  #define ENERGY_FLOAT CUDA_FLOAT
+  #define ENERGY_F(x) CUDA_F(x) 
+#endif
+
+#ifdef ENERGY_PRECISION
+  #if ENERGY_PRECISION == 1
+    #define ENERGY_FLOAT float
+    #define ENERGY_F(x) x##f 
+  #endif 
+  #if ENERGY_PRECISION == 2
+    #define ENERGY_FLOAT double
+    #define ENERGY_F(x) x 
+  #endif
+#endif
+
+#ifndef ENERGY_PRECISION
+  #define ENERGY_FLOAT CUDA_FLOAT
+  #define ENERGY_F(x) CUDA_F(x) 
+  #define ENERGY_PRECISION CUDA_PRECISION 
+#endif
+
+//--------------------------------
+//-----------POSITIONS------------
+//--------------------------------
+
+#ifdef X_PRECISION
+  #if X_PRECISION == 1
+    #define X_FLOAT float
+    #define X_F(x) x##f 
+  #endif 
+  #if X_PRECISION == 2
+    #define X_FLOAT double
+    #define X_F(x) x 
+  #endif
+#endif
+
+#ifndef X_PRECISION
+  #define X_FLOAT CUDA_FLOAT
+  #define X_F(x) CUDA_F(x) 
+  #define X_PRECISION CUDA_PRECISION 
+#endif
+
+#if X_PRECISION == 2
+struct X_FLOAT2
+{
+  X_FLOAT x;
+  X_FLOAT y;
+};
+struct X_FLOAT3
+{
+  X_FLOAT x;
+  X_FLOAT y;
+  X_FLOAT z;
+};
+struct X_FLOAT4
+{
+  X_FLOAT x;
+  X_FLOAT y;
+  X_FLOAT z;
+  X_FLOAT w;
+};
+#else
+#define X_FLOAT2 float2
+#define X_FLOAT3 float3
+#define X_FLOAT4 float4
+#endif
+
+//--------------------------------
+//-----------velocities-----------
+//--------------------------------
+
+#ifdef V_PRECISION
+  #if V_PRECISION == 1
+    #define V_FLOAT float
+    #define V_F(x) x##f  
+  #endif 
+  #if V_PRECISION == 2
+    #define V_FLOAT double
+    #define V_F(x) x  
+  #endif
+#endif
+
+#ifndef V_PRECISION
+  #define V_FLOAT CUDA_FLOAT
+  #define V_F(x) CUDA_F(x) 
+  #define V_PRECISION CUDA_PRECISION 
+#endif
+
+#if V_PRECISION == 2
+struct V_FLOAT4
+{
+  V_FLOAT x;
+  V_FLOAT y;
+  V_FLOAT z;
+  V_FLOAT w;
+};
+#else
+#define V_FLOAT4 float4
+#endif
+
+#ifdef NO_PREC_TIMING
+struct timespec_2
+{
+	unsigned int tv_sec;
+	unsigned int tv_nsec;
+};
+
+#define timespec timespec_2
+#define clock_gettime(a,b) 
+#endif
+#endif /*CUDA_PRECISION_H_*/
--- a/src/USER-CUDA/cuda_shared.h
+++ b/src/USER-CUDA/cuda_shared.h
@ -0,0 +1,378 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_SHARED_H_
+#define _CUDA_SHARED_H_
+#include "cuda_precision.h"
+
+#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
+
+struct dev_array
+{
+	void* dev_data;			// pointer to memory address on cuda device
+	unsigned dim[3];		// array dimensions
+};
+
+struct cuda_shared_atom		// relevent data from atom class
+{
+	dev_array dx; 			// cumulated distance for binning settings
+	dev_array x;			// position
+	dev_array v;			// velocity
+	dev_array f;			// force
+	dev_array tag;
+	dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)   
+	dev_array mask;
+	dev_array image; 		
+	dev_array q;			// charges
+	dev_array mass;			// per-type masses
+	dev_array rmass;		// per-atom masses
+	dev_array radius;		// per-atom radius
+	dev_array density;
+	dev_array omega;
+	dev_array torque;
+	dev_array molecule;
+	
+	dev_array special;
+	int maxspecial;
+	dev_array nspecial;
+	int* special_flag;
+	int molecular;
+	
+	dev_array eatom;		// per-atom energy
+	dev_array vatom;		// per-atom virial
+	int need_eatom;
+	int need_vatom;
+	
+	dev_array x_type;		// position + type in X_FLOAT4 struct
+	dev_array v_radius;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
+	dev_array omega_rmass;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
+
+	double* mass_host;		// remember per-type host pointer to masses
+	//int natoms;				// total # of atoms in system, could be 0
+	int nghost;				// and ghost atoms on this proc
+	int nlocal;				// # of owned
+	int nall;			    // total # of atoms in this proc
+	int nmax;				// max # of owned+ghost in arrays on this proc 	
+	int ntypes;
+	int q_flag;				// do we have charges?
+	int rmass_flag;			// do we have per-atom masses?
+	int firstgroup;
+	int nfirst;
+	
+ 	int update_nlocal;
+ 	int update_nmax;
+ 	
+	dev_array xhold;	    // position at last neighboring
+ 	X_FLOAT triggerneighsq;		// maximum square movement before reneighboring
+ 	int reneigh_flag;		// is reneighboring necessary
+ 	int maxhold;			// size of xhold
+ 	int dist_check; 		//perform distance check for reneighboring
+ 	dev_array binned_id;    //id of each binned atom (not tag!!)
+ 	dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
+    float bin_extraspace;	
+	int bin_dim[3];
+	int bin_nmax;
+	dev_array map_array;
+};
+
+struct cuda_shared_pair		// relevent data from pair class
+{
+	char cudable_force;		// check for (cudable_force!=0)
+	X_FLOAT cut_global;
+	X_FLOAT cut_inner_global;
+	X_FLOAT cut_coul_global;
+	double** cut;			// type-type cutoff 
+	double** cutsq;			// type-type cutoff 
+	double** cut_inner;			// type-type cutoff for coul
+	double** cut_coul;			// type-type cutoff for coul
+	double** coeff1;		// tpye-type pair parameters
+	double** coeff2;
+	double** coeff3;
+	double** coeff4;
+	double** coeff5;		
+	double** coeff6;		
+	double** coeff7;		
+	double** coeff8;		
+	double** coeff9;		
+	double** coeff10;		
+	double** offset;
+	double* special_lj;
+	double* special_coul;
+	dev_array virial; // ENERGY_FLOAT
+	dev_array eng_vdwl; // ENERGY_FLOAT
+	dev_array eng_coul; // ENERGY_FLOAT
+	X_FLOAT cut_coulsq_global;
+	F_FLOAT g_ewald,kappa;
+	int freeze_group_bit;
+	
+	dev_array coeff1_gm;
+	dev_array coeff2_gm;
+	dev_array coeff3_gm;
+	dev_array coeff4_gm;
+	dev_array coeff5_gm;
+	dev_array coeff6_gm;
+	dev_array coeff7_gm;
+	dev_array coeff8_gm;
+	dev_array coeff9_gm;
+	dev_array coeff10_gm;
+	
+	int lastgridsize;
+	int n_energy_virial;
+	int collect_forces_later;
+	int use_block_per_atom;
+	int override_block_per_atom;
+	
+};
+
+struct cuda_shared_domain	// relevent data from domain class
+{
+	X_FLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
+	X_FLOAT subhi[3];
+	X_FLOAT boxlo[3];
+	X_FLOAT boxhi[3];
+	X_FLOAT prd[3];
+	int periodicity[3];		// xyz periodicity as array
+
+	int triclinic;
+	X_FLOAT xy;
+	X_FLOAT xz;
+	X_FLOAT yz;
+	X_FLOAT boxlo_lamda[3];
+	X_FLOAT boxhi_lamda[3];
+	X_FLOAT prd_lamda[3];
+	X_FLOAT h[6];
+	X_FLOAT h_inv[6];
+	V_FLOAT h_rate[6];
+	int update;
+};
+
+struct cuda_shared_pppm
+{
+   char cudable_force;
+#ifdef FFT_CUFFT  
+   FFT_FLOAT* work1;
+   FFT_FLOAT* work2;
+   FFT_FLOAT* work3;
+   PPPM_FLOAT* greensfn;
+   PPPM_FLOAT* fkx;
+   PPPM_FLOAT* fky;
+   PPPM_FLOAT* fkz;
+   PPPM_FLOAT* vg;
+#endif
+   int* part2grid;
+   PPPM_FLOAT* density_brick;
+   int* density_brick_int;
+   PPPM_FLOAT density_intScale;
+   PPPM_FLOAT* vdx_brick;
+   PPPM_FLOAT* vdy_brick;
+   PPPM_FLOAT* vdz_brick;
+   PPPM_FLOAT* density_fft;
+   ENERGY_FLOAT* energy;
+   ENERGY_FLOAT* virial;
+   int nxlo_in;
+   int nxhi_in;
+   int nxlo_out;
+   int nxhi_out;
+   int nylo_in;
+   int nyhi_in;
+   int nylo_out;
+   int nyhi_out;
+   int nzlo_in;
+   int nzhi_in;
+   int nzlo_out;
+   int nzhi_out;
+   int nx_pppm;
+   int ny_pppm;
+   int nz_pppm;
+   PPPM_FLOAT qqrd2e;
+   int order;
+  // float3 sublo;
+   PPPM_FLOAT* rho_coeff;
+   int nmax;
+   int nlocal;
+   PPPM_FLOAT* debugdata;
+   PPPM_FLOAT delxinv;
+   PPPM_FLOAT delyinv;
+   PPPM_FLOAT delzinv;
+   int nlower;
+   int nupper;
+   PPPM_FLOAT shiftone;
+   
+};
+
+struct cuda_shared_comm
+{
+   int maxswap;
+   int maxlistlength;
+   dev_array pbc;
+   dev_array slablo;
+   dev_array slabhi;
+   dev_array multilo;
+   dev_array multihi;
+   dev_array sendlist;
+   int grow_flag;
+   int comm_phase;
+   
+   int nsend;
+   int* nsend_swap;
+   int* send_size;
+   int* recv_size; 
+   double** buf_send;
+   void** buf_send_dev;
+   double** buf_recv;
+   void** buf_recv_dev;
+   void* buffer;
+   int buffer_size;
+   double overlap_split_ratio;
+};
+
+struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
+{
+	int maxlocal;
+	int inum;                // # of I atoms neighbors are stored for local indices of I atoms
+	int inum_border2;
+	dev_array inum_border;         // # of atoms which interact with border atoms
+	dev_array ilist;
+	dev_array ilist_border;
+	dev_array numneigh;
+	dev_array numneigh_inner;
+	dev_array numneigh_border;
+	dev_array firstneigh;
+	dev_array neighbors;
+	dev_array neighbors_border;
+	dev_array neighbors_inner;
+	int maxpage;
+	dev_array page_pointers;
+	dev_array* pages;
+	int maxneighbors;
+	int neigh_lists_per_page;
+	double** cutneighsq;
+	CUDA_FLOAT* cu_cutneighsq;
+	int* binned_id;
+	int* bin_dim;
+	int bin_nmax;
+	float bin_extraspace;
+	double maxcut;
+	dev_array ex_type;
+	int nex_type;
+	dev_array ex1_bit;
+	dev_array ex2_bit;
+	int nex_group;
+	dev_array ex_mol_bit;
+	int nex_mol;
+	
+};
+
+struct cuda_compile_settings		// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
+{
+    int prec_glob;
+    int prec_x;
+    int prec_v;
+    int prec_f;
+    int prec_pppm;
+    int prec_fft;
+    int cufft;
+    int arch;
+};
+
+struct cuda_timings_struct
+{
+	//Debug:
+	double test1;
+	double test2;
+	//transfers
+	double transfer_upload_tmp_constr;
+	double transfer_download_tmp_deconstr;
+	
+	//communication
+	double comm_forward_total;
+	double comm_forward_mpi_upper;
+	double comm_forward_mpi_lower;
+	double comm_forward_kernel_pack;
+	double comm_forward_kernel_unpack;
+	double comm_forward_kernel_self;
+	double comm_forward_upload;
+	double comm_forward_download;
+
+	double comm_exchange_total;
+	double comm_exchange_mpi;
+	double comm_exchange_kernel_pack;
+	double comm_exchange_kernel_unpack;
+	double comm_exchange_kernel_fill;
+	double comm_exchange_cpu_pack;
+	double comm_exchange_upload;
+	double comm_exchange_download;
+
+	double comm_border_total;
+	double comm_border_mpi;
+	double comm_border_kernel_pack;
+	double comm_border_kernel_unpack;
+	double comm_border_kernel_self;
+	double comm_border_kernel_buildlist;
+	double comm_border_upload;
+	double comm_border_download;
+	
+	//pair forces
+	double pair_xtype_conversion;
+	double pair_kernel;
+	double pair_virial;
+	double pair_force_collection;
+	
+	//neighbor
+	double neigh_bin;
+	double neigh_build;
+	double neigh_special;
+	
+	//PPPM
+ 	double pppm_particle_map; 
+    double pppm_make_rho; 
+    double pppm_brick2fft; 
+    double pppm_poisson; 
+    double pppm_fillbrick; 
+    double pppm_fieldforce; 
+    double pppm_compute; 
+	
+};
+
+struct cuda_shared_data		// holds space for all relevent data from the different classes
+{
+	void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
+	int buffersize; //maxsize of buffer
+	int buffer_new; //should be 1 if the pointer to buffer has changed
+	void* flag;
+	void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
+	cuda_shared_atom atom;
+	cuda_shared_pair pair;	
+	cuda_shared_domain domain;
+	cuda_shared_pppm pppm;
+	cuda_shared_comm comm;
+	cuda_compile_settings compile_settings;
+	cuda_timings_struct cuda_timings;
+	int exchange_dim;
+	int me; //mpi rank
+	unsigned int datamask;
+	int overlap_comm;
+};
+
+
+#endif // #ifndef _CUDA_SHARED_H_
--- a/src/USER-CUDA/domain_cuda.cpp
+++ b/src/USER-CUDA/domain_cuda.cpp
@ -0,0 +1,270 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author (triclinic) : Pieter in 't Veld (SNL)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "stdlib.h"
+#include "string.h"
+#include "stdio.h"
+#include "math.h"
+#include "domain_cuda.h"
+#include "style_region.h"
+#include "atom.h"
+#include "force.h"
+#include "update.h"
+#include "modify.h"
+#include "fix.h"
+#include "fix_deform.h"
+#include "region.h"
+#include "lattice.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+
+#include "cuda.h"
+#include "domain_cu.h"
+
+using namespace LAMMPS_NS;
+
+#define BIG   1.0e20
+#define SMALL 1.0e-4
+#define DELTA 1
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+enum{NO_REMAP,X_REMAP,V_REMAP};                   // same as fix_deform.cpp
+
+/* ----------------------------------------------------------------------
+   default is periodic 
+------------------------------------------------------------------------- */
+
+DomainCuda::DomainCuda(LAMMPS *lmp) : Domain(lmp)
+{
+  cuda = lmp->cuda;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DomainCuda::init()
+{
+  Domain::init();
+
+  if(not cuda->finished_run)
+  {
+    cuda->setDomainParams();
+    Cuda_Domain_Init(&cuda->shared_data);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set global box params
+   assumes boxlo/hi and triclinic tilts are already set
+------------------------------------------------------------------------- */
+
+void DomainCuda::set_global_box()
+{
+  Domain::set_global_box();
+
+  if(not cuda->finished_run)
+  {
+    cuda->setDomainParams();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set lamda box params, only need be done one time
+   assumes global box is defined and proc assignment has been made by comm
+   for uppermost proc, insure subhi = 1.0 (in case round-off occurs)
+------------------------------------------------------------------------- */
+
+void DomainCuda::set_lamda_box()
+{
+  Domain::set_lamda_box();
+
+  if(not cuda->finished_run)
+  {
+    cuda->setDomainParams();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set local subbox params
+   assumes global box is defined and proc assignment has been made
+   for uppermost proc, insure subhi = boxhi (in case round-off occurs)
+------------------------------------------------------------------------- */
+
+void DomainCuda::set_local_box()
+{
+  Domain::set_local_box();
+
+  if(not cuda->finished_run)
+  {
+   // cuda->setDomainParams();
+    //Cuda_Domain_Init(&cuda->shared_data);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   reset global & local boxes due to global box boundary changes
+   if shrink-wrapped, determine atom extent and reset boxlo/hi
+   if shrink-wrapped and triclinic, perform shrink-wrap in box coords
+------------------------------------------------------------------------- */
+
+void DomainCuda::reset_box()
+{
+  if (nonperiodic == 2) {
+
+    // convert back to box coords for shrink-wrap operation
+
+    if (triclinic) lamda2x(atom->nlocal);
+
+    // compute extent of atoms on this proc
+
+    double extent[3][2],all[3][2];
+
+    extent[2][0] = extent[1][0] = extent[0][0] = BIG;
+    extent[2][1] = extent[1][1] = extent[0][1] = -BIG;
+
+    double **x = atom->x;
+    int nlocal = atom->nlocal;
+
+    if (cuda->finished_setup&&(!cuda->oncpu))
+      {
+        extent[0][0]=cuda->extent[0];
+        extent[0][1]=cuda->extent[1];
+        extent[1][0]=cuda->extent[2];
+        extent[1][1]=cuda->extent[3];
+        extent[2][0]=cuda->extent[4];
+        extent[2][1]=cuda->extent[5];
+      }
+    else
+      for (int i = 0; i < nlocal; i++) {
+        extent[0][0] = MIN(extent[0][0],x[i][0]);
+        extent[0][1] = MAX(extent[0][1],x[i][0]);
+        extent[1][0] = MIN(extent[1][0],x[i][1]);
+        extent[1][1] = MAX(extent[1][1],x[i][1]);
+        extent[2][0] = MIN(extent[2][0],x[i][2]);
+        extent[2][1] = MAX(extent[2][1],x[i][2]);
+      }
+
+    // compute extent across all procs
+    // flip sign of MIN to do it in one Allreduce MAX
+
+    extent[0][0] = -extent[0][0];
+    extent[1][0] = -extent[1][0];
+    extent[2][0] = -extent[2][0];
+
+    MPI_Allreduce(extent,all,6,MPI_DOUBLE,MPI_MAX,world);
+
+    // in shrink-wrapped dims, set box by atom extent
+    // if minimum set, enforce min box size settings
+
+    if (xperiodic == 0) {
+      if (boundary[0][0] == 2) boxlo[0] = -all[0][0] - SMALL;
+      else if (boundary[0][0] == 3) boxlo[0] = MIN(-all[0][0]-SMALL,minxlo);
+      if (boundary[0][1] == 2) boxhi[0] = all[0][1] + SMALL;
+      else if (boundary[0][1] == 3) boxhi[0] = MAX(all[0][1]+SMALL,minxhi);
+      if (boxlo[0] > boxhi[0]) error->all("Illegal simulation box");
+    }
+    if (yperiodic == 0) {
+      if (boundary[1][0] == 2) boxlo[1] = -all[1][0] - SMALL;
+      else if (boundary[1][0] == 3) boxlo[1] = MIN(-all[1][0]-SMALL,minylo);
+      if (boundary[1][1] == 2) boxhi[1] = all[1][1] + SMALL;
+      else if (boundary[1][1] == 3) boxhi[1] = MAX(all[1][1]+SMALL,minyhi);
+      if (boxlo[1] > boxhi[1]) error->all("Illegal simulation box");
+    }
+    if (zperiodic == 0) {
+      if (boundary[2][0] == 2) boxlo[2] = -all[2][0] - SMALL;
+      else if (boundary[2][0] == 3) boxlo[2] = MIN(-all[2][0]-SMALL,minzlo);
+      if (boundary[2][1] == 2) boxhi[2] = all[2][1] + SMALL;
+      else if (boundary[2][1] == 3) boxhi[2] = MAX(all[2][1]+SMALL,minzhi);
+      if (boxlo[2] > boxhi[2]) error->all("Illegal simulation box");
+    }
+  }
+
+  set_global_box();
+  set_local_box();
+
+  if(not cuda->finished_run)
+  {
+    cuda->setDomainParams();
+    Cuda_Domain_Init(&cuda->shared_data);
+  }
+
+  // if shrink-wrapped, convert to lamda coords for new box
+  // must re-invoke pbc() b/c x2lamda result can be outside 0,1 due to roundoff
+
+  if (nonperiodic == 2 && triclinic) {
+    x2lamda(atom->nlocal);
+    pbc();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   enforce PBC and modify box image flags for each atom
+   called every reneighboring and by other commands that change atoms
+   resulting coord must satisfy lo <= coord < hi
+   MAX is important since coord - prd < lo can happen when coord = hi
+   if fix deform, remap velocity of fix group atoms by box edge velocities
+   for triclinic, atoms must be in lamda coords (0-1) before pbc is called
+   image = 10 bits for each dimension
+   increment/decrement in wrap-around fashion
+------------------------------------------------------------------------- */
+
+void DomainCuda::pbc()
+{
+  if(cuda->finished_setup&&(!cuda->oncpu))
+  {
+  	cuda->setDomainParams();
+    Cuda_Domain_PBC(&cuda->shared_data, deform_vremap, deform_groupbit,cuda->extent);
+    return;
+  }
+  
+  Domain::pbc();
+}
+
+
+/* ----------------------------------------------------------------------
+   convert triclinic 0-1 lamda coords to box coords for all N atoms
+   x = H lamda + x0;
+------------------------------------------------------------------------- */
+
+void DomainCuda::lamda2x(int n)
+{
+  if(cuda->finished_setup&&(!cuda->oncpu))
+  {
+    Cuda_Domain_lamda2x(&cuda->shared_data,n);
+    return;
+  }
+
+  Domain::lamda2x(n);
+}
+
+/* ----------------------------------------------------------------------
+   convert box coords to triclinic 0-1 lamda coords for all N atoms
+   lamda = H^-1 (x - x0)
+------------------------------------------------------------------------- */
+
+void DomainCuda::x2lamda(int n)
+{
+  if(cuda->finished_setup&&(!cuda->oncpu))
+  {
+    Cuda_Domain_x2lamda(&cuda->shared_data,n);
+    return;
+  }
+ 
+  Domain::x2lamda(n);
+}
--- a/src/USER-CUDA/domain_cuda.h
+++ b/src/USER-CUDA/domain_cuda.h
@ -0,0 +1,41 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_DOMAIN_CUDA_H
+#define LMP_DOMAIN_CUDA_H
+
+#include "pointers.h"
+#include "domain.h"
+
+namespace LAMMPS_NS {
+
+class DomainCuda : public Domain {
+ public:
+  DomainCuda(class LAMMPS *);
+  void init();
+  void set_global_box();
+  void set_lamda_box();
+  void set_local_box();
+  void reset_box();
+  void pbc();
+ 
+  void lamda2x(int);
+  void x2lamda(int);
+
+ protected:
+  class Cuda *cuda;
+};
+
+}
+
+#endif
--- a/src/USER-CUDA/modify_cuda.cpp
+++ b/src/USER-CUDA/modify_cuda.cpp
@ -0,0 +1,442 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdio>
+#include <cstring>
+#include "modify_cuda.h"
+#include "style_compute.h"
+#include "style_fix.h"
+#include "atom.h"
+#include "comm.h"
+#include "fix.h"
+#include "compute.h"
+#include "group.h"
+#include "update.h"
+#include "domain.h"
+#include "cuda.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 4
+
+// mask settings - same as in fix.cpp
+
+#define INITIAL_INTEGRATE   1
+#define POST_INTEGRATE      2
+#define PRE_EXCHANGE        4
+#define PRE_NEIGHBOR        8
+#define PRE_FORCE          16
+#define POST_FORCE         32
+#define FINAL_INTEGRATE    64
+#define END_OF_STEP       128
+#define THERMO_ENERGY     256
+#define INITIAL_INTEGRATE_RESPA   512
+#define POST_INTEGRATE_RESPA     1024
+#define PRE_FORCE_RESPA          2048
+#define POST_FORCE_RESPA         4096
+#define FINAL_INTEGRATE_RESPA    8192
+#define MIN_PRE_EXCHANGE        16384
+#define MIN_POST_FORCE          32768
+#define MIN_ENERGY              65536
+
+#include "cuda_modify_flags.h"
+
+#define MIN(A,B) ((A) < (B)) ? (A) : (B)
+#define MAX(A,B) ((A) > (B)) ? (A) : (B)
+
+#define BIG 1.0e20
+
+/* ---------------------------------------------------------------------- */
+
+ModifyCuda::ModifyCuda(LAMMPS *lmp) : Modify(lmp)
+{
+  cuda = lmp->cuda;
+
+  n_initial_integrate_cuda = 0;
+  n_post_integrate_cuda = 0;
+  n_pre_exchange = 0;
+  n_pre_neighbor_cuda = 0;
+  n_pre_force_cuda = 0;
+  n_post_force_cuda = 0;
+  n_final_integrate_cuda = 0;
+  n_end_of_step_cuda = 0;
+  n_thermo_energy_cuda = 0;
+
+  n_initial_integrate_host = 0;
+  n_post_integrate_host = 0;
+  n_pre_exchange = 0;
+  n_pre_neighbor_host = 0;
+  n_pre_force_host = 0;
+  n_post_force_host = 0;
+  n_final_integrate_host = 0;
+  n_end_of_step_host = 0;
+  n_thermo_energy_host = 0;
+
+  list_initial_integrate_cuda = NULL;
+  list_post_integrate_cuda = NULL;
+  list_pre_exchange_cuda = NULL;
+  list_pre_neighbor_cuda = NULL;
+  list_pre_force_cuda = NULL;
+  list_post_force_cuda = NULL;
+  list_final_integrate_cuda = NULL;
+  list_end_of_step_cuda = NULL;
+  list_thermo_energy_cuda = NULL;
+  end_of_step_every_cuda = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ModifyCuda::~ModifyCuda()
+{
+  delete [] list_initial_integrate_cuda;
+  delete [] list_post_integrate_cuda;
+  delete [] list_pre_exchange_cuda;
+  delete [] list_pre_neighbor_cuda;
+  delete [] list_pre_force_cuda;
+  delete [] list_post_force_cuda;
+  delete [] list_final_integrate_cuda;
+  delete [] list_end_of_step_cuda;
+  delete [] list_thermo_energy_cuda;
+  delete [] end_of_step_every_cuda;
+}
+
+/* ----------------------------------------------------------------------
+   initialize all fixes and computes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::init()
+{
+  int i,j;
+
+  // delete storage of restart info since it is not valid after 1st run
+
+  restart_deallocate();
+
+  // create lists of fixes to call at each stage of run
+
+  list_init(INITIAL_INTEGRATE,n_initial_integrate,list_initial_integrate);
+  list_init(POST_INTEGRATE,n_post_integrate,list_post_integrate);
+  list_init(PRE_EXCHANGE,n_pre_exchange,list_pre_exchange);
+  list_init(PRE_NEIGHBOR,n_pre_neighbor,list_pre_neighbor);
+  list_init(PRE_FORCE,n_pre_force,list_pre_force);
+  list_init(POST_FORCE,n_post_force,list_post_force);
+  list_init(FINAL_INTEGRATE,n_final_integrate,list_final_integrate);
+  list_init_end_of_step(END_OF_STEP,n_end_of_step,list_end_of_step);
+  list_init_thermo_energy(THERMO_ENERGY,n_thermo_energy,list_thermo_energy);
+
+  list_init(INITIAL_INTEGRATE_CUDA, n_initial_integrate_cuda, list_initial_integrate_cuda);
+  list_init(POST_INTEGRATE_CUDA, n_post_integrate_cuda, list_post_integrate_cuda);
+  list_init(PRE_EXCHANGE_CUDA, n_pre_exchange_cuda, list_pre_exchange_cuda);
+  list_init(PRE_NEIGHBOR_CUDA, n_pre_neighbor_cuda, list_pre_neighbor_cuda);
+  list_init(PRE_FORCE_CUDA, n_pre_force_cuda, list_pre_force_cuda);
+  list_init(POST_FORCE_CUDA, n_post_force_cuda, list_post_force_cuda);
+  list_init(FINAL_INTEGRATE_CUDA, n_final_integrate_cuda, list_final_integrate_cuda);
+  list_init_end_of_step_cuda(END_OF_STEP_CUDA, n_end_of_step_cuda, list_end_of_step_cuda);
+  list_init_thermo_energy(THERMO_ENERGY_CUDA, n_thermo_energy_cuda, list_thermo_energy_cuda);
+
+  n_initial_integrate_host = n_initial_integrate;
+  n_post_integrate_host = n_post_integrate;
+  n_pre_exchange_host = n_pre_exchange;
+  n_pre_neighbor_host = n_pre_neighbor;
+  n_pre_force_host = n_pre_force;
+  n_post_force_host = n_post_force;
+  n_final_integrate_host = n_final_integrate;
+  n_end_of_step_host = n_end_of_step;
+  n_thermo_energy_host = n_thermo_energy;
+  
+  n_initial_integrate = n_initial_integrate_cuda+n_initial_integrate_host;
+  n_post_integrate = n_post_integrate_cuda+n_post_integrate_host;
+  n_pre_exchange = n_pre_exchange_cuda+n_pre_exchange_host;
+  n_pre_neighbor = n_pre_neighbor_cuda+n_pre_neighbor_host;
+  n_pre_force = n_pre_force_cuda+n_pre_force_host;
+  n_post_force = n_post_force_cuda+n_post_force_host;
+  n_final_integrate = n_final_integrate_cuda+n_final_integrate_host;
+  n_end_of_step = n_end_of_step_cuda+n_end_of_step_host;
+  n_thermo_energy = n_thermo_energy_cuda+n_thermo_energy_host;
+  
+  list_init(INITIAL_INTEGRATE_RESPA,
+	    n_initial_integrate_respa,list_initial_integrate_respa);
+  list_init(POST_INTEGRATE_RESPA,
+	    n_post_integrate_respa,list_post_integrate_respa);
+  list_init(POST_FORCE_RESPA,
+	    n_post_force_respa,list_post_force_respa);
+  list_init(PRE_FORCE_RESPA,
+	    n_pre_force_respa,list_pre_force_respa);
+  list_init(FINAL_INTEGRATE_RESPA,
+	    n_final_integrate_respa,list_final_integrate_respa);
+
+  list_init(MIN_PRE_EXCHANGE,n_min_pre_exchange,list_min_pre_exchange);
+  list_init(MIN_POST_FORCE,n_min_post_force,list_min_post_force);
+  list_init(MIN_ENERGY,n_min_energy,list_min_energy);
+
+  // init each fix
+  // needs to come before compute init
+  // this is b/c some computes call fix->dof()
+  // FixRigid::dof() depends on its own init having been called
+
+  comm->maxforward_fix = comm->maxreverse_fix = 0;
+  for (i = 0; i < nfix; i++) fix[i]->init();
+
+  // set global flag if any fix has its restart_pbc flag set
+
+  restart_pbc_any = 0;
+  for (i = 0; i < nfix; i++)
+    if (fix[i]->restart_pbc) restart_pbc_any = 1;
+
+  // create list of computes that store invocation times
+
+  list_init_compute();
+
+  // init each compute
+  // set invoked_scalar,vector,etc to -1 to force new run to re-compute them
+  // add initial timestep to all computes that store invocation times
+  //   since any of them may be invoked by initial thermo
+  // do not clear out invocation times stored within a compute,
+  //   b/c some may be holdovers from previous run, like for ave fixes
+
+  for (i = 0; i < ncompute; i++) {
+    compute[i]->init();
+    compute[i]->invoked_scalar = -1;
+    compute[i]->invoked_vector = -1;
+    compute[i]->invoked_array = -1;
+    compute[i]->invoked_peratom = -1;
+    compute[i]->invoked_local = -1;
+  }
+  addstep_compute_all(update->ntimestep);
+
+  // warn if any particle is time integrated more than once
+
+  int nlocal = atom->nlocal;
+  int *mask = atom->mask;
+
+  int *flag = new int[nlocal];
+  for (i = 0; i < nlocal; i++) flag[i] = 0;
+
+  int groupbit;
+  for (i = 0; i < nfix; i++) {
+    if (fix[i]->time_integrate == 0) continue;
+    groupbit = fix[i]->groupbit;
+    for (j = 0; j < nlocal; j++)
+      if (mask[j] & groupbit) flag[j]++;
+  }
+
+  int check = 0;
+  for (i = 0; i < nlocal; i++)
+    if (flag[i] > 1) check = 1;
+
+  delete [] flag;
+
+  int checkall;
+  MPI_Allreduce(&check,&checkall,1,MPI_INT,MPI_SUM,world);
+  if (comm->me == 0 && checkall)
+    error->warning("One or more atoms are time integrated more than once");
+}
+
+/* ----------------------------------------------------------------------
+   1st half of integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::initial_integrate(int vflag)
+{
+	for(int i = 0; i < n_initial_integrate_cuda; i++)
+		fix[list_initial_integrate_cuda[i]]->initial_integrate(vflag);
+
+	if(n_initial_integrate_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_initial_integrate_host; i++)
+			fix[list_initial_integrate[i]]->initial_integrate(vflag);
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   post_integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::post_integrate()
+{
+	for(int i = 0; i < n_post_integrate_cuda; i++)
+		fix[list_post_integrate_cuda[i]]->post_integrate();
+	
+	if(n_post_integrate_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_post_integrate_host; i++)
+			fix[list_post_integrate[i]]->post_integrate();
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   pre_exchange call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::pre_exchange()
+{
+	for(int i = 0; i < n_pre_exchange_cuda; i++)
+		fix[list_pre_exchange_cuda[i]]->pre_exchange();
+
+	if(n_pre_exchange_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_pre_exchange_host; i++)
+			fix[list_pre_exchange[i]]->pre_exchange();
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   pre_neighbor call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::pre_neighbor()
+{
+	for(int i = 0; i < n_pre_neighbor_cuda; i++)
+		fix[list_pre_neighbor_cuda[i]]->pre_neighbor();
+	
+	if(n_pre_neighbor_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_pre_neighbor_host; i++)
+			fix[list_pre_neighbor[i]]->pre_neighbor();
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   pre_force call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::pre_force(int vflag)
+{
+	for(int i = 0; i < n_pre_force_cuda; i++)
+		fix[list_pre_force_cuda[i]]->pre_force(vflag);
+
+	if(n_pre_force_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_pre_force_host; i++)
+			fix[list_pre_force[i]]->pre_force(vflag);
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   post_force call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::post_force(int vflag)
+{
+	for(int i = 0; i < n_post_force_cuda; i++)
+			fix[list_post_force_cuda[i]]->post_force(vflag);
+
+	if(n_post_force_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_post_force_host; i++)
+			fix[list_post_force[i]]->post_force(vflag);
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   2nd half of integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyCuda::final_integrate()
+{
+	for (int i = 0; i < n_final_integrate_cuda; i++)
+		fix[list_final_integrate_cuda[i]]->final_integrate();
+	
+	if(n_final_integrate_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_final_integrate_host; i++)
+			fix[list_final_integrate[i]]->final_integrate();
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+}
+
+/* ----------------------------------------------------------------------
+   end-of-timestep call, only for relevant fixes
+   only call fix->end_of_step() on timesteps that are multiples of nevery
+------------------------------------------------------------------------- */
+
+void ModifyCuda::end_of_step()
+{
+	for (int i = 0; i < n_end_of_step_cuda; i++)
+		if (update->ntimestep % end_of_step_every_cuda[i] == 0)
+			fix[list_end_of_step_cuda[i]]->end_of_step();
+	
+	if(n_end_of_step_host != 0)
+	{
+		int do_thisstep=0;
+		for (int i = 0; i < n_end_of_step_host; i++)
+			if (update->ntimestep % end_of_step_every[i] == 0) do_thisstep=1;
+		if(do_thisstep)
+		{
+		  cuda->downloadAll(); cuda->oncpu = true;
+		  for (int i = 0; i < n_end_of_step_host; i++)
+			 if (update->ntimestep % end_of_step_every[i] == 0)
+				fix[list_end_of_step[i]]->end_of_step();
+		  cuda->uploadAll(); cuda->oncpu = false;
+		}
+	}
+}
+
+/* ----------------------------------------------------------------------
+   thermo energy call, only for relevant fixes
+   called by Thermo class
+   compute_scalar() is fix call to return energy
+------------------------------------------------------------------------- */
+
+double ModifyCuda::thermo_energy()
+{
+	double energy = 0.0;
+	
+	for (int i = 0; i < n_thermo_energy_cuda; i++)
+		energy += fix[list_thermo_energy_cuda[i]]->compute_scalar();
+	
+	if(n_thermo_energy_host != 0)
+	{
+		cuda->downloadAll(); cuda->oncpu = true;
+		for (int i = 0; i < n_thermo_energy_host; i++)
+			energy += fix[list_thermo_energy[i]]->compute_scalar();
+		cuda->uploadAll(); cuda->oncpu = false;
+	}
+	
+	return energy;
+}
+
+
+
+void ModifyCuda::list_init_end_of_step_cuda(int mask, int &n, int *&list)
+{
+  delete [] list;
+  delete [] end_of_step_every_cuda;
+
+  n = 0;
+  for (int i = 0; i < nfix; i++) if (fmask[i] & mask) n++;
+  list = new int[n];
+  end_of_step_every_cuda = new int[n];
+
+  n = 0;
+  for (int i = 0; i < nfix; i++)
+    if (fmask[i] & mask) {
+      list[n] = i;
+      end_of_step_every_cuda[n++] = fix[i]->nevery;
+    }
+}
--- a/src/USER-CUDA/modify_cuda.h
+++ b/src/USER-CUDA/modify_cuda.h
@ -0,0 +1,82 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_MODIFY_CUDA_H
+#define LMP_MODIFY_CUDA_H
+
+#include <cstdio>
+#include "modify.h"
+
+namespace LAMMPS_NS {
+
+class ModifyCuda : public Modify {
+ public:
+
+  int n_initial_integrate_cuda;
+  int n_post_integrate_cuda;
+  int n_pre_exchange_cuda;
+  int n_pre_neighbor_cuda;
+  int n_pre_force_cuda;
+  int n_post_force_cuda;
+  int n_final_integrate_cuda;
+  int n_end_of_step_cuda;
+  int n_thermo_energy_cuda;
+
+  int n_initial_integrate_host;
+  int n_post_integrate_host;
+  int n_pre_exchange_host;
+  int n_pre_neighbor_host;
+  int n_pre_force_host;
+  int n_post_force_host;
+  int n_final_integrate_host;
+  int n_end_of_step_host;
+  int n_thermo_energy_host;
+
+  ModifyCuda(class LAMMPS *);
+  ~ModifyCuda();
+  void init();
+  void initial_integrate(int);
+  void post_integrate();
+  //void pre_decide();
+  void pre_exchange();
+  void pre_neighbor();
+  void pre_force(int);
+  void post_force(int);
+  void final_integrate();
+  void end_of_step();
+  double thermo_energy();
+
+
+ protected:
+  class Cuda *cuda;
+
+  // lists of fixes to apply at different stages of timestep
+
+  // list of cuda fixes
+  int *list_initial_integrate_cuda;
+  int *list_post_integrate_cuda;
+  int *list_pre_exchange_cuda;
+  int *list_pre_neighbor_cuda;
+  int *list_pre_force_cuda;
+  int *list_post_force_cuda;
+  int *list_final_integrate_cuda;
+  int *list_end_of_step_cuda;
+  int *list_thermo_energy_cuda;
+  int *end_of_step_every_cuda;
+
+  void list_init_end_of_step_cuda(int, int &, int *&);
+};
+
+}
+
+#endif
--- a/src/USER-CUDA/neigh_full_cuda.cpp
+++ b/src/USER-CUDA/neigh_full_cuda.cpp
@ -0,0 +1,317 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef CUDA
+#include "neighbor_cuda.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "domain.h"
+#include "group.h"
+#include "error.h"
+#include "cuda_neigh_list.h"
+#include "cuda.h"
+#include "neighbor_cu.h"
+#include <cmath>
+using namespace LAMMPS_NS;
+
+/* ----------------------------------------------------------------------
+   N^2 search for all neighbors
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+void NeighborCuda::full_bin_cuda(NeighList *list)
+{
+  MYDBG(printf(" # CUDA::NeighFullBinCuda ... start\n");)
+  if(includegroup) error->warning("Warning using inlcudegroup neighborbuild. This is not yet supported by CUDA neighborbuild styles.\n");
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+ 
+  if(nlocal==0) return;
+  CudaNeighList* clist=list->cuda_list;
+  cuda_shared_neighlist* slist=&clist->sneighlist;
+  
+  if(not clist) cuda->registerNeighborList(list);
+  
+  clist->build_cuda=true;
+  
+  if(slist->bin_extraspace<0.09)
+  {
+    for(int i=1;i<=atom->ntypes;i++)
+    for(int j=1;j<=atom->ntypes;j++)
+    {
+    	if(slist->maxcut<cutneighsq[i][j]) slist->maxcut=cutneighsq[i][j];
+    }
+    slist->maxcut=sqrt(slist->maxcut);
+  }
+  int bin_dim_tmp[3];
+  int bin_nmax_tmp;
+//printf("Hallo\n");
+  timespec starttime,endtime;
+  do
+  {
+    do
+    {
+      bin_dim_tmp[0]=static_cast <int> ((domain->subhi[0]-domain->sublo[0])/slist->maxcut);
+      bin_dim_tmp[1]=static_cast <int> ((domain->subhi[1]-domain->sublo[1])/slist->maxcut);
+      bin_dim_tmp[2]=static_cast <int> ((domain->subhi[2]-domain->sublo[2])/slist->maxcut);
+      if(bin_dim_tmp[0]==0) bin_dim_tmp[0]+=1;
+      if(bin_dim_tmp[1]==0) bin_dim_tmp[1]+=1;
+      if(bin_dim_tmp[2]==0) bin_dim_tmp[2]+=1;
+      bin_nmax_tmp=static_cast <int> ((1.0+slist->bin_extraspace)*nlocal/(bin_dim_tmp[0]*bin_dim_tmp[1]*bin_dim_tmp[2]));
+      bin_dim_tmp[0]+=4;
+      bin_dim_tmp[1]+=4;
+      bin_dim_tmp[2]+=4;
+ 	  if(bin_nmax_tmp<32) slist->maxcut*=1.2;
+ 	 // printf("slist->maxcut: %lf\n", slist->maxcut);
+    } while(bin_nmax_tmp<32);
+    if((slist->bin_dim[0]!=bin_dim_tmp[0])||(slist->bin_dim[1]!=bin_dim_tmp[1])||(slist->bin_dim[2]!=bin_dim_tmp[2])||(slist->bin_nmax!=bin_nmax_tmp))
+    {
+    	if(slist->binned_id!=NULL) 
+    	CudaWrapper_FreeCudaData(slist->binned_id,slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int));
+    	slist->bin_dim[0] = bin_dim_tmp[0];
+    	slist->bin_dim[1] = bin_dim_tmp[1];
+    	slist->bin_dim[2] = bin_dim_tmp[2];
+    	slist->bin_nmax = bin_nmax_tmp;
+    	slist->binned_id=(int*) CudaWrapper_AllocCudaData(slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int));
+ 	  //printf("slist->bin: %i %i %i %i \n", bin_dim_tmp[0],bin_dim_tmp[1],bin_dim_tmp[2],bin_nmax_tmp);
+    }
+    //if(list->cuda_list->sneighlist.bin_nmax>512) error->all("To many atoms per bin. Likely cause is very long pair cutoff. This needs major rewrite of code and is not yet scheduled to be done.\n");
+  }while(Cuda_BinAtoms(&cuda->shared_data, &list->cuda_list->sneighlist));
+
+ // cuda->cu_debugdata->memset_device(0);
+  int maxneighbors=slist->maxneighbors;
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+
+  if((nex_type!=slist->nex_type)||
+  (nex_group!=slist->nex_group)||
+  (nex_mol!=slist->nex_mol))
+  {
+  	slist->nex_type=nex_type;
+  	slist->nex_group=nex_group;
+  	slist->nex_mol=nex_mol;
+  	//printf("%i %i %i\n",nex_type,nex_group,nex_mol);
+  	if(nex_type)
+  	{
+  	delete clist->cu_ex_type;
+  	clist->cu_ex_type=new cCudaData<int , int , x> (&ex_type[0][0]   , & slist->ex_type     , (atom->ntypes+1)*(atom->ntypes+1) );
+  	clist->cu_ex_type->upload();
+  	}
+ 	//printf("AA %i %i %i\n",nex_type,nex_group,nex_mol);
+  	if(nex_group)
+  	{
+   	delete clist->cu_ex1_bit;
+  	clist->cu_ex1_bit=new cCudaData<int , int , x> (ex1_bit   , & slist->ex1_bit     , nex_group );
+  	clist->cu_ex1_bit->upload();
+  	//printf("A %i %i %i\n",nex_type,nex_group,nex_mol);
+  	delete clist->cu_ex2_bit;
+  	clist->cu_ex2_bit=new cCudaData<int , int , x> (ex2_bit   , & slist->ex2_bit     , nex_group );
+  	clist->cu_ex2_bit->upload();
+  	}
+  	//printf("B %i %i %i\n",nex_type,nex_group,nex_mol);
+  	if(nex_mol)
+  	{
+  	delete clist->cu_ex_mol_bit;
+  	clist->cu_ex_mol_bit=new cCudaData<int , int , x> (ex_mol_bit   , & slist->ex_mol_bit     , nex_mol );
+  	clist->cu_ex_mol_bit->upload();
+  	}
+  	//printf("C %i %i %i\n",nex_type,nex_group,nex_mol);
+  }
+  int overflow = 0;
+  int inum = 0;
+  int npnt = 0;
+  do
+  {
+  	npnt=0;
+  	inum=0;
+    overflow=0;
+    clist->grow_device();
+    slist->cutneighsq=cutneighsq;
+    slist->maxneighbors=maxneighbors;
+    slist->inum = list->inum = nlocal;
+    //list->cuda_list->grow_device();
+    if(cuda->shared_data.overlap_comm)
+    {
+	  list->cuda_list->inum_border=0;
+	  list->cuda_list->cu_inum_border->upload();
+    }
+    
+    cuda->shared_data.atom.nall=nall;
+    //Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist);
+    overflow= Cuda_NeighborBuildFullBin(&cuda->shared_data, &list->cuda_list->sneighlist);
+	
+	/*cuda->cu_debugdata->download();
+	printf("Debugdata: %i ",cuda->debugdata[0]);
+	for(int i=0;i<cuda->debugdata[0];i+=3) printf("// %i %i %i",cuda->debugdata[i+1],cuda->debugdata[i+2],cuda->debugdata[i+3]);
+	printf("\n");*/
+	//printf("maxneighborsA: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
+	
+    if(overflow<0) 
+    {
+    	maxneighbors+=32; 
+    	if(-overflow>maxneighbors) maxneighbors=((-overflow+37)/32)*32;
+    	delete list->cuda_list->cu_neighbors;
+    	delete [] list->cuda_list->neighbors;
+    	list->cuda_list->neighbors= new int[slist->maxlocal*maxneighbors];
+    	list->cuda_list->sneighlist.maxneighbors=maxneighbors;
+	//printf("maxneighborsA1: %i %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax,slist->maxlocal);
+    	list->cuda_list->cu_neighbors= new cCudaData<int, int, x> (list->cuda_list->neighbors	 		 , & list->cuda_list->sneighlist.neighbors, slist->maxlocal*maxneighbors );
+	//printf("maxneighborsA2: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
+    	
+    	if(cuda->shared_data.overlap_comm)
+    	{
+    	  list->cuda_list->sneighlist.maxneighbors=maxneighbors;
+    	  list->cuda_list->dev_free();
+    	  list->cuda_list->dev_alloc();
+    	}
+	//printf("maxneighborsA3: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
+    }
+	//printf("maxneighborsB: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
+    if(cuda->shared_data.overlap_comm)
+    {
+  		list->cuda_list->cu_inum_border->download();
+  		list->cuda_list->sneighlist.inum_border2=list->cuda_list->inum_border;
+    }
+  }
+  while(overflow<0);
+  
+  //cuda->cu_debugdata->download();
+ // printf("Differences in: %i\n",cuda->debugdata[0]);
+ // for(int i=0;i<20;i++) printf("%i %i %i %i// ",cuda->debugdata[4*i+1],cuda->debugdata[4*i+2],cuda->debugdata[4*i+3],cuda->debugdata[4*i+4]);
+//  printf("\n");
+/*for(int i=0;i<10;i++)
+{
+	printf("%i %i // ",i,numneigh[i]);
+	for(int j=0;j<numneigh[i];j++)
+	 printf("%i ",list->cuda_list->neighbors[i+j*nlocal]);
+	printf("\n");
+}*/
+/*  int count=0;
+  if(cuda->shared_data.overlap_comm)
+  {
+  list->cuda_list->cu_inum_border->download();
+  list->cuda_list->cu_ilist_border->download();
+  list->cuda_list->cu_numneigh_border->download();
+  list->cuda_list->cu_numneigh_inner->download();
+  list->cuda_list->cu_neighbors->download();
+  list->cuda_list->cu_neighbors_inner->download();
+  list->cuda_list->cu_neighbors_border->download();
+  
+  //list->cuda_list->cu_firstneigh->download();
+ // list->cuda_list->nl_download();
+  list->cuda_list->cu_numneigh->download();
+  int diff=0;
+  //for(int i=0;i<nlocal;i++)*/
+ /* int i=123;
+  {
+  	int k=-1;
+  	//printf("inum_border: %i\n",list->cuda_list->inum_border);
+  	//for(int j=0;j<list->numneigh[i];j++) printf("%i ",list->firstneigh[i][j]);printf("\n");
+  	for(int j=0;j<list->cuda_list->inum_border;j++)
+  	if(list->cuda_list->ilist_border[j]==i) k=j;
+  	int d=numneigh[i]-list->cuda_list->numneigh_inner[i];
+  	if(k>-1) d-=list->cuda_list->numneigh_border[k];
+  	if(d!=0) {printf("Error at %i %i %i %i %i\n",i,k,d,numneigh[i],list->cuda_list->numneigh_inner[i]); diff++;}
+  	if(k>-1 && count<10) 
+  	{
+  		printf("Numneighs: %i %i %i  Border_i: %i %i\n",numneigh[i],list->cuda_list->numneigh_inner[i],list->cuda_list->numneigh_border[k],k,(int)list->cuda_list->cu_ilist_border->dev_data());
+	cuda->shared_data.me=k;
+	for(int j=0;j<numneigh[i];j++)
+	 printf("%i ",list->cuda_list->neighbors[i+j*nlocal]);
+  	 printf("\n");
+	for(int j=0;j<list->cuda_list->numneigh_inner[i];j++)
+	 printf("%i ",list->cuda_list->neighbors_inner[i+j*nlocal]);
+	 printf(" // ");
+	for(int j=0;j<list->cuda_list->numneigh_border[k];j++)
+	 printf("%i ",list->cuda_list->neighbors_border[k+j*nlocal]);
+  	 printf("\n");
+  	 count++;
+  	}
+  }
+  printf("%i\n",diff);
+  }*/
+  list->cuda_list->cu_numneigh->download();
+  list->cuda_list->cu_ilist->download();
+	//printf("Done\n");
+  
+  MYDBG(printf(" # CUDA::NeighFullBinCuda ... end\n");)
+  
+}
+
+
+void NeighborCuda::full_nsq_cuda(NeighList *list)
+{
+	printf("Full_Nsq cuda neighbor list build is not implemented anymore.\n");
+return;
+/*
+  MYDBG(printf(" # CUDA::NeighFullNSQCuda ... start\n");)
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+ 
+  if(cuda->cu_xhold) cuda->cu_xhold->upload();
+  
+
+  if(not list->cuda_list) cuda->registerNeighborList(list);
+  list->cuda_list->build_cuda=true;
+  int maxneighbors=list->cuda_list->sneighlist.maxneighbors;
+  int neigh_lists_per_page=pgsize/maxneighbors;
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int **pages = list->pages;
+
+  int overflow = 0;
+  int inum = 0;
+  int npage = 0;
+  int npnt = 0;
+  do
+  {
+  	npage=0;
+  	npnt=0;
+  	inum=0;
+    overflow=0;
+  	neigh_lists_per_page=pgsize/maxneighbors;
+    npage=(2*nlocal*maxneighbors-1)/pgsize;
+    while(npage>list->maxpage) list->add_pages();
+    pages = list->pages;
+    npage=0;
+  	list->cuda_list->sneighlist.neigh_lists_per_page=pgsize/maxneighbors;
+    list->cuda_list->grow_device();
+    list->cuda_list->sneighlist.cutneighsq=cutneighsq;
+    list->cuda_list->sneighlist.maxneighbors=maxneighbors;
+    list->cuda_list->sneighlist.inum = list->inum = nlocal;
+    
+    cuda->shared_data.atom.nall=nall;
+    Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist);
+    overflow= not Cuda_NeighborBuildFullNsq(&cuda->shared_data, &list->cuda_list->sneighlist);
+	
+	
+	    
+     if(overflow) maxneighbors+=32;
+  }
+  while(overflow);
+   if(not cudable) list->cuda_list->nl_download();
+  MYDBG(printf(" # CUDA::NeighFullNSQCuda ... end\n");)
+  */
+}
+#endif
+
--- a/src/USER-CUDA/neighbor_cuda.cpp
+++ b/src/USER-CUDA/neighbor_cuda.cpp
@ -0,0 +1,221 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "neighbor_cuda.h"
+#include "cuda.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "domain.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+enum{NSQ,BIN,MULTI};     // also in neigh_list.cpp
+
+/* ---------------------------------------------------------------------- */
+
+NeighborCuda::NeighborCuda(LAMMPS *lmp) : Neighbor(lmp)
+{
+  cuda = lmp->cuda;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborCuda::init()
+{
+  cuda->set_neighinit(dist_check,0.25*skin*skin);
+  cudable = 1;
+
+  Neighbor::init();
+}
+
+/* ----------------------------------------------------------------------
+   overwrite either full_nsq or full_bin with CUDA-equivalent methods
+   any other neighbor build method is unchanged
+------------------------------------------------------------------------- */
+
+void NeighborCuda::choose_build(int index, NeighRequest *rq)
+{
+  Neighbor::choose_build(index,rq);
+  
+  if (rq->full && style == NSQ && rq->ghost == 0 && rq->cudable)
+    pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_nsq_cuda;
+  else if (rq->full && style == BIN && rq->ghost == 0 && rq->cudable)
+    pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_bin_cuda;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int NeighborCuda::check_distance()
+{
+  double delx,dely,delz,rsq;
+  double delta,deltasq,delta1,delta2;
+
+  if (boxcheck) {
+    if (triclinic == 0) {
+      delx = bboxlo[0] - boxlo_hold[0];
+      dely = bboxlo[1] - boxlo_hold[1];
+      delz = bboxlo[2] - boxlo_hold[2];
+      delta1 = sqrt(delx*delx + dely*dely + delz*delz);
+      delx = bboxhi[0] - boxhi_hold[0];
+      dely = bboxhi[1] - boxhi_hold[1];
+      delz = bboxhi[2] - boxhi_hold[2];
+      delta2 = sqrt(delx*delx + dely*dely + delz*delz);
+      delta = 0.5 * (skin - (delta1+delta2));
+      deltasq = delta*delta;
+    } else {
+      domain->box_corners();
+      delta1 = delta2 = 0.0;
+      for (int i = 0; i < 8; i++) {
+	delx = corners[i][0] - corners_hold[i][0];
+	dely = corners[i][1] - corners_hold[i][1];
+	delz = corners[i][2] - corners_hold[i][2];
+	delta = sqrt(delx*delx + dely*dely + delz*delz);
+	if (delta > delta1) delta1 = delta;
+	else if (delta > delta2) delta2 = delta;
+      }
+      delta = 0.5 * (skin - (delta1+delta2));
+      deltasq = delta*delta;
+    }
+  } else deltasq = triggersq;
+
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+  if (includegroup) nlocal = atom->nfirst;
+
+  int flag = 0;
+
+  if (not cuda->neighbor_decide_by_integrator) {
+    cuda->cu_x_download();
+    for (int i = 0; i < nlocal; i++) {
+      delx = x[i][0] - xhold[i][0];
+      dely = x[i][1] - xhold[i][1];
+      delz = x[i][2] - xhold[i][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      if (rsq > deltasq) flag = 1;
+    }
+  }
+  else flag = cuda->shared_data.atom.reneigh_flag;
+  
+  int flagall;
+  MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_MAX,world);
+  if (flagall && ago == MAX(every,delay)) ndanger++;
+  return flagall;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborCuda::build()
+{
+  int i;
+
+  ago = 0;
+  ncalls++;
+
+  // store current atom positions and box size if needed
+
+  if (dist_check) {
+    if (cuda->decide_by_integrator())
+      cuda->update_xhold(maxhold, &xhold[0][0]);
+    else {
+      if (cuda->finished_setup) cuda->cu_x_download();
+      
+      double **x = atom->x;
+      int nlocal = atom->nlocal;
+      if (includegroup) nlocal = atom->nfirst;
+      if (nlocal > maxhold) {
+	maxhold = atom->nmax;
+	memory->destroy(xhold);
+	memory->create(xhold,maxhold,3,"neigh:xhold");
+      }
+      for (i = 0; i < nlocal; i++) {
+	xhold[i][0] = x[i][0];
+	xhold[i][1] = x[i][1];
+	xhold[i][2] = x[i][2];
+      }
+      if (boxcheck) {
+	if (triclinic == 0) {
+	  boxlo_hold[0] = bboxlo[0];
+	  boxlo_hold[1] = bboxlo[1];
+	  boxlo_hold[2] = bboxlo[2];
+	  boxhi_hold[0] = bboxhi[0];
+	  boxhi_hold[1] = bboxhi[1];
+	  boxhi_hold[2] = bboxhi[2];
+	} else {
+	  domain->box_corners();
+	  corners = domain->corners;
+	  for (i = 0; i < 8; i++) {
+	    corners_hold[i][0] = corners[i][0];
+	    corners_hold[i][1] = corners[i][1];
+	    corners_hold[i][2] = corners[i][2];
+	  }
+	}
+      }
+    }
+  }
+  
+  if (not cudable && cuda->finished_setup && atom->avec->cudable)
+    cuda->downloadAll(); 
+  if (cudable && (not cuda->finished_setup)) {
+    cuda->checkResize();
+    cuda->uploadAll();
+  }
+
+  // if any lists store neighbors of ghosts:
+  // invoke grow() if nlocal+nghost exceeds previous list size
+  // else only invoke grow() if nlocal exceeds previous list size
+  // only done for lists with growflag set and which are perpetual
+
+  if (anyghostlist && atom->nlocal+atom->nghost > maxatom) {
+    maxatom = atom->nmax;
+    for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
+  } else if (atom->nlocal > maxatom) {
+    maxatom = atom->nmax;
+    for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
+  }
+  
+  // extend atom bin list if necessary
+
+  if (style != NSQ && atom->nmax > maxbin) {
+    maxbin = atom->nmax;
+    memory->destroy(bins);
+    memory->create(bins,maxbin,"bins");
+  }
+
+  // check that neighbor list with special bond flags will not overflow
+
+  if (atom->nlocal+atom->nghost > NEIGHMASK)
+    error->one("Too many local+ghost atoms for neighbor list");
+
+  // invoke building of pair and molecular neighbor lists
+  // only for pairwise lists with buildflag set
+
+  for (i = 0; i < nblist; i++)
+    (this->*pair_build[blist[i]])(lists[blist[i]]);
+
+  if (atom->molecular) {
+    if (force->bond) (this->*bond_build)();
+    if (force->angle) (this->*angle_build)();
+    if (force->dihedral) (this->*dihedral_build)();
+    if (force->improper) (this->*improper_build)();
+  }
+}
--- a/src/USER-CUDA/neighbor_cuda.h
+++ b/src/USER-CUDA/neighbor_cuda.h
@ -0,0 +1,39 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_NEIGHBOR_CUDA_H
+#define LMP_NEIGHBOR_CUDA_H
+
+#include "neighbor.h"
+
+namespace LAMMPS_NS {
+
+class NeighborCuda : public Neighbor {
+ public:
+  NeighborCuda(class LAMMPS *);
+  void init();
+  int check_distance();
+  void build();
+
+ private:
+  class Cuda *cuda;
+
+  void choose_build(int, class NeighRequest *);
+  typedef void (NeighborCuda::*PairPtr)(class NeighList *);
+  void full_nsq_cuda(class NeighList *);
+  void full_bin_cuda(class NeighList *);
+};
+
+}
+
+#endif
--- a/src/USER-CUDA/verlet_cuda.cpp
+++ b/src/USER-CUDA/verlet_cuda.cpp
--- a/src/USER-CUDA/verlet_cuda.h
+++ b/src/USER-CUDA/verlet_cuda.h
@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef INTEGRATE_CLASS
+
+IntegrateStyle(verlet/cuda,VerletCuda)
+
+#else
+
+
+#ifndef LMP_VERLET_CUDA_H
+#define LMP_VERLET_CUDA_H
+#include "verlet.h"
+#include "modify_cuda.h"
+
+namespace LAMMPS_NS {
+
+class VerletCuda : public Verlet
+{
+	public:
+		VerletCuda(class LAMMPS *, int, char **);
+		void setup();
+ 		void setup_minimal(int);
+  		void run(int);
+		
+		void test_atom(int atom,char* astring); //debugging purpose
+		int dotestatom;	//debugging purpose
+	
+	protected:
+		class Cuda *cuda;
+		void force_clear();	
+	    double time_pair;
+	    double time_kspace;
+	    double time_comm;
+	    double time_modify;
+	    double time_fulliterate;
+	    ModifyCuda* modify_cuda;
+};
+
+}
+
+#endif
+#endif