git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6219 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-26 22:16:44 +00:00
parent cfa91c0611
commit 5a8719e38f
21 changed files with 7231 additions and 0 deletions
--- a/src/USER-CUDA/Install.sh
+++ b/src/USER-CUDA/Install.sh
@ -0,0 +1,74 @@
 # Install/unInstall package files in LAMMPS
 # edit Makefile.package to include/exclude CUDA library
 # do not copy child files if parent does not exist
 if (test $1 = 1) then
  if (test -e ../Makefile.package) then
      sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
      sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
      sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
      sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
      sed -i '1 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package
      sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package
      sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) |' ../Makefile.package
      sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda -lcuda -lcudart -lrt |' ../Makefile.package
  fi
  cp comm_cuda.cpp ..
  cp domain_cuda.cpp ..
  cp modify_cuda.cpp ..
  cp neighbor_cuda.cpp ..
  cp neigh_full_cuda.cpp ..
  cp verlet_cuda.cpp ..
  cp cuda.cpp ..
  cp cuda_neigh_list.cpp ..
  cp comm_cuda.h ..
  cp domain_cuda.h ..
  cp modify_cuda.h ..
  cp neighbor_cuda.h ..
  cp verlet_cuda.h ..
  cp cuda.h ..
  cp cuda_common.h ..
  cp cuda_data.h ..
  cp cuda_modify_flags.h ..
  cp cuda_neigh_list.h ..
  cp cuda_precision.h ..
  cp cuda_shared.h ..
 elif (test $1 = 0) then
  if (test -e ../Makefile.package) then
    sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
    sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
    sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
    sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
  fi
  rm ../comm_cuda.cpp
  rm ../domain_cuda.cpp
  rm ../modify_cuda.cpp
  rm ../neighbor_cuda.cpp
  rm ../neigh_full_cuda.cpp
  rm ../verlet_cuda.cpp
  rm ../cuda.cpp
  rm ../cuda_neigh_list.cpp
  rm ../comm_cuda.h
  rm ../domain_cuda.h
  rm ../modify_cuda.h
  rm ../neighbor_cuda.h
  rm ../verlet_cuda.h
  rm ../cuda.h
  rm ../cuda_common.h
  rm ../cuda_data.h
  rm ../cuda_modify_flags.h
  rm ../cuda_neigh_list.h
  rm ../cuda_precision.h
  rm ../cuda_shared.h
 fi
--- a/src/USER-CUDA/comm_cuda.cpp
+++ b/src/USER-CUDA/comm_cuda.cpp
--- a/src/USER-CUDA/comm_cuda.h
+++ b/src/USER-CUDA/comm_cuda.h
@ -0,0 +1,69 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifndef LMP_COMM_CUDA_H
 #define LMP_COMM_CUDA_H
 #include "pointers.h"
 #include "cuda_data.h"
 #include "comm.h"
 namespace LAMMPS_NS {
 class CommCuda : public Comm {
 public:
  CommCuda(class LAMMPS *);
  ~CommCuda();
  virtual void init();
  virtual void setup();                     // setup 3d communication pattern
  virtual void forward_comm(int mode=0);              // forward communication of atom coords
  virtual void forward_comm_cuda();
  virtual void forward_comm_pack_cuda();
  virtual void forward_comm_transfer_cuda();
  virtual void forward_comm_unpack_cuda();
  virtual void forward_comm_pair(Pair *pair);    
  virtual void reverse_comm();              // reverse communication of forces
  virtual void exchange();                  // move atoms to new procs
  virtual void exchange_cuda();                  // move atoms to new procs
  virtual void borders();                   // setup list of atoms to communicate
  virtual void borders_cuda();                   // setup list of atoms to communicate
  virtual void borders_cuda_overlap_forward_comm();
  virtual void forward_comm_fix(class Fix *);          // forward comm from a Fix
 protected:
  class Cuda *cuda;
  cCudaData<int, int, xy>* cu_pbc;
  cCudaData<double, X_FLOAT, x>* cu_slablo;
  cCudaData<double, X_FLOAT, x>* cu_slabhi;
  cCudaData<double, X_FLOAT, xy>* cu_multilo;
  cCudaData<double, X_FLOAT, xy>* cu_multihi;
  cCudaData<int, int, xy>* cu_sendlist;
  virtual void grow_send(int,int);          // reallocate send buffer
  virtual void grow_recv(int);              // free/allocate recv buffer
  virtual void grow_list(int, int);         // reallocate one sendlist
  virtual void grow_swap(int);              // grow swap and multi arrays
  virtual void allocate_swap(int);          // allocate swap arrays
  virtual void allocate_multi(int);         // allocate multi arrays
  virtual void free_swap();                 // free swap arrays
  virtual void free_multi();                // free multi arrays
 };
 }
 #endif
--- a/src/USER-CUDA/cuda.cpp
+++ b/src/USER-CUDA/cuda.cpp
@ -0,0 +1,837 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #include <cstdlib>
 #include <cstdio>
 #include <cstring>
 #include "cuda.h"
 #include "atom.h"
 #include "domain.h"
 #include "force.h"
 #include "pair.h"
 #include "update.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "universe.h"
 #include "input.h"
 #include "error.h"
 #include "cuda_neigh_list.h"
 //#include "pre_binning_cu.h"
 #include "binning_cu.h"
 //#include "reverse_binning_cu.h"
 #include <ctime>
 #include <cmath>
 #include "cuda_pair_cu.h"
 #include "cuda_cu.h"
 using namespace LAMMPS_NS;
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp)
 {
 	cuda_exists=true;
 	lmp->cuda=this;
 	if(universe->me==0)
 	printf("# Using LAMMPS_CUDA \n");
 	shared_data.me=universe->me;
 	device_set=false;
 	Cuda_Cuda_GetCompileSettings(&shared_data);
 	if(shared_data.compile_settings.prec_glob!=sizeof(CUDA_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: Global Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_glob, sizeof(CUDA_FLOAT)/4);
 	if(shared_data.compile_settings.prec_x!=sizeof(X_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: X Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_x, sizeof(X_FLOAT)/4);
 	if(shared_data.compile_settings.prec_v!=sizeof(V_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: V Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_v, sizeof(V_FLOAT)/4);
 	if(shared_data.compile_settings.prec_f!=sizeof(F_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: F Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_f, sizeof(F_FLOAT)/4);
 	if(shared_data.compile_settings.prec_pppm!=sizeof(PPPM_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: PPPM Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_pppm, sizeof(PPPM_FLOAT)/4);
 	if(shared_data.compile_settings.prec_fft!=sizeof(FFT_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: FFT Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_fft, sizeof(FFT_FLOAT)/4);
    #ifdef FFT_CUFFT
      if(shared_data.compile_settings.cufft!=1) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 1);
    #else	
      if(shared_data.compile_settings.cufft!=0) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 0);
    #endif
    if(shared_data.compile_settings.arch!=CUDA_ARCH)  printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: arch: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, CUDA_ARCH);
 	cu_x          = 0;
 	cu_v          = 0;
 	cu_f          = 0;
 	cu_tag        = 0;
 	cu_type       = 0;
 	cu_mask       = 0;
 	cu_image      = 0;
 	cu_xhold      = 0;
 	cu_q          = 0;
 	cu_rmass      = 0;
 	cu_mass       = 0;
 	cu_virial     = 0;
 	cu_eatom      = 0;
 	cu_vatom      = 0;
 	cu_radius	  = 0;
 	cu_density	  = 0;
 	cu_omega	  = 0;
 	cu_torque	  = 0;
 	cu_special 	  = 0;
 	cu_nspecial   = 0;
 	cu_molecule   = 0;
 	cu_x_type 	  = 0;
 	x_type		  = 0;
 	cu_v_radius	  = 0;
 	v_radius	  = 0;
 	cu_omega_rmass	  = 0;
 	omega_rmass	  = 0;
 	binned_id = 0;
 	cu_binned_id  = 0;
 	binned_idnew = 0;
 	cu_binned_idnew = 0;
 	cu_map_array = 0;
 	copy_buffer=0;
 	copy_buffersize=0;
 	neighbor_decide_by_integrator=0;
 	pinned=true;
 	debugdata=0;
 	new int[2*CUDA_MAX_DEBUG_SIZE];
 	finished_setup = false;
 	begin_setup = false;
 	finished_run = false;
 	setSharedDataZero();
 	uploadtime=0;
 	downloadtime=0;
 	dotiming=false;
    dotestatom = false;
    testatom = 0;	
 	oncpu = true;
    self_comm = 0;
 	MYDBG( printf("# CUDA: Cuda::Cuda Done...\n");)
 	//cCudaData<double, float, yx >  
 }
 Cuda::~Cuda()
 {
 	print_timings();
 	if(universe->me==0) printf("# CUDA: Free memory...\n");
 	delete cu_q;
 	delete cu_x;
 	delete cu_v;
 	delete cu_f;
 	delete cu_tag;
 	delete cu_type;
 	delete cu_mask;
 	delete cu_image;
 	delete cu_xhold;
 	delete cu_mass;
 	delete cu_rmass;
 	delete cu_virial;
 	delete cu_eng_vdwl;
 	delete cu_eng_coul;
 	delete cu_eatom;
 	delete cu_vatom;
 	delete cu_radius;
 	delete cu_density;
 	delete cu_omega;
 	delete cu_torque;
 	delete cu_molecule;
 	delete cu_x_type;
 	delete [] x_type;
 	delete cu_v_radius;
 	delete [] v_radius;
 	delete cu_omega_rmass;
 	delete [] omega_rmass;
 	delete cu_map_array;
 	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
 	while(p != neigh_lists.end())
 	{
 		delete p->second;
 		++p;
 	}
 }
 void Cuda::accelerator(int narg, char** arg)
 {
 	if(device_set) return;
 	if(universe->me==0)
 	printf("# CUDA: Activate GPU \n");
 	int* devicelist=NULL;
 	int pppn=2;
    for(int i=0;i<narg;i++)
 	{
 	  if(strcmp(arg[i],"gpu/node")==0) 
 	  {
 	  	if(++i==narg) 
 	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number or keyword 'special' after 'gpu/node' option."); 
 	    if(strcmp(arg[i],"special")==0)
 	    {
 	  	   if(++i==narg) 
 	  	     error->all("Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'."); 
 	       pppn=atoi(arg[i]);
 	       if(pppn<1) error->all("Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'."); 
 	  	   if(i+pppn==narg) 
 	  	     error->all("Invalid Options for 'accelerator' command. Expecting list of device ids after keyword 'gpu/node special'."); 
 	       devicelist=new int[pppn];
 	       for(int k=0;k<pppn;k++)
 	         {i++;devicelist[k]=atoi(arg[i]);}
 	    }
 	    else
 	    pppn=atoi(arg[i]);
 	  }
 	  if(strcmp(arg[i],"pinned")==0) 
 	  {
 	  	if(++i==narg) 
 	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number after 'pinned' option."); 
 	    pinned=atoi(arg[i])==0?false:true;
            if((pinned==false)&&(universe->me==0)) printf(" #CUDA: Pinned memory is not used for communication\n");
 	  }
 	  if(strcmp(arg[i],"dotiming")==0) 
 	  {
 	  	dotiming=true;
 	  }
 	  if(strcmp(arg[i],"suffix")==0) 
 	  {
 	  	if(++i==narg) 
 	  	  error->all("Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option."); 
 	  	strcpy(lmp->asuffix,arg[i]);
 	  }
 	  if(strcmp(arg[i],"overlap_comm")==0) 
 	  {
 	  	shared_data.overlap_comm=1;
 	  }
 	  if(strcmp(arg[i],"dotest")==0) 
 	  {
 	  	if(++i==narg) 
 	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number after 'dotest' option."); 
 	    testatom=atof(arg[i]);
 	    dotestatom=true;
 	  }
 	  if(strcmp(arg[i],"override_bpa")==0) 
 	  {
 	  	if(++i==narg) 
 	  	  error->all("Invalid Options for 'accelerator' command. Expecting a number after 'override_bpa' option."); 
 	      shared_data.pair.override_block_per_atom = atoi(arg[i]);
 	  }
 	}
 	CudaWrapper_Init(0, (char**)0,universe->me,pppn,devicelist);
 	//if(shared_data.overlap_comm)
 	  	CudaWrapper_AddStreams(3);
 	cu_x          = 0;
 	cu_v          = 0;
 	cu_f          = 0;
 	cu_tag        = 0;
 	cu_type       = 0;
 	cu_mask       = 0;
 	cu_image      = 0;
 	cu_xhold      = 0;
 	cu_q          = 0;
 	cu_rmass      = 0;
 	cu_mass       = 0;
 	cu_virial     = 0;
 	cu_eatom      = 0;
 	cu_vatom      = 0;
 	cu_radius	  = 0;
 	cu_density	  = 0;
 	cu_omega	  = 0;
 	cu_torque	  = 0;
 	cu_special 	  = 0;
 	cu_nspecial   = 0;
 	cu_molecule   = 0;
 	cu_x_type 	  = 0;
 	cu_v_radius	  = 0;
 	cu_omega_rmass	  = 0;
 	cu_binned_id  = 0;
 	cu_binned_idnew = 0;
 	device_set=true;
 	allocate();
 	delete devicelist;
 }
 void Cuda::setSharedDataZero()
 {
 	MYDBG(printf("# CUDA: Cuda::setSharedDataZero ...\n");)
 	shared_data.atom.nlocal = 0;
 	shared_data.atom.nghost = 0;
 	shared_data.atom.nall = 0;
 	shared_data.atom.nmax = 0;
 	shared_data.atom.ntypes = 0;
 	shared_data.atom.q_flag = 0;
 	shared_data.atom.need_eatom = 0;
 	shared_data.atom.need_vatom = 0;
    shared_data.pair.cudable_force = 0;
 	shared_data.pair.collect_forces_later = 0;
 	shared_data.pair.use_block_per_atom = 0;
 	shared_data.pair.override_block_per_atom = -1;
 	shared_data.pair.cut = 0;
 	shared_data.pair.cutsq = 0;
 	shared_data.pair.cut_inner = 0;
 	shared_data.pair.cut_coul = 0;
 	shared_data.pair.special_lj = 0;
 	shared_data.pair.special_coul = 0;
 	shared_data.pppm.cudable_force = 0;
 	shared_data.buffersize = 0;
 	shared_data.buffer_new = 1;
 	shared_data.buffer = NULL;
 	shared_data.comm.comm_phase=0;
 	shared_data.overlap_comm=0;
 	shared_data.comm.buffer = NULL;
 	shared_data.comm.buffer_size=0;
 	shared_data.comm.overlap_split_ratio=0;
   // setTimingsZero();
 }
 void Cuda::allocate()
 {
 	accelerator(0,NULL);
 	MYDBG(printf("# CUDA: Cuda::allocate ...\n");)
 	if(not cu_virial)
 	{
 	  cu_virial    = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.virial , 6);
 	  cu_eng_vdwl  = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.eng_vdwl ,1);
 	  cu_eng_coul  = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.eng_coul ,1);
 	  cu_extent 	 = new cCudaData<double, double, x> (extent, 6);
 	  shared_data.flag = CudaWrapper_AllocCudaData(sizeof(int));
 	  int size=2*CUDA_MAX_DEBUG_SIZE;
 	  debugdata = new int[size];
 	  cu_debugdata    = new cCudaData<int, int, x > (debugdata , size);
 	  shared_data.debugdata=cu_debugdata->dev_data();
 	}
 	checkResize();
 	setSystemParams();
 	MYDBG(printf("# CUDA: Cuda::allocate done...\n");)
 }
 void Cuda::setSystemParams()
 {
    MYDBG(printf("# CUDA: Cuda::setSystemParams ...\n");)
 	shared_data.atom.nlocal = atom->nlocal;
 	shared_data.atom.nghost = atom->nghost;
 	shared_data.atom.nall = atom->nlocal + atom->nghost;
 	shared_data.atom.ntypes = atom->ntypes;
 	shared_data.atom.q_flag = atom->q_flag;
 	shared_data.atom.rmass_flag = atom->rmass_flag;
    MYDBG(printf("# CUDA: Cuda::setSystemParams done ...\n");)
 }
 void Cuda::setDomainParams()
 {
    MYDBG(printf("# CUDA: Cuda::setDomainParams ...\n");)
 	cuda_shared_domain* cu_domain = &shared_data.domain;
    cu_domain->triclinic = domain->triclinic;
 	for(short i=0; i<3; ++i)
 	{
 		cu_domain->periodicity[i] = domain->periodicity[i];
 		cu_domain->sublo[i] = domain->sublo[i];
 		cu_domain->subhi[i] = domain->subhi[i];
 		cu_domain->boxlo[i] = domain->boxlo[i];
 		cu_domain->boxhi[i] = domain->boxhi[i];
 		cu_domain->prd[i] = domain->prd[i];
 	}
 	if(domain->triclinic)
    {
 	  for(short i=0; i<3; ++i)
 	  {
 	    cu_domain->boxlo_lamda[i] = domain->boxlo_lamda[i];
 	    cu_domain->boxhi_lamda[i] = domain->boxhi_lamda[i];
 	    cu_domain->prd_lamda[i] = domain->prd_lamda[i];
 	  }
 	  cu_domain->xy = domain->xy;
 	  cu_domain->xz = domain->xz;
 	  cu_domain->yz = domain->yz;
 	}
    for(int i=0;i<6;i++) 
 	{
 	  cu_domain->h[i]=domain->h[i];
 	  cu_domain->h_inv[i]=domain->h_inv[i];
 	  cu_domain->h_rate[i]=domain->h_rate[i];
 	}
 	cu_domain->update=2;
    MYDBG(printf("# CUDA: Cuda::setDomainParams done ...\n");)
 }
 void Cuda::checkResize()
 {
    MYDBG(printf("# CUDA: Cuda::checkResize ...\n");)
    accelerator(0,NULL);
 	cuda_shared_atom* cu_atom = & shared_data.atom;
 	cuda_shared_pair* cu_pair = & shared_data.pair;
 	cu_atom->q_flag      = atom->q_flag;
 	cu_atom->rmass_flag  = atom->rmass ? 1 : 0;
 	cu_atom->nall = atom->nlocal + atom->nghost;
 	cu_atom->nlocal      = atom->nlocal;
 	cu_atom->nghost      = atom->nghost;
 	// do we have more atoms to upload than currently allocated memory on device? (also true if nothing yet allocated)
 	if(atom->nmax > cu_atom->nmax || cu_tag == NULL)
 	{
 		delete cu_x;               cu_x         = new cCudaData<double, X_FLOAT, yx> ((double*)atom->x , & cu_atom->x        , atom->nmax, 3,0,true); //cu_x->set_buffer(&(shared_data.buffer),&(shared_data.buffersize),true);
 		delete cu_v;               cu_v         = new cCudaData<double, V_FLOAT, yx> ((double*)atom->v, & cu_atom->v         , atom->nmax, 3);
 		delete cu_f;               cu_f         = new cCudaData<double, F_FLOAT, yx> ((double*)atom->f, & cu_atom->f         , atom->nmax, 3,0,true);
 		delete cu_tag;             cu_tag       = new cCudaData<int   , int    , x > (atom->tag       , & cu_atom->tag       , atom->nmax   );
 		delete cu_type;            cu_type      = new cCudaData<int   , int    , x > (atom->type      , & cu_atom->type      , atom->nmax   );
 		delete cu_mask;            cu_mask      = new cCudaData<int   , int    , x > (atom->mask      , & cu_atom->mask      , atom->nmax   );
 		delete cu_image;           cu_image     = new cCudaData<int   , int    , x > (atom->image     , & cu_atom->image     , atom->nmax   );
 		if(atom->rmass)
 			{delete cu_rmass;      cu_rmass     = new cCudaData<double, V_FLOAT, x > (atom->rmass     , & cu_atom->rmass     , atom->nmax  );}
 		if(cu_atom->q_flag)
 			{delete cu_q;          cu_q         = new cCudaData<double, F_FLOAT, x > ((double*)atom->q, & cu_atom->q         , atom->nmax  );}// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
 /*
 		if(force->pair)
 		if(force->pair->eatom)
 			{delete cu_eatom;          cu_eatom         = new cCudaData<double, ENERGY_FLOAT, x > (force->pair->eatom, & cu_atom->eatom         , atom->nmax  );}// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
 		if(force->pair)
 		if(force->pair->vatom)
 			{delete cu_vatom;          cu_vatom         = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & cu_atom->vatom         , atom->nmax,6  );}// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
 */
 		if(atom->radius)
 		{
 			delete cu_radius;     cu_radius    = new cCudaData<double, X_FLOAT, x > (atom->radius    , & cu_atom->radius     , atom->nmax  );
 		    delete cu_v_radius;   cu_v_radius  = new cCudaData<V_FLOAT, V_FLOAT, x> (v_radius , & cu_atom->v_radius      , atom->nmax*4);
 		    delete cu_omega_rmass;   cu_omega_rmass  = new cCudaData<V_FLOAT, V_FLOAT, x> (omega_rmass , & cu_atom->omega_rmass      , atom->nmax*4);
 		}
 		/*		
 		if(atom->density)
 			{delete cu_density;    cu_density   = new cCudaData<double, F_FLOAT, x > (atom->density   , & cu_atom->density     , atom->nmax  );}
 		*/
 		if(atom->omega)
 			{delete cu_omega;      cu_omega     = new cCudaData<double, V_FLOAT, yx > (((double*) atom->omega)    , & cu_atom->omega     , atom->nmax,3  );}
 		if(atom->torque)
 			{delete cu_torque;     cu_torque    = new cCudaData<double, F_FLOAT, yx > (((double*) atom->torque)   , & cu_atom->torque     , atom->nmax,3  );}
 		if(atom->special)
 			{delete cu_special;     cu_special    = new cCudaData<int, int, yx > (((int*) &(atom->special[0][0]))   , & cu_atom->special     , atom->nmax,atom->maxspecial  ); shared_data.atom.maxspecial=atom->maxspecial;}
 		if(atom->nspecial)
 			{delete cu_nspecial;     cu_nspecial    = new cCudaData<int, int, yx > (((int*) atom->nspecial)  , & cu_atom->nspecial     , atom->nmax,3  );}
 		if(atom->molecule)
 			{delete cu_molecule;     cu_molecule    = new cCudaData<int, int, x > (((int*) atom->molecule)  , & cu_atom->molecule     , atom->nmax  );}
 		shared_data.atom.special_flag = neighbor->special_flag;
 		shared_data.atom.molecular = atom->molecular;
  	    cu_atom->update_nmax = 2;
 	    cu_atom->nmax        = atom->nmax;
 	    //delete [] x_type; 			x_type 		= new X_FLOAT4[atom->nmax];
 		delete cu_x_type;           cu_x_type   = new cCudaData<X_FLOAT, X_FLOAT, x> (x_type , & cu_atom->x_type      , atom->nmax*4);
 	   // shared_data.buffer_new = 2;
 	}
 	if(((cu_xhold==NULL)||(cu_xhold->get_dim()[0]<neighbor->maxhold))&&neighbor->xhold)
 	{
 		delete cu_xhold;           cu_xhold     = new cCudaData<double, X_FLOAT, yx> ((double*)neighbor->xhold, & cu_atom->xhold         , neighbor->maxhold, 3);
 		shared_data.atom.maxhold=neighbor->maxhold;
 	}
 	if(atom->mass && !cu_mass) 
 	{cu_mass      = new cCudaData<double, V_FLOAT, x > (atom->mass      , & cu_atom->mass      , atom->ntypes+1);}
 	cu_atom->mass_host   = atom->mass;
 	if(atom->map_style==1)
 	{
 	  if((cu_map_array==NULL))
 	  {
 	  	cu_map_array   = new cCudaData<int, int, x > (atom->get_map_array()   , & cu_atom->map_array     , atom->get_map_size()  );
 	  }
 	}
 	// if any of the host pointers have changed (e.g. re-allocated somewhere else), set to correct pointer
 	if(cu_x   ->get_host_data() != atom->x)    cu_x   ->set_host_data((double*) (atom->x));
 	if(cu_v   ->get_host_data() != atom->v)    cu_v   ->set_host_data((double*) (atom->v));
 	if(cu_f   ->get_host_data() != atom->f)    cu_f   ->set_host_data((double*) (atom->f));
 	if(cu_tag ->get_host_data() != atom->tag)  cu_tag ->set_host_data(atom->tag);
 	if(cu_type->get_host_data() != atom->type) cu_type->set_host_data(atom->type);
 	if(cu_mask->get_host_data() != atom->mask) cu_mask->set_host_data(atom->mask);
 	if(cu_image->get_host_data() != atom->image) cu_mask->set_host_data(atom->image);
 	if(cu_xhold)
 	if(cu_xhold->get_host_data()!= neighbor->xhold) cu_xhold->set_host_data((double*)(neighbor->xhold));
 	if(atom->rmass)
 	if(cu_rmass->get_host_data() != atom->rmass) cu_rmass->set_host_data((double*) (atom->rmass));
 	if(cu_atom->q_flag)
 	if(cu_q->get_host_data() != atom->q) cu_q->set_host_data((double*) (atom->q));
 	if(atom->radius)
 	if(cu_radius->get_host_data() != atom->radius) cu_radius->set_host_data((double*) (atom->radius));
 	/*
 	if(atom->density)
 	if(cu_density->get_host_data() != atom->density) cu_density->set_host_data((double*) (atom->density));
 	*/
 	if(atom->omega)
 	if(cu_omega->get_host_data() != atom->omega) cu_omega->set_host_data((double*) (atom->omega));
 	if(atom->torque)
 	if(cu_torque->get_host_data() != atom->torque) cu_torque->set_host_data((double*) (atom->torque));
 	if(atom->special)
 	if(cu_special->get_host_data() != atom->special)
 			{delete cu_special;     cu_special    = new cCudaData<int, int, yx > (((int*) atom->special)   , & cu_atom->special     , atom->nmax,atom->maxspecial  ); shared_data.atom.maxspecial=atom->maxspecial;}
 	if(atom->nspecial)
 	if(cu_nspecial->get_host_data() != atom->nspecial) cu_nspecial->set_host_data((int*) (atom->nspecial));
 	if(atom->molecule)
 	if(cu_molecule->get_host_data() != atom->molecule) cu_molecule->set_host_data((int*) (atom->molecule));
 	if(force)
 	if(cu_virial   ->get_host_data() != force->pair->virial)    cu_virial   ->set_host_data(force->pair->virial);
 	if(force)
 	if(cu_eng_vdwl ->get_host_data() != &force->pair->eng_vdwl)    cu_eng_vdwl  ->set_host_data(&force->pair->eng_vdwl);
 	if(force)
 	if(cu_eng_coul ->get_host_data() != &force->pair->eng_coul)    cu_eng_coul   ->set_host_data(&force->pair->eng_coul);
 	cu_atom->update_nlocal = 2;
 	MYDBG(printf("# CUDA: Cuda::checkResize done...\n");)
 }
 void Cuda::evsetup_eatom_vatom(int eflag_atom,int vflag_atom)
 {
    if(eflag_atom)
    {
    	if(not cu_eatom) 
    		cu_eatom         = new cCudaData<double, ENERGY_FLOAT, x > (force->pair->eatom, & (shared_data.atom.eatom)         , atom->nmax  );// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
    	cu_eatom->set_host_data(force->pair->eatom); 
 		cu_eatom->memset_device(0);
    }
    if(vflag_atom)
    {	
    	if(not cu_vatom) 
    		cu_vatom         = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom)         , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
    	cu_vatom->set_host_data((double*)force->pair->vatom); 
 		cu_vatom->memset_device(0);
    }
 }
 void Cuda::uploadAll()
 {
 	MYDBG(printf("# CUDA: Cuda::uploadAll() ... start\n");)
 	timespec starttime;
 	timespec endtime;
 	if(atom->nmax!=shared_data.atom.nmax) checkResize();
 	clock_gettime(CLOCK_REALTIME,&starttime);
 	cu_x   ->upload();
 	cu_v   ->upload();
 	cu_f   ->upload();
 	cu_tag ->upload();
 	cu_type->upload();
 	cu_mask->upload();
 	cu_image->upload();
 	if(shared_data.atom.q_flag) cu_q    ->upload();
 	//printf("A3\n");
 	//if(shared_data.atom.need_eatom) cu_eatom->upload();
 	//printf("A4\n");
 	//if(shared_data.atom.need_vatom) cu_vatom->upload();
 	//printf("A5\n");
 	if(atom->rmass)             cu_rmass->upload();
 	if(atom->radius)            cu_radius->upload();
 	//	if(atom->density)           cu_density->upload();
 	if(atom->omega)             cu_omega->upload();
 	if(atom->torque)            cu_torque->upload();
 	if(atom->special)           cu_special->upload();
 	if(atom->nspecial)          cu_nspecial->upload();
 	if(atom->molecule)          cu_molecule->upload();
 	if(cu_eatom) cu_eatom->upload();
 	if(cu_vatom) cu_vatom->upload();
 	clock_gettime(CLOCK_REALTIME,&endtime);
 	uploadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
 	CUDA_IF_BINNING(Cuda_PreBinning(& shared_data);)
 	CUDA_IF_BINNING(Cuda_Binning   (& shared_data);)
 	shared_data.atom.triggerneighsq=neighbor->triggersq;
 	MYDBG(printf("# CUDA: Cuda::uploadAll() ... end\n");)
 }
 void Cuda::downloadAll()
 {
 	MYDBG(printf("# CUDA: Cuda::downloadAll() ... start\n");)
 	timespec starttime;
 	timespec endtime;
 	if(atom->nmax!=shared_data.atom.nmax) checkResize();
 	CUDA_IF_BINNING( Cuda_ReverseBinning(& shared_data); )
 	clock_gettime(CLOCK_REALTIME,&starttime);
 	cu_x   ->download();
 	cu_v   ->download();
 	cu_f   ->download();
 	cu_type->download();
 	cu_tag ->download();
 	cu_mask->download();
 	cu_image->download();
 	//if(shared_data.atom.need_eatom) cu_eatom->download();
 	//if(shared_data.atom.need_vatom) cu_vatom->download();
 	if(shared_data.atom.q_flag) cu_q    ->download();
 	if(atom->rmass)             cu_rmass->download();
 	if(atom->radius)            cu_radius->download();
 	//	if(atom->density)           cu_density->download();
 	if(atom->omega)             cu_omega->download();
 	if(atom->torque)            cu_torque->download();
 	if(atom->special)           cu_special->download();
 	if(atom->nspecial)          cu_nspecial->download();
 	if(atom->molecule)          cu_molecule->download();
 	if(cu_eatom) cu_eatom->download();
 	if(cu_vatom) cu_vatom->download();
 	clock_gettime(CLOCK_REALTIME,&endtime);
 	downloadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
 	MYDBG(printf("# CUDA: Cuda::downloadAll() ... end\n");)
 }
 void Cuda::downloadX()
 {
 	Cuda_Pair_RevertXType(& this->shared_data);
 	cu_x->download();
 }
 CudaNeighList* Cuda::registerNeighborList(class NeighList* neigh_list)
 {
 	MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... start a\n");)
 	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.find(neigh_list);
 	if(p != neigh_lists.end()) return p->second;
 	else
 	{
 		CudaNeighList* neigh_list_cuda = new CudaNeighList(lmp, neigh_list);
 		neigh_lists.insert(std::pair<NeighList*, CudaNeighList*>(neigh_list, neigh_list_cuda));
 		return neigh_list_cuda;
 	}
 	MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... end b\n");)
 }
 void Cuda::uploadAllNeighborLists()
 {
 	MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... start\n");)
 	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
 	while(p != neigh_lists.end())
 	{
 		p->second->nl_upload();
 		if(not (p->second->neigh_list->cuda_list->build_cuda))
 		for(int i=0;i<atom->nlocal;i++)
 		p->second->sneighlist.maxneighbors=MAX(p->second->neigh_list->numneigh[i],p->second->sneighlist.maxneighbors) ;
 		++p;
 	}
 	MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... done\n");)
 }
 void Cuda::downloadAllNeighborLists()
 {
 	MYDBG(printf("# CUDA: Cuda::downloadAllNeighborList() ... start\n");)
 	std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
 	while(p != neigh_lists.end())
 	{
 		p->second->nl_download();
 		++p;
 	}
 }
 void Cuda::update_xhold(int &maxhold,double* xhold)
 {
     if(this->shared_data.atom.maxhold<atom->nmax)
     {
        maxhold = atom->nmax;
 		delete this->cu_xhold;           this->cu_xhold     = new cCudaData<double, X_FLOAT, yx> ((double*)xhold, & this->shared_data.atom.xhold         , maxhold, 3);
     }
     this->shared_data.atom.maxhold=maxhold;
 	 CudaWrapper_CopyData(this->cu_xhold->dev_data(),this->cu_x->dev_data(),3*atom->nmax*sizeof(X_FLOAT));
 }
 void Cuda::setTimingsZero()
 {
 	shared_data.cuda_timings.test1=0;
 	shared_data.cuda_timings.test2=0;
 	//communication
 	shared_data.cuda_timings.comm_forward_total = 0;
 	shared_data.cuda_timings.comm_forward_mpi_upper = 0;
 	shared_data.cuda_timings.comm_forward_mpi_lower = 0;
 	shared_data.cuda_timings.comm_forward_kernel_pack = 0;
 	shared_data.cuda_timings.comm_forward_kernel_unpack = 0;
 	shared_data.cuda_timings.comm_forward_upload = 0;
 	shared_data.cuda_timings.comm_forward_download = 0;
 	shared_data.cuda_timings.comm_exchange_total = 0;
 	shared_data.cuda_timings.comm_exchange_mpi = 0;
 	shared_data.cuda_timings.comm_exchange_kernel_pack = 0;
 	shared_data.cuda_timings.comm_exchange_kernel_unpack = 0;
 	shared_data.cuda_timings.comm_exchange_kernel_fill = 0;
 	shared_data.cuda_timings.comm_exchange_cpu_pack= 0;
 	shared_data.cuda_timings.comm_exchange_upload = 0;
 	shared_data.cuda_timings.comm_exchange_download = 0;
 	shared_data.cuda_timings.comm_border_total = 0;
 	shared_data.cuda_timings.comm_border_mpi = 0;
 	shared_data.cuda_timings.comm_border_kernel_pack = 0;
 	shared_data.cuda_timings.comm_border_kernel_unpack = 0;
 	shared_data.cuda_timings.comm_border_kernel_buildlist = 0;
 	shared_data.cuda_timings.comm_border_kernel_self = 0;
 	shared_data.cuda_timings.comm_border_upload = 0;
 	shared_data.cuda_timings.comm_border_download = 0;
 	//pair forces
 	shared_data.cuda_timings.pair_xtype_conversion = 0;
 	shared_data.cuda_timings.pair_kernel = 0;
 	shared_data.cuda_timings.pair_virial = 0;
 	shared_data.cuda_timings.pair_force_collection = 0;
 	//neighbor
 	shared_data.cuda_timings.neigh_bin = 0;
 	shared_data.cuda_timings.neigh_build = 0;
 	shared_data.cuda_timings.neigh_special = 0;
 	//PPPM
 	shared_data.cuda_timings.pppm_particle_map; 
    shared_data.cuda_timings.pppm_make_rho; 
    shared_data.cuda_timings.pppm_brick2fft; 
    shared_data.cuda_timings.pppm_poisson; 
    shared_data.cuda_timings.pppm_fillbrick; 
    shared_data.cuda_timings.pppm_fieldforce; 
    shared_data.cuda_timings.pppm_compute; 
 	CudaWrapper_CheckUploadTime(true);
 	CudaWrapper_CheckDownloadTime(true);
 	CudaWrapper_CheckCPUBufUploadTime(true);
 	CudaWrapper_CheckCPUBufDownloadTime(true);	
 }
 void Cuda::print_timings()
 {
 	if(universe->me!=0) return;
 	if(not dotiming) return;
 	printf("\n # CUDA: Special timings\n\n");
 	printf("\n Transfer Times\n");
 	printf(" PCIe Upload:  \t %lf s\n",CudaWrapper_CheckUploadTime());
 	printf(" PCIe Download:\t %lf s\n",CudaWrapper_CheckDownloadTime());
 	printf(" CPU Tempbbuf Upload:   \t %lf \n",CudaWrapper_CheckCPUBufUploadTime());
 	printf(" CPU Tempbbuf Download: \t %lf \n",CudaWrapper_CheckCPUBufDownloadTime());
 	printf("\n Communication \n");
 	printf(" Forward Total           \t %lf \n",shared_data.cuda_timings.comm_forward_total);
 	printf(" Forward MPI Upper Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_upper);
 	printf(" Forward MPI Lower Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_lower);
 	printf(" Forward Kernel Pack     \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_pack);
 	printf(" Forward Kernel Unpack   \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_unpack);
 	printf(" Forward Kernel Self     \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_self);
 	printf(" Forward Upload          \t %lf \n",shared_data.cuda_timings.comm_forward_upload);
 	printf(" Forward Download        \t %lf \n",shared_data.cuda_timings.comm_forward_download);
 	printf(" Forward Overlap Split Ratio\t %lf \n",shared_data.comm.overlap_split_ratio);
 	printf("\n");
 	printf(" Exchange Total          \t %lf \n",shared_data.cuda_timings.comm_exchange_total);
 	printf(" Exchange MPI            \t %lf \n",shared_data.cuda_timings.comm_exchange_mpi);
 	printf(" Exchange Kernel Pack    \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_pack);
 	printf(" Exchange Kernel Unpack  \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_unpack);
    printf(" Exchange Kernel Fill    \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_fill);
    printf(" Exchange CPU Pack	     \t %lf \n",shared_data.cuda_timings.comm_exchange_cpu_pack);
 	printf(" Exchange Upload         \t %lf \n",shared_data.cuda_timings.comm_exchange_upload);
 	printf(" Exchange Download       \t %lf \n",shared_data.cuda_timings.comm_exchange_download);
 	printf("\n");
 	printf(" Border Total            \t %lf \n",shared_data.cuda_timings.comm_border_total);
 	printf(" Border MPI              \t %lf \n",shared_data.cuda_timings.comm_border_mpi);
 	printf(" Border Kernel Pack      \t %lf \n",shared_data.cuda_timings.comm_border_kernel_pack);
 	printf(" Border Kernel Unpack    \t %lf \n",shared_data.cuda_timings.comm_border_kernel_unpack);
 	printf(" Border Kernel Self      \t %lf \n",shared_data.cuda_timings.comm_border_kernel_self);
 	printf(" Border Kernel BuildList \t %lf \n",shared_data.cuda_timings.comm_border_kernel_buildlist);
 	printf(" Border Upload           \t %lf \n",shared_data.cuda_timings.comm_border_upload);
 	printf(" Border Download 	     \t %lf \n",shared_data.cuda_timings.comm_border_download);
 	printf("\n");
 	//pair forces
 	printf(" Pair XType Conversion   \t %lf \n",shared_data.cuda_timings.pair_xtype_conversion );
 	printf(" Pair Kernel             \t %lf \n",shared_data.cuda_timings.pair_kernel );
 	printf(" Pair Virial             \t %lf \n",shared_data.cuda_timings.pair_virial );
 	printf(" Pair Force Collection   \t %lf \n",shared_data.cuda_timings.pair_force_collection );
 	printf("\n");
 	//neighbor
 	printf(" Neighbor Binning        \t %lf \n",shared_data.cuda_timings.neigh_bin );
 	printf(" Neighbor Build          \t %lf \n",shared_data.cuda_timings.neigh_build );
 	printf(" Neighbor Special        \t %lf \n",shared_data.cuda_timings.neigh_special );	
 	printf("\n");
 	//pppm
 	if(force->kspace)
 	{
 	printf(" PPPM Total              \t %lf \n",shared_data.cuda_timings.pppm_compute );
 	printf(" PPPM Particle Map       \t %lf \n",shared_data.cuda_timings.pppm_particle_map );
 	printf(" PPPM Make Rho           \t %lf \n",shared_data.cuda_timings.pppm_make_rho );
 	printf(" PPPM Brick2fft          \t %lf \n",shared_data.cuda_timings.pppm_brick2fft );
 	printf(" PPPM Poisson            \t %lf \n",shared_data.cuda_timings.pppm_poisson );
 	printf(" PPPM Fillbrick          \t %lf \n",shared_data.cuda_timings.pppm_fillbrick );
 	printf(" PPPM Fieldforce         \t %lf \n",shared_data.cuda_timings.pppm_fieldforce );
 	printf("\n");
 	}	
 	printf(" Debug Test 1            \t %lf \n",shared_data.cuda_timings.test1);
 	printf(" Debug Test 2            \t %lf \n",shared_data.cuda_timings.test2);
 	printf("\n");
 }
--- a/src/USER-CUDA/cuda.h
+++ b/src/USER-CUDA/cuda.h
@ -0,0 +1,153 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #ifndef CUDA_H
 #define CUDA_H
 #include "pointers.h"
 #include "cuda_shared.h"
 #include "cuda_data.h"
 #include "cuda_precision.h"
 #include <map>
 #ifdef _DEBUG
 #define MYDBG(a) a
 #else
 #define MYDBG(a) 
 #endif
 namespace LAMMPS_NS
 {
 	class Cuda : protected Pointers
 	{
 		public:
 		Cuda(class LAMMPS *);
 		~Cuda();
 		//static void setDevice(class LAMMPS*);
 		void allocate();
 		void accelerator(int, char **);
 		void setSharedDataZero();
 		void setSystemParams();
 		void setDomainParams();
 		void checkResize();
 		void evsetup_eatom_vatom(int eflag_atom,int vflag_atom);
 		void uploadAll();
 		void downloadAll();
 		void downloadX();
 		class CudaNeighList* registerNeighborList(class NeighList* neigh_list);
 		void uploadAllNeighborLists();
 		void downloadAllNeighborLists();
 		void set_neighinit(int dist_check, double triggerneighsq)
 		{
    	  shared_data.atom.dist_check=dist_check;  
    	  shared_data.atom.triggerneighsq = triggerneighsq;	  	
 		}
 		bool decide_by_integrator()
 		{
 		 return neighbor_decide_by_integrator  && cu_xhold && finished_setup;
 	    }
 	    void update_xhold(int &maxhold,double* xhold);
 		void setTimingsZero();
 		void print_timings();
 		void cu_x_download() {cu_x->download();}
 		bool device_set;
 		bool dotiming;
 		bool dotestatom;
 		int testatom;
 		double uploadtime,downloadtime;
 		bool finished_setup,begin_setup;
 		bool oncpu;
 		bool finished_run;
 		int self_comm;
 		int cuda_exists;
 		double extent[6];
 		int* debugdata;
 		// data shared between host code and device code
 		// (number of atoms, device pointers for up- & download)
 		cuda_shared_data shared_data;
 		cCudaData<double  , F_FLOAT , x >* cu_q;
 		cCudaData<double  , F_FLOAT , yx>* cu_f;
 		cCudaData<double  , V_FLOAT , x >* cu_mass;
 		cCudaData<double  , V_FLOAT , x >* cu_rmass;
 		cCudaData<double  , V_FLOAT , yx>* cu_v;
 		cCudaData<double  , X_FLOAT , yx>* cu_x;
 		cCudaData<double  , X_FLOAT , yx>* cu_xhold;
 		cCudaData<int     , int     , x >* cu_mask;
 		cCudaData<int     , int     , x >* cu_tag;
 		cCudaData<int     , int     , x >* cu_type;
 		cCudaData<int     , int     , x >* cu_image;
 		cCudaData<double  , ENERGY_FLOAT, x >* cu_eatom;
 		cCudaData<double  , ENERGY_FLOAT, yx>* cu_vatom;
 		cCudaData<double  , ENERGY_FLOAT, x >* cu_virial;
 		cCudaData<double  , ENERGY_FLOAT, x >* cu_eng_vdwl;
 		cCudaData<double  , ENERGY_FLOAT, x >* cu_eng_coul;
 		cCudaData<double  , double  , x >* cu_extent;
 		int* binned_id;
 		cCudaData<int 	  , int	    , xx >* cu_binned_id;
 		int* binned_idnew;
 		cCudaData<int 	  , int	    , xx >* cu_binned_idnew;
 		cCudaData<int 	  , int	    , x >* cu_debugdata;
 		cCudaData<double  , X_FLOAT , x>* cu_radius;
 		cCudaData<double  , F_FLOAT , x>* cu_density;
 		cCudaData<double  , V_FLOAT , yx>* cu_omega;
 		cCudaData<double  , F_FLOAT , yx>* cu_torque;
 		cCudaData<int 	  , int	    , yx >* cu_special;
 		cCudaData<int 	  , int	    , yx >* cu_nspecial;
 		cCudaData<int     , int     , x >* cu_molecule;
 		cCudaData<X_FLOAT  , X_FLOAT , x>* cu_x_type;
 		X_FLOAT* x_type;
 		cCudaData<V_FLOAT  , V_FLOAT , x>* cu_v_radius;
 		V_FLOAT* v_radius;
 		cCudaData<V_FLOAT  , V_FLOAT , x>* cu_omega_rmass;
 		V_FLOAT* omega_rmass;
 		cCudaData<int     , int     , x >* cu_map_array;
 		int neighbor_decide_by_integrator;
        bool pinned;
 		void* copy_buffer;
 		int copy_buffersize;
 		private:
 		std::map<class NeighList*, class CudaNeighList*> neigh_lists;
 	};
 }
 #endif // CUDA_H
--- a/src/USER-CUDA/cuda_common.h
+++ b/src/USER-CUDA/cuda_common.h
@ -0,0 +1,344 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #ifndef _CUDA_COMMON_H_
 #define _CUDA_COMMON_H_
 //#include "cutil.h"
 #include "cuda_precision.h"
 #include "cuda_wrapper_cu.h"
 #define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
 //this can not be arbitrarly large, since constant space is limited. 
 //in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
 //Christian   
 #define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
 #define CUDA_MAX_NSPECIAL 25
 // define some easy-to-use debug and emulation macros
 #ifdef _DEBUG
 #define MYDBG(a) a
 #else
 #define MYDBG(a) 
 #endif
 #if __DEVICE_EMULATION__
 #define MYEMU(a) a
 #else
 #define MYEMU(a) 
 #endif
 #define MYEMUDBG(a) MYEMU(MYDBG(a))
 // Add Prefix (needed as workaround, same constant's names in different files causes conflict)
 #define MY_ADD_PREFIX(prefix, var) prefix##_##var
 #define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
 #define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
 #define MY_VAR_TO_STR(var) #var
 #define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
 #define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
 #define CUDA_USE_TEXTURE
 #define CUDA_USE_FLOAT4
 //constants used by many classes
 //domain
 #define _boxhi       MY_AP(boxhi)
 #define _boxlo       MY_AP(boxlo)
 #define _subhi       MY_AP(subhi)
 #define _sublo       MY_AP(sublo)
 #define _box_size    MY_AP(box_size)
 #define _prd         MY_AP(prd)
 #define _periodicity MY_AP(periodicity)
 #define _triclinic	 MY_AP(triclinic)
 #define _boxhi_lamda MY_AP(boxhi_lamda)
 #define _boxlo_lamda MY_AP(boxlo_lamda)
 #define _prd_lamda   MY_AP(prd_lamda)
 #define _h		 	 MY_AP(h)
 #define _h_inv	 	 MY_AP(h_inv)
 #define _h_rate		 MY_AP(h_rate)
 __device__ __constant__ X_FLOAT _boxhi[3];
 __device__ __constant__ X_FLOAT _boxlo[3];
 __device__ __constant__ X_FLOAT _subhi[3];
 __device__ __constant__ X_FLOAT _sublo[3];
 __device__ __constant__ X_FLOAT _box_size[3];
 __device__ __constant__ X_FLOAT _prd[3];
 __device__ __constant__ int _periodicity[3];
 __device__ __constant__ int _triclinic;
 __device__ __constant__ X_FLOAT _boxhi_lamda[3];
 __device__ __constant__ X_FLOAT _boxlo_lamda[3];
 __device__ __constant__ X_FLOAT _prd_lamda[3];
 __device__ __constant__ X_FLOAT _h[6];
 __device__ __constant__ X_FLOAT _h_inv[6];
 __device__ __constant__ V_FLOAT _h_rate[6];
 //atom properties
 #define _x           MY_AP(x)
 #define _v           MY_AP(v)
 #define _f           MY_AP(f)
 #define _tag         MY_AP(tag)
 #define _type        MY_AP(type)
 #define _mask        MY_AP(mask)
 #define _image       MY_AP(image)
 #define _q           MY_AP(q)
 #define _mass        MY_AP(mass)
 #define _rmass       MY_AP(rmass)
 #define _rmass_flag  MY_AP(rmass_flag)
 #define _eatom       MY_AP(eatom)
 #define _vatom       MY_AP(vatom)
 #define _x_type      MY_AP(x_type)
 #define _radius      MY_AP(radius)
 #define _density     MY_AP(density)
 #define _omega       MY_AP(omega)
 #define _torque      MY_AP(torque)
 #define _special     MY_AP(special)
 #define _maxspecial  MY_AP(maxspecial)
 #define _nspecial    MY_AP(nspecial)
 #define _special_flag  MY_AP(special_flag)
 #define _molecule    MY_AP(molecule)
 #define _v_radius    MY_AP(v_radius)
 #define _omega_rmass MY_AP(omega_rmass)
 #define _freeze_group_bit MY_AP(freeze_group_bit)
 #define _map_array   MY_AP(map_array)
 __device__ __constant__ X_FLOAT* _x;  //holds pointer to positions
 __device__ __constant__ V_FLOAT* _v;
 __device__ __constant__ F_FLOAT* _f;
 __device__ __constant__ int* _tag;
 __device__ __constant__ int* _type;
 __device__ __constant__ int* _mask;
 __device__ __constant__ int* _image;
 __device__ __constant__ V_FLOAT* _mass;
 __device__ __constant__ F_FLOAT* _q;
 __device__ __constant__ V_FLOAT* _rmass;
 __device__ __constant__ int _rmass_flag;
 __device__ __constant__ ENERGY_FLOAT* _eatom;
 __device__ __constant__ ENERGY_FLOAT* _vatom;
 __device__ __constant__ X_FLOAT4* _x_type;  //holds pointer to positions
 __device__ __constant__ X_FLOAT* _radius;  
 __device__ __constant__ F_FLOAT* _density;  
 __device__ __constant__ V_FLOAT* _omega;  
 __device__ __constant__ F_FLOAT* _torque;  
 __device__ __constant__ int* _special;
 __device__ __constant__ int _maxspecial;
 __device__ __constant__ int* _nspecial;
 __device__ __constant__ int _special_flag[4];
 __device__ __constant__ int* _molecule;
 __device__ __constant__ V_FLOAT4* _v_radius;  //holds pointer to positions
 __device__ __constant__ V_FLOAT4* _omega_rmass;  //holds pointer to positions
 __device__ __constant__ int _freeze_group_bit;
 __device__ __constant__ int* _map_array;
 #ifdef CUDA_USE_TEXTURE
 	#define _x_tex         MY_AP(x_tex)
 	#if X_PRECISION == 1
 	texture<float> _x_tex;
 	#else
 	texture<int2,1> _x_tex;
 	#endif
 	#define _type_tex         MY_AP(type_tex)
 	texture<int> _type_tex;
 	#define _x_type_tex         MY_AP(x_type_tex)
 	#if X_PRECISION == 1
 	texture<float4,1> _x_type_tex;
 	#else
 	texture<int4,1> _x_type_tex;
 	#endif
 	#define _v_radius_tex         MY_AP(v_radius_tex)
 	#if V_PRECISION == 1
 	texture<float4,1> _v_radius_tex;
 	#else
 	texture<int4,1> _v_radius_tex;
 	#endif
 	#define _omega_rmass_tex         MY_AP(omega_rmass_tex)
 	#if V_PRECISION == 1
 	texture<float4,1> _omega_rmass_tex;
 	#else
 	texture<int4,1> _omega_rmass_tex;
 	#endif
 	#define _q_tex         MY_AP(q_tex)
 	#if F_PRECISION == 1
 	texture<float> _q_tex;
 	#else
 	texture<int2,1> _q_tex;
 	#endif
 #endif
 //neighbor
 #ifdef IncludeCommonNeigh
 #define _inum        	MY_AP(inum)
 #define _inum_border    MY_AP(inum_border)
 #define _ilist       	MY_AP(ilist)
 #define _ilist_border 	MY_AP(ilist_border)
 #define _numneigh    	MY_AP(numneigh)
 #define _numneigh_border 	MY_AP(numneigh_border)
 #define _numneigh_inner		MY_AP(numneigh_inner)
 #define _firstneigh  	MY_AP(firstneigh)
 #define _neighbors 	MY_AP(neighbors)
 #define _neighbors_border 	MY_AP(neighbors_border)
 #define _neighbors_inner  	MY_AP(neighbors_inner)
 #define _reneigh_flag 	MY_AP(reneigh_flag)
 #define _triggerneighsq MY_AP(triggerneighsq)
 #define _xhold       	MY_AP(xhold)
 #define _maxhold     	MY_AP(maxhold)
 #define _dist_check     MY_AP(dist_check)
 #define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
 #define _maxneighbors   MY_AP(maxneighbors)
 #define _overlap_comm   MY_AP(overlap_comm)
 __device__ __constant__ int _inum;
 __device__ __constant__ int* _inum_border;
 __device__ __constant__ int* _ilist;
 __device__ __constant__ int* _ilist_border;
 __device__ __constant__ int* _numneigh;
 __device__ __constant__ int* _numneigh_border;
 __device__ __constant__ int* _numneigh_inner;
 __device__ __constant__ int** _firstneigh;
 __device__ __constant__ int* _neighbors;
 __device__ __constant__ int* _neighbors_border;
 __device__ __constant__ int* _neighbors_inner;
 __device__ __constant__ int* _reneigh_flag;
 __device__ __constant__ X_FLOAT _triggerneighsq;
 __device__ __constant__ X_FLOAT* _xhold;  //holds pointer to positions
 __device__ __constant__ int _maxhold;
 __device__ __constant__ int _dist_check;
 __device__ __constant__ int _neighbor_maxlocal;
 __device__ __constant__ int _maxneighbors;
 __device__ __constant__ int _overlap_comm;
 #endif
 //system properties
 #define _nall        MY_AP(nall)
 #define _nghost      MY_AP(nghost)
 #define _nlocal      MY_AP(nlocal)
 #define _nmax        MY_AP(nmax)
 #define _cuda_ntypes MY_AP(cuda_ntypes)
 #define _dtf         MY_AP(dtf)
 #define _dtv         MY_AP(dtv)
 #define _factor      MY_AP(factor)
 #define _virial      MY_AP(virial)
 #define _eng_vdwl    MY_AP(eng_vdwl)
 #define _eng_coul    MY_AP(eng_coul)
 #define _molecular   MY_AP(molecular)
 __device__ __constant__ unsigned _nall;
 __device__ __constant__ unsigned _nghost;
 __device__ __constant__ unsigned _nlocal;
 __device__ __constant__ unsigned _nmax;
 __device__ __constant__ unsigned _cuda_ntypes;
 __device__ __constant__ V_FLOAT _dtf;
 __device__ __constant__ X_FLOAT _dtv;
 __device__ __constant__ V_FLOAT _factor;
 __device__ __constant__ ENERGY_FLOAT* _virial;
 __device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
 __device__ __constant__ ENERGY_FLOAT* _eng_coul;
 __device__ __constant__ int _molecular;
 //other general constants
 #define _buffer      MY_AP(buffer)
 #define _flag		 MY_AP(flag)
 #define _debugdata   MY_AP(debugdata)
 __device__ __constant__ void* _buffer;
 __device__ __constant__ int* _flag;
 __device__ __constant__ int* _debugdata;
 // pointers to data fields on GPU are hold in constant space 
 // -> reduces register usage and number of parameters for kernelcalls 
 // will be variables of file scope in cuda files
 // maybe used to output cudaError_t
 #define MY_OUTPUT_RESULT(result) \
 	switch(result) \
 	{ \
 		case cudaSuccess: printf(" => cudaSuccess\n"); break; \
 		case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
 		case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
 		case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
 		case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
 		default: printf(" => unknown\n"); break; \
 	}
 #ifdef _DEBUG
 #  define CUT_CHECK_ERROR(errorMessage) {                                    \
    cudaError_t err = cudaGetLastError();                                    \
    if( cudaSuccess != err) {                                                \
        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
        exit(EXIT_FAILURE);                                                  \
    }                                                                        \
    err = cudaThreadSynchronize();                                           \
    if( cudaSuccess != err) {                                                \
        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
        exit(EXIT_FAILURE);                                                  \
    }                                                                        \
    }
 #else
 #  define CUT_CHECK_ERROR(errorMessage) {                                    \
    cudaError_t err = cudaGetLastError();                                    \
    if( cudaSuccess != err) {                                                \
        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
        exit(EXIT_FAILURE);                                                  \
    }                                                                        \
    }
 #endif
 #  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
    cudaError err = call;                                                    \
    if( cudaSuccess != err) {                                                \
        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
                __FILE__, __LINE__, cudaGetErrorString( err) );              \
        exit(EXIT_FAILURE);                                                  \
    } }
 #  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
 #define X_MASK 1
 #define V_MASK 2
 #define F_MASK 4
 #define TAG_MASK 8
 #define TYPE_MASK 16
 #define MASK_MASK 32
 #define IMAGE_MASK 64
 #define Q_MASK 128
 #define MOLECULE_MASK 256
 #define RMASS_MASK 512
 #define RADIUS_MASK 1024
 #define DENSITY_MASK 2048
 #define OMEGA_MASK 4096
 #define TORQUE_MASK 8192
 #endif // #ifdef _CUDA_COMMON_H_
--- a/src/USER-CUDA/cuda_data.h
+++ b/src/USER-CUDA/cuda_data.h
@ -0,0 +1,796 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #ifndef _CUDA_DATA_H_
 #define _CUDA_DATA_H_
 enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
 //xx==x in atom_vec x is a member therefore copymode x produces compile errors
 #include "cuda_shared.h"
 #include "cuda_wrapper_cu.h"
 #include "cuda_data_cu.h"
 #include <ctime>
 #include <cstdio>
 #include <typeinfo>
 template <typename host_type, typename dev_type, copy_mode mode>
 class cCudaData
 {
 	protected:
 	void** buffer;
 	int* buf_size;
 	host_type* host_data;
 	dev_array* dev_data_array;
 	dev_type* temp_data;
 	unsigned nbytes;
 	bool owns_dev_array;
 	bool current_data_on_device; //this is not yet working as intended and therefore deactivated
 	bool current_data_on_host;
 	bool is_continues;
 	bool pinned;
 	public:
 	cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
 	cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
 	~cCudaData();
 	void* dev_data() {if(dev_data_array!=NULL) return dev_data_array->dev_data; else return NULL;};
 	void set_dev_data(void* adev_data) {dev_data_array->dev_data=adev_data;};
 	void set_dev_array(dev_array* adev_array) {dev_data_array=adev_array;};
 	void set_host_data(host_type* host_data);
 	void* get_host_data() { return host_data;};
 	void set_buffer(void** buffer,int* buf_size,bool ais_continues);
 	unsigned int* get_dim() {return dev_data_array->dim;};
 	// if you want to upload data to the gpu, which will not change there, then set will_be_changed=false
 	// if you want to upload data to the gpu and update it there, then set will_be_changed=true (default)
 	void upload(bool will_be_changed=true);
 	void uploadAsync(int stream, bool will_be_changed=true ); 
 	// if you want to download data just to have a look at it, then set will_be_changed=false
 	// if you are going to modify the downloaded data, then set will_be_changed=true (default)
 	void download(bool will_be_changed=true);
 	void downloadAsync(int stream); 
 	void memset_device(int value);
 	void device_data_has_changed() {current_data_on_device=false;}
 	void host_data_has_changed() {current_data_on_host=false;}
 	int dev_size() {
 		int size = dev_data_array->dim[0]*sizeof(dev_type);
 		if(dev_data_array->dim[1]) size*=dev_data_array->dim[1];
 		if(dev_data_array->dim[2]) size*=dev_data_array->dim[2];
 		return size;}
 };
 template <typename host_type, typename dev_type, copy_mode mode>
 cCudaData<host_type, dev_type, mode>
 ::cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
 {
 	pinned=is_pinned;
 	owns_dev_array = false;
 	current_data_on_device = false;
 	current_data_on_host = false;
 	is_continues = false;
 	this->host_data = host_data;
 	this->dev_data_array = dev_data_array;
 	unsigned ndev;
 	if((mode == x)||(mode==xx))
 	{
 		ndev = dim_x;
 		dev_data_array->dim[0] = dim_x;
 		dev_data_array->dim[1] = 0;
 		dev_data_array->dim[2] = 0;
 	}
 	else if(mode == xy || mode == yx )
 	{
 		ndev = dim_x * dim_y;
 		dev_data_array->dim[0] = dim_x;
 		dev_data_array->dim[1] = dim_y;
 		dev_data_array->dim[2] = 0;
 	}
 	else
 	{
 		ndev = dim_x * dim_y * dim_z;
 		dev_data_array->dim[0] = dim_x;
 		dev_data_array->dim[1] = dim_y;
 		dev_data_array->dim[2] = dim_z;
 	}
 	nbytes = ndev * sizeof(dev_type);
 	if(nbytes<=0)
 	{
 		host_data=NULL;
 		temp_data=NULL;
 		dev_data_array->dev_data=NULL;
 		return;
 	}
 	dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
 	if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type))
 	{
 		if(not pinned)
 		temp_data = new dev_type[ndev];
 		else
 		{
 			temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type));
 		}
 	}
 }
 template <typename host_type, typename dev_type, copy_mode mode>
 cCudaData<host_type, dev_type, mode>
 ::cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
 {
 	pinned=is_pinned;
 	this->dev_data_array = new dev_array;
 	this->owns_dev_array = true;
 	current_data_on_device = false;
 	current_data_on_host = false;
 	is_continues = false;
 	this->host_data = host_data;
 	unsigned ndev;
 	if((mode == x)||(mode==xx))
 	{
 		ndev = dim_x;
 		dev_data_array->dim[0] = dim_x;
 		dev_data_array->dim[1] = 0;
 		dev_data_array->dim[2] = 0;
 	}
 	else if(mode == xy || mode == yx )
 	{
 		ndev = dim_x * dim_y;
 		dev_data_array->dim[0] = dim_x;
 		dev_data_array->dim[1] = dim_y;
 		dev_data_array->dim[2] = 0;
 	}
 	else
 	{
 		ndev = dim_x * dim_y * dim_z;
 		dev_data_array->dim[0] = dim_x;
 		dev_data_array->dim[1] = dim_y;
 		dev_data_array->dim[2] = dim_z;
 	}
 	nbytes = ndev * sizeof(dev_type);
 	if(nbytes<=0)
 	{
 		host_data=NULL;
 		temp_data=NULL;
 		dev_data_array->dev_data=NULL;
 		return;
 	}
 	dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
 	if(((mode!=x)&&(mode!=xx)) || (typeid(host_type) != typeid(dev_type)))
 	{
 		if(not pinned)
 		temp_data = new dev_type[ndev];
 		else
 		{
 			temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type));
 		}
 	}
 }
 template <typename host_type, typename dev_type, copy_mode mode>
 cCudaData<host_type, dev_type, mode>
 ::~cCudaData()
 {
 	if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type))
 	{
 		if(not pinned)
 		delete [] temp_data;
 		else
 		{
 			CudaWrapper_FreePinnedHostData((void*)temp_data);
 		}
 	}
 	if((dev_data_array->dev_data)&&(nbytes>0))
 	CudaWrapper_FreeCudaData(dev_data_array->dev_data,nbytes);
 	if(owns_dev_array) delete dev_data_array;
 }
 template <typename host_type, typename dev_type, copy_mode mode>
 void cCudaData<host_type, dev_type, mode>
 ::set_host_data(host_type* host_data)
 {
 	this->host_data = host_data;
 }
 template <typename host_type, typename dev_type, copy_mode mode>
 void cCudaData<host_type, dev_type, mode>
 ::upload(bool will_be_changed)
 {
 	// if current data is already up, do not re-upload it
 //	if(current_data_on_device) return;
    if(buffer&&is_continues)
    {
   	printf("Actual Buffer: %p %i\n",*buffer,*buf_size);
    	if(typeid(host_type)==typeid(double))
    	{
    	  if(typeid(dev_type)==typeid(double))
    	  {
    	  	CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
    	  										dev_data_array->dim,mode,*buffer);
 			current_data_on_device = true;
 			if(will_be_changed) current_data_on_host = false;
 			return;
    	  }
    	  else if(typeid(dev_type)==typeid(float))
    	  {
    	  	CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
    	  										dev_data_array->dim,mode,*buffer);
 			current_data_on_device = true;
 			if(will_be_changed) current_data_on_host = false;
 			return;
    	  }
    	}
    	else if(typeid(host_type)==typeid(float))
    	{
    	  if(typeid(dev_type)==typeid(double))
    	  {
    	  	CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
    	  										dev_data_array->dim,mode,*buffer);
 			current_data_on_device = true;
 			if(will_be_changed) current_data_on_host = false;
 			return;
    	  }
    	  else if(typeid(dev_type)==typeid(float))
    	  {
    	  	CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
    	  										dev_data_array->dim,mode,*buffer);
 			current_data_on_device = true;
 			if(will_be_changed) current_data_on_host = false;
 			return;
    	  }
    	}
    	else if(typeid(host_type)==typeid(int))
    	{
    	  if(typeid(dev_type)==typeid(int))
    	  {
    	  	CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
    	  										dev_data_array->dim,mode,*buffer);
 			current_data_on_device = true;
 			if(will_be_changed) current_data_on_host = false;
 			return;
    	  }
    	}
    }
 	switch(mode)
 	{
 		case x:
 		{
 			if(typeid(host_type) == typeid(dev_type))
 				CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
 			else
 			{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			  for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			  CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
 			}
 			break;
 		}
 		case xx:
 		{
 			if(typeid(host_type) == typeid(dev_type))
 				CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
 			else
 			{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 				CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
 			}
 			break;
 		}
 		case xy:
 		{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 			{
 				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
 				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 				{
 					temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
 			break;
 		}
 		case yx:
 		{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 			{
 				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
 				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 				{
 					temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
 			break;
 		}	
 		case xyz:
 		{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 			{
 				dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
 				for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
 				{
 					temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
 			break;
 		}	
 		case xzy:
 		{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 			for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
 			{
 				dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
 				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 				{
 					temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
 			break;
 		}	
 	}
 	// we have uploaded the data to the device, i.e.:
 	current_data_on_device = true;
 	// the data is going to change on the device, making the host data out-dated
 	if(will_be_changed) current_data_on_host = false;
 }
 template <typename host_type, typename dev_type, copy_mode mode>
 void cCudaData<host_type, dev_type, mode>
 ::uploadAsync(int stream,bool will_be_changed)
 {
 	// if current data is already up, do not re-upload it
 //	if(current_data_on_device) return;
    if(buffer&&is_continues)
    {
   	printf("Actual Buffer: %p %i\n",*buffer,*buf_size);
    	if(typeid(host_type)==typeid(double))
    	{
    	  if(typeid(dev_type)==typeid(double))
    	  {
    	  	CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
    	  										dev_data_array->dim,mode,*buffer);
 			current_data_on_device = true;
 			if(will_be_changed) current_data_on_host = false;
 			return;
    	  }
    	  else if(typeid(dev_type)==typeid(float))
    	  {
    	  	CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
    	  										dev_data_array->dim,mode,*buffer);
 			current_data_on_device = true;
 			if(will_be_changed) current_data_on_host = false;
 			return;
    	  }
    	}
    	else if(typeid(host_type)==typeid(float))
    	{
    	  if(typeid(dev_type)==typeid(double))
    	  {
    	  	CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
    	  										dev_data_array->dim,mode,*buffer);
 			current_data_on_device = true;
 			if(will_be_changed) current_data_on_host = false;
 			return;
    	  }
    	  else if(typeid(dev_type)==typeid(float))
    	  {
    	  	CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
    	  										dev_data_array->dim,mode,*buffer);
 			current_data_on_device = true;
 			if(will_be_changed) current_data_on_host = false;
 			return;
    	  }
    	}
    	else if(typeid(host_type)==typeid(int))
    	{
    	  if(typeid(dev_type)==typeid(int))
    	  {
    	  	CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
    	  										dev_data_array->dim,mode,*buffer);
 			current_data_on_device = true;
 			if(will_be_changed) current_data_on_host = false;
 			return;
    	  }
    	}
    }
 	switch(mode)
 	{
 		case x:
 		{
 			if(typeid(host_type) == typeid(dev_type))
 				CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
 			else
 			{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			  for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			  CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
 			}
 			break;
 		}
 		case xx:
 		{
 			if(typeid(host_type) == typeid(dev_type))
 				CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
 			else
 			{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 				CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
 			}
 			break;
 		}
 		case xy:
 		{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 			{
 				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
 				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 				{
 					temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
 			break;
 		}
 		case yx:
 		{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 			{
 				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
 				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 				{
 					temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
 			break;
 		}	
 		case xyz:
 		{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 			{
 				dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
 				for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
 				{
 					temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
 			break;
 		}	
 		case xzy:
 		{
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 			for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
 			{
 				dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
 				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 				{
 					temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufUploadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
 			break;
 		}	
 	}
 	// we have uploaded the data to the device, i.e.:
 	current_data_on_device = true;
 	// the data is going to change on the device, making the host data out-dated
 	if(will_be_changed) current_data_on_host = false;
 }
 template <typename host_type, typename dev_type, copy_mode mode>
 void cCudaData<host_type, dev_type, mode>
 ::download(bool will_be_changed)
 {
 	// if current data is already down, do not re-download it
 //	if(current_data_on_host) return;
 	switch(mode)
 	{
 		case x:
 		{
 			if(typeid(host_type) == typeid(dev_type))
 				CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
 			else
 			{
 				CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufDownloadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			}
 			break;
 		}
 		case xx:
 		{
 			if(typeid(host_type) == typeid(dev_type))
 				CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
 			else
 			{
 				CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufDownloadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			}
 			break;
 		}
 		case xy:
 		{
 			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 			{
 				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
 				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 				{
 					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufDownloadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			break;
 		}
 		case yx:
 		{
 			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 			{
 				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
 				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 				{
 					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufDownloadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			break;
 		}
 		case xyz:
 		{
 			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 			{
 				dev_type* temp = &temp_data[(i * dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
 				for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
 				{
 					reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[k]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufDownloadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			break;
 		}
 		case xzy:
 		{
 			CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
    timespec time1,time2;
    clock_gettime(CLOCK_REALTIME,&time1);
 			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 			for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
 			{
 				dev_type* temp = &temp_data[(i * dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
 				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 				{
 					reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[j]);
 				}
 			}
 	clock_gettime(CLOCK_REALTIME,&time2);
 	CudaWrapper_AddCPUBufDownloadTime(
        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
 			break;
 		}
 	}
 	// we have downloaded the data to the host, i.e.:
 	current_data_on_host = true;
 	// the data is going to change on the host, making the device data out-dated
 	if(will_be_changed) current_data_on_device = false;
 }
 template <typename host_type, typename dev_type, copy_mode mode>
 void cCudaData<host_type, dev_type, mode>
 ::downloadAsync(int stream)
 {
 	switch(mode)
 	{
 		case x:
 		{
 			if(typeid(host_type) == typeid(dev_type))
 			{
 				CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
 				CudaWrapper_SyncStream(stream);
 			}		
 			else
 			{
 				CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
 				CudaWrapper_SyncStream(stream);
 				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
 			}
 			break;
 		}
 		case xx:
 		{
 			if(typeid(host_type) == typeid(dev_type))
 			{
 				CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
 			    CudaWrapper_SyncStream(stream);
 			}
 			else
 			{
 				CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
 			    CudaWrapper_SyncStream(stream);
 				for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
 			}
 			break;
 		}
 		case xy:
 		{
 			CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
 			CudaWrapper_SyncStream(stream);
 			for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 			{
 				dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
 				for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 				{
 					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
 				}
 			}
 			break;
 		}
 		case yx:
 		{
 			CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
 			CudaWrapper_SyncStream(stream);
 			for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
 			{
 				dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
 				for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
 				{
 					reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
 				}
 			}
 			break;
 		}
 	}
 }
 template <typename host_type, typename dev_type, copy_mode mode>
 void cCudaData<host_type, dev_type, mode>
 ::memset_device(int value)
 {
   CudaWrapper_Memset(dev_data_array->dev_data,value, nbytes);
 }
 template <typename host_type, typename dev_type, copy_mode mode>
 void cCudaData<host_type, dev_type, mode>
 ::set_buffer(void** abuffer,int* abuf_size,bool ais_continues)
 {
   buffer = abuffer;
   buf_size = abuf_size;
   unsigned nbytes_buf=(nbytes/sizeof(dev_type))*sizeof(host_type);
   if(buffer!=NULL)
   if(not((typeid(host_type) == typeid(dev_type))&&(mode == x || mode == xx)))
   {
   	printf("Allocate Buffer: %p %i\n",*buffer,*buf_size);
   	 if(((*buffer)!=NULL)&&(*buf_size<nbytes_buf))
   	 CudaWrapper_FreeCudaData(*buffer,*buf_size);
   	 if(*buf_size<nbytes_buf)
   	 {*buffer=CudaWrapper_AllocCudaData(nbytes_buf);*buf_size=nbytes_buf;}
   	printf("Allocate Buffer2: %p %i\n",*buffer,*buf_size);
   }
   is_continues=ais_continues;
 }
 #endif // _CUDA_DATA_H_
--- a/src/USER-CUDA/cuda_modify_flags.h
+++ b/src/USER-CUDA/cuda_modify_flags.h
@ -0,0 +1,39 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #ifndef CUDA_MODIFY_FLAGS_H
 #define CUDA_MODIFY_FLAGS_H
 #define INITIAL_INTEGRATE_CUDA  (1 << 16)
 #define POST_INTEGRATE_CUDA     (1 << 17)
 #define PRE_EXCHANGE_CUDA       (1 << 18)
 #define PRE_NEIGHBOR_CUDA       (1 << 19)
 #define PRE_FORCE_CUDA          (1 << 20)
 #define POST_FORCE_CUDA         (1 << 21)
 #define FINAL_INTEGRATE_CUDA    (1 << 22)
 #define END_OF_STEP_CUDA        (1 << 23)
 #define THERMO_ENERGY_CUDA      (1 << 24)
 #define MIN_POST_FORCE_CUDA      (1 << 25)
 // remember not to shift over 31 bits
 #endif // CUDA_MODIFY_FLAGS_H
--- a/src/USER-CUDA/cuda_neigh_list.cpp
+++ b/src/USER-CUDA/cuda_neigh_list.cpp
@ -0,0 +1,180 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #include "cuda_neigh_list.h"
 #include "neigh_list.h"
 #include <cstring>
 #include <vector>
 #include <map>
 #include <algorithm>
 #include "cuda.h"
 #include "atom.h"
 using namespace LAMMPS_NS;
 CudaNeighList::CudaNeighList(LAMMPS *lmp, class NeighList* neigh_list) : Pointers(lmp)
 {
        cuda = lmp->cuda;
 	MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... start\n");)
 	this->neigh_list = neigh_list;
 	neigh_list->cuda_list=this;
 	sneighlist.maxlocal = neigh_list->get_maxlocal();
 	sneighlist.maxneighbors = 32;
 	sneighlist.maxcut = 0.0;
 	sneighlist.cutneighsq = NULL;
 	cu_neighbors = NULL;
 	cu_neighbors_border = NULL;
 	cu_neighbors_inner = NULL;
 	cu_numneigh_border = NULL;
 	cu_numneigh_inner = NULL;
 	cu_numneigh = NULL;
 	cu_ilist = NULL;
 	cu_ilist_border = NULL;
 	cu_inum_border = NULL;
 	inum_border = 0;
 	neighbors = NULL;
 	neighbors_inner = NULL;
 	neighbors_border = NULL;
 	numneigh_border = NULL;
 	numneigh_inner = NULL;
 	ilist_border = NULL;
 	build_cuda = false;
 	sneighlist.binned_id=NULL;
 	sneighlist.bin_dim=new int[3];
 	sneighlist.bin_dim[0]=0;
 	sneighlist.bin_dim[1]=0;
 	sneighlist.bin_dim[2]=0;
 	cu_ex_type = NULL;
 	cu_ex1_bit = NULL;
 	cu_ex2_bit = NULL;
 	cu_ex_mol_bit = NULL;
 	sneighlist.nex_type=0;
 	sneighlist.nex_group=0;
 	sneighlist.nex_mol=0;
 	sneighlist.bin_nmax=0;
 	sneighlist.bin_extraspace=0.05;
 	MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... end\n");)
 }
 CudaNeighList::~CudaNeighList()
 {
 	dev_free();
 }
 void CudaNeighList::dev_alloc()
 { 
 	MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... start\n"); )
 	cu_ilist         = new cCudaData<int , int , x> (neigh_list->ilist   , & sneighlist.ilist     , sneighlist.maxlocal );
 	cu_numneigh      = new cCudaData<int , int , x> (neigh_list->numneigh, & sneighlist.numneigh  , sneighlist.maxlocal );
 	neighbors = new int[atom->nmax*sneighlist.maxneighbors];
 	cu_neighbors= new cCudaData<int, int, x> (neighbors			 		 , & sneighlist.neighbors, atom->nmax*sneighlist.maxneighbors );
 	if(cuda->shared_data.overlap_comm)
 	{
 	ilist_border  = new int[sneighlist.maxlocal];
 	numneigh_border	= new int[sneighlist.maxlocal];
 	numneigh_inner	= new int[sneighlist.maxlocal];
 	cu_inum_border  = new cCudaData<int , int , x> (&inum_border		 , & sneighlist.inum_border      , 1 );
 	cu_ilist_border  = new cCudaData<int , int , x> (ilist_border		 , & sneighlist.ilist_border     , sneighlist.maxlocal );
 	cu_numneigh_border	= new cCudaData<int , int , x> (numneigh_border  , & sneighlist.numneigh_border  , sneighlist.maxlocal );
 	cu_numneigh_inner 	= new cCudaData<int , int , x> (numneigh_inner   , & sneighlist.numneigh_inner   , sneighlist.maxlocal );
 	neighbors_border = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
 	cu_neighbors_border= new cCudaData<int, int, x> (neighbors_border	 , & sneighlist.neighbors_border, sneighlist.maxlocal*sneighlist.maxneighbors );
 	neighbors_inner = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
 	cu_neighbors_inner = new cCudaData<int, int, x> (neighbors_inner	 , & sneighlist.neighbors_inner , sneighlist.maxlocal*sneighlist.maxneighbors );
 	}
 	MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... end\n"); )
 }
 void CudaNeighList::dev_free()
 {
 	MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... start\n"); )
 	delete cu_numneigh;
 	delete cu_ilist;
 	delete [] neighbors;
 	delete cu_neighbors;
 	if(cuda->shared_data.overlap_comm)
 	{
 	delete [] ilist_border;
 	delete [] numneigh_border;
 	delete [] numneigh_inner;
 	delete [] neighbors_border;
 	delete [] neighbors_inner;
 	delete cu_inum_border;
 	delete cu_neighbors_border;
 	delete cu_neighbors_inner;
 	delete cu_numneigh_border;
 	delete cu_numneigh_inner;
 	delete cu_ilist_border;
 	}
 	MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... end\n"); )
 }
 void CudaNeighList::grow_device()
 {
 	MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... start\n");)
 	// if host has allocated more memory for atom arrays than device has, then allocate more memory on device
 	int new_maxlocal = neigh_list->get_maxlocal();
 	if(sneighlist.maxlocal < new_maxlocal)
 	{
 		sneighlist.maxlocal = new_maxlocal;
 		dev_free();
 		dev_alloc();
 	}
 	if(!cu_ilist || !cu_numneigh) dev_alloc();
 	// check, if hosts data has been allocated somewhere else
 	if(cu_ilist   ->get_host_data() != neigh_list->ilist)    cu_ilist   ->set_host_data(neigh_list->ilist);
 	if(cu_numneigh->get_host_data() != neigh_list->numneigh) cu_numneigh->set_host_data(neigh_list->numneigh);
 	MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... end\n");)
 }
 void CudaNeighList::nl_upload(bool will_be_changed)
 {
 	//return;
 	MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... start\n");)
 	if(cu_ilist)
 	cu_ilist->upload();
 	if(cu_numneigh)
 	cu_numneigh->upload();
 	MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... end\n");)
 }
 void CudaNeighList::nl_download(bool will_be_changed)
 {
 	MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... start\n");)
 	if(cu_ilist)
 	cu_ilist->download();
 	if(cu_numneigh)
 	cu_numneigh->download();
 	MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... end\n");)
 }
--- a/src/USER-CUDA/cuda_neigh_list.h
+++ b/src/USER-CUDA/cuda_neigh_list.h
@ -0,0 +1,83 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #ifndef LMP_NEIGH_LIST_CUDA_H
 #define LMP_NEIGH_LIST_CUDA_H
 #include "pointers.h"
 #include "cuda_data.h"
 #include "neigh_list.h"
 namespace LAMMPS_NS
 {
 class CudaNeighList : protected Pointers
 {
 	public:
 		cCudaData<int , int , x>*  cu_ilist;
 		cCudaData<int , int , x>*  cu_numneigh;
 		cCudaData<int , int , x>*  cu_inum_border;
 		cCudaData<int , int , x>*  cu_ilist_border;
 		cCudaData<int , int , x>*  cu_numneigh_border;
 		cCudaData<int , int , x>*  cu_numneigh_inner;
 		cCudaData<int , int , x>*  cu_neighbors;
 		cCudaData<int , int , x>*  cu_neighbors_border;
 		cCudaData<int , int , x>*  cu_neighbors_inner;
 		cCudaData<int , int , x>*  cu_ex_type;
 		cCudaData<int , int , x>*  cu_ex1_bit;
 		cCudaData<int , int , x>*  cu_ex2_bit;
 		cCudaData<int , int , x>*  cu_ex_mol_bit;
 		cuda_shared_neighlist sneighlist;
 		int* neighbors;
 		int* neighbors_inner;
 		int* neighbors_border;
 		int inum_border;
 		int* ilist_border;
 		int* numneigh_border;
 		int* numneigh_inner;
 		int nex_type;
 		int nex_group;
 		int nex_mol;
 		bool build_cuda;
 		CudaNeighList(class LAMMPS *, class NeighList* neigh_list);
 		~CudaNeighList();
 		void grow_device(); // will grow pages memory on device, keeping old pages. will grow lists memory on device, deleting old lists
 		void nl_upload(bool will_be_changed=true);
 		void nl_download(bool will_be_changed=true);
 		NeighList* neigh_list;
 		void dev_alloc();
 		void dev_free();
 private:
  class Cuda *cuda;
 };
 }
 #endif
--- a/src/USER-CUDA/cuda_precision.h
+++ b/src/USER-CUDA/cuda_precision.h
@ -0,0 +1,269 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #ifndef CUDA_PRECISION_H_
 #define CUDA_PRECISION_H_
 /* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
 * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
 * ***_FLOAT: type definition of given property
 * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
 */
 #ifdef CUDA_USE_BINNING
 #define CUDA_IF_BINNING(a) a
 #else
 #define CUDA_IF_BINNING(a) 
 #endif
 //GLOBAL
 #ifdef CUDA_PRECISION
  #if CUDA_PRECISION == 1
    #define CUDA_FLOAT float
    #define CUDA_F(x) x##f
  #endif 
  #if CUDA_PRECISION == 2
    #define CUDA_FLOAT double
    #define CUDA_F(x) x
  #endif
 #endif
 #ifndef CUDA_PRECISION
  #define CUDA_FLOAT double
  #define CUDA_F(x) x
  #define CUDA_PRECISION 2
 #endif
 //--------------------------------
 //-----------FFT-----------------
 //--------------------------------
 #ifdef FFT_PRECISION_CU
  #if FFT_PRECISION_CU == 1
    #define FFT_FLOAT float
    #define FFT_F(x) x##f 
  #endif 
  #if FFT_PRECISION_CU == 2
    #define FFT_FLOAT double
    #define FFT_F(x) x 
  #endif
 #endif
 #ifndef FFT_PRECISION_CU
  #define FFT_FLOAT CUDA_FLOAT
  #define FFT_F(x) CUDA_F(x)
  #define FFT_PRECISION_CU CUDA_PRECISION 
 #endif
 //--------------------------------
 //-----------PPPM-----------------
 //--------------------------------
 #ifdef PPPM_PRECISION
  #if PPPM_PRECISION == 1
    #define PPPM_FLOAT float
    #define PPPM_F(x) x##f 
  #endif 
  #if PPPM_PRECISION == 2
    #define PPPM_FLOAT double
    #define PPPM_F(x) x 
  #endif
 #endif
 #ifndef PPPM_PRECISION
  #define PPPM_FLOAT CUDA_FLOAT
  #define PPPM_F(x) CUDA_F(x) 
  #define PPPM_PRECISION CUDA_PRECISION 
 #endif
 //--------------------------------
 //-----------FORCE-----------------
 //--------------------------------
 #ifdef F_PRECISION
  #if F_PRECISION == 1
    #define F_FLOAT float
    #define F_F(x) x##f 
  #endif 
  #if F_PRECISION == 2
    #define F_FLOAT double
    #define F_F(x) x 
  #endif
 #endif
 #ifndef F_PRECISION
  #define F_FLOAT CUDA_FLOAT
  #define F_F(x) CUDA_F(x) 
  #define F_PRECISION CUDA_PRECISION 
 #endif
 #if F_PRECISION == 1
 #define _SQRT_ sqrtf
 #define _RSQRT_ rsqrtf
 #define _EXP_ expf
 #else
 #define _SQRT_ sqrt
 #define _RSQRT_ rsqrt
 #define _EXP_ exp
 #endif
 #if F_PRECISION == 2
 struct F_FLOAT2
 {
  F_FLOAT x;
  F_FLOAT y;
 };
 struct F_FLOAT3
 {
  F_FLOAT x;
  F_FLOAT y;
  F_FLOAT z;
 };
 struct F_FLOAT4
 {
  F_FLOAT x;
  F_FLOAT y;
  F_FLOAT z;
  F_FLOAT w;
 };
 #else
 #define F_FLOAT2 float2
 #define F_FLOAT3 float3
 #define F_FLOAT4 float4
 #endif
 //--------------------------------
 //-----------ENERGY-----------------
 //--------------------------------
 #ifndef ENERGY_PRECISION
  #define ENERGY_FLOAT CUDA_FLOAT
  #define ENERGY_F(x) CUDA_F(x) 
 #endif
 #ifdef ENERGY_PRECISION
  #if ENERGY_PRECISION == 1
    #define ENERGY_FLOAT float
    #define ENERGY_F(x) x##f 
  #endif 
  #if ENERGY_PRECISION == 2
    #define ENERGY_FLOAT double
    #define ENERGY_F(x) x 
  #endif
 #endif
 #ifndef ENERGY_PRECISION
  #define ENERGY_FLOAT CUDA_FLOAT
  #define ENERGY_F(x) CUDA_F(x) 
  #define ENERGY_PRECISION CUDA_PRECISION 
 #endif
 //--------------------------------
 //-----------POSITIONS------------
 //--------------------------------
 #ifdef X_PRECISION
  #if X_PRECISION == 1
    #define X_FLOAT float
    #define X_F(x) x##f 
  #endif 
  #if X_PRECISION == 2
    #define X_FLOAT double
    #define X_F(x) x 
  #endif
 #endif
 #ifndef X_PRECISION
  #define X_FLOAT CUDA_FLOAT
  #define X_F(x) CUDA_F(x) 
  #define X_PRECISION CUDA_PRECISION 
 #endif
 #if X_PRECISION == 2
 struct X_FLOAT2
 {
  X_FLOAT x;
  X_FLOAT y;
 };
 struct X_FLOAT3
 {
  X_FLOAT x;
  X_FLOAT y;
  X_FLOAT z;
 };
 struct X_FLOAT4
 {
  X_FLOAT x;
  X_FLOAT y;
  X_FLOAT z;
  X_FLOAT w;
 };
 #else
 #define X_FLOAT2 float2
 #define X_FLOAT3 float3
 #define X_FLOAT4 float4
 #endif
 //--------------------------------
 //-----------velocities-----------
 //--------------------------------
 #ifdef V_PRECISION
  #if V_PRECISION == 1
    #define V_FLOAT float
    #define V_F(x) x##f  
  #endif 
  #if V_PRECISION == 2
    #define V_FLOAT double
    #define V_F(x) x  
  #endif
 #endif
 #ifndef V_PRECISION
  #define V_FLOAT CUDA_FLOAT
  #define V_F(x) CUDA_F(x) 
  #define V_PRECISION CUDA_PRECISION 
 #endif
 #if V_PRECISION == 2
 struct V_FLOAT4
 {
  V_FLOAT x;
  V_FLOAT y;
  V_FLOAT z;
  V_FLOAT w;
 };
 #else
 #define V_FLOAT4 float4
 #endif
 #ifdef NO_PREC_TIMING
 struct timespec_2
 {
 	unsigned int tv_sec;
 	unsigned int tv_nsec;
 };
 #define timespec timespec_2
 #define clock_gettime(a,b) 
 #endif
 #endif /*CUDA_PRECISION_H_*/
--- a/src/USER-CUDA/cuda_shared.h
+++ b/src/USER-CUDA/cuda_shared.h
@ -0,0 +1,378 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #ifndef _CUDA_SHARED_H_
 #define _CUDA_SHARED_H_
 #include "cuda_precision.h"
 #define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
 struct dev_array
 {
 	void* dev_data;			// pointer to memory address on cuda device
 	unsigned dim[3];		// array dimensions
 };
 struct cuda_shared_atom		// relevent data from atom class
 {
 	dev_array dx; 			// cumulated distance for binning settings
 	dev_array x;			// position
 	dev_array v;			// velocity
 	dev_array f;			// force
 	dev_array tag;
 	dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)   
 	dev_array mask;
 	dev_array image; 		
 	dev_array q;			// charges
 	dev_array mass;			// per-type masses
 	dev_array rmass;		// per-atom masses
 	dev_array radius;		// per-atom radius
 	dev_array density;
 	dev_array omega;
 	dev_array torque;
 	dev_array molecule;
 	dev_array special;
 	int maxspecial;
 	dev_array nspecial;
 	int* special_flag;
 	int molecular;
 	dev_array eatom;		// per-atom energy
 	dev_array vatom;		// per-atom virial
 	int need_eatom;
 	int need_vatom;
 	dev_array x_type;		// position + type in X_FLOAT4 struct
 	dev_array v_radius;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
 	dev_array omega_rmass;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
 	double* mass_host;		// remember per-type host pointer to masses
 	//int natoms;				// total # of atoms in system, could be 0
 	int nghost;				// and ghost atoms on this proc
 	int nlocal;				// # of owned
 	int nall;			    // total # of atoms in this proc
 	int nmax;				// max # of owned+ghost in arrays on this proc 	
 	int ntypes;
 	int q_flag;				// do we have charges?
 	int rmass_flag;			// do we have per-atom masses?
 	int firstgroup;
 	int nfirst;
 	int update_nlocal;
 	int update_nmax;
 	dev_array xhold;	    // position at last neighboring
 	X_FLOAT triggerneighsq;		// maximum square movement before reneighboring
 	int reneigh_flag;		// is reneighboring necessary
 	int maxhold;			// size of xhold
 	int dist_check; 		//perform distance check for reneighboring
 	dev_array binned_id;    //id of each binned atom (not tag!!)
 	dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
    float bin_extraspace;	
 	int bin_dim[3];
 	int bin_nmax;
 	dev_array map_array;
 };
 struct cuda_shared_pair		// relevent data from pair class
 {
 	char cudable_force;		// check for (cudable_force!=0)
 	X_FLOAT cut_global;
 	X_FLOAT cut_inner_global;
 	X_FLOAT cut_coul_global;
 	double** cut;			// type-type cutoff 
 	double** cutsq;			// type-type cutoff 
 	double** cut_inner;			// type-type cutoff for coul
 	double** cut_coul;			// type-type cutoff for coul
 	double** coeff1;		// tpye-type pair parameters
 	double** coeff2;
 	double** coeff3;
 	double** coeff4;
 	double** coeff5;		
 	double** coeff6;		
 	double** coeff7;		
 	double** coeff8;		
 	double** coeff9;		
 	double** coeff10;		
 	double** offset;
 	double* special_lj;
 	double* special_coul;
 	dev_array virial; // ENERGY_FLOAT
 	dev_array eng_vdwl; // ENERGY_FLOAT
 	dev_array eng_coul; // ENERGY_FLOAT
 	X_FLOAT cut_coulsq_global;
 	F_FLOAT g_ewald,kappa;
 	int freeze_group_bit;
 	dev_array coeff1_gm;
 	dev_array coeff2_gm;
 	dev_array coeff3_gm;
 	dev_array coeff4_gm;
 	dev_array coeff5_gm;
 	dev_array coeff6_gm;
 	dev_array coeff7_gm;
 	dev_array coeff8_gm;
 	dev_array coeff9_gm;
 	dev_array coeff10_gm;
 	int lastgridsize;
 	int n_energy_virial;
 	int collect_forces_later;
 	int use_block_per_atom;
 	int override_block_per_atom;
 };
 struct cuda_shared_domain	// relevent data from domain class
 {
 	X_FLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
 	X_FLOAT subhi[3];
 	X_FLOAT boxlo[3];
 	X_FLOAT boxhi[3];
 	X_FLOAT prd[3];
 	int periodicity[3];		// xyz periodicity as array
 	int triclinic;
 	X_FLOAT xy;
 	X_FLOAT xz;
 	X_FLOAT yz;
 	X_FLOAT boxlo_lamda[3];
 	X_FLOAT boxhi_lamda[3];
 	X_FLOAT prd_lamda[3];
 	X_FLOAT h[6];
 	X_FLOAT h_inv[6];
 	V_FLOAT h_rate[6];
 	int update;
 };
 struct cuda_shared_pppm
 {
   char cudable_force;
 #ifdef FFT_CUFFT  
   FFT_FLOAT* work1;
   FFT_FLOAT* work2;
   FFT_FLOAT* work3;
   PPPM_FLOAT* greensfn;
   PPPM_FLOAT* fkx;
   PPPM_FLOAT* fky;
   PPPM_FLOAT* fkz;
   PPPM_FLOAT* vg;
 #endif
   int* part2grid;
   PPPM_FLOAT* density_brick;
   int* density_brick_int;
   PPPM_FLOAT density_intScale;
   PPPM_FLOAT* vdx_brick;
   PPPM_FLOAT* vdy_brick;
   PPPM_FLOAT* vdz_brick;
   PPPM_FLOAT* density_fft;
   ENERGY_FLOAT* energy;
   ENERGY_FLOAT* virial;
   int nxlo_in;
   int nxhi_in;
   int nxlo_out;
   int nxhi_out;
   int nylo_in;
   int nyhi_in;
   int nylo_out;
   int nyhi_out;
   int nzlo_in;
   int nzhi_in;
   int nzlo_out;
   int nzhi_out;
   int nx_pppm;
   int ny_pppm;
   int nz_pppm;
   PPPM_FLOAT qqrd2e;
   int order;
  // float3 sublo;
   PPPM_FLOAT* rho_coeff;
   int nmax;
   int nlocal;
   PPPM_FLOAT* debugdata;
   PPPM_FLOAT delxinv;
   PPPM_FLOAT delyinv;
   PPPM_FLOAT delzinv;
   int nlower;
   int nupper;
   PPPM_FLOAT shiftone;
 };
 struct cuda_shared_comm
 {
   int maxswap;
   int maxlistlength;
   dev_array pbc;
   dev_array slablo;
   dev_array slabhi;
   dev_array multilo;
   dev_array multihi;
   dev_array sendlist;
   int grow_flag;
   int comm_phase;
   int nsend;
   int* nsend_swap;
   int* send_size;
   int* recv_size; 
   double** buf_send;
   void** buf_send_dev;
   double** buf_recv;
   void** buf_recv_dev;
   void* buffer;
   int buffer_size;
   double overlap_split_ratio;
 };
 struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
 {
 	int maxlocal;
 	int inum;                // # of I atoms neighbors are stored for local indices of I atoms
 	int inum_border2;
 	dev_array inum_border;         // # of atoms which interact with border atoms
 	dev_array ilist;
 	dev_array ilist_border;
 	dev_array numneigh;
 	dev_array numneigh_inner;
 	dev_array numneigh_border;
 	dev_array firstneigh;
 	dev_array neighbors;
 	dev_array neighbors_border;
 	dev_array neighbors_inner;
 	int maxpage;
 	dev_array page_pointers;
 	dev_array* pages;
 	int maxneighbors;
 	int neigh_lists_per_page;
 	double** cutneighsq;
 	CUDA_FLOAT* cu_cutneighsq;
 	int* binned_id;
 	int* bin_dim;
 	int bin_nmax;
 	float bin_extraspace;
 	double maxcut;
 	dev_array ex_type;
 	int nex_type;
 	dev_array ex1_bit;
 	dev_array ex2_bit;
 	int nex_group;
 	dev_array ex_mol_bit;
 	int nex_mol;
 };
 struct cuda_compile_settings		// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
 {
    int prec_glob;
    int prec_x;
    int prec_v;
    int prec_f;
    int prec_pppm;
    int prec_fft;
    int cufft;
    int arch;
 };
 struct cuda_timings_struct
 {
 	//Debug:
 	double test1;
 	double test2;
 	//transfers
 	double transfer_upload_tmp_constr;
 	double transfer_download_tmp_deconstr;
 	//communication
 	double comm_forward_total;
 	double comm_forward_mpi_upper;
 	double comm_forward_mpi_lower;
 	double comm_forward_kernel_pack;
 	double comm_forward_kernel_unpack;
 	double comm_forward_kernel_self;
 	double comm_forward_upload;
 	double comm_forward_download;
 	double comm_exchange_total;
 	double comm_exchange_mpi;
 	double comm_exchange_kernel_pack;
 	double comm_exchange_kernel_unpack;
 	double comm_exchange_kernel_fill;
 	double comm_exchange_cpu_pack;
 	double comm_exchange_upload;
 	double comm_exchange_download;
 	double comm_border_total;
 	double comm_border_mpi;
 	double comm_border_kernel_pack;
 	double comm_border_kernel_unpack;
 	double comm_border_kernel_self;
 	double comm_border_kernel_buildlist;
 	double comm_border_upload;
 	double comm_border_download;
 	//pair forces
 	double pair_xtype_conversion;
 	double pair_kernel;
 	double pair_virial;
 	double pair_force_collection;
 	//neighbor
 	double neigh_bin;
 	double neigh_build;
 	double neigh_special;
 	//PPPM
 	double pppm_particle_map; 
    double pppm_make_rho; 
    double pppm_brick2fft; 
    double pppm_poisson; 
    double pppm_fillbrick; 
    double pppm_fieldforce; 
    double pppm_compute; 
 };
 struct cuda_shared_data		// holds space for all relevent data from the different classes
 {
 	void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
 	int buffersize; //maxsize of buffer
 	int buffer_new; //should be 1 if the pointer to buffer has changed
 	void* flag;
 	void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
 	cuda_shared_atom atom;
 	cuda_shared_pair pair;	
 	cuda_shared_domain domain;
 	cuda_shared_pppm pppm;
 	cuda_shared_comm comm;
 	cuda_compile_settings compile_settings;
 	cuda_timings_struct cuda_timings;
 	int exchange_dim;
 	int me; //mpi rank
 	unsigned int datamask;
 	int overlap_comm;
 };
 #endif // #ifndef _CUDA_SHARED_H_
--- a/src/USER-CUDA/domain_cuda.cpp
+++ b/src/USER-CUDA/domain_cuda.cpp
@ -0,0 +1,270 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author (triclinic) : Pieter in 't Veld (SNL)
 ------------------------------------------------------------------------- */
 #include "mpi.h"
 #include "stdlib.h"
 #include "string.h"
 #include "stdio.h"
 #include "math.h"
 #include "domain_cuda.h"
 #include "style_region.h"
 #include "atom.h"
 #include "force.h"
 #include "update.h"
 #include "modify.h"
 #include "fix.h"
 #include "fix_deform.h"
 #include "region.h"
 #include "lattice.h"
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
 #include "cuda.h"
 #include "domain_cu.h"
 using namespace LAMMPS_NS;
 #define BIG   1.0e20
 #define SMALL 1.0e-4
 #define DELTA 1
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 enum{NO_REMAP,X_REMAP,V_REMAP};                   // same as fix_deform.cpp
 /* ----------------------------------------------------------------------
   default is periodic 
 ------------------------------------------------------------------------- */
 DomainCuda::DomainCuda(LAMMPS *lmp) : Domain(lmp)
 {
  cuda = lmp->cuda;
 }
 /* ---------------------------------------------------------------------- */
 void DomainCuda::init()
 {
  Domain::init();
  if(not cuda->finished_run)
  {
    cuda->setDomainParams();
    Cuda_Domain_Init(&cuda->shared_data);
  }
 }
 /* ----------------------------------------------------------------------
   set global box params
   assumes boxlo/hi and triclinic tilts are already set
 ------------------------------------------------------------------------- */
 void DomainCuda::set_global_box()
 {
  Domain::set_global_box();
  if(not cuda->finished_run)
  {
    cuda->setDomainParams();
  }
 }
 /* ----------------------------------------------------------------------
   set lamda box params, only need be done one time
   assumes global box is defined and proc assignment has been made by comm
   for uppermost proc, insure subhi = 1.0 (in case round-off occurs)
 ------------------------------------------------------------------------- */
 void DomainCuda::set_lamda_box()
 {
  Domain::set_lamda_box();
  if(not cuda->finished_run)
  {
    cuda->setDomainParams();
  }
 }
 /* ----------------------------------------------------------------------
   set local subbox params
   assumes global box is defined and proc assignment has been made
   for uppermost proc, insure subhi = boxhi (in case round-off occurs)
 ------------------------------------------------------------------------- */
 void DomainCuda::set_local_box()
 {
  Domain::set_local_box();
  if(not cuda->finished_run)
  {
   // cuda->setDomainParams();
    //Cuda_Domain_Init(&cuda->shared_data);
  }
 }
 /* ----------------------------------------------------------------------
   reset global & local boxes due to global box boundary changes
   if shrink-wrapped, determine atom extent and reset boxlo/hi
   if shrink-wrapped and triclinic, perform shrink-wrap in box coords
 ------------------------------------------------------------------------- */
 void DomainCuda::reset_box()
 {
  if (nonperiodic == 2) {
    // convert back to box coords for shrink-wrap operation
    if (triclinic) lamda2x(atom->nlocal);
    // compute extent of atoms on this proc
    double extent[3][2],all[3][2];
    extent[2][0] = extent[1][0] = extent[0][0] = BIG;
    extent[2][1] = extent[1][1] = extent[0][1] = -BIG;
    double **x = atom->x;
    int nlocal = atom->nlocal;
    if (cuda->finished_setup&&(!cuda->oncpu))
      {
        extent[0][0]=cuda->extent[0];
        extent[0][1]=cuda->extent[1];
        extent[1][0]=cuda->extent[2];
        extent[1][1]=cuda->extent[3];
        extent[2][0]=cuda->extent[4];
        extent[2][1]=cuda->extent[5];
      }
    else
      for (int i = 0; i < nlocal; i++) {
        extent[0][0] = MIN(extent[0][0],x[i][0]);
        extent[0][1] = MAX(extent[0][1],x[i][0]);
        extent[1][0] = MIN(extent[1][0],x[i][1]);
        extent[1][1] = MAX(extent[1][1],x[i][1]);
        extent[2][0] = MIN(extent[2][0],x[i][2]);
        extent[2][1] = MAX(extent[2][1],x[i][2]);
      }
    // compute extent across all procs
    // flip sign of MIN to do it in one Allreduce MAX
    extent[0][0] = -extent[0][0];
    extent[1][0] = -extent[1][0];
    extent[2][0] = -extent[2][0];
    MPI_Allreduce(extent,all,6,MPI_DOUBLE,MPI_MAX,world);
    // in shrink-wrapped dims, set box by atom extent
    // if minimum set, enforce min box size settings
    if (xperiodic == 0) {
      if (boundary[0][0] == 2) boxlo[0] = -all[0][0] - SMALL;
      else if (boundary[0][0] == 3) boxlo[0] = MIN(-all[0][0]-SMALL,minxlo);
      if (boundary[0][1] == 2) boxhi[0] = all[0][1] + SMALL;
      else if (boundary[0][1] == 3) boxhi[0] = MAX(all[0][1]+SMALL,minxhi);
      if (boxlo[0] > boxhi[0]) error->all("Illegal simulation box");
    }
    if (yperiodic == 0) {
      if (boundary[1][0] == 2) boxlo[1] = -all[1][0] - SMALL;
      else if (boundary[1][0] == 3) boxlo[1] = MIN(-all[1][0]-SMALL,minylo);
      if (boundary[1][1] == 2) boxhi[1] = all[1][1] + SMALL;
      else if (boundary[1][1] == 3) boxhi[1] = MAX(all[1][1]+SMALL,minyhi);
      if (boxlo[1] > boxhi[1]) error->all("Illegal simulation box");
    }
    if (zperiodic == 0) {
      if (boundary[2][0] == 2) boxlo[2] = -all[2][0] - SMALL;
      else if (boundary[2][0] == 3) boxlo[2] = MIN(-all[2][0]-SMALL,minzlo);
      if (boundary[2][1] == 2) boxhi[2] = all[2][1] + SMALL;
      else if (boundary[2][1] == 3) boxhi[2] = MAX(all[2][1]+SMALL,minzhi);
      if (boxlo[2] > boxhi[2]) error->all("Illegal simulation box");
    }
  }
  set_global_box();
  set_local_box();
  if(not cuda->finished_run)
  {
    cuda->setDomainParams();
    Cuda_Domain_Init(&cuda->shared_data);
  }
  // if shrink-wrapped, convert to lamda coords for new box
  // must re-invoke pbc() b/c x2lamda result can be outside 0,1 due to roundoff
  if (nonperiodic == 2 && triclinic) {
    x2lamda(atom->nlocal);
    pbc();
  }
 }
 /* ----------------------------------------------------------------------
   enforce PBC and modify box image flags for each atom
   called every reneighboring and by other commands that change atoms
   resulting coord must satisfy lo <= coord < hi
   MAX is important since coord - prd < lo can happen when coord = hi
   if fix deform, remap velocity of fix group atoms by box edge velocities
   for triclinic, atoms must be in lamda coords (0-1) before pbc is called
   image = 10 bits for each dimension
   increment/decrement in wrap-around fashion
 ------------------------------------------------------------------------- */
 void DomainCuda::pbc()
 {
  if(cuda->finished_setup&&(!cuda->oncpu))
  {
  	cuda->setDomainParams();
    Cuda_Domain_PBC(&cuda->shared_data, deform_vremap, deform_groupbit,cuda->extent);
    return;
  }
  Domain::pbc();
 }
 /* ----------------------------------------------------------------------
   convert triclinic 0-1 lamda coords to box coords for all N atoms
   x = H lamda + x0;
 ------------------------------------------------------------------------- */
 void DomainCuda::lamda2x(int n)
 {
  if(cuda->finished_setup&&(!cuda->oncpu))
  {
    Cuda_Domain_lamda2x(&cuda->shared_data,n);
    return;
  }
  Domain::lamda2x(n);
 }
 /* ----------------------------------------------------------------------
   convert box coords to triclinic 0-1 lamda coords for all N atoms
   lamda = H^-1 (x - x0)
 ------------------------------------------------------------------------- */
 void DomainCuda::x2lamda(int n)
 {
  if(cuda->finished_setup&&(!cuda->oncpu))
  {
    Cuda_Domain_x2lamda(&cuda->shared_data,n);
    return;
  }
  Domain::x2lamda(n);
 }
--- a/src/USER-CUDA/domain_cuda.h
+++ b/src/USER-CUDA/domain_cuda.h
@ -0,0 +1,41 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifndef LMP_DOMAIN_CUDA_H
 #define LMP_DOMAIN_CUDA_H
 #include "pointers.h"
 #include "domain.h"
 namespace LAMMPS_NS {
 class DomainCuda : public Domain {
 public:
  DomainCuda(class LAMMPS *);
  void init();
  void set_global_box();
  void set_lamda_box();
  void set_local_box();
  void reset_box();
  void pbc();
  void lamda2x(int);
  void x2lamda(int);
 protected:
  class Cuda *cuda;
 };
 }
 #endif
--- a/src/USER-CUDA/modify_cuda.cpp
+++ b/src/USER-CUDA/modify_cuda.cpp
@ -0,0 +1,442 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #include <cstdio>
 #include <cstring>
 #include "modify_cuda.h"
 #include "style_compute.h"
 #include "style_fix.h"
 #include "atom.h"
 #include "comm.h"
 #include "fix.h"
 #include "compute.h"
 #include "group.h"
 #include "update.h"
 #include "domain.h"
 #include "cuda.h"
 #include "memory.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 #define DELTA 4
 // mask settings - same as in fix.cpp
 #define INITIAL_INTEGRATE   1
 #define POST_INTEGRATE      2
 #define PRE_EXCHANGE        4
 #define PRE_NEIGHBOR        8
 #define PRE_FORCE          16
 #define POST_FORCE         32
 #define FINAL_INTEGRATE    64
 #define END_OF_STEP       128
 #define THERMO_ENERGY     256
 #define INITIAL_INTEGRATE_RESPA   512
 #define POST_INTEGRATE_RESPA     1024
 #define PRE_FORCE_RESPA          2048
 #define POST_FORCE_RESPA         4096
 #define FINAL_INTEGRATE_RESPA    8192
 #define MIN_PRE_EXCHANGE        16384
 #define MIN_POST_FORCE          32768
 #define MIN_ENERGY              65536
 #include "cuda_modify_flags.h"
 #define MIN(A,B) ((A) < (B)) ? (A) : (B)
 #define MAX(A,B) ((A) > (B)) ? (A) : (B)
 #define BIG 1.0e20
 /* ---------------------------------------------------------------------- */
 ModifyCuda::ModifyCuda(LAMMPS *lmp) : Modify(lmp)
 {
  cuda = lmp->cuda;
  n_initial_integrate_cuda = 0;
  n_post_integrate_cuda = 0;
  n_pre_exchange = 0;
  n_pre_neighbor_cuda = 0;
  n_pre_force_cuda = 0;
  n_post_force_cuda = 0;
  n_final_integrate_cuda = 0;
  n_end_of_step_cuda = 0;
  n_thermo_energy_cuda = 0;
  n_initial_integrate_host = 0;
  n_post_integrate_host = 0;
  n_pre_exchange = 0;
  n_pre_neighbor_host = 0;
  n_pre_force_host = 0;
  n_post_force_host = 0;
  n_final_integrate_host = 0;
  n_end_of_step_host = 0;
  n_thermo_energy_host = 0;
  list_initial_integrate_cuda = NULL;
  list_post_integrate_cuda = NULL;
  list_pre_exchange_cuda = NULL;
  list_pre_neighbor_cuda = NULL;
  list_pre_force_cuda = NULL;
  list_post_force_cuda = NULL;
  list_final_integrate_cuda = NULL;
  list_end_of_step_cuda = NULL;
  list_thermo_energy_cuda = NULL;
  end_of_step_every_cuda = NULL;
 }
 /* ---------------------------------------------------------------------- */
 ModifyCuda::~ModifyCuda()
 {
  delete [] list_initial_integrate_cuda;
  delete [] list_post_integrate_cuda;
  delete [] list_pre_exchange_cuda;
  delete [] list_pre_neighbor_cuda;
  delete [] list_pre_force_cuda;
  delete [] list_post_force_cuda;
  delete [] list_final_integrate_cuda;
  delete [] list_end_of_step_cuda;
  delete [] list_thermo_energy_cuda;
  delete [] end_of_step_every_cuda;
 }
 /* ----------------------------------------------------------------------
   initialize all fixes and computes
 ------------------------------------------------------------------------- */
 void ModifyCuda::init()
 {
  int i,j;
  // delete storage of restart info since it is not valid after 1st run
  restart_deallocate();
  // create lists of fixes to call at each stage of run
  list_init(INITIAL_INTEGRATE,n_initial_integrate,list_initial_integrate);
  list_init(POST_INTEGRATE,n_post_integrate,list_post_integrate);
  list_init(PRE_EXCHANGE,n_pre_exchange,list_pre_exchange);
  list_init(PRE_NEIGHBOR,n_pre_neighbor,list_pre_neighbor);
  list_init(PRE_FORCE,n_pre_force,list_pre_force);
  list_init(POST_FORCE,n_post_force,list_post_force);
  list_init(FINAL_INTEGRATE,n_final_integrate,list_final_integrate);
  list_init_end_of_step(END_OF_STEP,n_end_of_step,list_end_of_step);
  list_init_thermo_energy(THERMO_ENERGY,n_thermo_energy,list_thermo_energy);
  list_init(INITIAL_INTEGRATE_CUDA, n_initial_integrate_cuda, list_initial_integrate_cuda);
  list_init(POST_INTEGRATE_CUDA, n_post_integrate_cuda, list_post_integrate_cuda);
  list_init(PRE_EXCHANGE_CUDA, n_pre_exchange_cuda, list_pre_exchange_cuda);
  list_init(PRE_NEIGHBOR_CUDA, n_pre_neighbor_cuda, list_pre_neighbor_cuda);
  list_init(PRE_FORCE_CUDA, n_pre_force_cuda, list_pre_force_cuda);
  list_init(POST_FORCE_CUDA, n_post_force_cuda, list_post_force_cuda);
  list_init(FINAL_INTEGRATE_CUDA, n_final_integrate_cuda, list_final_integrate_cuda);
  list_init_end_of_step_cuda(END_OF_STEP_CUDA, n_end_of_step_cuda, list_end_of_step_cuda);
  list_init_thermo_energy(THERMO_ENERGY_CUDA, n_thermo_energy_cuda, list_thermo_energy_cuda);
  n_initial_integrate_host = n_initial_integrate;
  n_post_integrate_host = n_post_integrate;
  n_pre_exchange_host = n_pre_exchange;
  n_pre_neighbor_host = n_pre_neighbor;
  n_pre_force_host = n_pre_force;
  n_post_force_host = n_post_force;
  n_final_integrate_host = n_final_integrate;
  n_end_of_step_host = n_end_of_step;
  n_thermo_energy_host = n_thermo_energy;
  n_initial_integrate = n_initial_integrate_cuda+n_initial_integrate_host;
  n_post_integrate = n_post_integrate_cuda+n_post_integrate_host;
  n_pre_exchange = n_pre_exchange_cuda+n_pre_exchange_host;
  n_pre_neighbor = n_pre_neighbor_cuda+n_pre_neighbor_host;
  n_pre_force = n_pre_force_cuda+n_pre_force_host;
  n_post_force = n_post_force_cuda+n_post_force_host;
  n_final_integrate = n_final_integrate_cuda+n_final_integrate_host;
  n_end_of_step = n_end_of_step_cuda+n_end_of_step_host;
  n_thermo_energy = n_thermo_energy_cuda+n_thermo_energy_host;
  list_init(INITIAL_INTEGRATE_RESPA,
 	    n_initial_integrate_respa,list_initial_integrate_respa);
  list_init(POST_INTEGRATE_RESPA,
 	    n_post_integrate_respa,list_post_integrate_respa);
  list_init(POST_FORCE_RESPA,
 	    n_post_force_respa,list_post_force_respa);
  list_init(PRE_FORCE_RESPA,
 	    n_pre_force_respa,list_pre_force_respa);
  list_init(FINAL_INTEGRATE_RESPA,
 	    n_final_integrate_respa,list_final_integrate_respa);
  list_init(MIN_PRE_EXCHANGE,n_min_pre_exchange,list_min_pre_exchange);
  list_init(MIN_POST_FORCE,n_min_post_force,list_min_post_force);
  list_init(MIN_ENERGY,n_min_energy,list_min_energy);
  // init each fix
  // needs to come before compute init
  // this is b/c some computes call fix->dof()
  // FixRigid::dof() depends on its own init having been called
  comm->maxforward_fix = comm->maxreverse_fix = 0;
  for (i = 0; i < nfix; i++) fix[i]->init();
  // set global flag if any fix has its restart_pbc flag set
  restart_pbc_any = 0;
  for (i = 0; i < nfix; i++)
    if (fix[i]->restart_pbc) restart_pbc_any = 1;
  // create list of computes that store invocation times
  list_init_compute();
  // init each compute
  // set invoked_scalar,vector,etc to -1 to force new run to re-compute them
  // add initial timestep to all computes that store invocation times
  //   since any of them may be invoked by initial thermo
  // do not clear out invocation times stored within a compute,
  //   b/c some may be holdovers from previous run, like for ave fixes
  for (i = 0; i < ncompute; i++) {
    compute[i]->init();
    compute[i]->invoked_scalar = -1;
    compute[i]->invoked_vector = -1;
    compute[i]->invoked_array = -1;
    compute[i]->invoked_peratom = -1;
    compute[i]->invoked_local = -1;
  }
  addstep_compute_all(update->ntimestep);
  // warn if any particle is time integrated more than once
  int nlocal = atom->nlocal;
  int *mask = atom->mask;
  int *flag = new int[nlocal];
  for (i = 0; i < nlocal; i++) flag[i] = 0;
  int groupbit;
  for (i = 0; i < nfix; i++) {
    if (fix[i]->time_integrate == 0) continue;
    groupbit = fix[i]->groupbit;
    for (j = 0; j < nlocal; j++)
      if (mask[j] & groupbit) flag[j]++;
  }
  int check = 0;
  for (i = 0; i < nlocal; i++)
    if (flag[i] > 1) check = 1;
  delete [] flag;
  int checkall;
  MPI_Allreduce(&check,&checkall,1,MPI_INT,MPI_SUM,world);
  if (comm->me == 0 && checkall)
    error->warning("One or more atoms are time integrated more than once");
 }
 /* ----------------------------------------------------------------------
   1st half of integrate call, only for relevant fixes
 ------------------------------------------------------------------------- */
 void ModifyCuda::initial_integrate(int vflag)
 {
 	for(int i = 0; i < n_initial_integrate_cuda; i++)
 		fix[list_initial_integrate_cuda[i]]->initial_integrate(vflag);
 	if(n_initial_integrate_host != 0)
 	{
 		cuda->downloadAll(); cuda->oncpu = true;
 		for (int i = 0; i < n_initial_integrate_host; i++)
 			fix[list_initial_integrate[i]]->initial_integrate(vflag);
 		cuda->uploadAll(); cuda->oncpu = false;
 	}
 }
 /* ----------------------------------------------------------------------
   post_integrate call, only for relevant fixes
 ------------------------------------------------------------------------- */
 void ModifyCuda::post_integrate()
 {
 	for(int i = 0; i < n_post_integrate_cuda; i++)
 		fix[list_post_integrate_cuda[i]]->post_integrate();
 	if(n_post_integrate_host != 0)
 	{
 		cuda->downloadAll(); cuda->oncpu = true;
 		for (int i = 0; i < n_post_integrate_host; i++)
 			fix[list_post_integrate[i]]->post_integrate();
 		cuda->uploadAll(); cuda->oncpu = false;
 	}
 }
 /* ----------------------------------------------------------------------
   pre_exchange call, only for relevant fixes
 ------------------------------------------------------------------------- */
 void ModifyCuda::pre_exchange()
 {
 	for(int i = 0; i < n_pre_exchange_cuda; i++)
 		fix[list_pre_exchange_cuda[i]]->pre_exchange();
 	if(n_pre_exchange_host != 0)
 	{
 		cuda->downloadAll(); cuda->oncpu = true;
 		for (int i = 0; i < n_pre_exchange_host; i++)
 			fix[list_pre_exchange[i]]->pre_exchange();
 		cuda->uploadAll(); cuda->oncpu = false;
 	}
 }
 /* ----------------------------------------------------------------------
   pre_neighbor call, only for relevant fixes
 ------------------------------------------------------------------------- */
 void ModifyCuda::pre_neighbor()
 {
 	for(int i = 0; i < n_pre_neighbor_cuda; i++)
 		fix[list_pre_neighbor_cuda[i]]->pre_neighbor();
 	if(n_pre_neighbor_host != 0)
 	{
 		cuda->downloadAll(); cuda->oncpu = true;
 		for (int i = 0; i < n_pre_neighbor_host; i++)
 			fix[list_pre_neighbor[i]]->pre_neighbor();
 		cuda->uploadAll(); cuda->oncpu = false;
 	}
 }
 /* ----------------------------------------------------------------------
   pre_force call, only for relevant fixes
 ------------------------------------------------------------------------- */
 void ModifyCuda::pre_force(int vflag)
 {
 	for(int i = 0; i < n_pre_force_cuda; i++)
 		fix[list_pre_force_cuda[i]]->pre_force(vflag);
 	if(n_pre_force_host != 0)
 	{
 		cuda->downloadAll(); cuda->oncpu = true;
 		for (int i = 0; i < n_pre_force_host; i++)
 			fix[list_pre_force[i]]->pre_force(vflag);
 		cuda->uploadAll(); cuda->oncpu = false;
 	}
 }
 /* ----------------------------------------------------------------------
   post_force call, only for relevant fixes
 ------------------------------------------------------------------------- */
 void ModifyCuda::post_force(int vflag)
 {
 	for(int i = 0; i < n_post_force_cuda; i++)
 			fix[list_post_force_cuda[i]]->post_force(vflag);
 	if(n_post_force_host != 0)
 	{
 		cuda->downloadAll(); cuda->oncpu = true;
 		for (int i = 0; i < n_post_force_host; i++)
 			fix[list_post_force[i]]->post_force(vflag);
 		cuda->uploadAll(); cuda->oncpu = false;
 	}
 }
 /* ----------------------------------------------------------------------
   2nd half of integrate call, only for relevant fixes
 ------------------------------------------------------------------------- */
 void ModifyCuda::final_integrate()
 {
 	for (int i = 0; i < n_final_integrate_cuda; i++)
 		fix[list_final_integrate_cuda[i]]->final_integrate();
 	if(n_final_integrate_host != 0)
 	{
 		cuda->downloadAll(); cuda->oncpu = true;
 		for (int i = 0; i < n_final_integrate_host; i++)
 			fix[list_final_integrate[i]]->final_integrate();
 		cuda->uploadAll(); cuda->oncpu = false;
 	}
 }
 /* ----------------------------------------------------------------------
   end-of-timestep call, only for relevant fixes
   only call fix->end_of_step() on timesteps that are multiples of nevery
 ------------------------------------------------------------------------- */
 void ModifyCuda::end_of_step()
 {
 	for (int i = 0; i < n_end_of_step_cuda; i++)
 		if (update->ntimestep % end_of_step_every_cuda[i] == 0)
 			fix[list_end_of_step_cuda[i]]->end_of_step();
 	if(n_end_of_step_host != 0)
 	{
 		int do_thisstep=0;
 		for (int i = 0; i < n_end_of_step_host; i++)
 			if (update->ntimestep % end_of_step_every[i] == 0) do_thisstep=1;
 		if(do_thisstep)
 		{
 		  cuda->downloadAll(); cuda->oncpu = true;
 		  for (int i = 0; i < n_end_of_step_host; i++)
 			 if (update->ntimestep % end_of_step_every[i] == 0)
 				fix[list_end_of_step[i]]->end_of_step();
 		  cuda->uploadAll(); cuda->oncpu = false;
 		}
 	}
 }
 /* ----------------------------------------------------------------------
   thermo energy call, only for relevant fixes
   called by Thermo class
   compute_scalar() is fix call to return energy
 ------------------------------------------------------------------------- */
 double ModifyCuda::thermo_energy()
 {
 	double energy = 0.0;
 	for (int i = 0; i < n_thermo_energy_cuda; i++)
 		energy += fix[list_thermo_energy_cuda[i]]->compute_scalar();
 	if(n_thermo_energy_host != 0)
 	{
 		cuda->downloadAll(); cuda->oncpu = true;
 		for (int i = 0; i < n_thermo_energy_host; i++)
 			energy += fix[list_thermo_energy[i]]->compute_scalar();
 		cuda->uploadAll(); cuda->oncpu = false;
 	}
 	return energy;
 }
 void ModifyCuda::list_init_end_of_step_cuda(int mask, int &n, int *&list)
 {
  delete [] list;
  delete [] end_of_step_every_cuda;
  n = 0;
  for (int i = 0; i < nfix; i++) if (fmask[i] & mask) n++;
  list = new int[n];
  end_of_step_every_cuda = new int[n];
  n = 0;
  for (int i = 0; i < nfix; i++)
    if (fmask[i] & mask) {
      list[n] = i;
      end_of_step_every_cuda[n++] = fix[i]->nevery;
    }
 }
--- a/src/USER-CUDA/modify_cuda.h
+++ b/src/USER-CUDA/modify_cuda.h
@ -0,0 +1,82 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifndef LMP_MODIFY_CUDA_H
 #define LMP_MODIFY_CUDA_H
 #include <cstdio>
 #include "modify.h"
 namespace LAMMPS_NS {
 class ModifyCuda : public Modify {
 public:
  int n_initial_integrate_cuda;
  int n_post_integrate_cuda;
  int n_pre_exchange_cuda;
  int n_pre_neighbor_cuda;
  int n_pre_force_cuda;
  int n_post_force_cuda;
  int n_final_integrate_cuda;
  int n_end_of_step_cuda;
  int n_thermo_energy_cuda;
  int n_initial_integrate_host;
  int n_post_integrate_host;
  int n_pre_exchange_host;
  int n_pre_neighbor_host;
  int n_pre_force_host;
  int n_post_force_host;
  int n_final_integrate_host;
  int n_end_of_step_host;
  int n_thermo_energy_host;
  ModifyCuda(class LAMMPS *);
  ~ModifyCuda();
  void init();
  void initial_integrate(int);
  void post_integrate();
  //void pre_decide();
  void pre_exchange();
  void pre_neighbor();
  void pre_force(int);
  void post_force(int);
  void final_integrate();
  void end_of_step();
  double thermo_energy();
 protected:
  class Cuda *cuda;
  // lists of fixes to apply at different stages of timestep
  // list of cuda fixes
  int *list_initial_integrate_cuda;
  int *list_post_integrate_cuda;
  int *list_pre_exchange_cuda;
  int *list_pre_neighbor_cuda;
  int *list_pre_force_cuda;
  int *list_post_force_cuda;
  int *list_final_integrate_cuda;
  int *list_end_of_step_cuda;
  int *list_thermo_energy_cuda;
  int *end_of_step_every_cuda;
  void list_init_end_of_step_cuda(int, int &, int *&);
 };
 }
 #endif
--- a/src/USER-CUDA/neigh_full_cuda.cpp
+++ b/src/USER-CUDA/neigh_full_cuda.cpp
@ -0,0 +1,317 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #ifdef CUDA
 #include "neighbor_cuda.h"
 #include "neigh_list.h"
 #include "atom.h"
 #include "domain.h"
 #include "group.h"
 #include "error.h"
 #include "cuda_neigh_list.h"
 #include "cuda.h"
 #include "neighbor_cu.h"
 #include <cmath>
 using namespace LAMMPS_NS;
 /* ----------------------------------------------------------------------
   N^2 search for all neighbors
   every neighbor pair appears in list of both atoms i and j
 ------------------------------------------------------------------------- */
 void NeighborCuda::full_bin_cuda(NeighList *list)
 {
  MYDBG(printf(" # CUDA::NeighFullBinCuda ... start\n");)
  if(includegroup) error->warning("Warning using inlcudegroup neighborbuild. This is not yet supported by CUDA neighborbuild styles.\n");
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  if(nlocal==0) return;
  CudaNeighList* clist=list->cuda_list;
  cuda_shared_neighlist* slist=&clist->sneighlist;
  if(not clist) cuda->registerNeighborList(list);
  clist->build_cuda=true;
  if(slist->bin_extraspace<0.09)
  {
    for(int i=1;i<=atom->ntypes;i++)
    for(int j=1;j<=atom->ntypes;j++)
    {
    	if(slist->maxcut<cutneighsq[i][j]) slist->maxcut=cutneighsq[i][j];
    }
    slist->maxcut=sqrt(slist->maxcut);
  }
  int bin_dim_tmp[3];
  int bin_nmax_tmp;
 //printf("Hallo\n");
  timespec starttime,endtime;
  do
  {
    do
    {
      bin_dim_tmp[0]=static_cast <int> ((domain->subhi[0]-domain->sublo[0])/slist->maxcut);
      bin_dim_tmp[1]=static_cast <int> ((domain->subhi[1]-domain->sublo[1])/slist->maxcut);
      bin_dim_tmp[2]=static_cast <int> ((domain->subhi[2]-domain->sublo[2])/slist->maxcut);
      if(bin_dim_tmp[0]==0) bin_dim_tmp[0]+=1;
      if(bin_dim_tmp[1]==0) bin_dim_tmp[1]+=1;
      if(bin_dim_tmp[2]==0) bin_dim_tmp[2]+=1;
      bin_nmax_tmp=static_cast <int> ((1.0+slist->bin_extraspace)*nlocal/(bin_dim_tmp[0]*bin_dim_tmp[1]*bin_dim_tmp[2]));
      bin_dim_tmp[0]+=4;
      bin_dim_tmp[1]+=4;
      bin_dim_tmp[2]+=4;
 	  if(bin_nmax_tmp<32) slist->maxcut*=1.2;
 	 // printf("slist->maxcut: %lf\n", slist->maxcut);
    } while(bin_nmax_tmp<32);
    if((slist->bin_dim[0]!=bin_dim_tmp[0])||(slist->bin_dim[1]!=bin_dim_tmp[1])||(slist->bin_dim[2]!=bin_dim_tmp[2])||(slist->bin_nmax!=bin_nmax_tmp))
    {
    	if(slist->binned_id!=NULL) 
    	CudaWrapper_FreeCudaData(slist->binned_id,slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int));
    	slist->bin_dim[0] = bin_dim_tmp[0];
    	slist->bin_dim[1] = bin_dim_tmp[1];
    	slist->bin_dim[2] = bin_dim_tmp[2];
    	slist->bin_nmax = bin_nmax_tmp;
    	slist->binned_id=(int*) CudaWrapper_AllocCudaData(slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int));
 	  //printf("slist->bin: %i %i %i %i \n", bin_dim_tmp[0],bin_dim_tmp[1],bin_dim_tmp[2],bin_nmax_tmp);
    }
    //if(list->cuda_list->sneighlist.bin_nmax>512) error->all("To many atoms per bin. Likely cause is very long pair cutoff. This needs major rewrite of code and is not yet scheduled to be done.\n");
  }while(Cuda_BinAtoms(&cuda->shared_data, &list->cuda_list->sneighlist));
 // cuda->cu_debugdata->memset_device(0);
  int maxneighbors=slist->maxneighbors;
  int *ilist = list->ilist;
  int *numneigh = list->numneigh;
  if((nex_type!=slist->nex_type)||
  (nex_group!=slist->nex_group)||
  (nex_mol!=slist->nex_mol))
  {
  	slist->nex_type=nex_type;
  	slist->nex_group=nex_group;
  	slist->nex_mol=nex_mol;
  	//printf("%i %i %i\n",nex_type,nex_group,nex_mol);
  	if(nex_type)
  	{
  	delete clist->cu_ex_type;
  	clist->cu_ex_type=new cCudaData<int , int , x> (&ex_type[0][0]   , & slist->ex_type     , (atom->ntypes+1)*(atom->ntypes+1) );
  	clist->cu_ex_type->upload();
  	}
 	//printf("AA %i %i %i\n",nex_type,nex_group,nex_mol);
  	if(nex_group)
  	{
   	delete clist->cu_ex1_bit;
  	clist->cu_ex1_bit=new cCudaData<int , int , x> (ex1_bit   , & slist->ex1_bit     , nex_group );
  	clist->cu_ex1_bit->upload();
  	//printf("A %i %i %i\n",nex_type,nex_group,nex_mol);
  	delete clist->cu_ex2_bit;
  	clist->cu_ex2_bit=new cCudaData<int , int , x> (ex2_bit   , & slist->ex2_bit     , nex_group );
  	clist->cu_ex2_bit->upload();
  	}
  	//printf("B %i %i %i\n",nex_type,nex_group,nex_mol);
  	if(nex_mol)
  	{
  	delete clist->cu_ex_mol_bit;
  	clist->cu_ex_mol_bit=new cCudaData<int , int , x> (ex_mol_bit   , & slist->ex_mol_bit     , nex_mol );
  	clist->cu_ex_mol_bit->upload();
  	}
  	//printf("C %i %i %i\n",nex_type,nex_group,nex_mol);
  }
  int overflow = 0;
  int inum = 0;
  int npnt = 0;
  do
  {
  	npnt=0;
  	inum=0;
    overflow=0;
    clist->grow_device();
    slist->cutneighsq=cutneighsq;
    slist->maxneighbors=maxneighbors;
    slist->inum = list->inum = nlocal;
    //list->cuda_list->grow_device();
    if(cuda->shared_data.overlap_comm)
    {
 	  list->cuda_list->inum_border=0;
 	  list->cuda_list->cu_inum_border->upload();
    }
    cuda->shared_data.atom.nall=nall;
    //Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist);
    overflow= Cuda_NeighborBuildFullBin(&cuda->shared_data, &list->cuda_list->sneighlist);
 	/*cuda->cu_debugdata->download();
 	printf("Debugdata: %i ",cuda->debugdata[0]);
 	for(int i=0;i<cuda->debugdata[0];i+=3) printf("// %i %i %i",cuda->debugdata[i+1],cuda->debugdata[i+2],cuda->debugdata[i+3]);
 	printf("\n");*/
 	//printf("maxneighborsA: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
    if(overflow<0) 
    {
    	maxneighbors+=32; 
    	if(-overflow>maxneighbors) maxneighbors=((-overflow+37)/32)*32;
    	delete list->cuda_list->cu_neighbors;
    	delete [] list->cuda_list->neighbors;
    	list->cuda_list->neighbors= new int[slist->maxlocal*maxneighbors];
    	list->cuda_list->sneighlist.maxneighbors=maxneighbors;
 	//printf("maxneighborsA1: %i %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax,slist->maxlocal);
    	list->cuda_list->cu_neighbors= new cCudaData<int, int, x> (list->cuda_list->neighbors	 		 , & list->cuda_list->sneighlist.neighbors, slist->maxlocal*maxneighbors );
 	//printf("maxneighborsA2: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
    	if(cuda->shared_data.overlap_comm)
    	{
    	  list->cuda_list->sneighlist.maxneighbors=maxneighbors;
    	  list->cuda_list->dev_free();
    	  list->cuda_list->dev_alloc();
    	}
 	//printf("maxneighborsA3: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
    }
 	//printf("maxneighborsB: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
    if(cuda->shared_data.overlap_comm)
    {
  		list->cuda_list->cu_inum_border->download();
  		list->cuda_list->sneighlist.inum_border2=list->cuda_list->inum_border;
    }
  }
  while(overflow<0);
  //cuda->cu_debugdata->download();
 // printf("Differences in: %i\n",cuda->debugdata[0]);
 // for(int i=0;i<20;i++) printf("%i %i %i %i// ",cuda->debugdata[4*i+1],cuda->debugdata[4*i+2],cuda->debugdata[4*i+3],cuda->debugdata[4*i+4]);
 //  printf("\n");
 /*for(int i=0;i<10;i++)
 {
 	printf("%i %i // ",i,numneigh[i]);
 	for(int j=0;j<numneigh[i];j++)
 	 printf("%i ",list->cuda_list->neighbors[i+j*nlocal]);
 	printf("\n");
 }*/
 /*  int count=0;
  if(cuda->shared_data.overlap_comm)
  {
  list->cuda_list->cu_inum_border->download();
  list->cuda_list->cu_ilist_border->download();
  list->cuda_list->cu_numneigh_border->download();
  list->cuda_list->cu_numneigh_inner->download();
  list->cuda_list->cu_neighbors->download();
  list->cuda_list->cu_neighbors_inner->download();
  list->cuda_list->cu_neighbors_border->download();
  //list->cuda_list->cu_firstneigh->download();
 // list->cuda_list->nl_download();
  list->cuda_list->cu_numneigh->download();
  int diff=0;
  //for(int i=0;i<nlocal;i++)*/
 /* int i=123;
  {
  	int k=-1;
  	//printf("inum_border: %i\n",list->cuda_list->inum_border);
  	//for(int j=0;j<list->numneigh[i];j++) printf("%i ",list->firstneigh[i][j]);printf("\n");
  	for(int j=0;j<list->cuda_list->inum_border;j++)
  	if(list->cuda_list->ilist_border[j]==i) k=j;
  	int d=numneigh[i]-list->cuda_list->numneigh_inner[i];
  	if(k>-1) d-=list->cuda_list->numneigh_border[k];
  	if(d!=0) {printf("Error at %i %i %i %i %i\n",i,k,d,numneigh[i],list->cuda_list->numneigh_inner[i]); diff++;}
  	if(k>-1 && count<10) 
  	{
  		printf("Numneighs: %i %i %i  Border_i: %i %i\n",numneigh[i],list->cuda_list->numneigh_inner[i],list->cuda_list->numneigh_border[k],k,(int)list->cuda_list->cu_ilist_border->dev_data());
 	cuda->shared_data.me=k;
 	for(int j=0;j<numneigh[i];j++)
 	 printf("%i ",list->cuda_list->neighbors[i+j*nlocal]);
  	 printf("\n");
 	for(int j=0;j<list->cuda_list->numneigh_inner[i];j++)
 	 printf("%i ",list->cuda_list->neighbors_inner[i+j*nlocal]);
 	 printf(" // ");
 	for(int j=0;j<list->cuda_list->numneigh_border[k];j++)
 	 printf("%i ",list->cuda_list->neighbors_border[k+j*nlocal]);
  	 printf("\n");
  	 count++;
  	}
  }
  printf("%i\n",diff);
  }*/
  list->cuda_list->cu_numneigh->download();
  list->cuda_list->cu_ilist->download();
 	//printf("Done\n");
  MYDBG(printf(" # CUDA::NeighFullBinCuda ... end\n");)
 }
 void NeighborCuda::full_nsq_cuda(NeighList *list)
 {
 	printf("Full_Nsq cuda neighbor list build is not implemented anymore.\n");
 return;
 /*
  MYDBG(printf(" # CUDA::NeighFullNSQCuda ... start\n");)
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  if(cuda->cu_xhold) cuda->cu_xhold->upload();
  if(not list->cuda_list) cuda->registerNeighborList(list);
  list->cuda_list->build_cuda=true;
  int maxneighbors=list->cuda_list->sneighlist.maxneighbors;
  int neigh_lists_per_page=pgsize/maxneighbors;
  int *ilist = list->ilist;
  int *numneigh = list->numneigh;
  int **firstneigh = list->firstneigh;
  int **pages = list->pages;
  int overflow = 0;
  int inum = 0;
  int npage = 0;
  int npnt = 0;
  do
  {
  	npage=0;
  	npnt=0;
  	inum=0;
    overflow=0;
  	neigh_lists_per_page=pgsize/maxneighbors;
    npage=(2*nlocal*maxneighbors-1)/pgsize;
    while(npage>list->maxpage) list->add_pages();
    pages = list->pages;
    npage=0;
  	list->cuda_list->sneighlist.neigh_lists_per_page=pgsize/maxneighbors;
    list->cuda_list->grow_device();
    list->cuda_list->sneighlist.cutneighsq=cutneighsq;
    list->cuda_list->sneighlist.maxneighbors=maxneighbors;
    list->cuda_list->sneighlist.inum = list->inum = nlocal;
    cuda->shared_data.atom.nall=nall;
    Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist);
    overflow= not Cuda_NeighborBuildFullNsq(&cuda->shared_data, &list->cuda_list->sneighlist);
     if(overflow) maxneighbors+=32;
  }
  while(overflow);
   if(not cudable) list->cuda_list->nl_download();
  MYDBG(printf(" # CUDA::NeighFullNSQCuda ... end\n");)
  */
 }
 #endif
--- a/src/USER-CUDA/neighbor_cuda.cpp
+++ b/src/USER-CUDA/neighbor_cuda.cpp
@ -0,0 +1,221 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #include "math.h"
 #include "neighbor_cuda.h"
 #include "cuda.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "domain.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "force.h"
 #include "group.h"
 #include "memory.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 enum{NSQ,BIN,MULTI};     // also in neigh_list.cpp
 /* ---------------------------------------------------------------------- */
 NeighborCuda::NeighborCuda(LAMMPS *lmp) : Neighbor(lmp)
 {
  cuda = lmp->cuda;
 }
 /* ---------------------------------------------------------------------- */
 void NeighborCuda::init()
 {
  cuda->set_neighinit(dist_check,0.25*skin*skin);
  cudable = 1;
  Neighbor::init();
 }
 /* ----------------------------------------------------------------------
   overwrite either full_nsq or full_bin with CUDA-equivalent methods
   any other neighbor build method is unchanged
 ------------------------------------------------------------------------- */
 void NeighborCuda::choose_build(int index, NeighRequest *rq)
 {
  Neighbor::choose_build(index,rq);
  if (rq->full && style == NSQ && rq->ghost == 0 && rq->cudable)
    pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_nsq_cuda;
  else if (rq->full && style == BIN && rq->ghost == 0 && rq->cudable)
    pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_bin_cuda;
 }
 /* ---------------------------------------------------------------------- */
 int NeighborCuda::check_distance()
 {
  double delx,dely,delz,rsq;
  double delta,deltasq,delta1,delta2;
  if (boxcheck) {
    if (triclinic == 0) {
      delx = bboxlo[0] - boxlo_hold[0];
      dely = bboxlo[1] - boxlo_hold[1];
      delz = bboxlo[2] - boxlo_hold[2];
      delta1 = sqrt(delx*delx + dely*dely + delz*delz);
      delx = bboxhi[0] - boxhi_hold[0];
      dely = bboxhi[1] - boxhi_hold[1];
      delz = bboxhi[2] - boxhi_hold[2];
      delta2 = sqrt(delx*delx + dely*dely + delz*delz);
      delta = 0.5 * (skin - (delta1+delta2));
      deltasq = delta*delta;
    } else {
      domain->box_corners();
      delta1 = delta2 = 0.0;
      for (int i = 0; i < 8; i++) {
 	delx = corners[i][0] - corners_hold[i][0];
 	dely = corners[i][1] - corners_hold[i][1];
 	delz = corners[i][2] - corners_hold[i][2];
 	delta = sqrt(delx*delx + dely*dely + delz*delz);
 	if (delta > delta1) delta1 = delta;
 	else if (delta > delta2) delta2 = delta;
      }
      delta = 0.5 * (skin - (delta1+delta2));
      deltasq = delta*delta;
    }
  } else deltasq = triggersq;
  double **x = atom->x;
  int nlocal = atom->nlocal;
  if (includegroup) nlocal = atom->nfirst;
  int flag = 0;
  if (not cuda->neighbor_decide_by_integrator) {
    cuda->cu_x_download();
    for (int i = 0; i < nlocal; i++) {
      delx = x[i][0] - xhold[i][0];
      dely = x[i][1] - xhold[i][1];
      delz = x[i][2] - xhold[i][2];
      rsq = delx*delx + dely*dely + delz*delz;
      if (rsq > deltasq) flag = 1;
    }
  }
  else flag = cuda->shared_data.atom.reneigh_flag;
  int flagall;
  MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_MAX,world);
  if (flagall && ago == MAX(every,delay)) ndanger++;
  return flagall;
 }
 /* ---------------------------------------------------------------------- */
 void NeighborCuda::build()
 {
  int i;
  ago = 0;
  ncalls++;
  // store current atom positions and box size if needed
  if (dist_check) {
    if (cuda->decide_by_integrator())
      cuda->update_xhold(maxhold, &xhold[0][0]);
    else {
      if (cuda->finished_setup) cuda->cu_x_download();
      double **x = atom->x;
      int nlocal = atom->nlocal;
      if (includegroup) nlocal = atom->nfirst;
      if (nlocal > maxhold) {
 	maxhold = atom->nmax;
 	memory->destroy(xhold);
 	memory->create(xhold,maxhold,3,"neigh:xhold");
      }
      for (i = 0; i < nlocal; i++) {
 	xhold[i][0] = x[i][0];
 	xhold[i][1] = x[i][1];
 	xhold[i][2] = x[i][2];
      }
      if (boxcheck) {
 	if (triclinic == 0) {
 	  boxlo_hold[0] = bboxlo[0];
 	  boxlo_hold[1] = bboxlo[1];
 	  boxlo_hold[2] = bboxlo[2];
 	  boxhi_hold[0] = bboxhi[0];
 	  boxhi_hold[1] = bboxhi[1];
 	  boxhi_hold[2] = bboxhi[2];
 	} else {
 	  domain->box_corners();
 	  corners = domain->corners;
 	  for (i = 0; i < 8; i++) {
 	    corners_hold[i][0] = corners[i][0];
 	    corners_hold[i][1] = corners[i][1];
 	    corners_hold[i][2] = corners[i][2];
 	  }
 	}
      }
    }
  }
  if (not cudable && cuda->finished_setup && atom->avec->cudable)
    cuda->downloadAll(); 
  if (cudable && (not cuda->finished_setup)) {
    cuda->checkResize();
    cuda->uploadAll();
  }
  // if any lists store neighbors of ghosts:
  // invoke grow() if nlocal+nghost exceeds previous list size
  // else only invoke grow() if nlocal exceeds previous list size
  // only done for lists with growflag set and which are perpetual
  if (anyghostlist && atom->nlocal+atom->nghost > maxatom) {
    maxatom = atom->nmax;
    for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
  } else if (atom->nlocal > maxatom) {
    maxatom = atom->nmax;
    for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
  }
  // extend atom bin list if necessary
  if (style != NSQ && atom->nmax > maxbin) {
    maxbin = atom->nmax;
    memory->destroy(bins);
    memory->create(bins,maxbin,"bins");
  }
  // check that neighbor list with special bond flags will not overflow
  if (atom->nlocal+atom->nghost > NEIGHMASK)
    error->one("Too many local+ghost atoms for neighbor list");
  // invoke building of pair and molecular neighbor lists
  // only for pairwise lists with buildflag set
  for (i = 0; i < nblist; i++)
    (this->*pair_build[blist[i]])(lists[blist[i]]);
  if (atom->molecular) {
    if (force->bond) (this->*bond_build)();
    if (force->angle) (this->*angle_build)();
    if (force->dihedral) (this->*dihedral_build)();
    if (force->improper) (this->*improper_build)();
  }
 }
--- a/src/USER-CUDA/neighbor_cuda.h
+++ b/src/USER-CUDA/neighbor_cuda.h
@ -0,0 +1,39 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifndef LMP_NEIGHBOR_CUDA_H
 #define LMP_NEIGHBOR_CUDA_H
 #include "neighbor.h"
 namespace LAMMPS_NS {
 class NeighborCuda : public Neighbor {
 public:
  NeighborCuda(class LAMMPS *);
  void init();
  int check_distance();
  void build();
 private:
  class Cuda *cuda;
  void choose_build(int, class NeighRequest *);
  typedef void (NeighborCuda::*PairPtr)(class NeighList *);
  void full_nsq_cuda(class NeighList *);
  void full_bin_cuda(class NeighList *);
 };
 }
 #endif
--- a/src/USER-CUDA/verlet_cuda.cpp
+++ b/src/USER-CUDA/verlet_cuda.cpp
--- a/src/USER-CUDA/verlet_cuda.h
+++ b/src/USER-CUDA/verlet_cuda.h
@ -0,0 +1,63 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 
   See the README file in the top-level LAMMPS directory. 
   ----------------------------------------------------------------------- 
   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 
   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 
   See the README file in the USER-CUDA directory. 
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 #ifdef INTEGRATE_CLASS
 IntegrateStyle(verlet/cuda,VerletCuda)
 #else
 #ifndef LMP_VERLET_CUDA_H
 #define LMP_VERLET_CUDA_H
 #include "verlet.h"
 #include "modify_cuda.h"
 namespace LAMMPS_NS {
 class VerletCuda : public Verlet
 {
 	public:
 		VerletCuda(class LAMMPS *, int, char **);
 		void setup();
 		void setup_minimal(int);
  		void run(int);
 		void test_atom(int atom,char* astring); //debugging purpose
 		int dotestatom;	//debugging purpose
 	protected:
 		class Cuda *cuda;
 		void force_clear();	
 	    double time_pair;
 	    double time_kspace;
 	    double time_comm;
 	    double time_modify;
 	    double time_fulliterate;
 	    ModifyCuda* modify_cuda;
 };
 }
 #endif
 #endif