diff --git a/src/USER-CUDA/Install.sh b/src/USER-CUDA/Install.sh
index c11e58b4f6..13cc0fd222 100755
--- a/src/USER-CUDA/Install.sh
+++ b/src/USER-CUDA/Install.sh
@@ -4,31 +4,189 @@
 
 if (test $1 = 1) then
 
-  if (test -e ../Makefile.package) then
-      sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
-      sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
-      sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
-      sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
-      sed -i '1 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package
-      sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package
-      sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) |' ../Makefile.package
-      sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda -lcuda -lcudart -lrt |' ../Makefile.package
+  if (test ! -e ../Makefile.package) then
+    cp ../Makefile.package.empty ../Makefile.package
   fi
 
+  sed -i -e '/^include.*cuda.*$/d' ../Makefile.package
+  sed -i -e 's/[^ \t]*cuda[^ \t]* //g' ../Makefile.package
+  sed -i -e 's/[^ \t]*CUDA[^ \t]* //g' ../Makefile.package
+  sed -i -e 's/[^ \t]*lrt[^ \t]* //g' ../Makefile.package
+  sed -i '4 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package
+  sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -DLMP_USER_CUDA |' ../Makefile.package
+  sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda |' ../Makefile.package
+  sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda |' ../Makefile.package
+  sed -i -e 's|^PKG_SYSINC =[ \t]*|&-I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package
+  sed -i -e 's|^PKG_SYSPATH =[ \t]*|&-L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(CUDA_USRLIB_CONDITIONAL) |' ../Makefile.package
+  sed -i -e 's|^PKG_SYSLIB =[ \t]*|&-lcuda -lcudart -lrt |' ../Makefile.package
+ 
+  if (test -e ../atom_vec_angle.cpp) then
+    cp atom_vec_angle_cuda.cpp ..
+    cp atom_vec_angle_cuda.h ..
+  fi
+
+  if (test -e ../atom_vec_full.cpp) then
+    cp atom_vec_full_cuda.cpp ..
+    cp atom_vec_full_cuda.h ..
+  fi
+
+  if (test -e ../fix_freeze.cpp) then
+    cp fix_freeze_cuda.cpp ..
+    cp fix_freeze_cuda.h ..
+  fi
+
+  if (test -e ../pair_born_coul_long.cpp) then
+    cp pair_born_coul_long_cuda.cpp ..
+    cp pair_born_coul_long_cuda.h ..
+  fi
+
+  if (test -e ../pair_buck_coul_long.cpp) then
+    cp pair_buck_coul_long_cuda.cpp ..
+    cp pair_buck_coul_long_cuda.h ..
+  fi
+
+  if (test -e ../pair_cg_cmm.cpp) then
+    cp pair_cg_cmm_cuda.cpp ..
+    cp pair_cg_cmm_coul_cut_cuda.cpp ..
+    cp pair_cg_cmm_coul_debye_cuda.cpp ..
+    cp pair_cg_cmm_cuda.h ..
+    cp pair_cg_cmm_coul_cut_cuda.h ..
+    cp pair_cg_cmm_coul_debye_cuda.h ..
+  fi
+
+  if (test -e ../pair_cg_cmm_coul_long.cpp) then
+    cp pair_cg_cmm_coul_long_cuda.cpp ..
+    cp pair_cg_cmm_coul_long_cuda.h ..
+  fi
+
+  if (test -e ../pppm.cpp) then
+    cp pppm_cuda.cpp ..
+    cp fft3d_cuda.cpp ..
+    cp fft3d_wrap_cuda.cpp ..
+    cp pppm_cuda.h ..
+    cp fft3d_cuda.h ..
+    cp fft3d_wrap_cuda.h ..
+    cp pair_lj_cut_coul_long_cuda.cpp ..
+    cp pair_lj_cut_coul_long_cuda.h ..
+  fi
+  
+
+  if (test -e ../pair_eam.cpp) then
+    cp pair_eam_alloy_cuda.cpp ..
+    cp pair_eam_cuda.cpp ..
+    cp pair_eam_fs_cuda.cpp ..
+    cp pair_eam_alloy_cuda.h ..
+    cp pair_eam_cuda.h ..
+    cp pair_eam_fs_cuda.h ..
+  fi
+
+  if (test -e ../pair_gran_hooke.cpp) then
+    cp pair_gran_hooke_cuda.cpp ..
+    cp pair_gran_hooke_cuda.h ..
+  fi
+
+  if (test -e ../pair_lj_charmm_coul_charmm.cpp) then
+    cp pair_lj_charmm_coul_charmm_cuda.cpp ..
+    cp pair_lj_charmm_coul_charmm_implicit_cuda.cpp ..
+    cp pair_lj_charmm_coul_charmm_cuda.h ..
+    cp pair_lj_charmm_coul_charmm_implicit_cuda.h ..
+    if (test -e ../pair_lj_charmm_coul_long.cpp) then  
+      cp pair_lj_charmm_coul_long_cuda.cpp ..
+      cp pair_lj_charmm_coul_long_cuda.h ..
+    fi
+  fi
+
+  if (test -e ../pair_lj_class2.cpp) then
+    cp pair_lj_class2_coul_cut_cuda.cpp ..
+    cp pair_lj_class2_cuda.cpp ..
+    cp pair_lj_class2_coul_cut_cuda.h ..
+    cp pair_lj_class2_cuda.h ..
+    if (test -e ../pair_lj_class2_coul_long.cpp) then
+      cp pair_lj_class2_coul_long_cuda.cpp ..
+      cp pair_lj_class2_coul_long_cuda.h ..
+    fi
+  fi 
+
+  cp atom_vec_atomic_cuda.cpp ..
+  cp atom_vec_charge_cuda.cpp ..
   cp comm_cuda.cpp ..
+  cp compute_pe_cuda.cpp ..
+  cp compute_pressure_cuda.cpp ..
+  cp compute_temp_cuda.cpp ..
+  cp compute_temp_partial_cuda.cpp ..
   cp domain_cuda.cpp ..
+  cp fix_addforce_cuda.cpp ..
+  cp fix_aveforce_cuda.cpp ..
+  cp fix_enforce2d_cuda.cpp ..
+  cp fix_gravity_cuda.cpp ..
+  cp fix_nh_cuda.cpp ..
+  cp fix_npt_cuda.cpp ..
+  cp fix_nve_cuda.cpp ..
+  cp fix_nvt_cuda.cpp ..
+  cp fix_set_force_cuda.cpp ..
+  cp fix_shake_cuda.cpp ..
+  cp fix_temp_berendsen_cuda.cpp ..
+  cp fix_temp_rescale_cuda.cpp ..
+  cp fix_temp_rescale_limit_cuda.cpp ..
+  cp fix_viscous_cuda.cpp ..
   cp modify_cuda.cpp ..
   cp neighbor_cuda.cpp ..
   cp neigh_full_cuda.cpp ..
+  cp pair_buck_coul_cut_cuda.cpp ..
+  cp pair_buck_cuda.cpp ..
+  cp pair_lj96_cut_cuda.cpp ..
+  cp pair_lj_cut_coul_cut_cuda.cpp ..
+  cp pair_lj_cut_coul_debye_cuda.cpp ..
+  cp pair_lj_cut_cuda.cpp ..
+  cp pair_lj_cut_experimental_cuda.cpp ..
+  cp pair_lj_expand_cuda.cpp ..
+  cp pair_lj_gromacs_coul_gromacs_cuda.cpp ..
+  cp pair_lj_gromacs_cuda.cpp ..
+  cp pair_lj_smooth_cuda.cpp ..
+  cp pair_morse_cuda.cpp ..
+  cp pppm_cuda.cpp ..
   cp verlet_cuda.cpp ..
 
   cp cuda.cpp ..
   cp cuda_neigh_list.cpp ..
 
+  cp atom_vec_atomic_cuda.h ..
+  cp atom_vec_charge_cuda.h ..
   cp comm_cuda.h ..
+  cp compute_pe_cuda.h ..
+  cp compute_pressure_cuda.h ..
+  cp compute_temp_cuda.h ..
+  cp compute_temp_partial_cuda.h ..
   cp domain_cuda.h ..
+  cp fix_addforce_cuda.h ..
+  cp fix_aveforce_cuda.h ..
+  cp fix_enforce2d_cuda.h ..
+  cp fix_gravity_cuda.h ..
+  cp fix_nh_cuda.h ..
+  cp fix_npt_cuda.h ..
+  cp fix_nve_cuda.h ..
+  cp fix_nvt_cuda.h ..
+  cp fix_set_force_cuda.h ..
+  cp fix_shake_cuda.h ..
+  cp fix_temp_berendsen_cuda.h ..
+  cp fix_temp_rescale_cuda.h ..
+  cp fix_temp_rescale_limit_cuda.h ..
+  cp fix_viscous_cuda.h ..
   cp modify_cuda.h ..
   cp neighbor_cuda.h ..
+  cp pair_buck_coul_cut_cuda.h ..
+  cp pair_buck_cuda.h ..
+
+  cp pair_lj96_cut_cuda.h ..
+  cp pair_lj_cut_coul_cut_cuda.h ..
+  cp pair_lj_cut_coul_debye_cuda.h ..
+  cp pair_lj_cut_cuda.h ..
+  cp pair_lj_cut_experimental_cuda.h ..
+  cp pair_lj_expand_cuda.h ..
+  cp pair_lj_gromacs_coul_gromacs_cuda.h ..
+  cp pair_lj_gromacs_cuda.h ..
+  cp pair_lj_smooth_cuda.h ..
+  cp pair_morse_cuda.h ..
   cp verlet_cuda.h ..
 
   cp cuda.h ..
@@ -42,26 +200,136 @@ if (test $1 = 1) then
 elif (test $1 = 0) then
 
   if (test -e ../Makefile.package) then
-    sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
-    sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
-    sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
-    sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
+    sed -i -e '/^include.*cuda.*$/d' ../Makefile.package
+    sed -i -e 's/[^ \t]*cuda[^ \t]* //g' ../Makefile.package
+    sed -i -e 's/[^ \t]*CUDA[^ \t]* //g' ../Makefile.package
+    sed -i -e 's/[^ \t]*lrt[^ \t]* //g' ../Makefile.package
   fi
 
+  rm ../atom_vec_angle_cuda.cpp
+  rm ../atom_vec_atomic_cuda.cpp
+  rm ../atom_vec_charge_cuda.cpp
+  rm ../atom_vec_full_cuda.cpp
   rm ../comm_cuda.cpp
+  rm ../compute_pe_cuda.cpp
+  rm ../compute_pressure_cuda.cpp
+  rm ../compute_temp_cuda.cpp
+  rm ../compute_temp_partial_cuda.cpp
   rm ../domain_cuda.cpp
+  rm ../fft3d_cuda.cpp
+  rm ../fft3d_wrap_cuda.cpp
+  rm ../fix_addforce_cuda.cpp
+  rm ../fix_aveforce_cuda.cpp
+  rm ../fix_enforce2d_cuda.cpp
+  rm ../fix_freeze_cuda.cpp
+  rm ../fix_gravity_cuda.cpp
+  rm ../fix_nh_cuda.cpp
+  rm ../fix_npt_cuda.cpp
+  rm ../fix_nve_cuda.cpp
+  rm ../fix_nvt_cuda.cpp
+  rm ../fix_set_force_cuda.cpp
+  rm ../fix_shake_cuda.cpp
+  rm ../fix_temp_berendsen_cuda.cpp
+  rm ../fix_temp_rescale_cuda.cpp
+  rm ../fix_temp_rescale_limit_cuda.cpp
+  rm ../fix_viscous_cuda.cpp
   rm ../modify_cuda.cpp
   rm ../neighbor_cuda.cpp
   rm ../neigh_full_cuda.cpp
+  rm ../pair_born_coul_long_cuda.cpp
+  rm ../pair_buck_coul_cut_cuda.cpp
+  rm ../pair_buck_coul_long_cuda.cpp
+  rm ../pair_buck_cuda.cpp
+  rm ../pair_cg_cmm_coul_cut_cuda.cpp
+  rm ../pair_cg_cmm_coul_debye_cuda.cpp
+  rm ../pair_cg_cmm_coul_long_cuda.cpp
+  rm ../pair_cg_cmm_cuda.cpp
+  rm ../pair_eam_alloy_cuda.cpp
+  rm ../pair_eam_cuda.cpp
+  rm ../pair_eam_fs_cuda.cpp
+  rm ../pair_gran_hooke_cuda.cpp
+  rm ../pair_lj96_cut_cuda.cpp
+  rm ../pair_lj_charmm_coul_charmm_cuda.cpp
+  rm ../pair_lj_charmm_coul_charmm_implicit_cuda.cpp
+  rm ../pair_lj_charmm_coul_long_cuda.cpp
+  rm ../pair_lj_class2_coul_cut_cuda.cpp
+  rm ../pair_lj_class2_coul_long_cuda.cpp
+  rm ../pair_lj_class2_cuda.cpp
+  rm ../pair_lj_cut_coul_cut_cuda.cpp
+  rm ../pair_lj_cut_coul_debye_cuda.cpp
+  rm ../pair_lj_cut_coul_long_cuda.cpp
+  rm ../pair_lj_cut_cuda.cpp
+  rm ../pair_lj_cut_experimental_cuda.cpp
+  rm ../pair_lj_expand_cuda.cpp
+  rm ../pair_lj_gromacs_coul_gromacs_cuda.cpp
+  rm ../pair_lj_gromacs_cuda.cpp
+  rm ../pair_lj_smooth_cuda.cpp
+  rm ../pair_morse_cuda.cpp
+  rm ../pppm_cuda.cpp
   rm ../verlet_cuda.cpp
 
   rm ../cuda.cpp
   rm ../cuda_neigh_list.cpp
 
+  rm ../atom_vec_angle_cuda.h
+  rm ../atom_vec_atomic_cuda.h
+  rm ../atom_vec_charge_cuda.h
+  rm ../atom_vec_full_cuda.h
   rm ../comm_cuda.h
+  rm ../compute_pe_cuda.h
+  rm ../compute_pressure_cuda.h
+  rm ../compute_temp_cuda.h
+  rm ../compute_temp_partial_cuda.h
   rm ../domain_cuda.h
+  rm ../fft3d_cuda.h
+  rm ../fft3d_wrap_cuda.h
+  rm ../fix_addforce_cuda.h
+  rm ../fix_aveforce_cuda.h
+  rm ../fix_enforce2d_cuda.h
+  rm ../fix_freeze_cuda.h
+  rm ../fix_gravity_cuda.h
+  rm ../fix_nh_cuda.h
+  rm ../fix_npt_cuda.h
+  rm ../fix_nve_cuda.h
+  rm ../fix_nvt_cuda.h
+  rm ../fix_set_force_cuda.h
+  rm ../fix_shake_cuda.h
+  rm ../fix_temp_berendsen_cuda.h
+  rm ../fix_temp_rescale_cuda.h
+  rm ../fix_temp_rescale_limit_cuda.h
+  rm ../fix_viscous_cuda.h
   rm ../modify_cuda.h
   rm ../neighbor_cuda.h
+  rm ../pair_born_coul_long_cuda.h
+  rm ../pair_buck_coul_cut_cuda.h
+  rm ../pair_buck_coul_long_cuda.h
+  rm ../pair_buck_cuda.h
+  rm ../pair_cg_cmm_coul_cut_cuda.h
+  rm ../pair_cg_cmm_coul_debye_cuda.h
+  rm ../pair_cg_cmm_coul_long_cuda.h
+  rm ../pair_cg_cmm_cuda.h
+  rm ../pair_eam_alloy_cuda.h
+  rm ../pair_eam_cuda.h
+  rm ../pair_eam_fs_cuda.h
+  rm ../pair_gran_hooke_cuda.h
+  rm ../pair_lj96_cut_cuda.h
+  rm ../pair_lj_charmm_coul_charmm_cuda.h
+  rm ../pair_lj_charmm_coul_charmm_implicit_cuda.h
+  rm ../pair_lj_charmm_coul_long_cuda.h
+  rm ../pair_lj_class2_coul_cut_cuda.h
+  rm ../pair_lj_class2_coul_long_cuda.h
+  rm ../pair_lj_class2_cuda.h
+  rm ../pair_lj_cut_coul_cut_cuda.h
+  rm ../pair_lj_cut_coul_debye_cuda.h
+  rm ../pair_lj_cut_coul_long_cuda.h
+  rm ../pair_lj_cut_cuda.h
+  rm ../pair_lj_cut_experimental_cuda.h
+  rm ../pair_lj_expand_cuda.h
+  rm ../pair_lj_gromacs_coul_gromacs_cuda.h
+  rm ../pair_lj_gromacs_cuda.h
+  rm ../pair_lj_smooth_cuda.h
+  rm ../pair_morse_cuda.h
+  rm ../pppm_cuda.h
   rm ../verlet_cuda.h
 
   rm ../cuda.h
diff --git a/src/USER-CUDA/atom_vec_angle_cuda.cpp b/src/USER-CUDA/atom_vec_angle_cuda.cpp
new file mode 100644
index 0000000000..3064533649
--- /dev/null
+++ b/src/USER-CUDA/atom_vec_angle_cuda.cpp
@@ -0,0 +1,476 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "atom_vec_angle_cuda.h"
+#include "comm_cuda_cu.h"
+#include "atom_vec_angle_cuda_cu.h"
+#include "atom.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+#include "universe.h"
+#include "comm.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+#define BUFFACTOR 1.5
+#define BUFEXTRA 1000
+#define NCUDAEXCHANGE 12 //nextra x y z vx vy vz tag type mask image molecule
+
+#define BUF_FLOAT double
+/* ---------------------------------------------------------------------- */
+
+AtomVecAngleCuda::AtomVecAngleCuda(LAMMPS *lmp, int narg, char **arg) :
+  AtomVecAngle(lmp, narg, arg)
+{
+   cuda = lmp->cuda;
+   if(cuda == NULL) 
+	error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); 
+
+   maxsend=0;
+   cudable=true;
+   cuda_init_done=false;
+   max_nsend=0;
+   cu_copylist=NULL;
+   copylist=NULL;
+   copylist2=NULL;
+}
+
+void AtomVecAngleCuda::grow_copylist(int new_max_nsend)
+{
+  max_nsend=new_max_nsend;
+  delete cu_copylist;
+  delete [] copylist2;
+  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
+  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
+  copylist2 = new int[max_nsend];
+  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
+}
+
+void AtomVecAngleCuda::grow_send(int n,double** buf_send,int flag)  //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole
+{
+  int old_maxsend=*maxsend+BUFEXTRA;
+  *maxsend = static_cast<int> (BUFFACTOR * n);
+  if (flag)
+  {
+    if(cuda->pinned)
+    {
+      double* tmp = new double[old_maxsend];
+      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
+      delete [] tmp;	        	
+    }
+    else
+    {
+     *buf_send = (double *) 
+      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
+		       "comm:buf_send");
+    }
+  }
+  else {
+   if(cuda->pinned)
+    {
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+    }
+    else
+    {
+      memory->sfree(*buf_send);
+      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
+					  "comm:buf_send");
+    }
+  }
+}
+
+void AtomVecAngleCuda::grow_both(int n)
+{
+  if(cuda->finished_setup)
+  cuda->downloadAll();	
+  AtomVecAngle::grow(n);
+  if(cuda->finished_setup)
+  {
+    cuda->checkResize();
+    cuda->uploadAll();
+  }
+}
+
+int AtomVecAngleCuda::pack_comm(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAngle::pack_comm(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+int AtomVecAngleCuda::pack_comm_vel(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAngle::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAngleCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAngle::unpack_comm(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
+}
+
+void AtomVecAngleCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAngle::unpack_comm_vel(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAngleCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAngle::pack_reverse(n,first,buf);
+
+  int i,m,last;
+  cuda->cu_f->download();
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = f[i][0];
+    buf[m++] = f[i][1];
+    buf[m++] = f[i][2];
+  }
+  cuda->cu_f->upload();
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAngleCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	{AtomVecAngle::unpack_reverse(n,list,buf); return;}
+
+  int i,j,m;
+
+  m = 0;
+  cuda->cu_f->download();
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    f[j][0] += buf[m++];
+    f[j][1] += buf[m++];
+    f[j][2] += buf[m++];
+  }
+  cuda->cu_f->upload();
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAngleCuda::pack_border(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAngle::pack_border(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecAngleCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+  return m;
+}
+
+int AtomVecAngleCuda::pack_border_vel(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAngle::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecAngleCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAngleCuda::unpack_border(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAngle::unpack_border(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecAngleCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+void AtomVecAngleCuda::unpack_border_vel(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAngle::unpack_border_vel(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecAngleCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   xyz must be 1st 3 values, so comm::exchange() can test on them 
+------------------------------------------------------------------------- */
+
+
+int AtomVecAngleCuda::pack_exchange(int dim, double *buf)
+{
+  if(cuda->oncpu)
+  	return AtomVecAngle::pack_exchange(dim,buf);
+
+  if(not cuda_init_done||domain->box_change)
+  {
+  	Cuda_AtomVecAngleCuda_Init(&cuda->shared_data);
+  	cuda_init_done=true;
+  }
+  double** buf_pointer=(double**) buf;
+  if(*maxsend<atom->nghost || *buf_pointer==NULL)
+  {
+  	grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
+  	*maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
+  }
+  
+  if(max_nsend==0) grow_copylist(200);
+
+  int nsend_atoms = Cuda_AtomVecAngleCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+  if(nsend_atoms*NCUDAEXCHANGE>*maxsend) 
+  {
+  	grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
+  	Cuda_AtomVecAngleCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  }
+
+  int nlocal=atom->nlocal-nsend_atoms;
+  
+  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i>=nlocal) copylist2[i-nlocal]=-1;
+  }
+  
+  int actpos=0;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i<nlocal) 
+  	{
+  	  while(copylist2[actpos]==-1) actpos++;
+    	  copylist[j-1]=nlocal+actpos;
+  	  actpos++;
+  	}
+  }
+  cu_copylist->upload();
+  
+  cuda->shared_data.atom.nlocal=nlocal;
+  
+  int m = Cuda_AtomVecAngleCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
+  
+  timespec time1,time2;
+  clock_gettime(CLOCK_REALTIME,&time1);
+ 
+  double* buf_p=*buf_pointer;
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    int i=static_cast <int> (buf_p[j+1]);
+    int nextra=0;
+    int k;
+    buf_p[m++] = num_bond[i];
+    for (k = 0; k < num_bond[i]; k++) {
+      buf_p[m++] = bond_type[i][k];
+      buf_p[m++] = bond_atom[i][k];
+    }
+    nextra+=2*num_bond[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+
+    buf_p[m++] = num_angle[i];
+    for (k = 0; k < num_angle[i]; k++) {
+      buf_p[m++] = angle_type[i][k];
+      buf_p[m++] = angle_atom1[i][k];
+      buf_p[m++] = angle_atom2[i][k];
+      buf_p[m++] = angle_atom3[i][k];
+    }
+    nextra+=4*num_angle[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+
+    buf_p[m++] = nspecial[i][0];
+    buf_p[m++] = nspecial[i][1];
+    buf_p[m++] = nspecial[i][2];
+    for (k = 0; k < nspecial[i][2]; k++) buf_p[m++] = special[i][k];
+    nextra+=nspecial[i][2]+3;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+  
+    if (atom->nextra_grow)
+      for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      {
+        int dm= modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf_p[m]);
+        m+=dm;
+  		nextra+=dm;
+        if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i);
+    	if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+      }
+
+    if(i<nlocal)AtomVecAngle::copy(copylist[j],i,1);  
+    (*buf_pointer)[j+1] = nextra;
+  }
+	  
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+  (*buf_pointer)[0] = nsend_atoms;
+  atom->nlocal-=nsend_atoms;
+  cuda->shared_data.atom.update_nlocal=2;
+ //printf("End Pack Exchange\n");
+  if(m==1) return 0;
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAngleCuda::unpack_exchange(double *buf)
+{
+// printf("Begin UnPack Exchange\n");
+  if(cuda->oncpu)
+  	return AtomVecAngle::unpack_exchange(buf);
+  
+  double *sublo,*subhi;
+  int dim=cuda->shared_data.exchange_dim;
+  if(domain->box_change) 
+  Cuda_AtomVecAngleCuda_Init(&cuda->shared_data);
+  if (domain->triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  int mfirst=0;
+  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
+  {
+  int nlocal = atom->nlocal;
+  int nsend_atoms=static_cast<int> (buf[0]);
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+ 
+  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost); //ensure there is enough space on device to unpack data
+  int naccept = Cuda_AtomVecAngleCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
+  cu_copylist->download();
+  int m = nsend_atoms*NCUDAEXCHANGE + 1;
+  nlocal+=naccept;
+
+  timespec time1,time2;
+  clock_gettime(CLOCK_REALTIME,&time1);
+
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    if(copylist[j]>-1)
+    {
+ 	  int k;
+	  int i=copylist[j];
+      num_bond[i] = static_cast<int> (buf[m++]);
+      for (k = 0; k < num_bond[i]; k++) {
+    	bond_type[i][k] = static_cast<int> (buf[m++]);
+    	bond_atom[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  num_angle[i] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < num_angle[i]; k++) {
+    	angle_type[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom1[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom2[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom3[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  nspecial[i][0] = static_cast<int> (buf[m++]);
+  	  nspecial[i][1] = static_cast<int> (buf[m++]);
+  	  nspecial[i][2] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < nspecial[i][2]; k++)
+    	special[i][k] = static_cast<int> (buf[m++]);
+    	
+  	  if (atom->nextra_grow)
+        for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      				m += modify->fix[atom->extra_grow[iextra]]->
+					unpack_exchange(i,&buf[m]);
+    	
+    }
+    else 
+    m+=static_cast <int> (buf[j+1]);
+  }
+	  
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+  cuda->shared_data.atom.nlocal=nlocal;
+  cuda->shared_data.atom.update_nlocal=2;
+  atom->nlocal=nlocal;
+  mfirst+=m;
+  buf=&buf[m];
+  }
+  return mfirst;
+}
+
+
+
diff --git a/src/USER-CUDA/atom_vec_angle_cuda.h b/src/USER-CUDA/atom_vec_angle_cuda.h
new file mode 100644
index 0000000000..0687058aca
--- /dev/null
+++ b/src/USER-CUDA/atom_vec_angle_cuda.h
@@ -0,0 +1,69 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(angle/cuda,AtomVecAngleCuda)
+
+#else
+
+#ifndef LMP_ATOM_VEC_ANGLE_CUDA_H
+#define LMP_ATOM_VEC_ANGLE_CUDA_H
+
+#include "atom_vec_angle.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecAngleCuda : public AtomVecAngle {
+ public:
+  AtomVecAngleCuda(class LAMMPS *, int, char **);
+  virtual ~AtomVecAngleCuda() {}
+  void grow_copylist(int n);
+  void grow_send(int n,double** buf_send,int flag);
+  void grow_both(int n);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+ private:
+  class Cuda *cuda;
+  bool cuda_init_done;
+  int* copylist;
+  int* copylist2;
+  cCudaData<int, int, xx >* cu_copylist;
+  int max_nsend;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/atom_vec_atomic_cuda.cpp b/src/USER-CUDA/atom_vec_atomic_cuda.cpp
new file mode 100644
index 0000000000..210d712db2
--- /dev/null
+++ b/src/USER-CUDA/atom_vec_atomic_cuda.cpp
@@ -0,0 +1,407 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "atom_vec_atomic_cuda.h"
+#include "comm_cuda_cu.h"
+#include "atom_vec_atomic_cuda_cu.h"
+#include "atom.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+#include "comm.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+#define BUFFACTOR 1.5
+#define BUFEXTRA 1000
+#define NCUDAEXCHANGE 11 //nextra x y z vx vy vz tag type mask image
+
+
+#define BUF_FLOAT double
+/* ---------------------------------------------------------------------- */
+
+AtomVecAtomicCuda::AtomVecAtomicCuda(LAMMPS *lmp, int narg, char **arg) :
+  AtomVecAtomic(lmp, narg, arg)
+{
+   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+   maxsend=0;
+   cudable=true;
+   cuda_init_done=false;
+   max_nsend=0;
+   cu_copylist=NULL;
+   copylist=NULL;
+   copylist2=NULL;
+}
+
+void AtomVecAtomicCuda::grow_copylist(int new_max_nsend)
+{
+  max_nsend=new_max_nsend;
+  delete cu_copylist;
+  delete [] copylist2;
+  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
+  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
+  copylist2 = new int[max_nsend];
+  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
+}
+
+void AtomVecAtomicCuda::grow_send(int n,double** buf_send,int flag)
+{
+  int old_maxsend=*maxsend+BUFEXTRA;
+  *maxsend = static_cast<int> (BUFFACTOR * n);
+  if (flag)
+  {
+    if(cuda->pinned)
+    {
+      double* tmp = new double[old_maxsend];
+      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
+      delete [] tmp;	        	
+    }
+    else
+    {
+     *buf_send = (double *) 
+      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
+		       "comm:buf_send");
+    }
+  }
+  else {
+   if(cuda->pinned)
+    {
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+    }
+    else
+    {
+      memory->sfree(*buf_send);
+      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
+					  "comm:buf_send");
+    }
+  }
+}
+
+void AtomVecAtomicCuda::grow_both(int n)
+{
+  if(cuda->finished_setup)
+  cuda->downloadAll();	
+  AtomVecAtomic::grow(n);
+  if(cuda->finished_setup)
+  {
+    cuda->checkResize();
+    cuda->uploadAll();
+  }
+}
+
+int AtomVecAtomicCuda::pack_comm(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAtomic::pack_comm(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+int AtomVecAtomicCuda::pack_comm_vel(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAtomic::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicCuda::unpack_comm(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAtomic::unpack_comm(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
+}
+
+void AtomVecAtomicCuda::unpack_comm_vel(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAtomic::unpack_comm_vel(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
+}
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicCuda::pack_reverse(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAtomic::pack_reverse(n,first,buf);
+
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = f[i][0];
+    buf[m++] = f[i][1];
+    buf[m++] = f[i][2];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicCuda::unpack_reverse(int n, int *list, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	{AtomVecAtomic::unpack_reverse(n,list,buf); return;}
+
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    f[j][0] += buf[m++];
+    f[j][1] += buf[m++];
+    f[j][2] += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicCuda::pack_border(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAtomic::pack_border(n,iswap,buf,pbc_flag,pbc);
+  
+  int m = Cuda_AtomVecAtomicCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+
+  return m;
+}
+
+int AtomVecAtomicCuda::pack_border_vel(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAtomic::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
+  
+  int m = Cuda_AtomVecAtomicCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+
+  return m;
+}
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicCuda::unpack_border(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAtomic::unpack_border(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) 
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecAtomicCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+
+}
+
+void AtomVecAtomicCuda::unpack_border_vel(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAtomic::unpack_border_vel(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) 
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecAtomicCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   xyz must be 1st 3 values, so comm::exchange() can test on them 
+------------------------------------------------------------------------- */
+
+
+int AtomVecAtomicCuda::pack_exchange(int dim, double *buf)
+{
+  if(cuda->oncpu)
+  	return AtomVecAtomic::pack_exchange(dim,buf);
+
+  if(not cuda_init_done||domain->box_change)
+  {
+  	Cuda_AtomVecAtomicCuda_Init(&cuda->shared_data);
+  	cuda_init_done=true;
+  }
+  double** buf_pointer=(double**) buf;
+  if(*maxsend<atom->nghost || *buf_pointer==NULL)
+  {
+  	grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
+  	*maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
+  }
+  
+  if(max_nsend==0) grow_copylist(200);
+  
+  int nsend_atoms = Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  
+  if(nsend_atoms>max_nsend) {grow_copylist(nsend_atoms+100);}
+  if(nsend_atoms*NCUDAEXCHANGE>*maxsend) 
+  {
+  	grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
+  	Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  }
+  
+  int nlocal=atom->nlocal-nsend_atoms;
+  
+  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i>=nlocal) copylist2[i-nlocal]=-1;
+  }
+  
+  int actpos=0;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i<nlocal) 
+  	{
+  	  while(copylist2[actpos]==-1) actpos++;
+    	  copylist[j-1]=nlocal+actpos;
+  	  actpos++;
+  	}
+  }
+  cu_copylist->upload();
+  
+  cuda->shared_data.atom.nlocal=nlocal;
+  
+  int m = Cuda_AtomVecAtomicCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
+  if (atom->nextra_grow)
+  for(int j=0;j<nsend_atoms;j++)
+  {
+      int i=static_cast <int> ((*buf_pointer)[j+1]);
+      int nextra=0;
+      for (int iextra = 0; iextra < atom->nextra_grow; iextra++) {
+      	
+        int dm = modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&((*buf_pointer)[m]));
+        m+=dm;
+  		nextra+=dm;
+        if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i);
+        if(m>*maxsend)  grow_send(m,buf_pointer,1);
+      }
+      (*buf_pointer)[j+1] = nextra;
+      
+  }
+
+  (*buf_pointer)[0] = nsend_atoms;
+  atom->nlocal-=nsend_atoms;
+  cuda->shared_data.atom.update_nlocal=2;
+
+  if(m==1) return 0;//m is at least 1 in cuda since buf[0] contains number of atoms
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicCuda::unpack_exchange(double *buf)
+{
+  //printf("Unpack Begin\n");
+  if(cuda->oncpu)
+  	return AtomVecAtomic::unpack_exchange(buf);
+
+  double *sublo,*subhi;
+
+  int dim=cuda->shared_data.exchange_dim;
+  if(domain->box_change) 
+  Cuda_AtomVecAtomicCuda_Init(&cuda->shared_data);
+  if (domain->triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  int mfirst=0;
+  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
+  {
+  int nlocal = atom->nlocal;
+  
+  int nsend_atoms=static_cast<int> (buf[0]);
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+ 
+  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost);
+  int naccept = Cuda_AtomVecAtomicCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
+  cu_copylist->download();
+  int m = nsend_atoms*NCUDAEXCHANGE + 1;
+  nlocal+=naccept;
+  if (atom->nextra_grow)
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    if(copylist[j]>-1)
+    {
+    		for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      				m += modify->fix[atom->extra_grow[iextra]]->
+					unpack_exchange(copylist[j],&buf[m]);
+    }
+    else
+    {
+      m+=static_cast <int> (buf[j+1]);
+    }
+  }
+  cuda->shared_data.atom.nlocal=nlocal;
+  cuda->shared_data.atom.update_nlocal=2;
+  atom->nlocal=nlocal;
+  mfirst+=m;
+  buf=&buf[m];
+  }
+  return mfirst;
+}
+
+
+
diff --git a/src/USER-CUDA/atom_vec_atomic_cuda.h b/src/USER-CUDA/atom_vec_atomic_cuda.h
new file mode 100644
index 0000000000..da6dfb4d3a
--- /dev/null
+++ b/src/USER-CUDA/atom_vec_atomic_cuda.h
@@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+#ifdef ATOM_CLASS
+
+AtomStyle(atomic/cuda,AtomVecAtomicCuda)
+
+#else
+
+#ifndef LMP_ATOM_VEC_ATOMIC_CUDA_H
+#define LMP_ATOM_VEC_ATOMIC_CUDA_H
+
+#include "atom_vec_atomic.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecAtomicCuda : public AtomVecAtomic {
+ public:
+  AtomVecAtomicCuda(class LAMMPS *, int, char **);
+  virtual ~AtomVecAtomicCuda() {}
+  void grow_copylist(int n);
+  void grow_send(int n,double** buf_send,int flag);
+  void grow_both(int n);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+ private:
+  class Cuda *cuda;
+  bool cuda_init_done;
+  int* copylist;
+  int* copylist2;
+  cCudaData<int, int, xx >* cu_copylist;
+  int max_nsend;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/atom_vec_charge_cuda.cpp b/src/USER-CUDA/atom_vec_charge_cuda.cpp
new file mode 100644
index 0000000000..476846909a
--- /dev/null
+++ b/src/USER-CUDA/atom_vec_charge_cuda.cpp
@@ -0,0 +1,407 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "atom_vec_charge_cuda.h"
+#include "comm_cuda_cu.h"
+#include "atom_vec_charge_cuda_cu.h"
+#include "atom.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+#include "comm.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+#define BUFFACTOR 1.5
+#define BUFEXTRA 1000
+#define NCUDAEXCHANGE 12 //nextra x y z vx vy vz tag type mask image q
+
+#define BUF_FLOAT double
+/* ---------------------------------------------------------------------- */
+
+AtomVecChargeCuda::AtomVecChargeCuda(LAMMPS *lmp, int narg, char **arg) :
+  AtomVecCharge(lmp, narg, arg)
+{
+   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+   maxsend=0;
+   cudable=true;
+   cuda_init_done=false;
+   max_nsend=0;
+   cu_copylist=NULL;
+   copylist=NULL;
+   copylist2=NULL;
+}
+
+void AtomVecChargeCuda::grow_copylist(int new_max_nsend)
+{
+  max_nsend=new_max_nsend;
+  delete cu_copylist;
+  delete [] copylist2;
+  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
+  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
+  copylist2 = new int[max_nsend];
+  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
+}
+
+void AtomVecChargeCuda::grow_send(int n,double** buf_send,int flag)  //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole
+{
+  int old_maxsend=*maxsend+BUFEXTRA;
+  *maxsend = static_cast<int> (BUFFACTOR * n);
+  if (flag)
+  {
+    if(cuda->pinned)
+    {
+      double* tmp = new double[old_maxsend];
+      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
+      delete [] tmp;	        	
+    }
+    else
+    {
+     *buf_send = (double *) 
+      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
+		       "comm:buf_send");
+    }
+  }
+  else {
+   if(cuda->pinned)
+    {
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+    }
+    else
+    {
+      memory->sfree(*buf_send);
+      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
+					  "comm:buf_send");
+    }
+  }
+}
+
+void AtomVecChargeCuda::grow_both(int n)
+{
+  if(cuda->finished_setup)
+  cuda->downloadAll();	
+  AtomVecCharge::grow(n);
+  if(cuda->finished_setup)
+  {
+    cuda->checkResize();
+    cuda->uploadAll();
+  }
+}
+
+int AtomVecChargeCuda::pack_comm(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecCharge::pack_comm(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+int AtomVecChargeCuda::pack_comm_vel(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecCharge::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecChargeCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecCharge::unpack_comm(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
+}
+
+void AtomVecChargeCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecCharge::unpack_comm_vel(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecChargeCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecCharge::pack_reverse(n,first,buf);
+
+  int i,m,last;
+  cuda->cu_f->download();
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = f[i][0];
+    buf[m++] = f[i][1];
+    buf[m++] = f[i][2];
+  }
+  cuda->cu_f->upload();
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecChargeCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	{AtomVecCharge::unpack_reverse(n,list,buf); return;}
+
+  int i,j,m;
+
+  m = 0;
+  cuda->cu_f->download();
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    f[j][0] += buf[m++];
+    f[j][1] += buf[m++];
+    f[j][2] += buf[m++];
+  }
+  cuda->cu_f->upload();
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecChargeCuda::pack_border(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecCharge::pack_border(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecChargeCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	
+  return m;
+}
+
+int AtomVecChargeCuda::pack_border_vel(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecCharge::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecChargeCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecChargeCuda::unpack_border(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecCharge::unpack_border(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecChargeCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+void AtomVecChargeCuda::unpack_border_vel(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecCharge::unpack_border_vel(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecChargeCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   xyz must be 1st 3 values, so comm::exchange() can test on them 
+------------------------------------------------------------------------- */
+
+
+int AtomVecChargeCuda::pack_exchange(int dim, double *buf)
+{
+  if(cuda->oncpu)
+  	return AtomVecCharge::pack_exchange(dim,buf);
+
+  if(not cuda_init_done||domain->box_change)
+  {
+  	Cuda_AtomVecChargeCuda_Init(&cuda->shared_data);
+  	cuda_init_done=true;
+  }
+  double** buf_pointer=(double**) buf;
+  if(*maxsend<atom->nghost || *buf_pointer==NULL)
+  {
+  	grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
+  	*maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
+  }
+  
+  if(max_nsend==0) grow_copylist(200);
+
+  int nsend_atoms = Cuda_AtomVecChargeCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+  if(nsend_atoms*NCUDAEXCHANGE>*maxsend) 
+  {
+  	grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
+  	Cuda_AtomVecChargeCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  }
+  
+  int nlocal=atom->nlocal-nsend_atoms;
+  
+  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i>=nlocal) copylist2[i-nlocal]=-1;
+  }
+  
+  int actpos=0;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i<nlocal) 
+  	{
+  	  while(copylist2[actpos]==-1) actpos++;
+    	  copylist[j-1]=nlocal+actpos;
+  	  actpos++;
+  	}
+  }
+  cu_copylist->upload();
+    
+  cuda->shared_data.atom.nlocal=nlocal;
+  
+  int m = Cuda_AtomVecChargeCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
+  
+  if (atom->nextra_grow)
+  for(int j=0;j<nsend_atoms;j++)
+  {
+      int i=static_cast <int> ((*buf_pointer)[j+1]);
+      int nextra=0;
+      for (int iextra = 0; iextra < atom->nextra_grow; iextra++) {
+      	
+        int dm = modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&((*buf_pointer)[m]));
+        m+=dm;
+  		nextra+=dm;
+        if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i);
+        if(m>*maxsend)  grow_send(m,buf_pointer,1);
+      }
+      (*buf_pointer)[j+1] = nextra;
+  }
+
+  (*buf_pointer)[0] = nsend_atoms;
+  atom->nlocal-=nsend_atoms;
+  cuda->shared_data.atom.update_nlocal=2;
+
+  if(m==1) return 0;//m is at least 1 in cuda since buf[0] contains number of atoms
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecChargeCuda::unpack_exchange(double *buf)
+{
+  if(cuda->oncpu)
+  	return AtomVecCharge::unpack_exchange(buf);
+  double *sublo,*subhi;
+
+  int dim=cuda->shared_data.exchange_dim;
+  if(domain->box_change) 
+  Cuda_AtomVecChargeCuda_Init(&cuda->shared_data);
+  if (domain->triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  int mfirst=0;
+  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
+  {
+  int nlocal = atom->nlocal;
+  int nsend_atoms=static_cast<int> (buf[0]);
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+ 
+  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost);
+  int naccept = Cuda_AtomVecChargeCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
+  cu_copylist->download();
+  int m = nsend_atoms*NCUDAEXCHANGE + 1;
+  nlocal+=naccept;
+  if (atom->nextra_grow)
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    if(copylist[j]>-1)
+    {
+    		for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      				m += modify->fix[atom->extra_grow[iextra]]->
+					unpack_exchange(copylist[j],&buf[m]);
+    }
+    else
+    m+=static_cast <int> (buf[j+1]);
+  }
+  cuda->shared_data.atom.nlocal=nlocal;
+  cuda->shared_data.atom.update_nlocal=2;
+  atom->nlocal=nlocal;
+  mfirst+=m;
+  buf=&buf[m];
+  }
+  return mfirst;
+}
+
+
+
diff --git a/src/USER-CUDA/atom_vec_charge_cuda.h b/src/USER-CUDA/atom_vec_charge_cuda.h
new file mode 100644
index 0000000000..924dd55c85
--- /dev/null
+++ b/src/USER-CUDA/atom_vec_charge_cuda.h
@@ -0,0 +1,69 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(charge/cuda,AtomVecChargeCuda)
+
+#else
+
+#ifndef LMP_ATOM_VEC_CHARGE_CUDA_H
+#define LMP_ATOM_VEC_CHARGE_CUDA_H
+
+#include "atom_vec_charge.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecChargeCuda : public AtomVecCharge {
+ public:
+  AtomVecChargeCuda(class LAMMPS *, int, char **);
+  virtual ~AtomVecChargeCuda() {}
+  void grow_copylist(int n);
+  void grow_send(int n,double** buf_send,int flag);
+  void grow_both(int n);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+ private:
+  class Cuda *cuda;
+  bool cuda_init_done;
+  int* copylist;
+  int* copylist2;
+  cCudaData<int, int, xx >* cu_copylist;
+  int max_nsend;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/atom_vec_full_cuda.cpp b/src/USER-CUDA/atom_vec_full_cuda.cpp
new file mode 100644
index 0000000000..e81213bfef
--- /dev/null
+++ b/src/USER-CUDA/atom_vec_full_cuda.cpp
@@ -0,0 +1,516 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "atom_vec_full_cuda.h"
+#include "comm_cuda_cu.h"
+#include "atom_vec_full_cuda_cu.h"
+#include "atom.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+#include "universe.h"
+#include "comm.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+#define BUFFACTOR 1.5
+#define BUFEXTRA 1000
+#define NCUDAEXCHANGE 13 //nextra x y z vx vy vz tag type mask image q molecule
+
+#define BUF_FLOAT double
+/* ---------------------------------------------------------------------- */
+
+AtomVecFullCuda::AtomVecFullCuda(LAMMPS *lmp, int narg, char **arg) :
+  AtomVecFull(lmp, narg, arg)
+{
+   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+   maxsend=0;
+   cudable=true;
+   cuda_init_done=false;
+   max_nsend=0;
+   cu_copylist=NULL;
+   copylist=NULL;
+   copylist2=NULL;
+}
+
+void AtomVecFullCuda::grow_copylist(int new_max_nsend)
+{
+  max_nsend=new_max_nsend;
+  delete cu_copylist;
+  delete [] copylist2;
+  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
+  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
+  copylist2 = new int[max_nsend];
+  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
+}
+
+void AtomVecFullCuda::grow_send(int n,double** buf_send,int flag)  //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole
+{
+  int old_maxsend=*maxsend+BUFEXTRA;
+  *maxsend = static_cast<int> (BUFFACTOR * n);
+  if (flag)
+  {
+    if(cuda->pinned)
+    {
+      double* tmp = new double[old_maxsend];
+      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
+      delete [] tmp;	        	
+    }
+    else
+    {
+     *buf_send = (double *) 
+      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
+		       "comm:buf_send");
+    }
+  }
+  else {
+    if(cuda->pinned)
+    {
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+    }
+    else
+    {
+      memory->sfree(*buf_send);
+      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
+					  "comm:buf_send");
+    }
+  }
+}
+
+void AtomVecFullCuda::grow_both(int n)
+{
+  if(cuda->finished_setup)
+  cuda->downloadAll();	
+  AtomVecFull::grow(n);
+  if(cuda->finished_setup)
+  {
+    cuda->checkResize();
+    cuda->uploadAll();
+  }
+}
+
+int AtomVecFullCuda::pack_comm(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecFull::pack_comm(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+int AtomVecFullCuda::pack_comm_vel(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecFull::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecFullCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecFull::unpack_comm(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
+}
+
+void AtomVecFullCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecFull::unpack_comm_vel(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecFullCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecFull::pack_reverse(n,first,buf);
+
+  int i,m,last;
+  cuda->cu_f->download();
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = f[i][0];
+    buf[m++] = f[i][1];
+    buf[m++] = f[i][2];
+  }
+  cuda->cu_f->upload();
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecFullCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	{AtomVecFull::unpack_reverse(n,list,buf); return;}
+
+  int i,j,m;
+
+  m = 0;
+  cuda->cu_f->download();
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    f[j][0] += buf[m++];
+    f[j][1] += buf[m++];
+    f[j][2] += buf[m++];
+  }
+  cuda->cu_f->upload();
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecFullCuda::pack_border(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecFull::pack_border(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecFullCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+  return m;
+}
+
+int AtomVecFullCuda::pack_border_vel(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecFull::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecFullCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecFullCuda::unpack_border(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecFull::unpack_border(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecFullCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+void AtomVecFullCuda::unpack_border_vel(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecFull::unpack_border_vel(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecFullCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   xyz must be 1st 3 values, so comm::exchange() can test on them 
+------------------------------------------------------------------------- */
+
+
+int AtomVecFullCuda::pack_exchange(int dim, double *buf)
+{
+  if(cuda->oncpu)
+  	return AtomVecFull::pack_exchange(dim,buf);
+
+  if(not cuda_init_done||domain->box_change)
+  {
+  	Cuda_AtomVecFullCuda_Init(&cuda->shared_data);
+  	cuda_init_done=true;
+  }
+  double** buf_pointer=(double**) buf;
+  if(*maxsend<atom->nghost || *buf_pointer==NULL)
+  {
+  	grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
+  	*maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
+  }
+  
+  if(max_nsend==0) grow_copylist(200);
+
+  int nsend_atoms = Cuda_AtomVecFullCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+  if(nsend_atoms*NCUDAEXCHANGE>*maxsend) 
+  {
+  	grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
+  	Cuda_AtomVecFullCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  }
+
+  int nlocal=atom->nlocal-nsend_atoms;
+  
+  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i>=nlocal) copylist2[i-nlocal]=-1;
+  }
+  
+  int actpos=0;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i<nlocal) 
+  	{
+  	  while(copylist2[actpos]==-1) actpos++;
+    	  copylist[j-1]=nlocal+actpos;
+  	  actpos++;
+  	}
+  }
+  cu_copylist->upload();
+  
+  cuda->shared_data.atom.nlocal=nlocal;
+  
+  int m = Cuda_AtomVecFullCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
+  
+  timespec time1,time2;
+  clock_gettime(CLOCK_REALTIME,&time1);
+ 
+  double* buf_p=*buf_pointer;
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    int i=static_cast <int> (buf_p[j+1]);
+    int nextra=0;
+    int k;
+    buf_p[m++] = num_bond[i];
+    for (k = 0; k < num_bond[i]; k++) {
+      buf_p[m++] = bond_type[i][k];
+      buf_p[m++] = bond_atom[i][k];
+    }
+    nextra+=2*num_bond[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+    
+    buf_p[m++] = num_angle[i];
+    for (k = 0; k < num_angle[i]; k++) {
+      buf_p[m++] = angle_type[i][k];
+      buf_p[m++] = angle_atom1[i][k];
+      buf_p[m++] = angle_atom2[i][k];
+      buf_p[m++] = angle_atom3[i][k];
+    }
+    nextra+=4*num_angle[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+
+    buf_p[m++] = num_dihedral[i];
+    for (k = 0; k < num_dihedral[i]; k++) {
+      buf_p[m++] = dihedral_type[i][k];
+      buf_p[m++] = dihedral_atom1[i][k];
+      buf_p[m++] = dihedral_atom2[i][k];
+      buf_p[m++] = dihedral_atom3[i][k];
+      buf_p[m++] = dihedral_atom4[i][k];
+    }
+    nextra+=5*num_dihedral[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+
+    buf_p[m++] = num_improper[i];
+    for (k = 0; k < num_improper[i]; k++) {
+      buf_p[m++] = improper_type[i][k];
+      buf_p[m++] = improper_atom1[i][k];
+      buf_p[m++] = improper_atom2[i][k];
+      buf_p[m++] = improper_atom3[i][k];
+      buf_p[m++] = improper_atom4[i][k];
+    }
+    nextra+=5*num_improper[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+
+    buf_p[m++] = nspecial[i][0];
+    buf_p[m++] = nspecial[i][1];
+    buf_p[m++] = nspecial[i][2];
+    for (k = 0; k < nspecial[i][2]; k++) buf_p[m++] = special[i][k];
+    nextra+=nspecial[i][2]+3;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+  
+    if (atom->nextra_grow)
+      for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      {
+        int dm= modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf_p[m]);
+        m+=dm;
+  		nextra+=dm;
+        if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i);
+        if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+      }
+
+    if(i<nlocal)AtomVecFull::copy(copylist[j],i,1);  
+    (*buf_pointer)[j+1] = nextra;
+  }
+	  
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+  (*buf_pointer)[0] = nsend_atoms;
+  atom->nlocal-=nsend_atoms;
+  cuda->shared_data.atom.update_nlocal=2;
+ //printf("End Pack Exchange\n");
+  if(m==1) return 0;
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecFullCuda::unpack_exchange(double *buf)
+{
+// printf("Begin UnPack Exchange\n");
+  if(cuda->oncpu)
+  	return AtomVecFull::unpack_exchange(buf);
+  
+  double *sublo,*subhi;
+  int dim=cuda->shared_data.exchange_dim;
+  if(domain->box_change) 
+  Cuda_AtomVecFullCuda_Init(&cuda->shared_data);
+  if (domain->triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  int mfirst=0;
+  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
+  {
+  int nlocal = atom->nlocal;
+  int nsend_atoms=static_cast<int> (buf[0]);
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+ 
+  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost); //ensure there is enough space on device to unpack data
+  int naccept = Cuda_AtomVecFullCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
+  cu_copylist->download();
+  int m = nsend_atoms*NCUDAEXCHANGE + 1;
+  nlocal+=naccept;
+
+  timespec time1,time2;
+  clock_gettime(CLOCK_REALTIME,&time1);
+
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    if(copylist[j]>-1)
+    {
+ 	  int k;
+	  int i=copylist[j];
+      num_bond[i] = static_cast<int> (buf[m++]);
+      for (k = 0; k < num_bond[i]; k++) {
+    	bond_type[i][k] = static_cast<int> (buf[m++]);
+    	bond_atom[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  num_angle[i] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < num_angle[i]; k++) {
+    	angle_type[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom1[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom2[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom3[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  num_dihedral[i] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < num_dihedral[i]; k++) {
+    	dihedral_type[i][k] = static_cast<int> (buf[m++]);
+    	dihedral_atom1[i][k] = static_cast<int> (buf[m++]);
+    	dihedral_atom2[i][k] = static_cast<int> (buf[m++]);
+    	dihedral_atom3[i][k] = static_cast<int> (buf[m++]);
+    	dihedral_atom4[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  num_improper[i] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < num_improper[i]; k++) {
+    	improper_type[i][k] = static_cast<int> (buf[m++]);
+    	improper_atom1[i][k] = static_cast<int> (buf[m++]);
+    	improper_atom2[i][k] = static_cast<int> (buf[m++]);
+    	improper_atom3[i][k] = static_cast<int> (buf[m++]);
+    	improper_atom4[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  nspecial[i][0] = static_cast<int> (buf[m++]);
+  	  nspecial[i][1] = static_cast<int> (buf[m++]);
+  	  nspecial[i][2] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < nspecial[i][2]; k++)
+    	special[i][k] = static_cast<int> (buf[m++]);
+    	
+  	  if (atom->nextra_grow)
+        for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      				m += modify->fix[atom->extra_grow[iextra]]->
+					unpack_exchange(i,&buf[m]);
+    	
+    }
+    else 
+    m+=static_cast <int> (buf[j+1]);
+  }
+	  
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+  cuda->shared_data.atom.nlocal=nlocal;
+  cuda->shared_data.atom.update_nlocal=2;
+  atom->nlocal=nlocal;
+  mfirst+=m;
+  buf=&buf[m];
+  }
+  return mfirst;
+}
+
+
+
diff --git a/src/USER-CUDA/atom_vec_full_cuda.h b/src/USER-CUDA/atom_vec_full_cuda.h
new file mode 100644
index 0000000000..f16fd7703d
--- /dev/null
+++ b/src/USER-CUDA/atom_vec_full_cuda.h
@@ -0,0 +1,69 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(full/cuda,AtomVecFullCuda)
+
+#else
+
+#ifndef LMP_ATOM_VEC_FULL_CUDA_H
+#define LMP_ATOM_VEC_FULL_CUDA_H
+
+#include "atom_vec_full.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecFullCuda : public AtomVecFull {
+ public:
+  AtomVecFullCuda(class LAMMPS *, int, char **);
+  virtual ~AtomVecFullCuda() {}
+  void grow_copylist(int n);
+  void grow_send(int n,double** buf_send,int flag);
+  void grow_both(int n);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+ private:
+  class Cuda *cuda;
+  bool cuda_init_done;
+  int* copylist;
+  int* copylist2;
+  cCudaData<int, int, xx >* cu_copylist;
+  int max_nsend;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/comm_cuda.cpp b/src/USER-CUDA/comm_cuda.cpp
index 6f90227112..aac1e53239 100644
--- a/src/USER-CUDA/comm_cuda.cpp
+++ b/src/USER-CUDA/comm_cuda.cpp
@@ -55,6 +55,8 @@ enum{SINGLE,MULTI};
 CommCuda::CommCuda(LAMMPS *lmp):Comm(lmp) 
 {
   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
 
   cu_pbc=NULL;
   cu_slablo=NULL;
diff --git a/src/USER-CUDA/comm_cuda.cu b/src/USER-CUDA/comm_cuda.cu
new file mode 100644
index 0000000000..0233f3ee13
--- /dev/null
+++ b/src/USER-CUDA/comm_cuda.cu
@@ -0,0 +1,483 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX comm_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "comm_cuda_cu.h"
+#include "comm_cuda_kernel.cu"
+#include <ctime>
+
+void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata,int n)
+{
+		int size=n*3*sizeof(X_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+}
+
+
+void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*) 	  );
+}
+
+
+void Cuda_CommCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_CommCuda_UpdateNmax(sdata);
+	int ntypesp=sdata->atom.ntypes+1;
+    cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)   , &ntypesp, sizeof(int));
+    cudaMemcpyToSymbol(MY_CONST(prd)   , sdata->domain.prd, 3*sizeof(X_FLOAT));
+    cudaMemcpyToSymbol(MY_CONST(flag)  , &sdata->flag, sizeof(int*));
+  	cudaMemcpyToSymbol(MY_CONST(debugdata)  , &sdata->debugdata, sizeof(int*));
+}
+
+int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemset( sdata->flag,0,sizeof(int));
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
+	  ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_pack+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+      
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+      if(not sdata->overlap_comm)
+        cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+      //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_download+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  int aflag;
+	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+		
+	}		
+    return 3*n;
+}
+
+int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*6*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemset( sdata->flag,0,sizeof(int));
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
+	  ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_pack+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+      
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+      if(not sdata->overlap_comm)
+        cudaMemcpy(buf_send, sdata->buffer, n*6*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+      //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_download+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  int aflag;
+	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+		
+	}		
+    return 6*n;
+}
+
+int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+	MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+	static int count=-1;
+	count++;
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  Cuda_CommCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_self+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
+	}	
+	
+    return 3*n;
+}
+
+int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+	MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*6*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+	static int count=-1;
+	count++;
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  Cuda_CommCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_self+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
+	}	
+	
+    return 6*n;
+}
+
+void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
+{
+    timespec time1,time2;
+
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+clock_gettime(CLOCK_REALTIME,&time1);
+      if(not sdata->overlap_comm||iswap<0)
+	    cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_upload+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+	  void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_kernel_unpack+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
+		
+	}		
+}
+
+void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
+{
+    timespec time1,time2;
+
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*6*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+clock_gettime(CLOCK_REALTIME,&time1);
+
+      if(not sdata->overlap_comm||iswap<0)
+	    cudaMemcpy(sdata->buffer,(void*)buf_recv, n*6*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_upload+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+	  void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_kernel_unpack+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
+		
+	}		
+}
+
+int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(F_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+
+	F_FLOAT* buf=(F_FLOAT*)buf_send;
+	F_FLOAT* f_dev=(F_FLOAT*)sdata->atom.f.dev_data;
+	f_dev+=first;
+	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+	buf+=n; f_dev+=sdata->atom.nmax;
+	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+	buf+=n; f_dev+=sdata->atom.nmax;
+	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+	return 	n*3;
+}
+
+
+void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(F_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemcpy(sdata->buffer,buf_recv, size, cudaMemcpyHostToDevice);
+	  Cuda_CommCuda_UnpackReverse_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed");		
+	}		
+}
+
+void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  Cuda_CommCuda_UnpackReverse_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,first);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed");
+		
+	}		
+}
+
+
+int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap)
+{
+	MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
+    timespec time1,time2;
+	Cuda_CommCuda_UpdateNmax(sdata);
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	if(sdata->buffer_new or (80>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,10);
+	int n;
+	if (!bordergroup || ineed >= 2)
+	n=nlast-nfirst+1;
+	else
+	{
+	  n=atom_nfirst;
+	  if(nlast-sdata->atom.nlocal+1>n) n=nlast-sdata->atom.nlocal+1;
+	}
+	int3 layout=getgrid(n,0,512,true);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x+1, layout.y, 1);
+
+
+    cudaMemset((int*) (sdata->buffer),0,sizeof(int));
+
+clock_gettime(CLOCK_REALTIME,&time1);
+	if(style==1)
+	Cuda_CommCuda_BuildSendlist_Single<<<grid, threads,(threads.x+1)*sizeof(int)>>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.slablo.dev_data,(X_FLOAT*) sdata->comm.slabhi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength);
+	else
+	Cuda_CommCuda_BuildSendlist_Multi<<<grid, threads,(threads.x+1)*sizeof(int)>>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.multilo.dev_data,(X_FLOAT*) sdata->comm.multihi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength);
+    cudaThreadSynchronize();
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_border_kernel_buildlist+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed");
+    int nsend;
+	cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
+	return nsend;
+	
+	
+}
+
diff --git a/src/USER-CUDA/compute_pe_cuda.cpp b/src/USER-CUDA/compute_pe_cuda.cpp
new file mode 100644
index 0000000000..0d93aea249
--- /dev/null
+++ b/src/USER-CUDA/compute_pe_cuda.cpp
@@ -0,0 +1,61 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cstring>
+#include "compute_pe_cuda.h"
+#include "atom.h"
+#include "update.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+#include "modify.h"
+#include "domain.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+ComputePECuda::ComputePECuda(LAMMPS *lmp, int narg, char **arg) : 
+  ComputePE(lmp, narg, arg)
+{
+  cudable = 1;
+}
diff --git a/src/USER-CUDA/compute_pe_cuda.h b/src/USER-CUDA/compute_pe_cuda.h
new file mode 100644
index 0000000000..71444f671c
--- /dev/null
+++ b/src/USER-CUDA/compute_pe_cuda.h
@@ -0,0 +1,59 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(pe/cuda,ComputePECuda)
+
+#else
+
+#ifndef LMP_COMPUTE_PE_CUDA_H
+#define LMP_COMPUTE_PE_CUDA_H
+
+#include "compute_pe.h"
+
+namespace LAMMPS_NS {
+
+class ComputePECuda : public ComputePE {
+ public:
+  ComputePECuda(class LAMMPS *, int, char **);
+  ~ComputePECuda() {}
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/compute_pressure_cuda.cpp b/src/USER-CUDA/compute_pressure_cuda.cpp
new file mode 100644
index 0000000000..bb3e49e8e9
--- /dev/null
+++ b/src/USER-CUDA/compute_pressure_cuda.cpp
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cstring>
+#include <cstdlib>
+#include "compute_pressure_cuda.h"
+#include "atom.h"
+#include "update.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+enum{DUMMY0,INVOKED_SCALAR,INVOKED_VECTOR,DUMMMY3,INVOKED_PERATOM};
+
+/* ---------------------------------------------------------------------- */
+
+ComputePressureCuda::ComputePressureCuda(LAMMPS *lmp, int narg, char **arg) :
+  ComputePressure(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+  cudable = 1;
+  
+  // store temperature ID used by pressure computation
+  // insure it is valid for temperature computation
+
+  int n = strlen(arg[3]) + 1;
+  char* id_temp = new char[n];
+  strcpy(id_temp,arg[3]);
+
+  int icompute = modify->find_compute(id_temp);
+  delete id_temp;
+  if (modify->compute[icompute]->cudable == 0)
+  {
+    error->warning("Compute pressure/cuda temperature ID is not cudable! Try a temp/cuda style.");
+    cudable = 0;
+  }
+  
+}
+
+double ComputePressureCuda::compute_scalar()
+{
+  if(not temperature->cudable && cuda->finished_setup) cuda->downloadAll();
+  ComputePressure::compute_scalar();
+}
+
+void ComputePressureCuda::compute_vector()
+{
+  if(not temperature->cudable && cuda->finished_setup) cuda->downloadAll();
+  ComputePressure::compute_vector();
+}
diff --git a/src/USER-CUDA/compute_pressure_cuda.h b/src/USER-CUDA/compute_pressure_cuda.h
new file mode 100644
index 0000000000..d99f4a5cca
--- /dev/null
+++ b/src/USER-CUDA/compute_pressure_cuda.h
@@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(pressure/cuda,ComputePressureCuda)
+
+#else
+
+#ifndef LMP_COMPUTE_PRESSURE_CUDA_H
+#define LMP_COMPUTE_PRESSURE_CUDA_H
+
+#include "compute_pressure.h"
+
+namespace LAMMPS_NS {
+
+class ComputePressureCuda : public ComputePressure {
+ public:
+  ComputePressureCuda(class LAMMPS *, int, char **);
+  ~ComputePressureCuda() {}
+  double compute_scalar();
+  void compute_vector();
+
+  private:
+  class Cuda *cuda;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/compute_temp_cuda.cpp b/src/USER-CUDA/compute_temp_cuda.cpp
new file mode 100644
index 0000000000..a16939f95c
--- /dev/null
+++ b/src/USER-CUDA/compute_temp_cuda.cpp
@@ -0,0 +1,212 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "compute_temp_cuda.h"
+#include "compute_temp_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "force.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "group.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+ComputeTempCuda::ComputeTempCuda(LAMMPS *lmp, int narg, char **arg) : 
+  Compute(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 3) error->all("Illegal compute temp/cuda command");
+
+  scalar_flag = vector_flag = 1;
+  size_vector = 6;
+  extscalar = 0;
+  extvector = 1;
+  tempflag = 1;
+
+  vector = new double[6];
+  cu_t_vector = 0;
+  cu_t_scalar = 0;
+  cudable=true;
+  
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeTempCuda::~ComputeTempCuda()
+{
+  delete [] vector;
+  delete cu_t_vector;
+  delete cu_t_scalar;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempCuda::init()
+{
+  fix_dof = 0;
+  for (int i = 0; i < modify->nfix; i++)
+    fix_dof += modify->fix[i]->dof(igroup);
+  dof_compute();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempCuda::dof_compute()
+{
+  double natoms = group->count(igroup);
+  dof = domain->dimension * natoms;
+  dof -= extra_dof + fix_dof;
+  if (dof > 0.0) tfactor = force->mvv2e / (dof * force->boltz);
+  else tfactor = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double ComputeTempCuda::compute_scalar()
+{
+  if(cuda->begin_setup)
+  {
+  	if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_FLOAT, x> (t_vector,6);    
+  	if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_FLOAT, x> (&t_scalar,1);    
+    invoked_scalar = update->ntimestep;
+    Cuda_ComputeTempCuda_Scalar(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_scalar->dev_data());
+    cu_t_scalar->download();
+  }
+  else
+  {
+  invoked_scalar = update->ntimestep;
+
+  double **v = atom->v;
+  double *mass = atom->mass;
+  double *rmass = atom->rmass;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double t = 0.0;
+
+  if (rmass) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i];
+  } else {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * 
+	  mass[type[i]];
+  }
+  t_scalar=t;
+  }
+  
+  MPI_Allreduce(&t_scalar,&scalar,1,MPI_DOUBLE,MPI_SUM,world);
+  if (dynamic) dof_compute();
+  scalar *= tfactor;
+  if(scalar>1e15) 
+  {
+  	cuda->cu_v->download();
+  	cuda->cu_x->download();
+  	cuda->cu_type->download();
+    double **v = atom->v;
+    double **x = atom->x;
+    printf("Out of v-range atoms:  \n"); 
+  	for(int i=0;i<atom->nlocal;i++) 
+  	if((v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2])>1e5) 
+  	printf("%i %i // %lf %lf %lf // %lf %lf %lf\n",atom->tag[i],atom->type[i],x[i][0], x[i][1], x[i][2],v[i][0], v[i][1], v[i][2]);
+  	error->all("Temperature out of range. Simulations will be abortet.\n");
+  }
+  return scalar;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempCuda::compute_vector()
+{
+  int i;
+  if(cuda->begin_setup)
+  {
+  if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_FLOAT, x> (t_vector,6);    
+  if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_FLOAT, x> (&t_scalar,1);    
+
+  invoked_vector = update->ntimestep;
+
+  Cuda_ComputeTempCuda_Vector(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_vector->dev_data());
+  cu_t_vector->download();
+  }
+  else
+  {
+ 
+  invoked_vector = update->ntimestep;
+
+  double **v = atom->v;
+  double *mass = atom->mass;
+  double *rmass = atom->rmass;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double massone,t[6];
+  for (i = 0; i < 6; i++) t[i] = 0.0;
+
+  for (i = 0; i < nlocal; i++)
+    if (mask[i] & groupbit) {
+      if (rmass) massone = rmass[i];
+      else massone = mass[type[i]];
+      t[0] += massone * v[i][0]*v[i][0];
+      t[1] += massone * v[i][1]*v[i][1];
+      t[2] += massone * v[i][2]*v[i][2];
+      t[3] += massone * v[i][0]*v[i][1];
+      t[4] += massone * v[i][0]*v[i][2];
+      t[5] += massone * v[i][1]*v[i][2];
+    }
+  
+  for (i = 0; i < 6; i++) t_vector[i]=t[i];
+  }
+  MPI_Allreduce(t_vector,vector,6,MPI_DOUBLE,MPI_SUM,world);
+  for (i = 0; i < 6; i++) vector[i] *= force->mvv2e;
+}
diff --git a/src/USER-CUDA/compute_temp_cuda.h b/src/USER-CUDA/compute_temp_cuda.h
new file mode 100644
index 0000000000..35ae0bbf3f
--- /dev/null
+++ b/src/USER-CUDA/compute_temp_cuda.h
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(temp/cuda,ComputeTempCuda)
+
+#else
+
+#ifndef LMP_COMPUTE_TEMP_CUDA_H
+#define LMP_COMPUTE_TEMP_CUDA_H
+
+#include "compute.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class ComputeTempCuda : public Compute {
+ public:
+  ComputeTempCuda(class LAMMPS *, int, char **);
+  ~ComputeTempCuda();
+  void init();
+  double compute_scalar();
+  void compute_vector();
+
+ private:
+  class Cuda *cuda;
+  int fix_dof;
+  double tfactor;
+
+  void dof_compute();
+  double t_vector[6];
+  double t_scalar;
+  cCudaData<double     , ENERGY_FLOAT   		, x>* cu_t_scalar;	
+  cCudaData<double     , ENERGY_FLOAT   		, x>* cu_t_vector;	
+  
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/compute_temp_partial_cuda.cpp b/src/USER-CUDA/compute_temp_partial_cuda.cpp
new file mode 100644
index 0000000000..2965e273cd
--- /dev/null
+++ b/src/USER-CUDA/compute_temp_partial_cuda.cpp
@@ -0,0 +1,357 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "compute_temp_partial_cuda.h"
+#include "compute_temp_partial_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "force.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "group.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+ComputeTempPartialCuda::ComputeTempPartialCuda(LAMMPS *lmp, int narg, char **arg) : 
+  Compute(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 6) error->all("Illegal compute temp/partial command");
+
+  scalar_flag = vector_flag = 1;
+  size_vector = 6;
+  extscalar = 0;
+  extvector = 1;
+  tempflag = 1;
+  tempbias = 1;
+  
+  xflag = atoi(arg[3]);
+  yflag = atoi(arg[4]);
+  zflag = atoi(arg[5]);
+  if (zflag && domain->dimension == 2)
+    error->all("Compute temp/partial cannot use vz for 2d systemx");
+
+  maxbias = 0;
+  vbiasall = NULL;
+
+  vector = new double[6];
+  cu_t_vector = 0;
+  cu_t_scalar = 0;
+  cu_vbiasall=NULL;
+  cudable=true;
+  
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeTempPartialCuda::~ComputeTempPartialCuda()
+{
+  memory->destroy(vbiasall);
+  delete [] vector;
+  delete cu_t_vector;
+  delete cu_t_scalar;
+  delete cu_vbiasall;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::init()
+{
+  fix_dof = 0;
+  for (int i = 0; i < modify->nfix; i++)
+    fix_dof += modify->fix[i]->dof(igroup);
+  dof_compute();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::dof_compute()
+{
+  double natoms = group->count(igroup);
+  int nper = xflag+yflag+zflag;
+  dof = nper * natoms;
+  dof -= (1.0*nper/domain->dimension)*fix_dof + extra_dof;
+  if (dof > 0) tfactor = force->mvv2e / (dof * force->boltz);
+  else tfactor = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int ComputeTempPartialCuda::dof_remove(int i)
+{
+  int nper = xflag+yflag+zflag;
+  return (domain->dimension - nper);
+}
+
+/* ---------------------------------------------------------------------- */
+
+double ComputeTempPartialCuda::compute_scalar()
+{
+  if(cuda->begin_setup)
+  {
+  	if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_FLOAT, x> (t_vector,6);    
+  	if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_FLOAT, x> (&t_scalar,1);    
+    invoked_scalar = update->ntimestep;
+    Cuda_ComputeTempPartialCuda_Scalar(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_scalar->dev_data(),xflag,yflag,zflag);
+    cu_t_scalar->download();
+  }
+  else
+  {
+  invoked_scalar = update->ntimestep;
+
+  double **v = atom->v;
+  double *mass = atom->mass;
+  double *rmass = atom->rmass;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double t = 0.0;
+
+  if (rmass) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	t += (xflag*v[i][0]*v[i][0] + yflag*v[i][1]*v[i][1] + zflag*v[i][2]*v[i][2]) * rmass[i];
+  } else {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	t += (xflag*v[i][0]*v[i][0] + yflag*v[i][1]*v[i][1] + zflag*v[i][2]*v[i][2]) * 
+	  mass[type[i]];
+  }
+  t_scalar=t;
+  }
+  
+  MPI_Allreduce(&t_scalar,&scalar,1,MPI_DOUBLE,MPI_SUM,world);
+  if (dynamic) dof_compute();
+  scalar *= tfactor;
+  if(scalar>1e15) 
+  {
+  	cuda->cu_v->download();
+  	cuda->cu_x->download();
+  	cuda->cu_type->download();
+    double **v = atom->v;
+    double **x = atom->x;
+    printf("Out of v-range atoms:  \n"); 
+  	for(int i=0;i<atom->nlocal;i++) 
+  	if((v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2])>1e5) 
+  	printf("%i %i // %lf %lf %lf // %lf %lf %lf\n",atom->tag[i],atom->type[i],x[i][0], x[i][1], x[i][2],v[i][0], v[i][1], v[i][2]);
+  	error->all("Temperature out of range. Simulations will be abortet.\n");
+  }
+  return scalar;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::compute_vector()
+{
+  int i;
+  if(cuda->begin_setup)
+  {
+  if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_FLOAT, x> (t_vector,6);    
+  if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_FLOAT, x> (&t_scalar,1);    
+
+  invoked_vector = update->ntimestep;
+
+  Cuda_ComputeTempPartialCuda_Vector(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_vector->dev_data(),xflag,yflag,zflag);
+  cu_t_vector->download();
+  }
+  else
+  {
+ 
+  invoked_vector = update->ntimestep;
+
+  double **v = atom->v;
+  double *mass = atom->mass;
+  double *rmass = atom->rmass;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double massone,t[6];
+  for (i = 0; i < 6; i++) t[i] = 0.0;
+
+  for (i = 0; i < nlocal; i++)
+    if (mask[i] & groupbit) {
+      if (rmass) massone = rmass[i];
+      else massone = mass[type[i]];
+      t[0] += massone * xflag*v[i][0]*v[i][0];
+      t[1] += massone * yflag*v[i][1]*v[i][1];
+      t[2] += massone * zflag*v[i][2]*v[i][2];
+      t[3] += massone * xflag*yflag*v[i][0]*v[i][1];
+      t[4] += massone * xflag*zflag*v[i][0]*v[i][2];
+      t[5] += massone * yflag*zflag*v[i][1]*v[i][2];
+    }
+  
+  for (i = 0; i < 6; i++) t_vector[i]=t[i];
+  }
+  MPI_Allreduce(t_vector,vector,6,MPI_DOUBLE,MPI_SUM,world);
+  for (i = 0; i < 6; i++) vector[i] *= force->mvv2e;
+}
+
+/* ----------------------------------------------------------------------
+   remove velocity bias from atom I to leave thermal velocity
+------------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::remove_bias(int i, double *v)
+{
+  if (!xflag) {
+    vbias[0] = v[0];
+    v[0] = 0.0;
+  }
+  if (!yflag) {
+    vbias[1] = v[1];
+    v[1] = 0.0;
+  }
+  if (!zflag) {
+    vbias[2] = v[2];
+    v[2] = 0.0;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   remove velocity bias from all atoms to leave thermal velocity
+------------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::remove_bias_all()
+{
+  double **v = atom->v;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  if (nlocal > maxbias) {
+    memory->destroy(vbiasall);
+    maxbias = atom->nmax;
+    memory->create(vbiasall,maxbias,3,"temp/partial:vbiasall");
+	delete cu_vbiasall;
+	cu_vbiasall = new cCudaData<double, V_FLOAT, yx> ((double*)vbiasall, atom->nmax, 3);
+  }
+  if(cuda->begin_setup)
+  {
+  		Cuda_ComputeTempPartialCuda_RemoveBiasAll(&cuda->shared_data,groupbit,xflag,yflag,zflag,cu_vbiasall->dev_data());
+  }
+  else
+  {
+  if (!xflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	vbiasall[i][0] = v[i][0];
+	v[i][0] = 0.0;
+      }
+  }
+  if (!yflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	vbiasall[i][1] = v[i][1];
+	v[i][1] = 0.0;
+      }
+  }
+  if (!zflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	vbiasall[i][2] = v[i][2];
+	v[i][2] = 0.0;
+      }
+  }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   add back in velocity bias to atom I removed by remove_bias()
+   assume remove_bias() was previously called
+------------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::restore_bias(int i, double *v)
+{
+  if (!xflag) v[0] += vbias[0];
+  if (!yflag) v[1] += vbias[1];
+  if (!zflag) v[2] += vbias[2];
+}
+
+/* ----------------------------------------------------------------------
+   add back in velocity bias to all atoms removed by remove_bias_all()
+   assume remove_bias_all() was previously called
+------------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::restore_bias_all()
+{
+  double **v = atom->v;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  if(cuda->begin_setup)
+  {
+  		Cuda_ComputeTempPartialCuda_RestoreBiasAll(&cuda->shared_data,groupbit,xflag,yflag,zflag,cu_vbiasall->dev_data());
+  }
+  else
+  {
+
+  if (!xflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	v[i][0] += vbiasall[i][0];
+  }
+  if (!yflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	v[i][1] += vbiasall[i][1];
+  }
+  if (!zflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	v[i][2] += vbiasall[i][2];
+  }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double ComputeTempPartialCuda::memory_usage()
+{
+  double bytes = maxbias * sizeof(double);
+  return bytes;
+}
diff --git a/src/USER-CUDA/compute_temp_partial_cuda.h b/src/USER-CUDA/compute_temp_partial_cuda.h
new file mode 100644
index 0000000000..4412adc88a
--- /dev/null
+++ b/src/USER-CUDA/compute_temp_partial_cuda.h
@@ -0,0 +1,83 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(temp/partial/cuda,ComputeTempPartialCuda)
+
+#else
+
+#ifndef LMP_COMPUTE_TEMP_PARTIAL_CUDA_H
+#define LMP_COMPUTE_TEMP_PARTIAL_CUDA_H
+
+#include "compute.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class ComputeTempPartialCuda : public Compute {
+ public:
+  ComputeTempPartialCuda(class LAMMPS *, int, char **);
+  ~ComputeTempPartialCuda();
+  void init();
+  double compute_scalar();
+  void compute_vector();
+
+  int dof_remove(int);
+  void remove_bias(int, double *);
+  void remove_bias_all();
+  void restore_bias(int, double *);
+  void restore_bias_all();
+  double memory_usage();
+
+ private:
+  class Cuda *cuda;
+  int xflag,yflag,zflag;
+  int fix_dof;
+  double tfactor;
+
+  void dof_compute();
+  double t_vector[6];
+  double t_scalar;
+  cCudaData<double     , ENERGY_FLOAT   		, x>* cu_t_scalar;	
+  cCudaData<double     , ENERGY_FLOAT   		, x>* cu_t_vector;	
+  cCudaData<double, V_FLOAT, yx>* cu_vbiasall;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp
index f5ff1ea72d..66273775e7 100644
--- a/src/USER-CUDA/cuda.cpp
+++ b/src/USER-CUDA/cuda.cpp
@@ -230,7 +230,7 @@ void Cuda::accelerator(int narg, char** arg)
 	  {
 	  	if(++i==narg) 
 	  	  error->all("Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option."); 
-	  	strcpy(lmp->asuffix,arg[i]);
+	  	strcpy(lmp->suffix,arg[i]);
 	  }
 	  if(strcmp(arg[i],"overlap_comm")==0) 
 	  {
diff --git a/src/USER-CUDA/cuda_neigh_list.cpp b/src/USER-CUDA/cuda_neigh_list.cpp
index e6d7a6f516..5715d8cb8b 100644
--- a/src/USER-CUDA/cuda_neigh_list.cpp
+++ b/src/USER-CUDA/cuda_neigh_list.cpp
@@ -29,12 +29,16 @@
 #include <algorithm>
 #include "cuda.h"
 #include "atom.h"
+#include "error.h"
 
 using namespace LAMMPS_NS;
 
 CudaNeighList::CudaNeighList(LAMMPS *lmp, class NeighList* neigh_list) : Pointers(lmp)
 {
         cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
 	MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... start\n");)
 	this->neigh_list = neigh_list;
 	neigh_list->cuda_list=this;
diff --git a/src/USER-CUDA/domain_cuda.cpp b/src/USER-CUDA/domain_cuda.cpp
index fc8d8bb498..438b47b28c 100644
--- a/src/USER-CUDA/domain_cuda.cpp
+++ b/src/USER-CUDA/domain_cuda.cpp
@@ -54,6 +54,8 @@ enum{NO_REMAP,X_REMAP,V_REMAP};                   // same as fix_deform.cpp
 DomainCuda::DomainCuda(LAMMPS *lmp) : Domain(lmp)
 {
   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/USER-CUDA/fft3d_cuda.cpp b/src/USER-CUDA/fft3d_cuda.cpp
new file mode 100644
index 0000000000..bb1278bb75
--- /dev/null
+++ b/src/USER-CUDA/fft3d_cuda.cpp
@@ -0,0 +1,608 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Jim Shepherd (GA Tech) added SGI SCSL support
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include "fft3d_cuda.h"
+#include "fft3d_cuda_cu.h"
+#include "remap.h"
+#include <ctime>
+#include "cuda_wrapper_cu.h"
+
+#ifdef FFT_CUFFT
+#endif
+#define MIN(A,B) ((A) < (B)) ? (A) : (B)
+#define MAX(A,B) ((A) > (B)) ? (A) : (B)
+
+/* ----------------------------------------------------------------------
+   Data layout for 3d FFTs:
+
+   data set of Nfast x Nmid x Nslow elements is owned by P procs
+   on input, each proc owns a subsection of the elements
+   on output, each proc will own a (possibly different) subsection
+   my subsection must not overlap with any other proc's subsection,
+     i.e. the union of all proc's input (or output) subsections must
+     exactly tile the global Nfast x Nmid x Nslow data set
+   when called from C, all subsection indices are 
+     C-style from 0 to N-1 where N = Nfast or Nmid or Nslow
+   when called from F77, all subsection indices are 
+     F77-style from 1 to N where N = Nfast or Nmid or Nslow
+   a proc can own 0 elements on input or output
+     by specifying hi index < lo index
+   on both input and output, data is stored contiguously on a processor
+     with a fast-varying, mid-varying, and slow-varying index
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Perform 3d FFT 
+
+   Arguments:
+   in           starting address of input data on this proc
+   out          starting address of where output data for this proc
+                  will be placed (can be same as in)
+   flag         1 for forward FFT, -1 for inverse FFT
+   plan         plan returned by previous call to fft_3d_create_plan
+------------------------------------------------------------------------- */
+
+void fft_3d_cuda(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
+{
+#ifdef FFT_CUFFT
+  plan->iterate++;
+  timespec starttime,starttime2;
+  timespec endtime,endtime2;
+	
+  int i,total,length,offset,num;
+  double norm;
+  FFT_DATA *data,*copy;
+  // system specific constants 
+
+
+  // pre-remap to prepare for 1st FFTs if needed
+  // copy = loc for remap result 
+  int nprocs=plan->nprocs;
+if(nprocs>1)
+{
+  if(plan->init)
+  clock_gettime(CLOCK_REALTIME,&starttime);
+  if (plan->pre_plan) {
+    if (plan->pre_target == 0) copy = out;
+    else copy = plan->copy;
+    if(plan->init) remap_3d((double *) in, (double *) out, (double *) plan->scratch,plan->pre_plan);
+    data = out;
+  }
+  else
+    data = in;
+}
+  cufftResult retvalc;
+  if(plan->init)
+  {
+	if(nprocs>1)
+	{
+      if(sizeof(FFT_FLOAT)==sizeof(double))cudaMemcpy((void*) (plan->cudata2), (void*) data, plan->cudatasize/2,cudaMemcpyHostToDevice);
+      if(sizeof(FFT_FLOAT)==sizeof(float)) cudaMemcpy((void*) (plan->cudata2), (void*) data, plan->cudatasize,cudaMemcpyHostToDevice);
+      initfftdata((double*)plan->cudata2,(FFT_FLOAT*)plan->cudata,plan->nfast,plan->nmid,plan->nslow);
+    }
+  }
+    if (flag == -1)
+    {
+      retvalc=cufft(plan->plan_3d, plan->cudata, plan->cudata2,CUFFT_FORWARD);
+    }
+    else
+    {
+      retvalc=cufft(plan->plan_3d, plan->cudata, plan->cudata2,CUFFT_INVERSE);
+    }
+    if(retvalc!=CUFFT_SUCCESS) {printf("ErrorCUFFT: %i\n",retvalc);exit(EXIT_FAILURE);}
+
+    FFTsyncthreads();
+#endif
+}
+/* ----------------------------------------------------------------------
+   Create plan for performing a 3d FFT 
+
+   Arguments:
+   comm                 MPI communicator for the P procs which own the data
+   nfast,nmid,nslow     size of global 3d matrix
+   in_ilo,in_ihi        input bounds of data I own in fast index
+   in_jlo,in_jhi        input bounds of data I own in mid index
+   in_klo,in_khi        input bounds of data I own in slow index
+   out_ilo,out_ihi      output bounds of data I own in fast index
+   out_jlo,out_jhi      output bounds of data I own in mid index
+   out_klo,out_khi      output bounds of data I own in slow index
+   scaled               0 = no scaling of result, 1 = scaling
+   permute              permutation in storage order of indices on output
+                          0 = no permutation
+			  1 = permute once = mid->fast, slow->mid, fast->slow
+			  2 = permute twice = slow->fast, fast->mid, mid->slow
+   nbuf                 returns size of internal storage buffers used by FFT
+------------------------------------------------------------------------- */
+
+struct fft_plan_3d *fft_3d_create_plan_cuda(
+       MPI_Comm comm, int nfast, int nmid, int nslow,
+       int in_ilo, int in_ihi, int in_jlo, int in_jhi,
+       int in_klo, int in_khi,
+       int out_ilo, int out_ihi, int out_jlo, int out_jhi,
+       int out_klo, int out_khi,
+       int scaled, int permute, int *nbuf,bool ainit)
+{
+#ifdef FFT_CUFFT
+  struct fft_plan_3d *plan;
+  int me,nprocs;
+  int i,num,flag,remapflag,fftflag;
+  int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
+  int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
+  int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
+  int out_size,first_size,second_size,third_size,copy_size,scratch_size;
+  int np1,np2,ip1,ip2;
+  int list[50];
+
+  // system specific variables 
+
+  // query MPI info 
+
+  MPI_Comm_rank(comm,&me);
+  MPI_Comm_size(comm,&nprocs);
+
+#ifndef FFT_CUFFT
+    error->all("ERROR: Trying to use cuda fft without FFT_CUFFT set. Recompile with make option 'cufft=1'.");
+#endif
+  // compute division of procs in 2 dimensions not on-processor 
+  bifactor_cuda(nprocs,&np1,&np2);
+  ip1 = me % np1;
+  ip2 = me/np1;
+
+  // in case of CUDA FFT every proc does the full FFT in order to avoid data transfers (the problem is other wise heavily bandwidth limited)
+
+  int ip1out = ip1;
+  int ip2out = ip2;
+  int np1out = np1;
+  int np2out = np2;
+  
+  ip1 = 0;
+  ip2 = 0;
+  np1 = 1;
+  np2 = 1;
+
+  // allocate memory for plan data struct 
+
+  plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d));
+  if (plan == NULL) return NULL;
+  plan->init=ainit;
+
+  // remap from initial distribution to layout needed for 1st set of 1d FFTs
+  // not needed if all procs own entire fast axis initially
+  // first indices = distribution after 1st set of FFTs 
+
+  if (in_ilo == 0 && in_ihi == nfast-1)
+    flag = 0;
+  else
+    flag = 1;
+
+  if(nprocs>1)flag=1;
+
+  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
+
+  if (remapflag == 0) {
+    first_ilo = in_ilo;
+    first_ihi = in_ihi;
+    first_jlo = in_jlo;
+    first_jhi = in_jhi;
+    first_klo = in_klo;
+    first_khi = in_khi;
+    plan->pre_plan = NULL;
+  }
+  else {
+    first_ilo = 0;
+    first_ihi = nfast - 1;
+    first_jlo = ip1*nmid/np1;
+    first_jhi = (ip1+1)*nmid/np1 - 1;
+    first_klo = ip2*nslow/np2;
+    first_khi = (ip2+1)*nslow/np2 - 1;
+    int members=2;
+    if(plan->init) members=1;
+    plan->pre_plan =
+      remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
+			   first_ilo,first_ihi,first_jlo,first_jhi,
+			   first_klo,first_khi,
+			   members,0,0,2);
+    if (plan->pre_plan == NULL) return NULL;
+  }
+
+  // 1d FFTs along fast axis 
+
+  plan->length1 = nfast;
+  plan->total1 = nfast * nmid * nslow;
+
+  // remap from 1st to 2nd FFT
+  // choose which axis is split over np1 vs np2 to minimize communication
+  // second indices = distribution after 2nd set of FFTs 
+
+  second_ilo = ip1*nfast/np1;
+  second_ihi = (ip1+1)*nfast/np1 - 1;
+  second_jlo = 0;
+  second_jhi = nmid - 1;
+  second_klo = ip2*nslow/np2;
+  second_khi = (ip2+1)*nslow/np2 - 1;
+  plan->mid1_plan =
+      remap_3d_create_plan(comm,
+			   first_ilo,first_ihi,first_jlo,first_jhi,
+			   first_klo,first_khi,
+			   second_ilo,second_ihi,second_jlo,second_jhi,
+			   second_klo,second_khi,
+			   2,1,0,2);
+  if (plan->mid1_plan == NULL) return NULL;
+
+  // 1d FFTs along mid axis 
+
+  plan->length2 = nmid;
+  plan->total2 = nfast * nmid * nslow;
+
+  // remap from 2nd to 3rd FFT
+  // if final distribution is permute=2 with all procs owning entire slow axis
+  //   then this remapping goes directly to final distribution
+  //  third indices = distribution after 3rd set of FFTs 
+
+  flag=1;
+
+  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
+
+  if (remapflag == 0) {
+    third_ilo = out_ilo;
+    third_ihi = out_ihi;
+    third_jlo = out_jlo;
+    third_jhi = out_jhi;
+    third_klo = out_klo;
+    third_khi = out_khi;
+  }
+  else {
+    third_ilo = ip1*nfast/np1;
+    third_ihi = (ip1+1)*nfast/np1 - 1;
+    third_jlo = ip2*nmid/np2;
+    third_jhi = (ip2+1)*nmid/np2 - 1;
+    third_klo = 0;
+    third_khi = nslow - 1;
+  }
+  
+  plan->mid2_plan =
+    remap_3d_create_plan(comm,
+			 second_jlo,second_jhi,second_klo,second_khi,
+			 second_ilo,second_ihi,
+			 third_jlo,third_jhi,third_klo,third_khi,
+			 third_ilo,third_ihi,
+			 2,1,0,2);
+  if (plan->mid2_plan == NULL) return NULL;
+
+  // 1d FFTs along slow axis 
+
+  plan->length3 = nslow;
+  plan->total3 = nfast * nmid * nslow;
+
+  // remap from 3rd FFT to final distribution
+  //  not needed if permute = 2 and third indices = out indices on all procs 
+
+  flag=1;
+
+  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
+
+  if (remapflag == 0)
+    plan->post_plan = NULL;
+  else {
+    plan->post_plan =
+      remap_3d_create_plan(comm,
+			   third_klo,third_khi,third_ilo,third_ihi,
+			   third_jlo,third_jhi,
+			   out_klo,out_khi,out_ilo,out_ihi,
+			   out_jlo,out_jhi,
+			   2,(permute+1)%3,0,2);
+    if (plan->post_plan == NULL) return NULL;
+  }
+
+  // configure plan memory pointers and allocate work space
+  // out_size = amount of memory given to FFT by user
+  // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps
+  // copy_size = amount needed internally for extra copy of data
+  // scratch_size = amount needed internally for remap scratch space
+  // for each remap:
+  //   out space used for result if big enough, else require copy buffer
+  //   accumulate largest required remap scratch space 
+
+  out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
+  first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * 
+    (first_khi-first_klo+1);
+  second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * 
+    (second_khi-second_klo+1);
+  third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * 
+    (third_khi-third_klo+1);
+
+  plan->ihi_out=out_ihi;
+  plan->ilo_out=out_ilo;
+  plan->jhi_out=out_jhi;
+  plan->jlo_out=out_jlo;
+  plan->khi_out=out_khi;
+  plan->klo_out=out_klo;
+
+  copy_size = 0;
+  scratch_size = 0;
+
+  if (plan->pre_plan) {
+    if (first_size <= out_size)
+      plan->pre_target = 0;
+    else {
+      plan->pre_target = 1;
+      copy_size = MAX(copy_size,first_size);
+    }
+    scratch_size = MAX(scratch_size,first_size);
+  }
+
+  if (plan->mid1_plan) {
+    if (second_size <= out_size)
+      plan->mid1_target = 0;
+    else {
+      plan->mid1_target = 1;
+      copy_size = MAX(copy_size,second_size);
+    }
+    scratch_size = MAX(scratch_size,second_size);
+  }
+
+  if (plan->mid2_plan) {
+    if (third_size <= out_size)
+      plan->mid2_target = 0;
+    else {
+      plan->mid2_target = 1;
+      copy_size = MAX(copy_size,third_size);
+    }
+    scratch_size = MAX(scratch_size,third_size);
+  }
+
+  if (plan->post_plan)
+    scratch_size = MAX(scratch_size,out_size);
+
+  *nbuf = copy_size + scratch_size;
+
+  if (copy_size) {
+    plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA));
+    if (plan->copy == NULL) return NULL;
+  }
+  else plan->copy = NULL;
+
+  if (scratch_size) {
+    plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA));
+    if (plan->scratch == NULL) return NULL;
+  }
+  else plan->scratch = NULL;
+
+  // system specific pre-computation of 1d FFT coeffs 
+  // and scaling normalization 
+
+  cufftResult retvalc;
+  int nfft = (in_ihi-in_ilo+1) * (in_jhi-in_jlo+1) *
+    (in_khi-in_klo+1);
+  int nfft_brick = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
+    (out_khi-out_klo+1);
+    
+  int nfft_both = MAX(nfft,nfft_brick);
+  nfft_both=nfast*nmid*nslow;
+
+  plan->cudatasize=nfft_both*sizeof(FFT_DATA);
+
+  //retvalc=cufftPlan1d(&(plan->plan_fast), nfast, CUFFT_PLAN,plan->total1/nfast);
+  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT1: %i\n",retvalc);
+  plan->nfast=nfast;
+
+  //retvalc=cufftPlan1d(&(plan->plan_mid), nmid, CUFFT_PLAN,plan->total2/nmid);
+  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT2: %i\n",retvalc);
+  plan->nmid=nmid;
+
+  //retvalc=cufftPlan1d(&(plan->plan_slow), nslow, CUFFT_PLAN,plan->total3/nslow);
+  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc);
+  plan->nslow=nslow;
+
+  retvalc=cufftPlan3d(&(plan->plan_3d), nslow,nmid,nfast, CUFFT_PLAN);
+  if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc);
+
+  plan->nprocs=nprocs;
+  plan->me=me;
+  if (scaled == 0)
+    plan->scaled = 0;
+  else {
+    plan->scaled = 1;
+    plan->norm = 1.0/(nfast*nmid*nslow);
+    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
+      (out_khi-out_klo+1);
+  }
+
+  plan->coretime=0;
+  plan->iterate=0;
+  plan->ffttime=0;
+  return plan;
+  #endif
+}
+
+/* ----------------------------------------------------------------------
+   Destroy a 3d fft plan 
+------------------------------------------------------------------------- */
+
+void fft_3d_destroy_plan_cuda(struct fft_plan_3d *plan)
+{
+#ifdef FFT_CUFFT
+  if (plan->pre_plan) remap_3d_destroy_plan(plan->pre_plan);
+  if (plan->mid1_plan) remap_3d_destroy_plan(plan->mid1_plan);
+  if (plan->mid2_plan) remap_3d_destroy_plan(plan->mid2_plan);
+  if (plan->post_plan) remap_3d_destroy_plan(plan->post_plan);
+
+  if (plan->copy) free(plan->copy);
+  if (plan->scratch) free(plan->scratch);
+
+
+  //cufftDestroy(plan->plan_fast);
+  //cufftDestroy(plan->plan_mid);
+  //cufftDestroy(plan->plan_slow);
+  cufftDestroy(plan->plan_3d);
+  free(plan);
+#endif
+}
+
+/* ----------------------------------------------------------------------
+   recursively divide n into small factors, return them in list
+------------------------------------------------------------------------- */
+
+void factor_cuda(int n, int *num, int *list)
+{
+  if (n == 1) {
+    return;
+  }
+  else if (n % 2 == 0) {
+    *list = 2;
+    (*num)++;
+    factor_cuda(n/2,num,list+1);
+  }
+  else if (n % 3 == 0) {
+    *list = 3;
+    (*num)++;
+    factor_cuda(n/3,num,list+1);
+  }
+  else if (n % 5 == 0) {
+    *list = 5;
+    (*num)++;
+    factor_cuda(n/5,num,list+1);
+  }
+  else if (n % 7 == 0) {
+    *list = 7;
+    (*num)++;
+    factor_cuda(n/7,num,list+1);
+  }
+  else if (n % 11 == 0) {
+    *list = 11;
+    (*num)++;
+    factor_cuda(n/11,num,list+1);
+  }
+  else if (n % 13 == 0) {
+    *list = 13;
+    (*num)++;
+    factor_cuda(n/13,num,list+1);
+  }
+  else {
+    *list = n;
+    (*num)++;
+    return;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   divide n into 2 factors of as equal size as possible 
+------------------------------------------------------------------------- */
+
+void bifactor_cuda(int n, int *factor1, int *factor2)
+{
+  int n1,n2,facmax;
+
+  facmax = static_cast<int> (sqrt((double) n));
+
+  for (n1 = facmax; n1 > 0; n1--) {
+    n2 = n/n1;
+    if (n1*n2 == n) {
+      *factor1 = n1;
+      *factor2 = n2;
+      return;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform just the 1d FFTs needed by a 3d FFT, no data movement
+   used for timing purposes
+
+   Arguments:
+   in           starting address of input data on this proc, all set to 0.0
+   nsize        size of in
+   flag         1 for forward FFT, -1 for inverse FFT
+   plan         plan returned by previous call to fft_3d_create_plan
+------------------------------------------------------------------------- */
+
+void fft_1d_only_cuda(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan)
+{
+#ifdef FFT_CUFFT
+  int i,total,length,offset,num;
+  double norm;
+
+  // system specific constants 
+
+
+
+  // total = size of data needed in each dim
+  // length = length of 1d FFT in each dim
+  // total/length = # of 1d FFTs in each dim
+  // if total > nsize, limit # of 1d FFTs to available size of data
+
+  int total1 = plan->total1;
+  int length1 = plan->length1;
+  int total2 = plan->total2;
+  int length2 = plan->length2;
+  int total3 = plan->total3;
+  int length3 = plan->length3;
+
+  if (total1 > nsize) total1 = (nsize/length1) * length1;
+  if (total2 > nsize) total2 = (nsize/length2) * length2;
+  if (total3 > nsize) total3 = (nsize/length3) * length3;
+
+  // perform 1d FFTs in each of 3 dimensions
+  // data is just an array of 0.0
+
+
+  cudaMemcpy((void**) &(plan->cudata), (void*) data, plan->cudatasize,cudaMemcpyHostToDevice);
+  if (flag == -1) {
+    cufft(plan->plan_3d, plan->cudata, plan->cudata,CUFFT_FORWARD);
+    /*cufft(plan->plan_fast, plan->cudata, plan->cudata,CUFFT_FORWARD);
+    cufft(plan->plan_mid, plan->cudata, plan->cudata,CUFFT_FORWARD);
+    cufft(plan->plan_slow, plan->cudata, plan->cudata,CUFFT_FORWARD);*/
+  } else {
+    cufft(plan->plan_3d, plan->cudata, plan->cudata,CUFFT_FORWARD);
+    /*cufft(plan->plan_fast, plan->cudata, plan->cudata,CUFFT_INVERSE);
+    cufft(plan->plan_mid,plan->cudata, plan->cudata,CUFFT_INVERSE);
+    cufft(plan->plan_slow, plan->cudata, plan->cudata,CUFFT_INVERSE);*/
+  }
+  cudaMemcpy((void*) data, (void**) &(plan->cudata), plan->cudatasize,cudaMemcpyDeviceToHost);
+
+  // scaling if required 
+  // limit num to size of data
+
+#endif
+}
diff --git a/src/USER-CUDA/fft3d_cuda.h b/src/USER-CUDA/fft3d_cuda.h
new file mode 100644
index 0000000000..648d7d6584
--- /dev/null
+++ b/src/USER-CUDA/fft3d_cuda.h
@@ -0,0 +1,148 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+// User-settable FFT precision 
+
+// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag) 
+// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag) 
+#include "cuda_precision.h"
+//#define FFT_PRECISION 2
+
+// ------------------------------------------------------------------------- 
+
+// Data types for single-precision complex 
+
+#if FFT_PRECISION_CU == 1
+
+#ifdef FFT_CUFFT
+#include "cuda_runtime.h"
+#include "cufft.h"
+typedef struct {
+  float re;
+  float im;
+} FFT_DATA;
+typedef cufftComplex cufftData;
+typedef cufftReal cufftDataInit;
+#define cufft cufftExecC2C
+#define cufftinit cufftExecR2C
+#define CUFFT_PLAN CUFFT_C2C
+#define CUFFT_PLAN_INIT CUFFT_R2C
+#else
+typedef struct {
+  float re;
+  float im;
+} FFT_DATA;
+#endif
+
+#endif
+
+// ------------------------------------------------------------------------- 
+
+// Data types for double-precision complex 
+
+#if FFT_PRECISION_CU == 2
+
+
+#ifdef FFT_CUFFT
+#include "cuda_runtime.h"
+#include "cufft.h"
+typedef cufftDoubleComplex cufftData;
+typedef cufftDoubleReal cufftDataInit;
+typedef struct {
+  double re;
+  double im;
+} FFT_DATA;
+#define cufft cufftExecZ2Z
+#define cufftinit cufftExecD2Z
+#define CUFFT_PLAN CUFFT_Z2Z
+#define CUFFT_PLAN_INIT CUFFT_D2Z
+#endif
+
+#endif
+
+// ------------------------------------------------------------------------- 
+
+// details of how to do a 3d FFT 
+
+struct fft_plan_3d {
+  struct remap_plan_3d *pre_plan;       // remap from input -> 1st FFTs 
+  struct remap_plan_3d *mid1_plan;      // remap from 1st -> 2nd FFTs 
+  struct remap_plan_3d *mid2_plan;      // remap from 2nd -> 3rd FFTs 
+  struct remap_plan_3d *post_plan;      // remap from 3rd FFTs -> output 
+  FFT_DATA *copy;                   // memory for remap results (if needed) 
+  FFT_DATA *scratch;                // scratch space for remaps 
+  int total1,total2,total3;         // # of 1st,2nd,3rd FFTs (times length) 
+  int length1,length2,length3;      // length of 1st,2nd,3rd FFTs 
+  int pre_target;                   // where to put remap results 
+  int mid1_target,mid2_target;
+  int scaled;                       // whether to scale FFT results 
+  int normnum;                      // # of values to rescale 
+  double norm;                      // normalization factor for rescaling 
+
+  double coretime;
+  double ffttime;
+  int iterate;
+                                    // system specific 1d FFT info 
+
+#ifdef FFT_CUFFT
+  //CUdeviceptr cudata;
+  cufftData* cudata;
+  cufftData* cudata2;
+  unsigned int cudatasize;
+  cufftHandle plan_fast;
+  cufftHandle plan_mid;
+  cufftHandle plan_slow;
+  cufftHandle plan_3d;
+  int nfast;
+  int nmid;
+  int nslow;
+  int ihi_out,ilo_out,jhi_out,jlo_out,khi_out,klo_out;
+  int me,nprocs;
+#endif
+  int init;
+};
+
+// function prototypes 
+
+void fft_3d_destroy_plan_cuda(struct fft_plan_3d *);
+void factor_cuda(int, int *, int *);
+void bifactor_cuda(int, int *, int *);
+void fft_1d_only_cuda(FFT_DATA *, int, int, struct fft_plan_3d *);
+void fft_3d_cudaA(FFT_DATA *, FFT_DATA *, int, struct fft_plan_3d *);
+void fft_3d_cuda(FFT_DATA *, FFT_DATA *, int, struct fft_plan_3d *);
+struct fft_plan_3d *fft_3d_create_plan_cuda(MPI_Comm, int, int, int,
+  int, int, int, int, int, int, int, int, int, int, int, int,
+  int, int, int *,bool init);
diff --git a/src/USER-CUDA/fft3d_wrap_cuda.cpp b/src/USER-CUDA/fft3d_wrap_cuda.cpp
new file mode 100644
index 0000000000..5fa45bd85c
--- /dev/null
+++ b/src/USER-CUDA/fft3d_wrap_cuda.cpp
@@ -0,0 +1,111 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "fft3d_wrap_cuda.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FFT3dCuda::FFT3dCuda(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow,
+	     int in_ilo, int in_ihi, int in_jlo, int in_jhi,
+	     int in_klo, int in_khi,
+	     int out_ilo, int out_ihi, int out_jlo, int out_jhi,
+	     int out_klo, int out_khi,
+	     int scaled, int permute, int *nbuf,bool init) : Pointers(lmp)
+{
+#ifdef FFT_CUFFT
+  plan = fft_3d_create_plan_cuda(comm,nfast,nmid,nslow,
+			    in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
+			    out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
+			    scaled,permute,nbuf,init);
+#endif
+#ifndef FFT_CUFFT
+  plan = fft_3d_create_plan(comm,nfast,nmid,nslow,
+			    in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
+			    out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
+			    scaled,permute,nbuf);
+#endif
+  if (plan == NULL) error->one("Could not create 3d FFT plan");
+}
+
+/* ---------------------------------------------------------------------- */
+
+FFT3dCuda::~FFT3dCuda()
+{
+#ifdef FFT_CUFFT
+  fft_3d_destroy_plan_cuda(plan);
+#endif
+#ifndef FFT_CUFFT
+   fft_3d_destroy_plan(plan);
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FFT3dCuda::compute(double *in, double *out, int flag)
+{
+#ifdef FFT_CUFFT
+  fft_3d_cuda((FFT_DATA *) in,(FFT_DATA *) out,flag,plan);
+#endif
+#ifndef FFT_CUFFT
+  fft_3d((FFT_DATA *) in,(FFT_DATA *) out,flag,plan);
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FFT3dCuda::timing1d(double *in, int nsize, int flag)
+{
+#ifdef FFT_CUFFT
+  fft_1d_only_cuda((FFT_DATA *) in,nsize,flag,plan);
+#endif
+#ifndef FFT_CUFFT
+  fft_1d_only((FFT_DATA *) in,nsize,flag,plan);
+#endif
+}
+
+#ifdef FFT_CUFFT
+void FFT3dCuda::set_cudata(void* cudata,void* cudata2)
+{ 
+  
+  plan->cudata=(cufftData*) cudata;
+  plan->cudata2=(cufftData*) cudata2;
+  
+}
+#endif
diff --git a/src/USER-CUDA/fft3d_wrap_cuda.h b/src/USER-CUDA/fft3d_wrap_cuda.h
new file mode 100644
index 0000000000..911057cbec
--- /dev/null
+++ b/src/USER-CUDA/fft3d_wrap_cuda.h
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef FFT3D_WRAP_CUDA_H_
+#define FFT3D_WRAP_CUDA_H_
+
+#include "pointers.h"
+
+#ifdef FFT_CUFFT
+  #include "fft3d_cuda.h"
+#endif
+#ifndef FFT_CUFFT
+  #include "fft3d.h"
+#endif
+
+namespace LAMMPS_NS {
+
+class FFT3dCuda : protected Pointers {
+ public:
+  FFT3dCuda(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,int,int,int,
+	int,int,int,int,int,int,int,int,int *,bool);
+  ~FFT3dCuda();
+  void compute(double *, double *, int);
+  void timing1d(double *, int, int);
+
+#ifdef FFT_CUFFT
+  void set_cudata(void* cudata,void* cudata2);
+#endif
+ private:
+  struct fft_plan_3d *plan;
+};
+
+}
+
+#endif /*FFT3D_WRAP_CUDA_H_*/
diff --git a/src/USER-CUDA/fix_addforce_cuda.cpp b/src/USER-CUDA/fix_addforce_cuda.cpp
new file mode 100644
index 0000000000..a259068365
--- /dev/null
+++ b/src/USER-CUDA/fix_addforce_cuda.cpp
@@ -0,0 +1,190 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+#include <cstring>
+#include <cstdlib>
+#include "fix_addforce_cuda.h"
+#include "fix_addforce_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "domain.h"
+#include "cuda.h"
+#include "memory.h"
+#include "cuda_modify_flags.h"
+
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixAddForceCuda::FixAddForceCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg < 6) error->all("Illegal fix addforce/cuda command");
+
+  scalar_flag = 1;
+  vector_flag = 1;
+  size_vector = 3;
+  global_freq = 1;
+  extscalar = 1;
+  extvector = 1;
+
+  xvalue = atof(arg[3]);
+  yvalue = atof(arg[4]);
+  zvalue = atof(arg[5]);
+
+  // optional args
+
+  iregion = -1;
+
+  int iarg = 6;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"region") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix addforce/cuda command");
+      iregion = domain->find_region(arg[iarg+1]);
+      if (iregion == -1) error->all("Fix addforce/cuda region ID does not exist");
+      iarg += 2;
+    } else error->all("Illegal fix addforce/cuda command");
+  }
+  
+  if(iregion!=-1) error->all("Error: fix addforce/cuda does not currently support 'region' option");
+  
+  force_flag = 0;
+  foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0;
+  cu_foriginal = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixAddForceCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  mask |= POST_FORCE_RESPA;
+  mask |= MIN_POST_FORCE_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::init()
+{
+  if(not cu_foriginal)
+  cu_foriginal = new cCudaData<double, F_FLOAT, x> (foriginal,4);    
+  if (strcmp(update->integrate_style,"respa") == 0)
+    nlevels_respa = ((Respa *) update->integrate)->nlevels;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::setup(int vflag)
+{
+  MYDBG( printf("# CUDA: FixAddForceCuda::setup\n"); )
+	
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixAddForceCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    
+  }
+  else {
+    ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1);
+    cuda->cu_f->download();
+    post_force_respa(vflag,nlevels_respa-1,0);
+    cuda->cu_f->upload();
+    ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1);
+  }
+  MYDBG( printf("# CUDA: FixAddForceCuda::setup done\n"); )
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::min_setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::post_force(int vflag)
+{
+  MYDBG( printf("# CUDA: FixAddForceCuda::postforce start\n"); )
+  force_flag = 0;
+  cu_foriginal->memset_device(0);
+  Cuda_FixAddForceCuda_PostForce(&cuda->shared_data, groupbit, xvalue, yvalue,zvalue,(F_FLOAT*) cu_foriginal->dev_data());
+  cu_foriginal->download();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  if (ilevel == nlevels_respa-1) post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::min_post_force(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ----------------------------------------------------------------------
+   potential energy of added force
+------------------------------------------------------------------------- */
+
+double FixAddForceCuda::compute_scalar()
+{
+  // only sum across procs one time
+
+  if (force_flag == 0) {
+    MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
+    force_flag = 1;
+  }
+  return foriginal_all[0];
+}
+
+/* ----------------------------------------------------------------------
+   return components of total force on fix group before force was changed
+------------------------------------------------------------------------- */
+
+double FixAddForceCuda::compute_vector(int n)
+{
+  // only sum across procs one time
+
+  if (force_flag == 0) {
+    MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
+    force_flag = 1;
+  }
+  return foriginal_all[n+1];
+}
diff --git a/src/USER-CUDA/fix_addforce_cuda.h b/src/USER-CUDA/fix_addforce_cuda.h
new file mode 100644
index 0000000000..38efa0528d
--- /dev/null
+++ b/src/USER-CUDA/fix_addforce_cuda.h
@@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(addforce/cuda,FixAddForceCuda)
+
+#else
+
+#ifndef LMP_FIX_ADD_FORCE_CUDA_H
+#define LMP_FIX_ADD_FORCE_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixAddForceCuda : public Fix {
+ public:
+  FixAddForceCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  void post_force_respa(int, int, int);
+  void min_post_force(int);
+  double compute_scalar();
+  double compute_vector(int);
+
+ private:
+  class Cuda *cuda;
+  int iregion;
+  double xvalue,yvalue,zvalue;
+  double foriginal[4],foriginal_all[4];
+  cCudaData<double     , F_FLOAT   		, x>* cu_foriginal;	
+  int force_flag;
+  int nlevels_respa;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_aveforce_cuda.cpp b/src/USER-CUDA/fix_aveforce_cuda.cpp
new file mode 100644
index 0000000000..8d2b4ddd0f
--- /dev/null
+++ b/src/USER-CUDA/fix_aveforce_cuda.cpp
@@ -0,0 +1,229 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+#include "mpi.h"
+#include <cstring>
+#include <cstdlib>
+#include "fix_aveforce_cuda.h"
+#include "fix_aveforce_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "domain.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixAveForceCuda::FixAveForceCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 6) error->all("Illegal fix aveforce command");
+
+  vector_flag = 1;
+  size_vector = 3;
+  global_freq = 1;
+  extvector = 1;
+
+  xflag = yflag = zflag = 1;
+  if (strcmp(arg[3],"NULL") == 0) xflag = 0;
+  else xvalue = atof(arg[3]);
+  if (strcmp(arg[4],"NULL") == 0) yflag = 0;
+  else yvalue = atof(arg[4]);
+  if (strcmp(arg[5],"NULL") == 0) zflag = 0;
+  else zvalue = atof(arg[5]);
+
+  // optional args
+
+  iregion = -1;
+
+  int iarg = 6;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"region") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix aveforce command");
+      iregion = domain->find_region(arg[iarg+1]);
+      if (iregion == -1) error->all("Fix aveforce region ID does not exist");
+      iarg += 2;
+    } else error->all("Illegal fix aveforce command");
+
+  }
+  
+  if(iregion!=-1) error->all("Error: fix aveforce/cuda does not currently support 'region' option");
+
+  foriginal_all[0] = foriginal_all[1] = foriginal_all[2] = foriginal_all[3] = 0.0;
+  foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0;
+  cu_foriginal = NULL;
+  
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixAveForceCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  mask |= POST_FORCE_RESPA;
+  mask |= MIN_POST_FORCE_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::init()
+{
+  if(not cu_foriginal)
+  cu_foriginal = new cCudaData<double, F_FLOAT, x> (foriginal,4);    
+  if (strcmp(update->integrate_style,"respa") == 0)
+    nlevels_respa = ((Respa *) update->integrate)->nlevels;
+
+  // ncount = total # of atoms in group
+
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::setup(int vflag)
+{
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixAveForceCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    
+  }
+  else
+  {
+    cuda->cu_f->download();
+    for (int ilevel = 0; ilevel < nlevels_respa; ilevel++) {
+      ((Respa *) update->integrate)->copy_flevel_f(ilevel);
+      post_force_respa(vflag,ilevel,0);
+      ((Respa *) update->integrate)->copy_f_flevel(ilevel);
+    }
+    cuda->cu_f->upload();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::min_setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::post_force(int vflag)
+{
+  // sum forces on participating atoms
+
+  cu_foriginal->memset_device(0);
+  Cuda_FixAveForceCuda_PostForce_FOrg(&cuda->shared_data, groupbit,(F_FLOAT*) cu_foriginal->dev_data());
+  cu_foriginal->download();
+
+  // average the force on participating atoms
+  // add in requested amount
+
+  MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
+  int ncount = static_cast<int> (foriginal_all[3]);
+  if (ncount == 0) return;
+  double fave[3];
+  fave[0] = foriginal_all[0]/ncount + xvalue;
+  fave[1] = foriginal_all[1]/ncount + yvalue;
+  fave[2] = foriginal_all[2]/ncount + zvalue;
+
+  // set force of all participating atoms to same value
+  // only for active dimensions
+
+  Cuda_FixAveForceCuda_PostForce_Set(&cuda->shared_data, groupbit,xflag,yflag,zflag,fave[0],fave[1],fave[2]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  // ave + extra force on outermost level
+  // just ave on inner levels
+  if (ilevel == nlevels_respa-1) post_force(vflag);
+  else {
+    cuda->cu_f->download();
+    cuda->cu_mask->download();
+    double **f = atom->f;
+    int *mask = atom->mask;
+    int nlocal = atom->nlocal;
+
+    double foriginal[4];
+    foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0;
+
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	foriginal[0] += f[i][0];
+	foriginal[1] += f[i][1];
+	foriginal[2] += f[i][2];
+	foriginal[3] += 1;
+	
+      }
+
+    MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
+    int ncount = static_cast<int> (foriginal_all[3]);
+    if (ncount == 0) return;
+    double fave[3];
+    fave[0] = foriginal_all[0]/ncount;
+    fave[1] = foriginal_all[1]/ncount;
+    fave[2] = foriginal_all[2]/ncount;
+
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	if (xflag) f[i][0] = fave[0];
+	if (yflag) f[i][1] = fave[1];
+	if (zflag) f[i][2] = fave[2];
+      }
+    cuda->cu_f->upload();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::min_post_force(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ----------------------------------------------------------------------
+   return components of total force on fix group before force was changed
+------------------------------------------------------------------------- */
+
+double FixAveForceCuda::compute_vector(int n)
+{
+  return foriginal_all[n];
+}
diff --git a/src/USER-CUDA/fix_aveforce_cuda.h b/src/USER-CUDA/fix_aveforce_cuda.h
new file mode 100644
index 0000000000..987ee9d996
--- /dev/null
+++ b/src/USER-CUDA/fix_aveforce_cuda.h
@@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(aveforce/cuda,FixAveForceCuda)
+
+#else
+
+
+#ifndef LMP_FIX_AVE_FORCE_CUDA_H
+#define LMP_FIX_AVE_FORCE_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixAveForceCuda : public Fix {
+ public:
+  FixAveForceCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  void post_force_respa(int, int, int);
+  void min_post_force(int);
+  double compute_vector(int);
+
+ private:
+  class Cuda *cuda;
+  int xflag,yflag,zflag,iregion;
+  double xvalue,yvalue,zvalue;
+  double foriginal_all[4];
+  double foriginal[4];
+  cCudaData<double     , F_FLOAT   		, x>* cu_foriginal;	
+  int nlevels_respa;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_enforce2d_cuda.cpp b/src/USER-CUDA/fix_enforce2d_cuda.cpp
new file mode 100644
index 0000000000..b9ac0341ad
--- /dev/null
+++ b/src/USER-CUDA/fix_enforce2d_cuda.cpp
@@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "fix_enforce2d_cuda.h"
+#include "fix_enforce2d_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "domain.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixEnforce2DCuda::FixEnforce2DCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 3) error->all("Illegal fix enforce2d command");
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixEnforce2DCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  mask |= POST_FORCE_RESPA;
+  mask |= MIN_POST_FORCE_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::init()
+{
+  if (domain->dimension == 3)
+    error->all("Cannot use fix enforce2d/cuda with 3d simulation");
+  if (atom->omega_flag) 
+    error->warning("Enforce2d/cuda does not support omega_flag on gpu yet. Will be handled on cpu.");
+  	
+  if (atom->angmom_flag)
+    error->warning("Enforce2d/cuda does not support angmom_flag (angular momentum) on gpu yet. Will be handled on cpu.");
+
+  if (atom->torque_flag) 
+    error->warning("Enforce2d/cuda does not support torque_flag on gpu yet. Will be handled on cpu.");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::setup(int vflag)
+{
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixEnforce2dCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    cuda->cu_v->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    cuda->cu_v->download();
+  }
+  else {
+    int nlevels_respa = ((Respa *) update->integrate)->nlevels;
+    for (int ilevel = 0; ilevel < nlevels_respa; ilevel++) {
+      ((Respa *) update->integrate)->copy_flevel_f(ilevel);
+      post_force_respa(vflag,ilevel,0);
+      ((Respa *) update->integrate)->copy_f_flevel(ilevel);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::min_setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::post_force(int vflag)
+{
+  Cuda_FixEnforce2dCuda_PostForce(&cuda->shared_data, groupbit);
+
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+  
+  if (atom->omega_flag) {
+    double **omega = atom->omega;
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	omega[i][0] = 0.0;
+	omega[i][1] = 0.0;
+      }
+  }
+
+  if (atom->angmom_flag) {
+    double **angmom = atom->angmom;
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	angmom[i][0] = 0.0;
+	angmom[i][1] = 0.0;
+      }
+  }
+
+  if (atom->torque_flag) {
+    double **torque = atom->torque;
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	torque[i][0] = 0.0;
+	torque[i][1] = 0.0;
+      }
+  }  
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::min_post_force(int vflag)
+{
+  post_force(vflag);
+}
diff --git a/src/USER-CUDA/fix_enforce2d_cuda.h b/src/USER-CUDA/fix_enforce2d_cuda.h
new file mode 100644
index 0000000000..2abb1ffa18
--- /dev/null
+++ b/src/USER-CUDA/fix_enforce2d_cuda.h
@@ -0,0 +1,55 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(enforce2d/cuda,FixEnforce2DCuda)
+
+#else
+
+#ifndef LMP_FIX_ENFORCE2D_CUDA_H
+#define LMP_FIX_ENFORCE2D_CUDA_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+
+class FixEnforce2DCuda : public Fix {
+ public:
+  FixEnforce2DCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  void post_force_respa(int, int, int);
+  void min_post_force(int);
+
+  private:
+  class Cuda *cuda;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_freeze_cuda.cpp b/src/USER-CUDA/fix_freeze_cuda.cpp
new file mode 100644
index 0000000000..c13dc02cdc
--- /dev/null
+++ b/src/USER-CUDA/fix_freeze_cuda.cpp
@@ -0,0 +1,135 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#include <cstring>
+#include <cstdlib>
+#include "fix_freeze_cuda.h"
+#include "fix_freeze_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda.h"
+#include "memory.h"
+#include "modify.h"
+#include "cuda_modify_flags.h"
+
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixFreezeCuda::FixFreezeCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+  if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 3) error->all("Illegal fix freeze command");
+
+  if (!atom->torque_flag)
+    error->all("Fix freeze requires atom attribute torque");
+
+  vector_flag = 1;
+  size_vector = 3;
+  global_freq = 1;
+  extvector = 1;
+
+
+
+  force_flag = 0;
+  foriginal[0] = foriginal[1] = foriginal[2] = 0.0;
+  cu_foriginal=NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixFreezeCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixFreezeCuda::init()
+{
+  if(not cu_foriginal)
+  cu_foriginal = new cCudaData<double, F_FLOAT, x> (foriginal,3);    
+  int count = 0;
+  for (int i = 0; i < modify->nfix; i++)
+    if (strcmp(modify->fix[i]->style,"freeze") == 0) count++;
+  if (count > 1) error->all("More than one fix freeze");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixFreezeCuda::setup(int vflag)
+{
+  MYDBG( printf("# CUDA: FixFreezeCuda::setup\n"); )
+	
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixFreezeCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    
+  }
+
+  MYDBG( printf("# CUDA: FixFreezeCuda::setup done\n"); )
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ---------------------------------------------------------------------- */
+
+void FixFreezeCuda::post_force(int vflag)
+{
+  MYDBG( printf("# CUDA: FixFreezeCuda::postforce start\n"); )
+  force_flag = 0;
+  cu_foriginal->memset_device(0);
+  Cuda_FixFreezeCuda_PostForce(&cuda->shared_data, groupbit, (F_FLOAT*) cu_foriginal->dev_data());
+  cu_foriginal->download();
+}
+
+/* ---------------------------------------------------------------------- */
+
+
+
+/* ----------------------------------------------------------------------
+   return components of total force on fix group before force was changed
+------------------------------------------------------------------------- */
+
+double FixFreezeCuda::compute_vector(int n)
+{
+  // only sum across procs one time
+
+  if (force_flag == 0) {
+    MPI_Allreduce(foriginal,foriginal_all,3,MPI_DOUBLE,MPI_SUM,world);
+    force_flag = 1;
+  }
+  return foriginal_all[n+1];
+}
diff --git a/src/USER-CUDA/fix_freeze_cuda.h b/src/USER-CUDA/fix_freeze_cuda.h
new file mode 100644
index 0000000000..019301096c
--- /dev/null
+++ b/src/USER-CUDA/fix_freeze_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(freeze/cuda,FixFreezeCuda)
+
+#else
+
+#ifndef LMP_FIX_FREEZE_CUDA_H
+#define LMP_FIX_FREEZE_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixFreezeCuda : public Fix {
+ public:
+  FixFreezeCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void post_force(int);
+  double compute_vector(int);
+
+ private:
+  class Cuda *cuda;
+  double foriginal[3],foriginal_all[3];
+  cCudaData<double     , F_FLOAT   		, x>* cu_foriginal;	
+  int force_flag;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_gravity_cuda.cpp b/src/USER-CUDA/fix_gravity_cuda.cpp
new file mode 100644
index 0000000000..650e9f7ae8
--- /dev/null
+++ b/src/USER-CUDA/fix_gravity_cuda.cpp
@@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include "fix_gravity_cuda.h"
+#include "fix_gravity_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "domain.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+
+using namespace LAMMPS_NS;
+
+enum{CHUTE,SPHERICAL,GRADIENT,VECTOR};
+
+/* ---------------------------------------------------------------------- */
+
+FixGravityCuda::FixGravityCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg < 5) error->all("Illegal fix gravity command");
+
+  time_depend = 1;
+
+  magnitude = atof(arg[3]);
+
+  if (strcmp(arg[4],"chute") == 0) {
+    if (narg != 6) error->all("Illegal fix gravity command");
+    style = CHUTE;
+    phi = 0.0;
+    theta = 180.0 - atof(arg[5]);
+  } else if (strcmp(arg[4],"spherical") == 0) {
+    if (narg != 7) error->all("Illegal fix gravity command");
+    style = SPHERICAL;
+    phi = atof(arg[5]);
+    theta = atof(arg[6]);
+  } else if (strcmp(arg[4],"gradient") == 0) {
+    if (narg != 9) error->all("Illegal fix gravity command");
+    style = GRADIENT;
+    phi = atof(arg[5]);
+    theta = atof(arg[6]);
+    phigrad = atof(arg[7]);
+    thetagrad = atof(arg[8]);
+  } else if (strcmp(arg[4],"vector") == 0) {
+    if (narg != 8) error->all("Illegal fix gravity command");
+    style = VECTOR;
+    xdir = atof(arg[5]);
+    ydir = atof(arg[6]);
+    zdir = atof(arg[7]);
+  } else error->all("Illegal fix gravity command");
+
+  double PI = 4.0*atan(1.0);
+  degree2rad = PI/180.0;
+
+  if (style == CHUTE || style == SPHERICAL || style == GRADIENT) {
+    if (domain->dimension == 3) {
+      xgrav = sin(degree2rad * theta) * cos(degree2rad * phi);
+      ygrav = sin(degree2rad * theta) * sin(degree2rad * phi);
+      zgrav = cos(degree2rad * theta);
+    } else {
+      xgrav = sin(degree2rad * theta);
+      ygrav = cos(degree2rad * theta);
+      zgrav = 0.0;
+    }
+  } else if (style == VECTOR) {
+    if (domain->dimension == 3) {
+      double length = sqrt(xdir*xdir + ydir*ydir + zdir*zdir);
+      xgrav = xdir/length;
+      ygrav = ydir/length;
+      zgrav = zdir/length;
+    } else {
+      double length = sqrt(xdir*xdir + ydir*ydir);
+      xgrav = xdir/length;
+      ygrav = ydir/length;
+      zgrav = 0.0;
+    }
+  }
+
+  time_origin = update->ntimestep;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixGravityCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGravityCuda::init()
+{
+  dt = update->dt;
+
+  xacc = magnitude*xgrav;
+  yacc = magnitude*ygrav;
+  zacc = magnitude*zgrav;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGravityCuda::setup(int vflag)
+{
+  MYDBG( printf("# CUDA: FixGravityCuda::setup\n"); )
+	
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixGravityCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    
+  }
+  else {
+  }
+  MYDBG( printf("# CUDA: FixGravityCuda::setup done\n"); )
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGravityCuda::post_force(int vflag)
+{
+  // update direction of gravity vector if gradient style
+
+  if (style == GRADIENT) {
+    if (domain->dimension == 3) {
+      double phi_current = degree2rad * 
+	(phi + (update->ntimestep - time_origin)*dt*phigrad*360.0);
+      double theta_current = degree2rad * 
+	(theta + (update->ntimestep - time_origin)*dt*thetagrad*360.0);
+      xgrav = sin(theta_current) * cos(phi_current);
+      ygrav = sin(theta_current) * sin(phi_current);
+      zgrav = cos(theta_current);
+    } else {
+      double theta_current = degree2rad * 
+	(theta + (update->ntimestep - time_origin)*dt*thetagrad*360.0);
+      xgrav = sin(theta_current);
+      ygrav = cos(theta_current);
+    }
+    xacc = magnitude*xgrav;
+    yacc = magnitude*ygrav;
+    zacc = magnitude*zgrav;
+  }
+
+  MYDBG( printf("# CUDA: FixGravityCuda::postforce start\n"); )
+  Cuda_FixGravityCuda_PostForce(&cuda->shared_data, groupbit, xacc,yacc,zacc);
+}
+
+
diff --git a/src/USER-CUDA/fix_gravity_cuda.h b/src/USER-CUDA/fix_gravity_cuda.h
new file mode 100644
index 0000000000..f4aef37790
--- /dev/null
+++ b/src/USER-CUDA/fix_gravity_cuda.h
@@ -0,0 +1,60 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(gravity/cuda,FixGravityCuda)
+
+#else
+
+#ifndef LMP_FIX_GRAVITY_CUDA_H
+#define LMP_FIX_GRAVITY_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixGravityCuda : public Fix {
+ public:
+  FixGravityCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void post_force(int);
+
+ private:
+  class Cuda *cuda;
+  int style;
+  double magnitude,dt;
+  double phi,theta,phigrad,thetagrad;
+  double xdir,ydir,zdir;
+  double xgrav,ygrav,zgrav,xacc,yacc,zacc;
+  double degree2rad;
+  int time_origin;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_nh_cuda.cpp b/src/USER-CUDA/fix_nh_cuda.cpp
new file mode 100644
index 0000000000..b495850d0c
--- /dev/null
+++ b/src/USER-CUDA/fix_nh_cuda.cpp
@@ -0,0 +1,2077 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mark Stevens (SNL), Aidan Thompson (SNL)
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include "fix_nh_cuda.h"
+#include "atom.h"
+#include "force.h"
+#include "comm.h"
+#include "modify.h"
+#include "fix_deform.h"
+#include "compute.h"
+#include "kspace.h"
+#include "update.h"
+#include "respa.h"
+#include "domain.h"
+#include "memory.h"
+#include "error.h"
+#include "math_extra.h"
+#include "cuda.h"
+#include "fix_nh_cuda_cu.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(A,B) ((A) < (B)) ? (A) : (B)
+#define MAX(A,B) ((A) > (B)) ? (A) : (B)
+
+enum{NOBIAS,BIAS};
+enum{NONE,XYZ,XY,YZ,XZ};
+enum{ISO,ANISO,TRICLINIC};
+
+/* ----------------------------------------------------------------------
+   NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion 
+ ---------------------------------------------------------------------- */
+
+FixNHCuda::FixNHCuda(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg < 4) error->all("Illegal fix nvt/npt/nph command");
+
+  restart_global = 1;
+  time_integrate = 1;
+  scalar_flag = 1;
+  vector_flag = 1;
+  global_freq = 1;
+  extscalar = 1;
+  extvector = 0;
+
+  // default values
+
+  pcouple = NONE;
+  drag = 0.0;
+  allremap = 1;
+  mtchain = mpchain = 3;
+  nc_tchain = nc_pchain = 1;
+  mtk_flag = 1;
+  deviatoric_flag = 0;
+  nreset_h0 = 0;
+
+  // Used by FixNVTSllod to preserve non-default value  
+
+  mtchain_default_flag = 1;
+
+  tstat_flag = 0;
+  double t_period = 0.0;
+
+  double p_period[6];
+  for (int i = 0; i < 6; i++) {
+    p_start[i] = p_stop[i] = p_period[i] = 0.0;
+    p_flag[i] = 0;
+  }
+
+  // process keywords
+
+  dimension = domain->dimension;
+
+  int iarg = 3;
+
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"temp") == 0) {
+      if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command");
+      tstat_flag = 1;
+      t_start = atof(arg[iarg+1]);
+      t_stop = atof(arg[iarg+2]);
+      t_period = atof(arg[iarg+3]);
+      if (t_start < 0.0 || t_stop <= 0.0)
+	error->all("Target T for fix nvt/npt/nph cannot be 0.0");
+      iarg += 4;
+
+    } else if (strcmp(arg[iarg],"iso") == 0) {
+      if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command");
+      pcouple = XYZ;
+      p_start[0] = p_start[1] = p_start[2] = atof(arg[iarg+1]);
+      p_stop[0] = p_stop[1] = p_stop[2] = atof(arg[iarg+2]);
+      p_period[0] = p_period[1] = p_period[2] = atof(arg[iarg+3]);
+      p_flag[0] = p_flag[1] = p_flag[2] = 1;
+      if (dimension == 2) {
+	p_start[2] = p_stop[2] = p_period[2] = 0.0;
+	p_flag[2] = 0;
+      }
+      iarg += 4; 
+    } else if (strcmp(arg[iarg],"aniso") == 0) {
+      if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command");
+      pcouple = NONE;
+      p_start[0] = p_start[1] = p_start[2] = atof(arg[iarg+1]);
+      p_stop[0] = p_stop[1] = p_stop[2] = atof(arg[iarg+2]);
+      p_period[0] = p_period[1] = p_period[2] = atof(arg[iarg+3]);
+      p_flag[0] = p_flag[1] = p_flag[2] = 1;
+      if (dimension == 2) {
+	p_start[2] = p_stop[2] = p_period[2] = 0.0;
+	p_flag[2] = 0;
+      }
+      iarg += 4;
+    } else if (strcmp(arg[iarg],"tri") == 0) {
+      if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command");
+      pcouple = NONE;
+      p_start[0] = p_start[1] = p_start[2] = atof(arg[iarg+1]);
+      p_stop[0] = p_stop[1] = p_stop[2] = atof(arg[iarg+2]);
+      p_period[0] = p_period[1] = p_period[2] = atof(arg[iarg+3]);
+      p_flag[0] = p_flag[1] = p_flag[2] = 1;
+      p_start[3] = p_start[4] = p_start[5] = 0.0;
+      p_stop[3] = p_stop[4] = p_stop[5] = 0.0;
+      p_period[3] = p_period[4] = p_period[5] = atof(arg[iarg+3]);
+      p_flag[3] = p_flag[4] = p_flag[5] = 1;
+      if (dimension == 2) {
+	p_start[2] = p_stop[2] = p_period[2] = 0.0;
+	p_flag[2] = 0;
+	p_start[3] = p_stop[3] = p_period[3] = 0.0;
+	p_flag[3] = 0;
+	p_start[4] = p_stop[4] = p_period[4] = 0.0;
+	p_flag[4] = 0;
+      }
+      iarg += 4;
+
+    } else if (strcmp(arg[iarg],"x") == 0) {
+      if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command");
+      p_start[0] = atof(arg[iarg+1]);
+      p_stop[0] = atof(arg[iarg+2]);
+      p_period[0] = atof(arg[iarg+3]);
+      p_flag[0] = 1;
+      deviatoric_flag = 1;
+      iarg += 4; 
+    } else if (strcmp(arg[iarg],"y") == 0) {
+      if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command");
+      p_start[1] = atof(arg[iarg+1]);
+      p_stop[1] = atof(arg[iarg+2]);
+      p_period[1] = atof(arg[iarg+3]);
+      p_flag[1] = 1;
+      deviatoric_flag = 1;
+      iarg += 4; 
+    } else if (strcmp(arg[iarg],"z") == 0) {
+      if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command");
+      p_start[2] = atof(arg[iarg+1]);
+      p_stop[2] = atof(arg[iarg+2]);
+      p_period[2] = atof(arg[iarg+3]);
+      p_flag[2] = 1;
+      deviatoric_flag = 1;
+      iarg += 4; 
+      if (dimension == 2)
+	error->all("Invalid fix nvt/npt/nph command for a 2d simulation");
+
+    } else if (strcmp(arg[iarg],"yz") == 0) {
+      if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command");
+      p_start[3] = atof(arg[iarg+1]);
+      p_stop[3] = atof(arg[iarg+2]);
+      p_period[3] = atof(arg[iarg+3]);
+      p_flag[3] = 1;
+      deviatoric_flag = 1;
+      iarg += 4; 
+      if (dimension == 2)
+	error->all("Invalid fix nvt/npt/nph command for a 2d simulation");
+    } else if (strcmp(arg[iarg],"xz") == 0) {
+      if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command");
+      p_start[4] = atof(arg[iarg+1]);
+      p_stop[4] = atof(arg[iarg+2]);
+      p_period[4] = atof(arg[iarg+3]);
+      p_flag[4] = 1;
+      deviatoric_flag = 1;
+      iarg += 4; 
+      if (dimension == 2)
+	error->all("Invalid fix nvt/npt/nph command for a 2d simulation");
+    } else if (strcmp(arg[iarg],"xy") == 0) {
+      if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command");
+      p_start[5] = atof(arg[iarg+1]);
+      p_stop[5] = atof(arg[iarg+2]);
+      p_period[5] = atof(arg[iarg+3]);
+      p_flag[5] = 1;
+      deviatoric_flag = 1;
+      iarg += 4; 
+
+    } else if (strcmp(arg[iarg],"couple") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command");
+      if (strcmp(arg[iarg+1],"xyz") == 0) pcouple = XYZ;
+      else if (strcmp(arg[iarg+1],"xy") == 0) pcouple = XY;
+      else if (strcmp(arg[iarg+1],"yz") == 0) pcouple = YZ;
+      else if (strcmp(arg[iarg+1],"xz") == 0) pcouple = XZ;
+      else if (strcmp(arg[iarg+1],"none") == 0) pcouple = NONE;
+      else error->all("Illegal fix nvt/npt/nph command");
+      iarg += 2;
+
+    } else if (strcmp(arg[iarg],"drag") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command");
+      drag = atof(arg[iarg+1]);
+      if (drag < 0.0) error->all("Illegal fix nvt/npt/nph command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"dilate") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command");
+      if (strcmp(arg[iarg+1],"all") == 0) allremap = 1;
+      else if (strcmp(arg[iarg+1],"partial") == 0) allremap = 0;
+      else error->all("Illegal fix nvt/npt/nph command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"tchain") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command");
+      mtchain = atoi(arg[iarg+1]);
+      if (mtchain < 1) error->all("Illegal fix nvt/npt/nph command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"pchain") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command");
+      mpchain = atoi(arg[iarg+1]);
+      if (mpchain < 0) error->all("Illegal fix nvt/npt/nph command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"mtk") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command");
+      if (strcmp(arg[iarg+1],"yes") == 0) mtk_flag = 1;
+      else if (strcmp(arg[iarg+1],"no") == 0) mtk_flag = 0;
+      else error->all("Illegal fix nvt/npt/nph command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"tloop") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command");
+      nc_tchain = atoi(arg[iarg+1]);
+      if (nc_tchain < 0) error->all("Illegal fix nvt/npt/nph command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"ploop") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command");
+      nc_pchain = atoi(arg[iarg+1]);
+      if (nc_pchain < 0) error->all("Illegal fix nvt/npt/nph command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"nreset") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command");
+      nreset_h0 = atoi(arg[iarg+1]);
+      if (nreset_h0 < 0) error->all("Illegal fix nvt/npt/nph command");
+      iarg += 2;
+    } else error->all("Illegal fix nvt/npt/nph command");
+  }
+
+  // error checks
+
+  if (dimension == 2 && (p_flag[2] || p_flag[3] || p_flag[4]))
+    error->all("Invalid fix nvt/npt/nph command for a 2d simulation");
+  if (dimension == 2 && (pcouple == YZ || pcouple == XZ))
+    error->all("Invalid fix nvt/npt/nph command for a 2d simulation");
+
+  if (pcouple == XYZ && (p_flag[0] == 0 || p_flag[1] == 0))
+    error->all("Invalid fix nvt/npt/nph command pressure settings");
+  if (pcouple == XYZ && dimension == 3 && p_flag[2] == 0)
+    error->all("Invalid fix nvt/npt/nph command pressure settings");
+  if (pcouple == XY && (p_flag[0] == 0 || p_flag[1] == 0))
+    error->all("Invalid fix nvt/npt/nph command pressure settings");
+  if (pcouple == YZ && (p_flag[1] == 0 || p_flag[2] == 0))
+    error->all("Invalid fix nvt/npt/nph command pressure settings");
+  if (pcouple == XZ && (p_flag[0] == 0 || p_flag[2] == 0))
+    error->all("Invalid fix nvt/npt/nph command pressure settings");
+
+  if (p_flag[0] && domain->xperiodic == 0)
+    error->all("Cannot use fix nvt/npt/nph on a non-periodic dimension");
+  if (p_flag[1] && domain->yperiodic == 0)
+    error->all("Cannot use fix nvt/npt/nph on a non-periodic dimension");
+  if (p_flag[2] && domain->zperiodic == 0)
+    error->all("Cannot use fix nvt/npt/nph on a non-periodic dimension");
+  if (p_flag[3] && domain->zperiodic == 0)
+    error->all("Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension");
+  if (p_flag[4] && domain->zperiodic == 0)
+    error->all("Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension");
+  if (p_flag[5] && domain->yperiodic == 0)
+    error->all("Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension");
+
+  if (!domain->triclinic && (p_flag[3] || p_flag[4] || p_flag[5])) 
+    error->all("Can not specify Pxy/Pxz/Pyz in "
+	       "fix nvt/npt/nph with non-triclinic box");
+
+  if (pcouple == XYZ && dimension == 3 &&
+      (p_start[0] != p_start[1] || p_start[0] != p_start[2] || 
+       p_stop[0] != p_stop[1] || p_stop[0] != p_stop[2] || 
+       p_period[0] != p_period[1] || p_period[0] != p_period[2]))
+    error->all("Invalid fix nvt/npt/nph pressure settings");
+  if (pcouple == XYZ && dimension == 2 &&
+      (p_start[0] != p_start[1] || p_stop[0] != p_stop[1] || 
+       p_period[0] != p_period[1]))
+    error->all("Invalid fix nvt/npt/nph pressure settings");
+  if (pcouple == XY && 
+      (p_start[0] != p_start[1] || p_stop[0] != p_stop[1] || 
+       p_period[0] != p_period[1]))
+    error->all("Invalid fix nvt/npt/nph pressure settings");
+  if (pcouple == YZ && 
+      (p_start[1] != p_start[2] || p_stop[1] != p_stop[2] ||
+       p_period[1] != p_period[2]))
+    error->all("Invalid fix nvt/npt/nph pressure settings");
+  if (pcouple == XZ && 
+      (p_start[0] != p_start[2] || p_stop[0] != p_stop[2] ||
+       p_period[0] != p_period[2]))
+    error->all("Invalid fix nvt/npt/nph pressure settings");
+
+  if ((tstat_flag && t_period <= 0.0) || 
+      (p_flag[0] && p_period[0] <= 0.0) || 
+      (p_flag[1] && p_period[1] <= 0.0) || 
+      (p_flag[2] && p_period[2] <= 0.0) ||
+      (p_flag[3] && p_period[3] <= 0.0) || 
+      (p_flag[4] && p_period[4] <= 0.0) || 
+      (p_flag[5] && p_period[5] <= 0.0))
+    error->all("Fix nvt/npt/nph damping parameters must be > 0.0");
+
+  // set pstat_flag and box change variables
+
+  pstat_flag = 0;
+  for (int i = 0; i < 6; i++)
+    if (p_flag[i]) pstat_flag = 1;
+
+  if (pstat_flag) {
+    box_change = 1;
+    if (p_flag[0] || p_flag[1] || p_flag[2]) box_change_size = 1;
+    if (p_flag[3] || p_flag[4] || p_flag[5]) box_change_shape = 1;
+    no_change_box = 1;
+    if (allremap == 0) restart_pbc = 1;
+  }
+
+  // pstyle = TRICLINIC if any off-diagonal term is controlled -> 6 dof
+  // else pstyle = ISO if XYZ coupling or XY coupling in 2d -> 1 dof
+  // else pstyle = ANISO -> 3 dof
+
+  if (p_flag[3] || p_flag[4] || p_flag[5]) pstyle = TRICLINIC;
+  else if (pcouple == XYZ || (dimension == 2 && pcouple == XY)) pstyle = ISO;
+  else pstyle = ANISO;
+
+  // convert input periods to frequencies
+
+  t_freq = 0.0;
+  p_freq[0] = p_freq[1] = p_freq[2] = p_freq[3] = p_freq[4] = p_freq[5] = 0.0;
+
+  if (tstat_flag) t_freq = 1.0 / t_period;
+  if (p_flag[0]) p_freq[0] = 1.0 / p_period[0];
+  if (p_flag[1]) p_freq[1] = 1.0 / p_period[1];
+  if (p_flag[2]) p_freq[2] = 1.0 / p_period[2];
+  if (p_flag[3]) p_freq[3] = 1.0 / p_period[3];
+  if (p_flag[4]) p_freq[4] = 1.0 / p_period[4];
+  if (p_flag[5]) p_freq[5] = 1.0 / p_period[5];
+
+  // Nose/Hoover temp and pressure init
+
+  size_vector = 0;
+
+  if (tstat_flag) {
+    int ich;
+    eta = new double[mtchain];
+
+    // add one extra dummy thermostat, set to zero
+
+    eta_dot = new double[mtchain+1];
+    eta_dot[mtchain] = 0.0;
+    eta_dotdot = new double[mtchain];
+    for (ich = 0; ich < mtchain; ich++) {
+      eta[ich] = eta_dot[ich] = eta_dotdot[ich] = 0.0;
+    }
+    eta_mass = new double[mtchain];
+    size_vector += 2*2*mtchain;
+  }
+
+  if (pstat_flag) {
+    omega[0] = omega[1] = omega[2] = 0.0;
+    omega_dot[0] = omega_dot[1] = omega_dot[2] = 0.0;
+    omega_mass[0] = omega_mass[1] = omega_mass[2] = 0.0;
+    omega[3] = omega[4] = omega[5] = 0.0;
+    omega_dot[3] = omega_dot[4] = omega_dot[5] = 0.0;
+    omega_mass[3] = omega_mass[4] = omega_mass[5] = 0.0;
+    if (pstyle == ISO) size_vector += 2*2*1;
+    else if (pstyle == ANISO) size_vector += 2*2*3;
+    else if (pstyle == TRICLINIC) size_vector += 2*2*6;
+    
+    if (mpchain) {
+      int ich;
+      etap = new double[mpchain];
+
+      // add one extra dummy thermostat, set to zero
+
+      etap_dot = new double[mpchain+1];
+      etap_dot[mpchain] = 0.0;
+      etap_dotdot = new double[mpchain];
+      for (ich = 0; ich < mpchain; ich++) {
+	etap[ich] = etap_dot[ich] = 
+	  etap_dotdot[ich] = 0.0;
+      }
+      etap_mass = new double[mpchain];
+      size_vector += 2*2*mpchain;
+    }
+
+    if (deviatoric_flag) size_vector += 1;
+  }
+
+  nrigid = 0;
+  rfix = NULL;
+
+  // initialize vol0,t0 to zero to signal uninitialized
+  // values then assigned in init(), if necessary
+
+  vol0 = t0 = 0.0;
+}
+  
+/* ---------------------------------------------------------------------- */
+  
+FixNHCuda::~FixNHCuda()
+{
+  delete [] rfix;
+
+  // delete temperature and pressure if fix created them
+
+  if (tflag) modify->delete_compute(id_temp);
+  delete [] id_temp;
+
+  if (tstat_flag) {
+    delete [] eta;
+    delete [] eta_dot;
+    delete [] eta_dotdot;
+    delete [] eta_mass;
+  }
+
+  if (pstat_flag) {
+    if (pflag) modify->delete_compute(id_press);
+    delete [] id_press;
+    if (mpchain) {
+      delete [] etap;
+      delete [] etap_dot;
+      delete [] etap_dotdot;
+      delete [] etap_mass;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixNHCuda::setmask()
+{
+  int mask = 0;
+  mask |= INITIAL_INTEGRATE_CUDA;
+  mask |= FINAL_INTEGRATE_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  //mask |= INITIAL_INTEGRATE_RESPA;
+  //mask |= FINAL_INTEGRATE_RESPA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNHCuda::init()
+{
+  // insure no conflict with fix deform
+
+  if (pstat_flag)
+    for (int i = 0; i < modify->nfix; i++)
+      if (strcmp(modify->fix[i]->style,"deform") == 0) {
+	int *dimflag = ((FixDeform *) modify->fix[i])->dimflag;
+	if ((p_flag[0] && dimflag[0]) || (p_flag[1] && dimflag[1]) || 
+	    (p_flag[2] && dimflag[2]) || (p_flag[3] && dimflag[3]) || 
+	    (p_flag[4] && dimflag[4]) || (p_flag[5] && dimflag[5]))
+	  error->all("Cannot use fix npt and fix deform on "
+		     "same component of stress tensor");
+      }
+
+  // set temperature and pressure ptrs
+
+  int icompute = modify->find_compute(id_temp);
+  if (icompute < 0) 
+    error->all("Temperature ID for fix nvt/nph/npt does not exist");
+  temperature = modify->compute[icompute];
+
+  if (temperature->tempbias) which = BIAS;
+  else which = NOBIAS;
+
+  if (pstat_flag) {
+    icompute = modify->find_compute(id_press);
+    if (icompute < 0) error->all("Pressure ID for fix npt/nph does not exist");
+    pressure = modify->compute[icompute];
+  }
+
+  // set timesteps and frequencies
+
+  dtv = update->dt;
+  dtf = 0.5 * update->dt * force->ftm2v;
+  dthalf = 0.5 * update->dt;
+  dt4 = 0.25 * update->dt;
+  dt8 = 0.125 * update->dt;
+  dto = dthalf;
+
+  p_freq_max = 0.0;
+  if (pstat_flag) {
+    p_freq_max = MAX(p_freq[0],p_freq[1]);
+    p_freq_max = MAX(p_freq_max,p_freq[2]);
+    if (pstyle == TRICLINIC) {
+      p_freq_max = MAX(p_freq_max,p_freq[3]);
+      p_freq_max = MAX(p_freq_max,p_freq[4]);
+      p_freq_max = MAX(p_freq_max,p_freq[5]);
+    }
+    pdrag_factor = 1.0 - (update->dt * p_freq_max * drag / nc_pchain);
+  }
+
+  if (tstat_flag)
+    tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
+
+  // tally the number of dimensions that are barostatted
+  // also compute the initial volume and reference cell  
+  // set initial volume and reference cell, if not already done
+
+  if (pstat_flag) {
+    pdim = p_flag[0] + p_flag[1] + p_flag[2];
+    if (vol0 == 0.0) {
+      if (dimension == 3) vol0 = domain->xprd * domain->yprd * domain->zprd;
+      else vol0 = domain->xprd * domain->yprd;
+      h0_inv[0] = domain->h_inv[0];
+      h0_inv[1] = domain->h_inv[1];
+      h0_inv[2] = domain->h_inv[2];
+      h0_inv[3] = domain->h_inv[3];
+      h0_inv[4] = domain->h_inv[4];
+      h0_inv[5] = domain->h_inv[5];
+    }
+  }
+
+  boltz = force->boltz;
+  nktv2p = force->nktv2p;
+
+  if (force->kspace) kspace_flag = 1;
+  else kspace_flag = 0;
+
+  if (strcmp(update->integrate_style,"respa") == 0) {
+    nlevels_respa = ((Respa *) update->integrate)->nlevels;
+    step_respa = ((Respa *) update->integrate)->step;
+    dto = 0.5*step_respa[0];
+  }
+
+  // detect if any rigid fixes exist so rigid bodies move when box is remapped
+  // rfix[] = indices to each fix rigid
+
+  delete [] rfix;
+  nrigid = 0;
+  rfix = NULL;
+
+  for (int i = 0; i < modify->nfix; i++)
+    if (modify->fix[i]->rigid_flag) nrigid++;
+  if (nrigid) {
+    rfix = new int[nrigid];
+    nrigid = 0;
+    for (int i = 0; i < modify->nfix; i++)
+      if (modify->fix[i]->rigid_flag) rfix[nrigid++] = i;
+  }
+  triggerneighsq= cuda->shared_data.atom.triggerneighsq;
+  cuda->neighbor_decide_by_integrator=1;
+  Cuda_FixNHCuda_Init(&cuda->shared_data,dtv,dtf);
+  
+}
+
+/* ----------------------------------------------------------------------
+   compute T,P before integrator starts 
+------------------------------------------------------------------------- */
+
+void FixNHCuda::setup(int vflag)
+{
+  // initialize some quantities that were not available earlier
+
+  //if (mtk_flag) mtk_factor = 1.0 + 1.0/atom->natoms;
+  //else mtk_factor = 1.0;
+  tdof = temperature->dof;
+
+  // t_target is used by compute_scalar(), even for NPH
+
+  if (tstat_flag) t_target = t_start;                      
+  else if (pstat_flag) {
+
+    // t0 = initial value for piston mass and energy conservation
+    // cannot be done in init() b/c temperature cannot be called there
+    // is b/c Modify::init() inits computes after fixes due to dof dependence
+    // guesstimate a unit-dependent t0 if actual T = 0.0
+    // if it was read in from a restart file, leave it be
+
+    if (t0 == 0.0) {
+      t0 = temperature->compute_scalar();
+      if (t0 == 0.0) {
+	if (strcmp(update->unit_style,"lj") == 0) t0 = 1.0;
+	else t0 = 300.0;
+      }
+    }
+    t_target = t0;
+  }
+
+  if (pstat_flag) compute_press_target();
+
+  t_current = temperature->compute_scalar();
+  if (pstat_flag) {
+    if (pstyle == ISO) double tmp = pressure->compute_scalar();
+    else pressure->compute_vector();
+    couple();
+    pressure->addstep(update->ntimestep+1);
+  }
+
+  // initial forces on thermostat variables
+
+  if (tstat_flag) {
+    eta_mass[0] = tdof * boltz * t_target / (t_freq*t_freq);
+    for (int ich = 1; ich < mtchain; ich++)
+      eta_mass[ich] = boltz * t_target / (t_freq*t_freq);
+    for (int ich = 1; ich < mtchain; ich++) {
+      eta_dotdot[ich] = (eta_mass[ich-1]*eta_dot[ich-1]*eta_dot[ich-1] -
+			 boltz*t_target) / eta_mass[ich];
+    }
+  }
+
+  if (pstat_flag) {
+    double kt = boltz * t_target;
+    double nkt = atom->natoms * kt;
+
+    for (int i = 0; i < 3; i++)
+      if (p_flag[i])
+	omega_mass[i] = nkt/(p_freq[i]*p_freq[i]);
+
+    if (pstyle == TRICLINIC) {
+      for (int i = 3; i < 6; i++)
+	if (p_flag[i]) omega_mass[i] = nkt/(p_freq[i]*p_freq[i]);
+    }
+
+  // initial forces on barostat thermostat variables
+
+    if (mpchain) {
+      etap_mass[0] = boltz * t_target / (p_freq_max*p_freq_max);
+      for (int ich = 1; ich < mpchain; ich++)
+	etap_mass[ich] = boltz * t_target / (p_freq_max*p_freq_max);
+      for (int ich = 1; ich < mpchain; ich++)
+	etap_dotdot[ich] = 
+	  (etap_mass[ich-1]*etap_dot[ich-1]*etap_dot[ich-1] -
+	   boltz*t_target) / etap_mass[ich];
+    }
+
+    // compute appropriately coupled elements of mvv_current
+
+    //if (mtk_flag) couple_ke();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   1st half of Verlet update 
+------------------------------------------------------------------------- */
+
+void FixNHCuda::initial_integrate(int vflag)
+{
+  if(!temperature->cudable) cuda->downloadAll();
+
+  if(triggerneighsq!=cuda->shared_data.atom.triggerneighsq) 
+  {
+	triggerneighsq= cuda->shared_data.atom.triggerneighsq;
+	Cuda_FixNHCuda_Init(&cuda->shared_data,dtv,dtf);
+  }
+
+  // update eta_press_dot
+
+  if (pstat_flag && mpchain) nhc_press_integrate();
+
+  // update eta_dot
+
+  if (tstat_flag) {
+    double delta = update->ntimestep - update->beginstep;
+    delta /= update->endstep - update->beginstep;
+    t_target = t_start + delta * (t_stop-t_start);
+    eta_mass[0] = tdof * boltz * t_target / (t_freq*t_freq);
+    for (int ich = 1; ich < mtchain; ich++)
+      eta_mass[ich] = boltz * t_target / (t_freq*t_freq);
+    nhc_temp_integrate();
+  }
+
+  // need to recompute pressure to account for change in KE
+  // t_current is up-to-date, but compute_temperature is not
+  // compute appropriately coupled elements of mvv_current
+
+  if (pstat_flag) {
+    if (pstyle == ISO) {
+      temperature->compute_scalar();
+      double tmp = pressure->compute_scalar();
+    } else {
+      temperature->compute_vector();
+      pressure->compute_vector();
+    }
+    couple();
+    pressure->addstep(update->ntimestep+1);
+    //if (mtk_flag) couple_ke();
+  }
+
+  if(which==NOBIAS)
+  {
+    if (pstat_flag) {
+      compute_press_target();
+      nh_omega_dot();
+      factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2));
+  	  factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2));
+  	  factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2));
+      Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0);
+    }
+    else
+    Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
+  }
+  else if(which==BIAS)
+  {
+  	if(pstat_flag)
+  	{
+      compute_press_target();
+      nh_omega_dot();
+  	  factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2));
+  	  factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2));
+  	  factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2));
+  	  if(!temperature->cudable) 
+  	  {
+  	  	nh_v_press();
+  	  	cuda->cu_v->upload();
+  	  }
+  	  else
+  	  {
+   	    int groupbit_org=temperature->groupbit;
+   	    temperature->groupbit=groupbit;	
+  	    temperature->remove_bias_all();
+        Cuda_FixNHCuda_nh_v_press(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0);
+  	    temperature->restore_bias_all();
+    	temperature->groupbit=groupbit_org;
+  	  }
+  	}
+    Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
+  }
+
+  // remap simulation box by 1/2 step
+
+  if (pstat_flag) remap();
+
+  Cuda_FixNHCuda_nve_x(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
+
+  // remap simulation box by 1/2 step
+  // redo KSpace coeffs since volume has changed
+
+  if (pstat_flag) {
+    remap();
+    if (kspace_flag) force->kspace->setup();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   2nd half of Verlet update 
+------------------------------------------------------------------------- */
+
+void FixNHCuda::final_integrate()
+{
+  if(!temperature->cudable) cuda->downloadAll();
+
+  if(which==NOBIAS)
+  {
+    if(pstat_flag)
+    {
+      factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2));
+      factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2));
+      factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2));
+
+      Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0);
+    }
+    else
+    Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
+  }
+  else if(which==BIAS)
+  {
+    Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
+    	
+  	if(pstat_flag)
+  	{
+      factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2));
+      factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2));
+      factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2));
+  	  if(!temperature->cudable) 
+  	  {
+  	  	cuda->cu_v->download();
+  	  	nh_v_press();
+  	  	cuda->cu_v->upload();
+  	  }
+  	  else
+  	  {
+    	int groupbit_org=temperature->groupbit;
+    	temperature->groupbit=groupbit;
+ 	    temperature->remove_bias_all();
+        Cuda_FixNHCuda_nh_v_press(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0);
+  	    temperature->restore_bias_all();
+    	temperature->groupbit=groupbit_org;
+  	  }
+  	}
+  }
+  // compute new T,P
+  // compute appropriately coupled elements of mvv_current
+
+  if(!temperature->cudable)	cuda->cu_v->download();
+  t_current = temperature->compute_scalar();
+  if (pstat_flag) {
+    if (pstyle == ISO) double tmp = pressure->compute_scalar();
+    else pressure->compute_vector();
+    couple();
+    pressure->addstep(update->ntimestep+1);
+  }
+
+  if (pstat_flag) nh_omega_dot();
+
+  // update eta_dot
+  // update eta_press_dot
+
+  if (tstat_flag) nhc_temp_integrate();
+  if (pstat_flag && mpchain) nhc_press_integrate();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNHCuda::initial_integrate_respa(int vflag, int ilevel, int iloop)
+{
+  int i;
+
+  // set timesteps by level
+
+  dtv = step_respa[ilevel];
+  dtf = 0.5 * step_respa[ilevel] * force->ftm2v;
+  dthalf = 0.5 * step_respa[ilevel];
+
+  // outermost level - update eta_dot and omega_dot, apply to v, remap box
+  // all other levels - NVE update of v
+  // x,v updates only performed for atoms in group
+
+  if (ilevel == nlevels_respa-1) {
+
+    // update eta_press_dot
+    
+    if (pstat_flag && mpchain) nhc_press_integrate();
+
+    // update eta_dot
+
+    if (tstat_flag) {
+      double delta = update->ntimestep - update->beginstep;
+      delta /= update->endstep - update->beginstep;
+      t_target = t_start + delta * (t_stop-t_start);
+      eta_mass[0] = tdof * boltz * t_target / (t_freq*t_freq);
+      for (int ich = 1; ich < mtchain; ich++)
+	eta_mass[ich] = boltz * t_target / (t_freq*t_freq);
+      nhc_temp_integrate();
+    }
+
+    // recompute pressure to account for change in KE
+    // t_current is up-to-date, but compute_temperature is not
+    // compute appropriately coupled elements of mvv_current
+
+    if (pstat_flag) {
+      if (pstyle == ISO) {
+	temperature->compute_scalar();
+	double tmp = pressure->compute_scalar();
+      } else {
+       	temperature->compute_vector();
+	pressure->compute_vector();
+      }
+      couple();
+      pressure->addstep(update->ntimestep+1);
+      if (mtk_flag) couple_ke();
+    }
+    
+    if (pstat_flag) {
+      compute_press_target();
+      nh_omega_dot();
+      nh_v_press();
+    }
+
+    nve_v();
+
+  } else nve_v();
+
+  // innermost level - also update x only for atoms in group
+  // if barostat, perform 1/2 step remap before and after
+
+  if (ilevel == 0) {
+    if (pstat_flag) remap();
+    nve_x();
+    if (pstat_flag) remap();
+  }
+
+  // if barostat, redo KSpace coeffs at outermost level, 
+  // since volume has changed
+
+  if (ilevel == nlevels_respa-1 && kspace_flag && pstat_flag) 
+    force->kspace->setup();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNHCuda::final_integrate_respa(int ilevel, int iloop)
+{
+  // set timesteps by level
+
+  dtf = 0.5 * step_respa[ilevel] * force->ftm2v;
+  dthalf = 0.5 * step_respa[ilevel];
+
+  // outermost level - update eta_dot and omega_dot, apply via final_integrate
+  // all other levels - NVE update of v
+
+  if (ilevel == nlevels_respa-1) final_integrate();
+  else nve_v();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNHCuda::couple()
+{
+  double *tensor = pressure->vector;
+
+  if (pstyle == ISO)
+    p_current[0] = p_current[1] = p_current[2] = pressure->scalar;
+  else if (pcouple == XYZ) {
+    double ave = 1.0/3.0 * (tensor[0] + tensor[1] + tensor[2]);
+    p_current[0] = p_current[1] = p_current[2] = ave;
+  } else if (pcouple == XY) {
+    double ave = 0.5 * (tensor[0] + tensor[1]);
+    p_current[0] = p_current[1] = ave;
+    p_current[2] = tensor[2];
+  } else if (pcouple == YZ) {
+    double ave = 0.5 * (tensor[1] + tensor[2]);
+    p_current[1] = p_current[2] = ave;
+    p_current[0] = tensor[0];
+  } else if (pcouple == XZ) {
+    double ave = 0.5 * (tensor[0] + tensor[2]);
+    p_current[0] = p_current[2] = ave;
+    p_current[1] = tensor[1];
+  } else {
+    p_current[0] = tensor[0];
+    p_current[1] = tensor[1];
+    p_current[2] = tensor[2];
+  }
+  
+  // switch order from xy-xz-yz to Voigt 
+  
+  if (pstyle == TRICLINIC) {
+    p_current[3] = tensor[5];
+    p_current[4] = tensor[4];
+    p_current[5] = tensor[3];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNHCuda::couple_ke()
+{
+  double *tensor = temperature->vector;
+  if (pstyle == ISO)
+    mvv_current[0] = mvv_current[1] = mvv_current[2] = 
+      tdof * boltz * t_current/dimension;
+  else if (pcouple == XYZ) {
+    double ave = 1.0/3.0 * (tensor[0] + tensor[1] + tensor[2]);
+    mvv_current[0] = mvv_current[1] = mvv_current[2] = ave;
+  } else if (pcouple == XY) {
+    double ave = 0.5 * (tensor[0] + tensor[1]);
+    mvv_current[0] = mvv_current[1] = ave;
+    mvv_current[2] = tensor[2];
+  } else if (pcouple == YZ) {
+    double ave = 0.5 * (tensor[1] + tensor[2]);
+    mvv_current[1] = mvv_current[2] = ave;
+    mvv_current[0] = tensor[0];
+  } else if (pcouple == XZ) {
+    double ave = 0.5 * (tensor[0] + tensor[2]);
+    mvv_current[0] = mvv_current[2] = ave;
+    mvv_current[1] = tensor[1];
+  } else {
+    mvv_current[0] = tensor[0];
+    mvv_current[1] = tensor[1];
+    mvv_current[2] = tensor[2];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   change box size
+   remap all atoms or fix group atoms depending on allremap flag
+   if rigid bodies exist, scale rigid body centers-of-mass
+------------------------------------------------------------------------- */
+
+void FixNHCuda::remap()
+{
+  int i;
+  double oldlo,oldhi,ctr;
+
+  double **x = atom->x;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  double *h = domain->h;
+
+  // omega is not used, except for book-keeping
+
+  for (int i = 0; i < 6; i++) omega[i] += dto*omega_dot[i];
+
+  // convert pertinent atoms and rigid bodies to lamda coords
+  if (allremap) domain->x2lamda(nlocal);
+  else {
+    for (i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	domain->x2lamda(x[i],x[i]);
+  }
+
+  if (nrigid)
+    for (i = 0; i < nrigid; i++)
+      modify->fix[rfix[i]]->deform(0);
+
+  // reset global and local box to new size/shape
+
+  // This operation corresponds to applying the
+  // translate and scale operations 
+  // corresponding to the solution of the following ODE:
+  //
+  // h_dot = omega_dot * h
+  //
+  // where h_dot, omega_dot and h are all upper-triangular
+  // 3x3 tensors. In Voigt notation, the elements of the 
+  // RHS product tensor are:
+  // h_dot = [0*0, 1*1, 2*2, 1*3+3*2, 0*4+5*3+4*2, 0*5+5*1]
+  // 
+  // Ordering of operations preserves time symmetry.
+
+  double dto2 = dto/2.0;
+  double dto4 = dto/4.0;
+  double dto8 = dto/8.0;
+
+  if (pstyle == TRICLINIC) {
+
+    h[4] *= exp(dto8*omega_dot[0]);
+    h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); 
+    h[4] *= exp(dto8*omega_dot[0]);
+
+    h[3] *= exp(dto4*omega_dot[1]);
+    h[3] += dto2*(omega_dot[3]*h[2]); 
+    h[3] *= exp(dto4*omega_dot[1]);
+
+    h[5] *= exp(dto4*omega_dot[0]);
+    h[5] += dto2*(omega_dot[5]*h[1]); 
+    h[5] *= exp(dto4*omega_dot[0]);
+
+    h[4] *= exp(dto8*omega_dot[0]);
+    h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); 
+    h[4] *= exp(dto8*omega_dot[0]);
+
+  }
+
+  for (i = 0; i < 3; i++) {
+    if (p_flag[i]) {
+      oldlo = domain->boxlo[i];
+      oldhi = domain->boxhi[i];
+      ctr = 0.5 * (oldlo + oldhi);
+      domain->boxlo[i] = (oldlo-ctr)*exp(dto*omega_dot[i]) + ctr;
+      domain->boxhi[i] = (oldhi-ctr)*exp(dto*omega_dot[i]) + ctr;
+    }
+  }
+
+  if (pstyle == TRICLINIC) {
+
+    h[4] *= exp(dto8*omega_dot[0]);
+    h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); 
+    h[4] *= exp(dto8*omega_dot[0]);
+
+    h[3] *= exp(dto4*omega_dot[1]);
+    h[3] += dto2*(omega_dot[3]*h[2]); 
+    h[3] *= exp(dto4*omega_dot[1]);
+
+    h[5] *= exp(dto4*omega_dot[0]);
+    h[5] += dto2*(omega_dot[5]*h[1]); 
+    h[5] *= exp(dto4*omega_dot[0]);
+
+    h[4] *= exp(dto8*omega_dot[0]);
+    h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); 
+    h[4] *= exp(dto8*omega_dot[0]);
+
+    domain->yz = h[3];
+    domain->xz = h[4];
+    domain->xy = h[5];
+
+    if (domain->yz < -0.5*domain->yprd || domain->yz > 0.5*domain->yprd ||
+	domain->xz < -0.5*domain->xprd || domain->xz > 0.5*domain->xprd ||
+	domain->xy < -0.5*domain->xprd || domain->xy > 0.5*domain->xprd)
+      error->all("Fix npt/nph has tilted box too far - "
+		 "box flips are not yet implemented");
+  }
+
+  domain->set_global_box();
+  domain->set_local_box();
+
+  // convert pertinent atoms and rigid bodies back to box coords
+
+  if (allremap) domain->lamda2x(nlocal);
+  else {
+    for (i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	domain->lamda2x(x[i],x[i]);
+  }
+
+  if (nrigid)
+    for (i = 0; i < nrigid; i++)
+      modify->fix[rfix[i]]->deform(1);
+}
+
+/* ----------------------------------------------------------------------
+   pack entire state of Fix into one write 
+------------------------------------------------------------------------- */
+
+void FixNHCuda::write_restart(FILE *fp)
+{
+  int nsize = 2;
+  if (tstat_flag) nsize += 1 + 2*mtchain;
+  if (pstat_flag) {
+    nsize += 16 + 2*mpchain;
+    if (deviatoric_flag) nsize += 6;
+  }
+
+  double* list = (double *) memory->smalloc(nsize*sizeof(double),"nh:list");
+
+  int n = 0;
+
+  list[n++] = tstat_flag;
+  if (tstat_flag) {
+    list[n++] = mtchain;
+    for (int ich = 0; ich < mtchain; ich++)
+      list[n++] = eta[ich];
+    for (int ich = 0; ich < mtchain; ich++)
+      list[n++] = eta_dot[ich];
+  }
+
+  list[n++] = pstat_flag;
+  if (pstat_flag) {
+    list[n++] = omega[0];
+    list[n++] = omega[1];
+    list[n++] = omega[2];
+    list[n++] = omega[3];
+    list[n++] = omega[4];
+    list[n++] = omega[5];
+    list[n++] = omega_dot[0];
+    list[n++] = omega_dot[1];
+    list[n++] = omega_dot[2];
+    list[n++] = omega_dot[3];
+    list[n++] = omega_dot[4];
+    list[n++] = omega_dot[5];
+    list[n++] = vol0;
+    list[n++] = t0;
+    list[n++] = mpchain;
+    if (mpchain) {
+      for (int ich = 0; ich < mpchain; ich++)
+	list[n++] = etap[ich];
+      for (int ich = 0; ich < mpchain; ich++)
+	list[n++] = etap_dot[ich];
+    }
+
+    list[n++] = deviatoric_flag;
+    if (deviatoric_flag) {
+      list[n++] = h0_inv[0];
+      list[n++] = h0_inv[1];
+      list[n++] = h0_inv[2];
+      list[n++] = h0_inv[3];
+      list[n++] = h0_inv[4];
+      list[n++] = h0_inv[5];
+    }
+  }
+
+  if (comm->me == 0) {
+    int size = nsize * sizeof(double);
+    fwrite(&size,sizeof(int),1,fp);
+    fwrite(list,sizeof(double),nsize,fp);
+  }
+
+  memory->sfree(list);
+}
+
+/* ----------------------------------------------------------------------
+   use state info from restart file to restart the Fix 
+------------------------------------------------------------------------- */
+
+void FixNHCuda::restart(char *buf)
+{
+  int n = 0;
+  double *list = (double *) buf;
+  int flag = static_cast<int> (list[n++]);
+  if (flag) {
+    int m = static_cast<int> (list[n++]);
+    if (tstat_flag && m == mtchain) {
+      for (int ich = 0; ich < mtchain; ich++)
+	eta[ich] = list[n++];
+      for (int ich = 0; ich < mtchain; ich++)
+	eta_dot[ich] = list[n++];
+    } else n += 2*m;
+  }
+  flag = static_cast<int> (list[n++]);
+  if (flag) {
+    omega[0] = list[n++];
+    omega[1] = list[n++];
+    omega[2] = list[n++];
+    omega[3] = list[n++];
+    omega[4] = list[n++];
+    omega[5] = list[n++];
+    omega_dot[0] = list[n++];
+    omega_dot[1] = list[n++];
+    omega_dot[2] = list[n++];
+    omega_dot[3] = list[n++];
+    omega_dot[4] = list[n++];
+    omega_dot[5] = list[n++];
+    vol0 = list[n++];
+    t0 = list[n++];
+    int m = static_cast<int> (list[n++]);
+    if (pstat_flag && m == mpchain) {
+      for (int ich = 0; ich < mpchain; ich++)
+	etap[ich] = list[n++];
+      for (int ich = 0; ich < mpchain; ich++)
+	etap_dot[ich] = list[n++];
+    } else n+=2*m;
+    flag = static_cast<int> (list[n++]);
+    if (flag) {
+      h0_inv[0] = list[n++];
+      h0_inv[1] = list[n++];
+      h0_inv[2] = list[n++];
+      h0_inv[3] = list[n++];
+      h0_inv[4] = list[n++];
+      h0_inv[5] = list[n++];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixNHCuda::modify_param(int narg, char **arg)
+{
+  if (strcmp(arg[0],"temp") == 0) {
+    if (narg < 2) error->all("Illegal fix_modify command");
+    if (tflag) {
+      modify->delete_compute(id_temp);
+      tflag = 0;
+    }
+    delete [] id_temp;
+    int n = strlen(arg[1]) + 1;
+    id_temp = new char[n];
+    strcpy(id_temp,arg[1]);
+
+    int icompute = modify->find_compute(arg[1]);
+    if (icompute < 0) error->all("Could not find fix_modify temperature ID");
+    temperature = modify->compute[icompute];
+
+    if (temperature->tempflag == 0)
+      error->all("Fix_modify temperature ID does not compute temperature");
+    if (temperature->igroup != 0 && comm->me == 0)
+      error->warning("Temperature for fix modify is not for group all");
+
+    // reset id_temp of pressure to new temperature ID
+
+    if (pstat_flag) {
+      icompute = modify->find_compute(id_press);
+      if (icompute < 0) 
+	error->all("Pressure ID for fix modify does not exist");
+      modify->compute[icompute]->reset_extra_compute_fix(id_temp);
+    }
+
+    return 2;
+
+  } else if (strcmp(arg[0],"press") == 0) {
+    if (narg < 2) error->all("Illegal fix_modify command");
+    if (!pstat_flag) error->all("Illegal fix_modify command");
+    if (pflag) {
+      modify->delete_compute(id_press);
+      pflag = 0;
+    }
+    delete [] id_press;
+    int n = strlen(arg[1]) + 1;
+    id_press = new char[n];
+    strcpy(id_press,arg[1]);
+
+    int icompute = modify->find_compute(arg[1]);
+    if (icompute < 0) error->all("Could not find fix_modify pressure ID");
+    pressure = modify->compute[icompute];
+
+    if (pressure->pressflag == 0)
+      error->all("Fix_modify pressure ID does not compute pressure");
+    return 2;
+  }
+
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixNHCuda::compute_scalar()
+{
+  int i;
+  double volume;
+  double energy;
+  double kt = boltz * t_target;
+  double lkt = tdof * kt;
+  double lkt_press = kt;
+  int ich;
+  if (dimension == 3) volume = domain->xprd * domain->yprd * domain->zprd;
+  else volume = domain->xprd * domain->yprd;
+
+  energy = 0.0;
+
+  // thermostat chain energy is equivalent to Eq. (2) in
+  // Martyna, Tuckerman, Tobias, Klein, Mol Phys, 87, 1117
+  // Sum(0.5*p_eta_k^2/Q_k,k=1,M) + L*k*T*eta_1 + Sum(k*T*eta_k,k=2,M),
+  // where L = tdof
+  //       M = mtchain
+  //       p_eta_k = Q_k*eta_dot[k-1]
+  //       Q_1 = L*k*T/t_freq^2
+  //       Q_k = k*T/t_freq^2, k > 1 
+
+  if (tstat_flag) {
+    energy += lkt * eta[0] + 0.5*eta_mass[0]*eta_dot[0]*eta_dot[0];
+    for (ich = 1; ich < mtchain; ich++)
+      energy += kt * eta[ich] + 0.5*eta_mass[ich]*eta_dot[ich]*eta_dot[ich];
+  }
+
+  // barostat energy is equivalent to Eq. (8) in
+  // Martyna, Tuckerman, Tobias, Klein, Mol Phys, 87, 1117
+  // Sum(0.5*p_omega^2/W + P*V),
+  // where N = natoms
+  //       p_omega = W*omega_dot
+  //       W = N*k*T/p_freq^2
+  //       sum is over barostatted dimensions
+
+  if (pstat_flag) { 
+    for (i = 0; i < 3; i++)
+      if (p_flag[i])
+	energy += 0.5*omega_dot[i]*omega_dot[i]*omega_mass[i] +
+	  p_hydro*(volume-vol0) / (pdim*nktv2p);
+
+    if (pstyle == TRICLINIC) {
+      for (i = 3; i < 6; i++)
+	if (p_flag[i])
+	  energy += 0.5*omega_dot[i]*omega_dot[i]*omega_mass[i]; 
+    }
+
+    // extra contributions from thermostat chain for barostat
+
+    if (mpchain) {
+      energy += lkt_press * etap[0] + 0.5*etap_mass[0]*etap_dot[0]*etap_dot[0];
+      for (ich = 1; ich < mpchain; ich++)
+	energy += kt * etap[ich] + 
+	  0.5*etap_mass[ich]*etap_dot[ich]*etap_dot[ich];
+    }
+
+    // extra contribution from strain energy
+
+    if (deviatoric_flag) energy += compute_strain_energy();
+  }
+
+  return energy;
+}
+
+/* ----------------------------------------------------------------------
+   return a single element of the following vectors, in this order:
+      eta[tchain], eta_dot[tchain], omega[ndof], omega_dot[ndof]
+      etap[pchain], etap_dot[pchain], PE_eta[tchain], KE_eta_dot[tchain]
+      PE_omega[ndof], KE_omega_dot[ndof], PE_etap[pchain], KE_etap_dot[pchain]
+      PE_strain[1]
+  if no thermostat exists, related quantities are omitted from the list
+  if no barostat exists, related quantities are omitted from the list
+  ndof = 1,3,6 degrees of freedom for pstyle = ISO,ANISO,TRI
+------------------------------------------------------------------------- */
+
+double FixNHCuda::compute_vector(int n)
+{
+  int ilen;
+
+  if (tstat_flag) { 
+    ilen = mtchain;
+    if (n < ilen) return eta[n];
+    n -= ilen;
+    ilen = mtchain;
+    if (n < ilen) return eta_dot[n];
+    n -= ilen;
+  }
+
+  if (pstat_flag) { 
+    if (pstyle == ISO) {
+      ilen = 1;
+      if (n < ilen) return omega[n];
+      n -= ilen;
+    } else if (pstyle == ANISO) {
+      ilen = 3;
+      if (n < ilen) return omega[n];
+      n -= ilen;
+    } else {
+      ilen = 6;
+      if (n < ilen) return omega[n];
+      n -= ilen;
+    }
+    
+    if (pstyle == ISO) {
+      ilen = 1;
+      if (n < ilen) return omega_dot[n];
+      n -= ilen;
+    } else if (pstyle == ANISO) {
+      ilen = 3;
+      if (n < ilen) return omega_dot[n];
+      n -= ilen;
+    } else {
+      ilen = 6;
+      if (n < ilen) return omega_dot[n];
+      n -= ilen;
+    }
+    
+    if (mpchain) {
+      ilen = mpchain;
+      if (n < ilen) return etap[n];
+      n -= ilen;
+      ilen = mpchain;
+      if (n < ilen) return etap_dot[n];
+      n -= ilen;
+    }
+  }
+
+  int i;
+  double volume;
+  double kt = boltz * t_target;
+  double lkt = tdof * kt;
+  double lkt_press = kt;
+  int ich;
+  if (dimension == 3) volume = domain->xprd * domain->yprd * domain->zprd;
+  else volume = domain->xprd * domain->yprd;
+
+  if (tstat_flag) { 
+    ilen = mtchain;
+    if (n < ilen) { 
+      ich = n;
+      if (ich == 0)
+	return lkt * eta[0];
+      else
+	return kt * eta[ich];
+    }
+    n -= ilen;
+    ilen = mtchain;
+    if (n < ilen) {
+      ich = n;
+      if (ich == 0)
+	return 0.5*eta_mass[0]*eta_dot[0]*eta_dot[0];
+      else
+	return 0.5*eta_mass[ich]*eta_dot[ich]*eta_dot[ich];
+    }
+    n -= ilen;
+  }
+
+  if (pstat_flag) { 
+    if (pstyle == ISO) {
+      ilen = 1;
+      if (n < ilen) 
+	return p_hydro*(volume-vol0) / nktv2p;
+      n -= ilen;
+    } else if (pstyle == ANISO) {
+      ilen = 3;
+      if (n < ilen) 
+	if (p_flag[n])
+	  return p_hydro*(volume-vol0) / (pdim*nktv2p);
+	else
+	  return 0.0;
+      n -= ilen;
+    } else {
+      ilen = 6;
+      if (n < ilen)
+	if (n > 2) return 0.0;
+	else if (p_flag[n])
+	  return p_hydro*(volume-vol0) / (pdim*nktv2p);
+	else
+	  return 0.0;
+      n -= ilen;
+    }
+    
+    if (pstyle == ISO) {
+      ilen = 1;
+      if (n < ilen) 
+	return pdim*0.5*omega_dot[n]*omega_dot[n]*omega_mass[n];
+      n -= ilen;
+    } else if (pstyle == ANISO) {
+      ilen = 3;
+      if (n < ilen) 
+	if (p_flag[n])
+	  return 0.5*omega_dot[n]*omega_dot[n]*omega_mass[n];
+	else return 0.0;
+      n -= ilen;
+    } else {
+      ilen = 6;
+      if (n < ilen)
+	if (p_flag[n])
+	  return 0.5*omega_dot[n]*omega_dot[n]*omega_mass[n];
+	else return 0.0;
+      n -= ilen;
+    }
+    
+    if (mpchain) {
+      ilen = mpchain;
+      if (n < ilen) {
+	ich = n;
+	if (ich == 0) return lkt_press * etap[0];
+	else return kt * etap[ich];
+      }
+      n -= ilen;
+      ilen = mpchain;
+      if (n < ilen) {
+	ich = n;
+	if (ich == 0)
+	  return 0.5*etap_mass[0]*etap_dot[0]*etap_dot[0];
+	else
+	  return 0.5*etap_mass[ich]*etap_dot[ich]*etap_dot[ich];
+      }
+      n -= ilen;
+    }
+
+    if (deviatoric_flag) {
+      ilen = 1;
+      if (n < ilen)
+	return compute_strain_energy();
+      n -= ilen;
+    }
+  }
+
+  return 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNHCuda::reset_dt()
+{
+  dtv = update->dt;
+  dtf = 0.5 * update->dt * force->ftm2v;
+  dthalf = 0.5 * update->dt;
+  dt4 = 0.25 * update->dt;
+  dt8 = 0.125 * update->dt;
+  dto = dthalf;
+
+  // If using respa, then remap is performed in innermost level
+  
+  if (strcmp(update->integrate_style,"respa") == 0)
+    dto = 0.5*step_respa[0];
+  
+  p_freq_max = 0.0;
+  if (pstat_flag) {
+    p_freq_max = MAX(p_freq[0],p_freq[1]);
+    p_freq_max = MAX(p_freq_max,p_freq[2]);
+    if (pstyle == TRICLINIC) {
+      p_freq_max = MAX(p_freq_max,p_freq[3]);
+      p_freq_max = MAX(p_freq_max,p_freq[4]);
+      p_freq_max = MAX(p_freq_max,p_freq[5]);
+    }
+    pdrag_factor = 1.0 - (update->dt * p_freq_max * drag / nc_pchain);
+  }
+
+  if (tstat_flag)
+    tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step update of chain thermostat variables
+------------------------------------------------------------------------- */
+
+void FixNHCuda::nhc_temp_integrate()
+{
+  int ich;
+  double expfac;
+
+  double lkt = tdof * boltz * t_target;
+  double kecurrent = tdof * boltz * t_current;
+  eta_dotdot[0] = (kecurrent - lkt)/eta_mass[0];
+
+  double ncfac = 1.0/nc_tchain;
+  for (int iloop = 0; iloop < nc_tchain; iloop++) {
+
+    for (ich = mtchain-1; ich > 0; ich--) {
+      expfac = exp(-ncfac*dt8*eta_dot[ich+1]);
+      eta_dot[ich] *= expfac;
+      eta_dot[ich] += eta_dotdot[ich] * ncfac*dt4;
+      eta_dot[ich] *= tdrag_factor;
+      eta_dot[ich] *= expfac;
+    }
+
+    expfac = exp(-ncfac*dt8*eta_dot[1]);
+    eta_dot[0] *= expfac;
+    eta_dot[0] += eta_dotdot[0] * ncfac*dt4;
+    eta_dot[0] *= tdrag_factor;
+    eta_dot[0] *= expfac;
+    
+    factor_eta = exp(-ncfac*dthalf*eta_dot[0]);
+	if(which==NOBIAS)
+	Cuda_FixNHCuda_nh_v_temp(&cuda->shared_data,groupbit,factor_eta,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
+    else if(which==BIAS)
+    {
+    	if(!temperature->cudable)
+    	{
+    		cuda->downloadAll();
+			nh_v_temp();
+			cuda->cu_v->upload();
+    	}
+    	else
+    	{
+    	   int groupbit_org=temperature->groupbit;
+    	   temperature->groupbit=groupbit;
+    	   temperature->remove_bias_all();
+			Cuda_FixNHCuda_nh_v_temp(&cuda->shared_data,groupbit,factor_eta,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
+    	   temperature->restore_bias_all();
+    	   temperature->groupbit=groupbit_org;
+    	}
+    		
+    }
+    // rescale temperature due to velocity scaling
+    // should not be necessary to explicitly recompute the temperature 
+
+    t_current *= factor_eta*factor_eta;
+    kecurrent = tdof * boltz * t_current;
+    eta_dotdot[0] = (kecurrent - lkt)/eta_mass[0];
+    
+    for (ich = 0; ich < mtchain; ich++)
+      eta[ich] += ncfac*dthalf*eta_dot[ich];
+    
+    eta_dot[0] *= expfac;
+    eta_dot[0] += eta_dotdot[0] * ncfac*dt4;
+    eta_dot[0] *= expfac;
+    
+    for (ich = 1; ich < mtchain; ich++) {
+      expfac = exp(-ncfac*dt8*eta_dot[ich+1]);
+      eta_dot[ich] *= expfac;
+      eta_dotdot[ich] = (eta_mass[ich-1]*eta_dot[ich-1]*eta_dot[ich-1] 
+			 - boltz * t_target)/eta_mass[ich];
+      eta_dot[ich] += eta_dotdot[ich] * ncfac*dt4;
+      eta_dot[ich] *= expfac;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step update of chain thermostat variables for barostat
+   scale barostat velocities
+------------------------------------------------------------------------- */
+
+void FixNHCuda::nhc_press_integrate()
+{
+  int ich,i;
+  double expfac,factor_etap,wmass,kecurrent;
+  double kt = boltz * t_target;
+  double lkt_press = kt;
+
+  kecurrent = 0.0;
+  for (i = 0; i < 3; i++)
+    if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i];
+
+  if (pstyle == TRICLINIC) {
+    for (i = 3; i < 6; i++) 
+      if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i];
+  }
+
+  etap_dotdot[0] = (kecurrent - lkt_press)/etap_mass[0];
+
+  double ncfac = 1.0/nc_pchain;
+  for (int iloop = 0; iloop < nc_pchain; iloop++) {
+
+    for (ich = mpchain-1; ich > 0; ich--) {
+      expfac = exp(-ncfac*dt8*etap_dot[ich+1]);
+      etap_dot[ich] *= expfac;
+      etap_dot[ich] += etap_dotdot[ich] * ncfac*dt4;
+      etap_dot[ich] *= pdrag_factor;
+      etap_dot[ich] *= expfac;
+    }
+    
+    expfac = exp(-ncfac*dt8*etap_dot[1]);
+    etap_dot[0] *= expfac;
+    etap_dot[0] += etap_dotdot[0] * ncfac*dt4;
+    etap_dot[0] *= pdrag_factor;
+    etap_dot[0] *= expfac;
+    
+    for (ich = 0; ich < mpchain; ich++)
+      etap[ich] += ncfac*dthalf*etap_dot[ich];
+    
+    factor_etap = exp(-ncfac*dthalf*etap_dot[0]);
+    for (i = 0; i < 3; i++)
+      if (p_flag[i]) omega_dot[i] *= factor_etap;
+    
+    if (pstyle == TRICLINIC) {
+      for (i = 3; i < 6; i++)
+	if (p_flag[i]) omega_dot[i] *= factor_etap;
+    }
+    
+    kecurrent = 0.0;
+    for (i = 0; i < 3; i++)
+      if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i];
+    
+    if (pstyle == TRICLINIC) {
+      for (i = 3; i < 6; i++) 
+	if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i];
+    }
+    
+    etap_dotdot[0] = (kecurrent - lkt_press)/etap_mass[0];
+    
+    etap_dot[0] *= expfac;
+    etap_dot[0] += etap_dotdot[0] * ncfac*dt4;
+    etap_dot[0] *= expfac;
+    
+    for (ich = 1; ich < mpchain; ich++) {
+      expfac = exp(-ncfac*dt8*etap_dot[ich+1]);
+      etap_dot[ich] *= expfac;
+      etap_dotdot[ich] = 
+	(etap_mass[ich-1]*etap_dot[ich-1]*etap_dot[ich-1] - boltz*t_target) / 
+	etap_mass[ich];
+      etap_dot[ich] += etap_dotdot[ich] * ncfac*dt4;
+      etap_dot[ich] *= expfac;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step barostat scaling of velocities
+-----------------------------------------------------------------------*/
+
+void FixNHCuda::nh_v_press()
+{
+  double factor[3];
+  double **v = atom->v;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2));
+  factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2));
+  factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2));
+
+  if (which == NOBIAS) {
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+	v[i][0] *= factor[0];
+	v[i][1] *= factor[1];
+	v[i][2] *= factor[2];
+	if (pstyle == TRICLINIC) {
+	  v[i][0] += -dthalf*(v[i][1]*omega_dot[5] + v[i][2]*omega_dot[4]);
+	  v[i][1] += -dthalf*v[i][2]*omega_dot[3];
+	}
+	v[i][0] *= factor[0];
+	v[i][1] *= factor[1];
+	v[i][2] *= factor[2];
+      }
+    }
+  } else if (which == BIAS) {
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+	temperature->remove_bias(i,v[i]);
+	v[i][0] *= factor[0];
+	v[i][1] *= factor[1];
+	v[i][2] *= factor[2];
+	if (pstyle == TRICLINIC) {
+	  v[i][0] += -dthalf*(v[i][1]*omega_dot[5] + v[i][2]*omega_dot[4]);
+	  v[i][1] += -dthalf*v[i][2]*omega_dot[3];
+	}
+	v[i][0] *= factor[0];
+	v[i][1] *= factor[1];
+	v[i][2] *= factor[2];
+	temperature->restore_bias(i,v[i]);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step update of velocities
+-----------------------------------------------------------------------*/
+
+void FixNHCuda::nve_v()
+{
+  double dtfm;
+  double **v = atom->v;
+  double **f = atom->f;
+  double *rmass = atom->rmass;
+  double *mass = atom->mass;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  if (rmass) {
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+	dtfm = dtf / rmass[i];
+	v[i][0] += dtfm*f[i][0];
+	v[i][1] += dtfm*f[i][1];
+	v[i][2] += dtfm*f[i][2];
+      }
+    }
+  } else {
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+	dtfm = dtf / mass[type[i]];
+	v[i][0] += dtfm*f[i][0];
+	v[i][1] += dtfm*f[i][1];
+	v[i][2] += dtfm*f[i][2];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform full-step update of positions
+-----------------------------------------------------------------------*/
+
+void FixNHCuda::nve_x()
+{
+  double **x = atom->x;
+  double **v = atom->v;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  // x update by full step only for atoms in group
+
+  for (int i = 0; i < nlocal; i++) {
+    if (mask[i] & groupbit) {
+      x[i][0] += dtv * v[i][0];
+      x[i][1] += dtv * v[i][1];
+      x[i][2] += dtv * v[i][2];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step thermostat scaling of velocities
+-----------------------------------------------------------------------*/
+
+void FixNHCuda::nh_v_temp()
+{
+  double **v = atom->v;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  if (which == NOBIAS) {
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+	v[i][0] *= factor_eta;
+	v[i][1] *= factor_eta;
+	v[i][2] *= factor_eta;
+      }
+    }
+  } else if (which == BIAS) {
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+	temperature->remove_bias(i,v[i]);
+	v[i][0] *= factor_eta;
+	v[i][1] *= factor_eta;
+	v[i][2] *= factor_eta;
+	temperature->restore_bias(i,v[i]);
+      }
+    }
+  }
+}
+  
+/* ----------------------------------------------------------------------
+   compute sigma tensor
+   needed whenever p_target or h0_inv changes
+-----------------------------------------------------------------------*/
+
+void FixNHCuda::compute_sigma()
+{
+  // if nreset_h0 > 0, reset vol0 and h0_inv 
+  // every nreset_h0 timesteps
+
+  if (nreset_h0 > 0) {
+    int delta = update->ntimestep - update->beginstep;
+    if (delta % nreset_h0 == 0) {
+      if (dimension == 3) vol0 = domain->xprd * domain->yprd * domain->zprd;
+      else vol0 = domain->xprd * domain->yprd;
+      h0_inv[0] = domain->h_inv[0];
+      h0_inv[1] = domain->h_inv[1];
+      h0_inv[2] = domain->h_inv[2];
+      h0_inv[3] = domain->h_inv[3];
+      h0_inv[4] = domain->h_inv[4];
+      h0_inv[5] = domain->h_inv[5];
+    }
+  }
+
+  // generate upper-triangular half of
+  // sigma = vol0*h0inv*(p_target-p_hydro)*h0inv^t
+  // units of sigma are are PV/L^2 e.g. atm.A 
+  //
+  // [ 0 5 4 ]   [ 0 5 4 ] [ 0 5 4 ] [ 0 - - ]
+  // [ 5 1 3 ] = [ - 1 3 ] [ 5 1 3 ] [ 5 1 - ]
+  // [ 4 3 2 ]   [ - - 2 ] [ 4 3 2 ] [ 4 3 2 ]
+
+  sigma[0] = 
+    vol0*(h0_inv[0]*((p_target[0]-p_hydro)*h0_inv[0] +
+		     p_target[5]*h0_inv[5]+p_target[4]*h0_inv[4]) +
+	  h0_inv[5]*(p_target[5]*h0_inv[0] +
+		     (p_target[1]-p_hydro)*h0_inv[5]+p_target[3]*h0_inv[4]) +
+	  h0_inv[4]*(p_target[4]*h0_inv[0]+p_target[3]*h0_inv[5] +
+		     (p_target[2]-p_hydro)*h0_inv[4]));
+  sigma[1] = 
+    vol0*(h0_inv[1]*((p_target[1]-p_hydro)*h0_inv[1] +
+		     p_target[3]*h0_inv[3]) +
+	  h0_inv[3]*(p_target[3]*h0_inv[1] + 
+		     (p_target[2]-p_hydro)*h0_inv[3]));
+  sigma[2] = 
+    vol0*(h0_inv[2]*((p_target[2]-p_hydro)*h0_inv[2]));
+  sigma[3] = 
+    vol0*(h0_inv[1]*(p_target[3]*h0_inv[2]) +
+	  h0_inv[3]*((p_target[2]-p_hydro)*h0_inv[2]));
+  sigma[4] = 
+    vol0*(h0_inv[0]*(p_target[4]*h0_inv[2]) +
+	  h0_inv[5]*(p_target[3]*h0_inv[2]) +
+	  h0_inv[4]*((p_target[2]-p_hydro)*h0_inv[2]));
+  sigma[5] = 
+    vol0*(h0_inv[0]*(p_target[5]*h0_inv[1]+p_target[4]*h0_inv[3]) +
+	  h0_inv[5]*((p_target[1]-p_hydro)*h0_inv[1]+p_target[3]*h0_inv[3]) +
+	  h0_inv[4]*(p_target[3]*h0_inv[1]+(p_target[2]-p_hydro)*h0_inv[3]));
+}
+
+/* ----------------------------------------------------------------------
+   compute strain energy
+-----------------------------------------------------------------------*/
+
+double FixNHCuda::compute_strain_energy()
+{
+  // compute strain energy = 0.5*Tr(sigma*h*h^t) in energy units 
+
+  double* h = domain->h;
+  double d0,d1,d2;
+
+  d0 = 
+    sigma[0]*(h[0]*h[0]+h[5]*h[5]+h[4]*h[4]) +
+    sigma[5]*(          h[1]*h[5]+h[3]*h[4]) +
+    sigma[4]*(                    h[2]*h[4]);
+  d1 = 
+    sigma[5]*(          h[5]*h[1]+h[4]*h[3]) +
+    sigma[1]*(          h[1]*h[1]+h[3]*h[3]) +
+    sigma[3]*(                    h[2]*h[3]);
+  d2 = 
+    sigma[4]*(                    h[4]*h[2]) +
+    sigma[3]*(                    h[3]*h[2]) +
+    sigma[2]*(                    h[2]*h[2]);
+
+  double energy = 0.5*(d0+d1+d2)/nktv2p;
+  return energy;
+}
+
+/* ----------------------------------------------------------------------
+   compute deviatoric barostat force = h*sigma*h^t
+-----------------------------------------------------------------------*/
+
+void FixNHCuda::compute_deviatoric()
+{
+  // generate upper-triangular part of h*sigma*h^t
+  // units of fdev are are PV, e.g. atm*A^3 
+  // [ 0 5 4 ]   [ 0 5 4 ] [ 0 5 4 ] [ 0 - - ]
+  // [ 5 1 3 ] = [ - 1 3 ] [ 5 1 3 ] [ 5 1 - ]
+  // [ 4 3 2 ]   [ - - 2 ] [ 4 3 2 ] [ 4 3 2 ]
+  
+  double* h = domain->h;
+   
+  fdev[0] = 
+    h[0]*(sigma[0]*h[0]+sigma[5]*h[5]+sigma[4]*h[4]) +
+    h[5]*(sigma[5]*h[0]+sigma[1]*h[5]+sigma[3]*h[4]) +
+    h[4]*(sigma[4]*h[0]+sigma[3]*h[5]+sigma[2]*h[4]);
+  fdev[1] = 
+    h[1]*(              sigma[1]*h[1]+sigma[3]*h[3]) +
+    h[3]*(              sigma[3]*h[1]+sigma[2]*h[3]);
+  fdev[2] = 
+    h[2]*(                            sigma[2]*h[2]);
+  fdev[3] = 
+    h[1]*(                            sigma[3]*h[2]) +
+    h[3]*(                            sigma[2]*h[2]);
+  fdev[4] = 
+    h[0]*(                            sigma[4]*h[2]) +
+    h[5]*(                            sigma[3]*h[2]) +
+    h[4]*(                            sigma[2]*h[2]);
+  fdev[5] = 
+    h[0]*(              sigma[5]*h[1]+sigma[4]*h[3]) +
+    h[5]*(              sigma[1]*h[1]+sigma[3]*h[3]) +
+    h[4]*(              sigma[3]*h[1]+sigma[2]*h[3]);
+}
+
+/* ----------------------------------------------------------------------
+   compute hydrostatic target pressure
+-----------------------------------------------------------------------*/
+
+void FixNHCuda::compute_press_target()
+{
+  double delta = update->ntimestep - update->beginstep;
+  if (update->endstep > update->beginstep)
+    delta /= update->endstep - update->beginstep;
+  else delta = 0.0;
+      
+  p_hydro = 0.0;
+  for (int i = 0; i < 3; i++)
+    if (p_flag[i]) {
+      p_target[i] = p_start[i] + delta * (p_stop[i]-p_start[i]);
+      p_hydro += p_target[i];
+    }
+  p_hydro /= pdim;
+
+  if (pstyle == TRICLINIC)
+    for (int i = 3; i < 6; i++)
+      p_target[i] = p_start[i] + delta * (p_stop[i]-p_start[i]);
+
+  // if deviatoric, recompute sigma each time p_target changes
+
+  if (deviatoric_flag) compute_sigma();
+}
+
+/* ----------------------------------------------------------------------
+   update omega_dot, omega, dilation
+-----------------------------------------------------------------------*/
+
+void FixNHCuda::nh_omega_dot()
+{
+  double f_omega,volume;
+
+  if (dimension == 3) volume = domain->xprd*domain->yprd*domain->zprd;
+  else volume = domain->xprd*domain->yprd;
+
+  if (deviatoric_flag) compute_deviatoric();
+
+  mtk_term1 = 0.0;
+  if (mtk_flag)
+    if (pstyle == ISO) { 
+      mtk_term1 = tdof * boltz * t_current;
+      mtk_term1 /= pdim * atom->natoms;
+    } else {
+      double *mvv_current = temperature->vector;
+      for (int i = 0; i < 3; i++)
+	if (p_flag[i])
+	  mtk_term1 += mvv_current[i];
+      mtk_term1 /= pdim * atom->natoms;
+    }
+  
+  for (int i = 0; i < 3; i++)
+    if (p_flag[i]) {
+      f_omega = (p_current[i]-p_hydro)*volume /
+	(omega_mass[i] * nktv2p) + mtk_term1 / omega_mass[i];
+      if (deviatoric_flag) f_omega -= fdev[i]/(omega_mass[i] * nktv2p);
+      omega_dot[i] += f_omega*dthalf;
+      omega_dot[i] *= pdrag_factor;
+    }
+
+  mtk_term2 = 0.0;
+  if (mtk_flag) {
+    for (int i = 0; i < 3; i++)
+      if (p_flag[i])
+	mtk_term2 += omega_dot[i];
+    mtk_term2 /= pdim * atom->natoms;
+  }
+
+  if (pstyle == TRICLINIC) {
+    for (int i = 3; i < 6; i++) {
+      if (p_flag[i]) {
+	f_omega = p_current[i]*volume/(omega_mass[i] * nktv2p);
+	if (deviatoric_flag) 
+	  f_omega -= fdev[i]/(omega_mass[i] * nktv2p);
+	omega_dot[i] += f_omega*dthalf;
+	omega_dot[i] *= pdrag_factor;
+      }
+    } 
+  }
+}
+
diff --git a/src/USER-CUDA/fix_nh_cuda.h b/src/USER-CUDA/fix_nh_cuda.h
new file mode 100644
index 0000000000..8c192b56dd
--- /dev/null
+++ b/src/USER-CUDA/fix_nh_cuda.h
@@ -0,0 +1,126 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_FIX_NH_CUDA_H
+#define LMP_FIX_NH_CUDA_H
+
+#include "fix.h"
+#include "cuda_precision.h"
+
+namespace LAMMPS_NS {
+
+class FixNHCuda : public Fix {
+ public:
+  FixNHCuda(class LAMMPS *, int, char **);
+  virtual ~FixNHCuda();
+  int setmask();
+  virtual void init();
+  void setup(int);
+  virtual void initial_integrate(int);
+  virtual void final_integrate();
+  void initial_integrate_respa(int, int, int);
+  void final_integrate_respa(int, int);
+  double compute_scalar();
+  double compute_vector(int);
+  void write_restart(FILE *);
+  void restart(char *);
+  int modify_param(int, char **);
+  void reset_dt();
+
+ protected:
+  class Cuda *cuda;
+  int dimension,which;
+  double dtv,dtf,dthalf,dt4,dt8,dto;
+  double boltz,nktv2p,tdof;
+  double vol0,t0;
+
+  double t_start,t_stop;
+  double t_current,t_target;
+  double t_freq;
+
+  int tstat_flag;                   // 1 if control T
+  int pstat_flag;                   // 1 if control P
+
+  int pstyle,pcouple,allremap;
+  int p_flag[6];                   // 1 if control P on this dim, 0 if not
+  double p_start[6],p_stop[6];
+  double p_freq[6],p_target[6];
+  double omega[6],omega_dot[6];
+  double omega_mass[6];
+  double p_current[6],dilation[6];
+  double drag,tdrag_factor;        // drag factor on particle thermostat
+  double pdrag_factor;             // drag factor on barostat
+  double factor[6];                // velocity scaling due to barostat
+  int kspace_flag;                 // 1 if KSpace invoked, 0 if not
+  int nrigid;                      // number of rigid fixes
+  int *rfix;                       // indices of rigid fixes
+
+  int nlevels_respa;
+  double *step_respa;
+
+  char *id_temp,*id_press;
+  class Compute *temperature,*pressure;
+  int tflag,pflag;
+
+  double *eta,*eta_dot;            // chain thermostat for particles
+  double *eta_dotdot;
+  double *eta_mass;
+  int mtchain;                     // length of chain
+                                   
+  double *etap;                    // chain thermostat for barostat
+  double *etap_dot;
+  double *etap_dotdot;
+  double *etap_mass;
+  int mpchain;                     // length of chain
+                                   
+  int mtk_flag;                    // 0 if using Hoover barostat
+  double mtk_term1,mtk_term2;
+  int mtchain_default_flag;
+  int pdim;                        // number of barostatted dims
+  double mvv_current[3];           // diagonal of KE tensor
+  double mtk_factor;               // MTK factor
+  double p_freq_max;               // maximum barostat frequency
+
+  double p_hydro;                  // hydrostatic target pressure
+
+  int nc_tchain,nc_pchain;
+  double factor_eta;
+  double sigma[6];                 // scaled target stress
+  double fdev[6];                  // deviatoric force on barostat
+  int deviatoric_flag;             // 0 if target stress tensor is hydrostatic
+  double h0_inv[6];                // h_inv of reference (zero strain) box
+  int nreset_h0;                   // interval for resetting h0
+
+  void couple();
+  void couple_ke();
+  void remap();
+  void nhc_temp_integrate();
+  void nhc_press_integrate();
+
+  virtual void nve_x();            // may be overwritten by child classes
+  virtual void nve_v();
+  virtual void nh_v_press();
+  virtual void nh_v_temp();
+
+  void compute_sigma();
+  void compute_deviatoric();
+  double compute_strain_energy();
+  void compute_press_target();
+  void nh_omega_dot();
+  
+  X_FLOAT triggerneighsq;
+};
+
+}
+
+#endif
diff --git a/src/USER-CUDA/fix_npt_cuda.cpp b/src/USER-CUDA/fix_npt_cuda.cpp
new file mode 100644
index 0000000000..f254b4b7d1
--- /dev/null
+++ b/src/USER-CUDA/fix_npt_cuda.cpp
@@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "fix_npt_cuda.h"
+#include "modify.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixNPTCuda::FixNPTCuda(LAMMPS *lmp, int narg, char **arg) :
+  FixNHCuda(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (!tstat_flag)
+    error->all("Temperature control must be used with fix npt");
+  if (!pstat_flag)
+    error->all("Pressure control must be used with fix npt");
+
+  // create a new compute temp style
+  // id = fix-ID + temp
+  // compute group = all since pressure is always global (group all)
+  // and thus its KE/temperature contribution should use group all
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+  
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = (char *) "all";
+  newarg[2] = (char *) "temp/cuda";
+
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+
+  // create a new compute pressure style
+  // id = fix-ID + press, compute group = all
+  // pass id_temp as 4th arg to pressure constructor
+
+  n = strlen(id) + 7;
+  id_press = new char[n];
+  strcpy(id_press,id);
+  strcat(id_press,"_press");
+  
+  newarg = new char*[4];
+  newarg[0] = id_press;
+  newarg[1] = (char *) "all";
+  newarg[2] = (char *) "pressure/cuda";
+  newarg[3] = id_temp;
+  modify->add_compute(4,newarg);
+  delete [] newarg;
+  pflag = 1;
+}
diff --git a/src/USER-CUDA/fix_npt_cuda.h b/src/USER-CUDA/fix_npt_cuda.h
new file mode 100644
index 0000000000..1dc5f5af35
--- /dev/null
+++ b/src/USER-CUDA/fix_npt_cuda.h
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(npt/cuda,FixNPTCuda)
+
+#else
+
+#ifndef LMP_FIX_NPTCuda_H
+#define LMP_FIX_NPTCuda_H
+
+#include "fix_nh_cuda.h"
+
+namespace LAMMPS_NS {
+
+class FixNPTCuda : public FixNHCuda {
+ public:
+  FixNPTCuda(class LAMMPS *, int, char **);
+  ~FixNPTCuda() {}
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_nve_cuda.cpp b/src/USER-CUDA/fix_nve_cuda.cpp
new file mode 100644
index 0000000000..367dd7c24e
--- /dev/null
+++ b/src/USER-CUDA/fix_nve_cuda.cpp
@@ -0,0 +1,155 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdio>
+#include <cstring>
+#include "fix_nve_cuda.h"
+#include "fix_nve_cuda_cu.h"
+#include "atom.h"
+#include "force.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixNVECuda::FixNVECuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+
+  if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (strcmp(style,"nve/sphere") != 0 && narg < 3)
+		error->all("Illegal fix nve command");
+	
+	time_integrate = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixNVECuda::setmask()
+{
+	int mask = 0;
+	mask |= INITIAL_INTEGRATE_CUDA;
+	mask |= FINAL_INTEGRATE_CUDA;
+	// mask |= INITIAL_INTEGRATE_RESPA_CUDA;
+	// mask |= FINAL_INTEGRATE_RESPA_CUDA;
+	return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVECuda::init()
+{
+	dtv = update->dt;
+	dtf = 0.5 * update->dt * force->ftm2v;
+	
+	if (strcmp(update->integrate_style,"respa") == 0)
+		step_respa = ((Respa *) update->integrate)->step;
+		
+	triggerneighsq= cuda->shared_data.atom.triggerneighsq;
+    cuda->neighbor_decide_by_integrator=1;
+    Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf);
+    
+}
+
+/* ----------------------------------------------------------------------
+   allow for both per-type and per-atom mass
+------------------------------------------------------------------------- */
+
+void FixNVECuda::initial_integrate(int vflag)
+{
+	if(triggerneighsq!=cuda->shared_data.atom.triggerneighsq) 
+	{
+		triggerneighsq= cuda->shared_data.atom.triggerneighsq;
+		Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf);
+	}
+	int nlocal = atom->nlocal;
+	if(igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+    Cuda_FixNVECuda_InitialIntegrate(& cuda->shared_data, groupbit,nlocal);	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVECuda::final_integrate()
+{
+	int nlocal = atom->nlocal;
+	if(igroup == atom->firstgroup) nlocal = atom->nfirst;
+	
+	Cuda_FixNVECuda_FinalIntegrate(& cuda->shared_data, groupbit,nlocal);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVECuda::initial_integrate_respa(int vflag, int ilevel, int flag)
+{
+	//this point should not be reached yet since RESPA is not supported
+	if (flag) return;             // only used by NPT,NPH
+	
+	dtv = step_respa[ilevel];
+	dtf = 0.5 * step_respa[ilevel] * force->ftm2v;
+	
+	// innermost level - NVE update of v and x
+	// all other levels - NVE update of v
+	
+	if(ilevel == 0) initial_integrate(vflag);
+	else final_integrate();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVECuda::final_integrate_respa(int ilevel, int iloop)
+{
+	//this point should not be reached yet since RESPA is not supported
+	dtf = 0.5 * step_respa[ilevel] * force->ftm2v;
+	final_integrate();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVECuda::reset_dt()
+{
+	dtv = update->dt;
+	dtf = 0.5 * update->dt * force->ftm2v;
+	Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf);
+}
diff --git a/src/USER-CUDA/fix_nve_cuda.h b/src/USER-CUDA/fix_nve_cuda.h
new file mode 100644
index 0000000000..6968297610
--- /dev/null
+++ b/src/USER-CUDA/fix_nve_cuda.h
@@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nve/cuda,FixNVECuda)
+
+#else
+
+#ifndef LMP_FIX_NVE_CUDA_H
+#define LMP_FIX_NVE_CUDA_H
+
+#include "fix.h"
+#include "cuda_precision.h"
+
+namespace LAMMPS_NS {
+
+class FixNVECuda : public Fix
+{
+	public:
+		FixNVECuda(class LAMMPS *, int, char **);
+		int setmask();
+		virtual void init();
+		virtual void initial_integrate(int);
+		virtual void final_integrate();
+		void initial_integrate_respa(int, int, int);
+		void final_integrate_respa(int, int);
+		void reset_dt();
+	
+		X_FLOAT triggerneighsq;
+		
+	protected:
+		class Cuda *cuda;
+		double dtv, dtf;
+		double *step_respa;
+		int mass_require;
+		
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_nvt_cuda.cpp b/src/USER-CUDA/fix_nvt_cuda.cpp
new file mode 100644
index 0000000000..49a3c63013
--- /dev/null
+++ b/src/USER-CUDA/fix_nvt_cuda.cpp
@@ -0,0 +1,48 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "fix_nvt_cuda.h"
+#include "group.h"
+#include "modify.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixNVTCuda::FixNVTCuda(LAMMPS *lmp, int narg, char **arg) :
+  FixNHCuda(lmp, narg, arg)
+{
+  if (!tstat_flag)
+    error->all("Temperature control must be used with fix nvt");
+  if (pstat_flag)
+    error->all("Pressure control can not be used with fix nvt");
+
+  // create a new compute temp style
+  // id = fix-ID + temp
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+  
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = group->names[igroup];
+  newarg[2] = (char *) "temp/cuda";
+
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+}
diff --git a/src/USER-CUDA/fix_nvt_cuda.h b/src/USER-CUDA/fix_nvt_cuda.h
new file mode 100644
index 0000000000..02e5ca3d58
--- /dev/null
+++ b/src/USER-CUDA/fix_nvt_cuda.h
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nvt/cuda,FixNVTCuda)
+
+#else
+
+#ifndef LMP_FIX_NVTCuda_H
+#define LMP_FIX_NVTCuda_H
+
+#include "fix_nh_cuda.h"
+
+namespace LAMMPS_NS {
+
+class FixNVTCuda : public FixNHCuda {
+ public:
+  FixNVTCuda(class LAMMPS *, int, char **);
+  ~FixNVTCuda() {}
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_set_force_cuda.cpp b/src/USER-CUDA/fix_set_force_cuda.cpp
new file mode 100644
index 0000000000..8f8c87c82f
--- /dev/null
+++ b/src/USER-CUDA/fix_set_force_cuda.cpp
@@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#include <cstring>
+#include <cstdlib>
+#include "fix_set_force_cuda.h"
+#include "fix_set_force_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda.h"
+#include "memory.h"
+#include "cuda_modify_flags.h"
+
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixSetForceCuda::FixSetForceCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+  if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+  
+  if (narg != 6) error->all("Illegal fix setforce/cuda command");
+
+  vector_flag = 1;
+  size_vector = 3;
+  global_freq = 1;
+  extvector = 1;
+
+  flagx = flagy = flagz = 1;
+  if (strcmp(arg[3],"NULL") == 0) flagx = 0;
+  else xvalue = atof(arg[3]);
+  if (strcmp(arg[4],"NULL") == 0) flagy = 0;
+  else yvalue = atof(arg[4]);
+  if (strcmp(arg[5],"NULL") == 0) flagz = 0;
+  else zvalue = atof(arg[5]);
+
+  force_flag = 0;
+  foriginal[0] = foriginal[1] = foriginal[2] = 0.0;
+  cu_foriginal=NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixSetForceCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  mask |= POST_FORCE_RESPA;
+  mask |= MIN_POST_FORCE_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::init()
+{
+  if(not cu_foriginal)
+  cu_foriginal = new cCudaData<double, F_FLOAT, x> (foriginal,3);    
+  if (strcmp(update->integrate_style,"respa") == 0)
+    nlevels_respa = ((Respa *) update->integrate)->nlevels;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::setup(int vflag)
+{
+  MYDBG( printf("# CUDA: FixSetForceCuda::setup\n"); )
+	
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixSetForceCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    
+  }
+  else {
+    ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1);
+    cuda->cu_f->download();
+    post_force_respa(vflag,nlevels_respa-1,0);
+    cuda->cu_f->upload();
+    ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1);
+  }
+  MYDBG( printf("# CUDA: FixSetForceCuda::setup done\n"); )
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::min_setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::post_force(int vflag)
+{
+  MYDBG( printf("# CUDA: FixSetForceCuda::postforce start\n"); )
+  force_flag = 0;
+  cu_foriginal->memset_device(0);
+  Cuda_FixSetForceCuda_PostForce(&cuda->shared_data, groupbit, xvalue, yvalue,zvalue,(F_FLOAT*) cu_foriginal->dev_data(),flagx,flagy,flagz);
+  cu_foriginal->download();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  if (ilevel == nlevels_respa-1) post_force(vflag);
+  else {
+  	cuda->cu_f->download();
+  	cuda->cu_mask->download();
+  	
+    double **f = atom->f;
+    int *mask = atom->mask;
+    int nlocal = atom->nlocal;
+
+    foriginal[0] = foriginal[1] = foriginal[2] = 0.0;
+    force_flag = 0;
+    
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	foriginal[0] += f[i][0];
+	foriginal[1] += f[i][1];
+	foriginal[2] += f[i][2];
+	if (flagx) f[i][0] = 0.0;
+	if (flagy) f[i][1] = 0.0;
+	if (flagz) f[i][2] = 0.0;
+      }
+  	cuda->cu_f->upload();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::min_post_force(int vflag)
+{
+  post_force(vflag);
+}
+
+
+/* ----------------------------------------------------------------------
+   return components of total force on fix group before force was changed
+------------------------------------------------------------------------- */
+
+double FixSetForceCuda::compute_vector(int n)
+{
+  // only sum across procs one time
+
+  if (force_flag == 0) {
+    MPI_Allreduce(foriginal,foriginal_all,3,MPI_DOUBLE,MPI_SUM,world);
+    force_flag = 1;
+  }
+  return foriginal_all[n+1];
+}
diff --git a/src/USER-CUDA/fix_set_force_cuda.h b/src/USER-CUDA/fix_set_force_cuda.h
new file mode 100644
index 0000000000..c233294a5b
--- /dev/null
+++ b/src/USER-CUDA/fix_set_force_cuda.h
@@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(setforce/cuda,FixSetForceCuda)
+
+#else
+
+#ifndef LMP_FIX_SET_FORCE_CUDA_H
+#define LMP_FIX_SET_FORCE_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixSetForceCuda : public Fix {
+ public:
+  FixSetForceCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  void post_force_respa(int, int, int);
+  void min_post_force(int);
+  double compute_vector(int);
+
+ private:
+  class Cuda *cuda;
+  int flagx,flagy,flagz; 
+  double xvalue,yvalue,zvalue;
+  double foriginal[3],foriginal_all[3];
+  cCudaData<double     , F_FLOAT   		, x>* cu_foriginal;	
+  int force_flag;
+  int nlevels_respa;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_shake_cuda.cpp b/src/USER-CUDA/fix_shake_cuda.cpp
new file mode 100644
index 0000000000..dd221e8536
--- /dev/null
+++ b/src/USER-CUDA/fix_shake_cuda.cpp
@@ -0,0 +1,2619 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <cstdio>
+#include <ctime>
+#include "fix_shake_cuda.h"
+#include "fix_shake_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "modify.h"
+#include "domain.h"
+#include "force.h"
+#include "bond.h"
+#include "angle.h"
+#include "comm.h"
+#include "group.h"
+#include "fix_respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+#define BIG 1.0e20
+#define MASSDELTA 0.1
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+FixShakeCuda::FixShakeCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  cuda->accelerator(0,NULL);
+  MPI_Comm_rank(world,&me);
+  MPI_Comm_size(world,&nprocs);
+  neighbor_step=true;
+  PI = 4.0*atan(1.0);
+
+  virial_flag = 1;
+  create_attribute = 1;
+
+  // error check
+
+  if (atom->molecular == 0)
+    error->all("Cannot use fix shake with non-molecular system");
+
+  // perform initial allocation of atom-based arrays
+  // register with Atom class
+
+  shake_flag = NULL;
+  shake_atom = shake_type = NULL;
+  xshake = NULL;
+  cu_shake_flag = NULL;
+  cu_shake_atom = NULL;
+  cu_shake_type = NULL;
+  cu_xshake = NULL;
+  cu_list = NULL;
+  cu_bond_distance = NULL;
+  cu_angle_distance = NULL;
+  cu_virial = new cCudaData<double 	  , ENERGY_FLOAT , xx >(virial,6);
+  grow_arrays(atom->nmax);
+  atom->add_callback(0);
+
+  // set comm size needed by this fix
+
+  comm_forward = 3;
+
+  // parse SHAKE args
+
+  if (narg < 8) error->all("Illegal fix shake command");
+
+  tolerance = atof(arg[3]);
+  max_iter = atoi(arg[4]);
+  output_every = atoi(arg[5]);
+
+  // parse SHAKE args for bond and angle types
+  // will be used by find_clusters
+  // store args for "b" "a" "t" as flags in (1:n) list for fast access
+  // store args for "m" in list of length nmass for looping over
+  // for "m" verify that atom masses have been set
+
+  bond_flag = new int[atom->nbondtypes+1];
+  for (int i = 1; i <= atom->nbondtypes; i++) bond_flag[i] = 0;
+  angle_flag = new int[atom->nangletypes+1];
+  for (int i = 1; i <= atom->nangletypes; i++) angle_flag[i] = 0;
+  type_flag = new int[atom->ntypes+1];
+  for (int i = 1; i <= atom->ntypes; i++) type_flag[i] = 0;
+  mass_list = new double[atom->ntypes];
+  nmass = 0;
+
+  char mode = '\0';
+  int next = 6;
+  while (next < narg) {
+
+    if (strcmp(arg[next],"b") == 0) mode = 'b';
+    else if (strcmp(arg[next],"a") == 0) mode = 'a';
+    else if (strcmp(arg[next],"t") == 0) mode = 't';
+    else if (strcmp(arg[next],"m") == 0) {
+      mode = 'm';
+      atom->check_mass();
+
+    } else if (mode == 'b') {
+      int i = atoi(arg[next]);
+      if (i < 1 || i > atom->nbondtypes) 
+	error->all("Invalid bond type index for fix shake");
+      bond_flag[i] = 1;
+
+    } else if (mode == 'a') {
+      int i = atoi(arg[next]);
+      if (i < 1 || i > atom->nangletypes) 
+	error->all("Invalid angle type index for fix shake");
+      angle_flag[i] = 1;
+
+    } else if (mode == 't') {
+      int i = atoi(arg[next]);
+      if (i < 1 || i > atom->ntypes) 
+	error->all("Invalid atom type index for fix shake");
+      type_flag[i] = 1;
+
+    } else if (mode == 'm') {
+      double massone = atof(arg[next]);
+      if (massone == 0.0) error->all("Invalid atom mass for fix shake");
+      if (nmass == atom->ntypes) error->all("Too many masses for fix shake");
+      mass_list[nmass++] = massone;
+
+    } else error->all("Illegal fix shake command");
+    next++;
+  }
+
+  // allocate bond and angle distance arrays, indexed from 1 to n
+
+  bond_distance = new double[atom->nbondtypes+1];
+  angle_distance = new double[atom->nangletypes+1];
+
+  cu_bond_distance = new cCudaData<double, X_FLOAT, xx> (bond_distance, atom->nbondtypes+1);
+  cu_angle_distance = new cCudaData<double, X_FLOAT, xx> (angle_distance, atom->nangletypes+1);
+  
+  // allocate statistics arrays
+
+  if (output_every) {
+    int nb = atom->nbondtypes + 1;
+    b_count = new int[nb];
+    b_count_all = new int[nb];
+    b_ave = new double[nb];
+    b_ave_all = new double[nb];
+    b_max = new double[nb];
+    b_max_all = new double[nb];
+    b_min = new double[nb];
+    b_min_all = new double[nb];
+
+    int na = atom->nangletypes + 1;
+    a_count = new int[na];
+    a_count_all = new int[na];
+    a_ave = new double[na];
+    a_ave_all = new double[na];
+    a_max = new double[na];
+    a_max_all = new double[na];
+    a_min = new double[na];
+    a_min_all = new double[na];
+  }
+
+  cudable_comm=true;
+  // identify all SHAKE clusters
+
+  find_clusters();
+
+  // initialize list of SHAKE clusters to constrain
+
+  maxlist = 0;
+  list = NULL;
+  Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq,
+  	cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(),
+  	cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(),
+	max_iter,tolerance);
+  
+  
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixShakeCuda::~FixShakeCuda()
+{
+  // unregister callbacks to this fix from Atom class
+
+  atom->delete_callback(id,0);
+
+  // set bond_type and angle_type back to positive for SHAKE clusters
+  // must set for all SHAKE bonds and angles stored by each atom
+
+  int **bond_type = atom->bond_type;
+  int **angle_type = atom->angle_type;
+  int nlocal = atom->nlocal;
+
+  int n;
+  for (int i = 0; i < nlocal; i++) {
+    if (shake_flag[i] == 0) continue;
+    else if (shake_flag[i] == 1) {
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][1]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][2]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+      n = anglefind(i,shake_atom[i][1],shake_atom[i][2]);
+      if (n >= 0) angle_type[i][n] = -angle_type[i][n];
+    } else if (shake_flag[i] == 2) {
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][1]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+    } else if (shake_flag[i] == 3) {
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][1]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][2]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+    } else if (shake_flag[i] == 4) {
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][1]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][2]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][3]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+    }
+  }
+
+  // delete locally stored arrays
+
+  memory->destroy(shake_flag);
+  memory->destroy(shake_atom);
+  memory->destroy(shake_type);
+  memory->destroy(xshake);
+
+  delete [] bond_flag;
+  delete [] angle_flag;
+  delete [] type_flag;
+  delete [] mass_list;
+
+  delete [] bond_distance;
+  delete [] angle_distance;
+
+  if (output_every) {
+    delete [] b_count;
+    delete [] b_count_all;
+    delete [] b_ave;
+    delete [] b_ave_all;
+    delete [] b_max;
+    delete [] b_max_all;
+    delete [] b_min;
+    delete [] b_min_all;
+
+    delete [] a_count;
+    delete [] a_count_all;
+    delete [] a_ave;
+    delete [] a_ave_all;
+    delete [] a_max;
+    delete [] a_max_all;
+    delete [] a_min;
+    delete [] a_min_all;
+  }
+
+  memory->destroy(list);
+
+  delete cu_shake_flag;
+  delete cu_shake_atom;
+  delete cu_shake_type;
+  delete cu_xshake;
+  delete cu_list;
+  delete cu_bond_distance;
+  delete cu_angle_distance;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixShakeCuda::setmask()
+{
+  int mask = 0;
+  mask |= PRE_NEIGHBOR_CUDA;
+  mask |= POST_FORCE_CUDA;
+  mask |= POST_FORCE_RESPA;
+  return mask;
+}
+
+/* ----------------------------------------------------------------------
+   set bond and angle distances
+   this init must happen after force->bond and force->angle inits 
+------------------------------------------------------------------------- */
+
+void FixShakeCuda::init()
+{
+  int i,m,flag,flag_all,type1,type2,bond1_type,bond2_type;
+  double rsq,angle;
+
+  // error if more than one shake fix
+
+  int count = 0;
+  for (i = 0; i < modify->nfix; i++)
+    if (strcmp(modify->fix[i]->style,"shake") == 0) count++;
+  if (count > 1) error->all("More than one fix shake");
+
+  // cannot use with minimization since SHAKE turns off bonds
+  // that should contribute to potential energy
+
+  if (update->whichflag == 2)
+    error->all("Fix shake cannot be used with minimization");
+
+  // error if npt,nph fix comes before shake fix
+
+  for (i = 0; i < modify->nfix; i++) {
+    if (strcmp(modify->fix[i]->style,"npt") == 0) break;
+    if (strcmp(modify->fix[i]->style,"nph") == 0) break;
+  }
+  if (i < modify->nfix) {
+    for (int j = i; j < modify->nfix; j++)
+      if (strcmp(modify->fix[j]->style,"shake") == 0)
+	error->all("Shake fix must come before NPT/NPH fix");
+  }
+
+  // if rRESPA, find associated fix that must exist
+  // could have changed locations in fix list since created
+  // set ptrs to rRESPA variables
+
+  if (strcmp(update->integrate_style,"respa") == 0) {
+    for (i = 0; i < modify->nfix; i++)
+      if (strcmp(modify->fix[i]->style,"RESPA") == 0) ifix_respa = i;
+    nlevels_respa = ((Respa *) update->integrate)->nlevels;
+    loop_respa = ((Respa *) update->integrate)->loop;
+    step_respa = ((Respa *) update->integrate)->step;
+  }
+
+  // set equilibrium bond distances
+
+  if (force->bond == NULL)
+    error->all("Bond potential must be defined for SHAKE");
+  for (i = 1; i <= atom->nbondtypes; i++) 
+    bond_distance[i] = force->bond->equilibrium_distance(i);
+
+  // set equilibrium angle distances
+
+  int nlocal = atom->nlocal;
+
+  for (i = 1; i <= atom->nangletypes; i++) {
+    if (angle_flag[i] == 0) continue;
+    if (force->angle == NULL)
+      error->all("Angle potential must be defined for SHAKE");
+
+    // scan all atoms for a SHAKE angle cluster
+    // extract bond types for the 2 bonds in the cluster
+    // bond types must be same in all clusters of this angle type,
+    //   else set error flag
+    
+    flag = 0;
+    bond1_type = bond2_type = 0;
+    for (m = 0; m < nlocal; m++) {
+      if (shake_flag[m] != 1) continue;
+      if (shake_type[m][2] != i) continue;
+      type1 = MIN(shake_type[m][0],shake_type[m][1]);
+      type2 = MAX(shake_type[m][0],shake_type[m][1]);
+      if (bond1_type > 0) {
+	if (type1 != bond1_type || type2 != bond2_type) {
+	  flag = 1;
+	  break;
+	}
+      }
+      bond1_type = type1;
+      bond2_type = type2;
+    }
+
+    // error check for any bond types that are not the same
+    
+    MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_MAX,world);
+    if (flag_all) error->all("Shake angles have different bond types");
+    
+    // insure all procs have bond types
+    
+    MPI_Allreduce(&bond1_type,&flag_all,1,MPI_INT,MPI_MAX,world);
+    bond1_type = flag_all;
+    MPI_Allreduce(&bond2_type,&flag_all,1,MPI_INT,MPI_MAX,world);
+    bond2_type = flag_all;
+    
+    // if bond types are 0, no SHAKE angles of this type exist
+    // just skip this angle
+    
+    if (bond1_type == 0) {
+      angle_distance[i] = 0.0;
+      continue;
+    }
+
+    // compute the angle distance as a function of 2 bond distances
+    
+    angle = force->angle->equilibrium_angle(i);
+    rsq = 2.0*bond_distance[bond1_type]*bond_distance[bond2_type] * 
+      (1.0-cos(angle));
+    angle_distance[i] = sqrt(rsq);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   SHAKE as pre-integrator constraint 
+------------------------------------------------------------------------- */
+
+void FixShakeCuda::setup(int vflag)
+{
+  pre_neighbor();
+
+  if (output_every) stats();
+
+  // setup SHAKE output
+
+  int ntimestep = update->ntimestep;
+  next_output = ntimestep + output_every;
+  if (output_every == 0) next_output = update->laststep + 1;
+  if (output_every && ntimestep % output_every != 0)
+    next_output = (ntimestep/output_every)*output_every + output_every;
+
+  // half timestep constraint on pre-step, full timestep thereafter
+
+  if (strcmp(update->integrate_style,"verlet") == 0) {
+    dtv = update->dt;
+    dtfsq = 0.5 * update->dt * update->dt * force->ftm2v;
+    post_force(vflag);
+    dtfsq = update->dt * update->dt * force->ftm2v;
+  } else {
+    dtv = step_respa[0];
+    dtf_innerhalf = 0.5 * step_respa[0] * force->ftm2v;
+    dtf_inner = dtf_innerhalf;
+    ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1);
+    post_force_respa(vflag,nlevels_respa-1,0);
+    ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1);
+    dtf_inner = step_respa[0] * force->ftm2v;
+  }
+  Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq,
+  	cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(),
+  	cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(),
+	max_iter,tolerance);
+}
+
+/* ----------------------------------------------------------------------
+   build list of SHAKE clusters to constrain
+   if one or more atoms in cluster are on this proc,
+     this proc lists the cluster exactly once 
+------------------------------------------------------------------------- */
+
+void FixShakeCuda::pre_neighbor()
+{
+  int atom1,atom2,atom3,atom4;
+
+  // local copies of atom quantities
+  // used by SHAKE until next re-neighboring
+
+  x = atom->x;
+  v = atom->v;
+  f = atom->f;
+  mass = atom->mass;
+  rmass = atom->rmass;
+  type = atom->type;
+  nlocal = atom->nlocal;
+
+  // extend size of SHAKE list if necessary
+
+  if (nlocal > maxlist) {
+    maxlist = nlocal;
+    memory->destroy(list);
+    memory->create(list,maxlist,"shake:list");
+    delete cu_list; cu_list = new cCudaData<int 	  , int	    , xx >(list,maxlist);
+  }
+
+  // build list of SHAKE clusters I compute
+
+  nlist = 0;
+  int count2=0,count3=0,count4=0,count3a=0;
+  for (int i = 0; i < nlocal; i++)
+    if (shake_flag[i]) {
+    	if(shake_flag[i] == 2) count2++;
+    	if(shake_flag[i] == 3) count3++;
+    	if(shake_flag[i] == 4) count4++;
+    	if(shake_flag[i] == 1) count3a++;
+    	
+      if (shake_flag[i] == 2) {
+	atom1 = atom->map(shake_atom[i][0]);
+	atom2 = atom->map(shake_atom[i][1]);
+	if (atom1 == -1 || atom2 == -1) {
+	  char str[128];
+	  sprintf(str,
+		  "Shake atoms %d %d missing on proc %d at step " BIGINT_FORMAT,
+		  shake_atom[i][0],shake_atom[i][1],me,update->ntimestep);
+	  error->one(str);
+	}
+	if (i <= atom1 && i <= atom2) list[nlist++] = i;
+      } else if (shake_flag[i] % 2 == 1) {
+	atom1 = atom->map(shake_atom[i][0]);
+	atom2 = atom->map(shake_atom[i][1]);
+	atom3 = atom->map(shake_atom[i][2]);
+	if (atom1 == -1 || atom2 == -1 || atom3 == -1) {
+	  char str[128];
+	  sprintf(str,
+		  "Shake atoms %d %d %d missing on proc %d at step " 
+		  BIGINT_FORMAT,
+		  shake_atom[i][0],shake_atom[i][1],shake_atom[i][2],
+		  me,update->ntimestep);
+	  error->one(str);
+	}
+	if (i <= atom1 && i <= atom2 && i <= atom3) list[nlist++] = i;
+      } else {
+	atom1 = atom->map(shake_atom[i][0]);
+	atom2 = atom->map(shake_atom[i][1]);
+	atom3 = atom->map(shake_atom[i][2]);
+	atom4 = atom->map(shake_atom[i][3]);
+	if (atom1 == -1 || atom2 == -1 || atom3 == -1 || atom4 == -1) {
+	  char str[128];
+	  sprintf(str,
+		  "Shake atoms %d %d %d %d missing on proc %d at step " 
+		  BIGINT_FORMAT,
+		  shake_atom[i][0],shake_atom[i][1],
+		  shake_atom[i][2],shake_atom[i][3],
+		  me,update->ntimestep);
+	  error->one(str);
+	}
+	if (i <= atom1 && i <= atom2 && i <= atom3 && i <= atom4) 
+	  list[nlist++] = i;
+      }
+    }
+    count2/=2;
+    count3/=3;
+    count4/=4;
+    count3a/=3;
+    count3+=count2;
+    count4+=count3;
+    count3a+=count4;
+    for(int k = 0,l = count2; k < count2; k++)
+    {
+      if(shake_flag[list[k]]!=2)
+      {
+        while(shake_flag[list[l]]!=2 && l<nlist-1) l++;
+        if(shake_flag[list[l]]!=2) {printf("FixShakeCuda: Error in List SortA %i %i\n",k,l);return;}
+        int tmp = list[k]; list[k]=list[l]; list[l]=tmp;
+      }
+    }
+    
+    for(int k = count2,l = count3; k < count3; k++)
+    {
+      if(shake_flag[list[k]]!=3)
+      {
+        while(shake_flag[list[l]]!=3 && l<nlist-1) l++;
+        if(shake_flag[list[l]]!=3) {printf("FixShakeCuda: Error in List SortB %i %i\n",k,l);return;}
+        int tmp = list[k]; list[k]=list[l]; list[l]=tmp;
+      }
+    }
+    
+    for(int k = count3,l = count4; k < count4; k++)
+    {
+      if(shake_flag[list[k]]!=4)
+      {
+        while(shake_flag[list[l]]!=4 && l<nlist-1) l++;
+        if(shake_flag[list[l]]!=4) {printf("FixShakeCuda: Error in List SortC %i %i\n",k,l);return;}
+        int tmp = list[k]; list[k]=list[l]; list[l]=tmp;
+      }
+    }
+  	cu_list->upload();
+  	cu_bond_distance->upload();
+  	cu_angle_distance->upload();
+  	cu_shake_flag->upload();
+  	cu_shake_atom->upload();
+  	cu_shake_type->upload();
+  	
+    neighbor_step=true;
+}
+
+/* ----------------------------------------------------------------------
+   compute the force adjustment for SHAKE constraint 
+------------------------------------------------------------------------- */
+
+void FixShakeCuda::post_force(int vflag)
+{
+	timespec starttime;
+	timespec endtime;
+
+
+	if(cuda->finished_setup && neighbor_step)
+	{
+ Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq,
+  	cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(),
+  	cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(),
+	max_iter,tolerance);
+	
+	}
+		
+	if(not cuda->finished_setup)
+	cuda->downloadAll();
+  if (update->ntimestep == next_output) 
+  {
+  	if(cuda->finished_setup) 
+  	cuda->cu_x->download();
+  	stats();
+  }
+
+  // xshake = unconstrained move with current v,f
+
+  unconstrained_update();
+
+  // communicate results if necessary
+  
+  //if(cuda->finished_setup) cu_xshake->download();
+
+  if (nprocs > 1) 
+  {
+  //if(cuda->finished_setup) 
+  //cu_xshake->download();
+  	comm->forward_comm_fix(this);
+  //if(cuda->finished_setup) 
+  //cu_xshake->upload();
+  }
+  // virial setup
+
+  if (vflag) v_setup(vflag);
+  else evflag = 0;
+
+  // loop over clusters
+
+	clock_gettime(CLOCK_REALTIME,&starttime);
+  if(cuda->finished_setup)
+  {
+  	cu_virial->upload();
+  	if(vflag_atom) cuda->cu_vatom->upload();
+  	
+  	Cuda_FixShakeCuda_Shake(&cuda->shared_data,vflag,vflag_atom,(int*)cu_list->dev_data(),nlist);
+  	cu_virial->download();
+    if(vflag_atom) cuda->cu_vatom->download();  	
+  	
+  }
+  else
+  for (int i = 0; i < nlist; i++) {
+    int m = list[i];
+    if (shake_flag[m] == 2) shake2(m); 
+    else if (shake_flag[m] == 3) shake3(m); 
+    else if (shake_flag[m] == 4) shake4(m); 
+    else shake3angle(m); 
+  }
+  if((not cuda->finished_setup))  cuda->cu_f->upload();
+	clock_gettime(CLOCK_REALTIME,&endtime);
+	if(cuda->finished_setup)
+	time_postforce+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+	else
+	time_postforce=0.0;
+	//printf("Postforce time: %lf\n",time_postforce);
+}
+
+/* ----------------------------------------------------------------------
+   count # of degrees-of-freedom removed by SHAKE for atoms in igroup 
+------------------------------------------------------------------------- */
+
+int FixShakeCuda::dof(int igroup)
+{
+  int groupbit = group->bitmask[igroup];
+
+  int *mask = atom->mask;
+  int *tag = atom->tag;
+  int nlocal = atom->nlocal;
+
+  // count dof in a cluster if and only if
+  // the central atom is in group and atom i is the central atom
+
+  int n = 0;
+  for (int i = 0; i < nlocal; i++) {
+    if (!(mask[i] & groupbit)) continue;
+    if (shake_flag[i] == 0) continue;
+    if (shake_atom[i][0] != tag[i]) continue;
+    if (shake_flag[i] == 1) n += 3;
+    else if (shake_flag[i] == 2) n += 1;
+    else if (shake_flag[i] == 3) n += 2;
+    else if (shake_flag[i] == 4) n += 3;
+  }
+
+  int nall;
+  MPI_Allreduce(&n,&nall,1,MPI_INT,MPI_SUM,world);
+  return nall;
+}
+
+/* ----------------------------------------------------------------------
+   identify whether each atom is in a SHAKE cluster
+   only include atoms in fix group and those bonds/angles specified in input
+   test whether all clusters are valid
+   set shake_flag, shake_atom, shake_type values
+   set bond,angle types negative so will be ignored in neighbor lists 
+------------------------------------------------------------------------- */
+
+void FixShakeCuda::find_clusters()
+{
+  int i,j,m,n;
+  int flag,flag_all,messtag,loop,nbuf,nbufmax,size;
+  double massone;
+  int *buf,*bufcopy;
+  MPI_Request request;
+  MPI_Status status;
+
+  if (me == 0 && screen) fprintf(screen,"Finding SHAKE clusters ...\n");
+
+  // local copies of atom ptrs
+
+  int *tag = atom->tag;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  double *mass = atom->mass;
+  double *rmass = atom->rmass;
+  int **bond_type = atom->bond_type;
+  int **angle_type = atom->angle_type;
+  int **nspecial = atom->nspecial;
+  int **special = atom->special;
+  int nlocal = atom->nlocal;
+
+  // setup ring of procs
+
+  int next = me + 1;
+  int prev = me -1; 
+  if (next == nprocs) next = 0;
+  if (prev < 0) prev = nprocs - 1;
+
+  // -----------------------------------------------------
+  // allocate arrays for self (1d) and bond partners (2d)
+  // max = max # of bond partners for owned atoms = 2nd dim of partner arrays
+  // npartner[i] = # of bonds attached to atom i
+  // nshake[i] = # of SHAKE bonds attached to atom i
+  // partner_tag[i][] = global IDs of each partner
+  // partner_mask[i][] = mask of each partner
+  // partner_type[i][] = type of each partner
+  // partner_massflag[i][] = 1 if partner meets mass criterion, 0 if not
+  // partner_bondtype[i][] = type of bond attached to each partner
+  // partner_shake[i][] = 1 if SHAKE bonded to partner, 0 if not
+  // partner_nshake[i][] = nshake value for each partner
+  // -----------------------------------------------------
+
+  int max = 0;
+  for (i = 0; i < nlocal; i++) max = MAX(max,nspecial[i][0]);
+
+  int *npartner,*nshake;
+  memory->create(npartner,nlocal,"shake:npartner");
+  memory->create(nshake,nlocal,"shake:nshake");
+
+  int **partner_tag,**partner_mask,**partner_type,**partner_massflag;
+  int ** partner_bondtype,**partner_shake,**partner_nshake;
+  memory->create(partner_tag,nlocal,max,"shake:partner_tag");
+  memory->create(partner_mask,nlocal,max,"shake:partner_mask");
+  memory->create(partner_type,nlocal,max,"shake:partner_type");
+  memory->create(partner_massflag,nlocal,max,"shake:partner_massflag");
+  memory->create(partner_bondtype,nlocal,max,"shake:partner_bondtype");
+  memory->create(partner_shake,nlocal,max,"shake:partner_shake");
+  memory->create(partner_nshake,nlocal,max,"shake:partner_nshake");
+
+  // -----------------------------------------------------
+  // set npartner and partner_tag from special arrays
+  // -----------------------------------------------------
+
+  for (i = 0; i < nlocal; i++) {
+    npartner[i] = nspecial[i][0];
+    for (j = 0; j < npartner[i]; j++) partner_tag[i][j] = special[i][j];
+  }
+
+  // -----------------------------------------------------
+  // set partner_mask, partner_type, partner_massflag, partner_bondtype
+  //   for bonded partners
+  // requires communication for off-proc partners
+  // -----------------------------------------------------
+
+  // fill in mask, type, massflag, bondtype if own bond partner
+  // info to store in buf for each off-proc bond = nper = 6
+  //   2 atoms IDs in bond, space for mask, type, massflag, bondtype
+  // nbufmax = largest buffer needed to hold info from any proc
+
+  int nper = 6;
+
+  nbuf = 0;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < npartner[i]; j++) {
+      partner_mask[i][j] = 0;
+      partner_type[i][j] = 0;
+      partner_massflag[i][j] = 0;
+      partner_bondtype[i][j] = 0;
+
+      m = atom->map(partner_tag[i][j]);
+      if (m >= 0 && m < nlocal) {
+	partner_mask[i][j] = mask[m];
+	partner_type[i][j] = type[m];
+	if (nmass) {
+	  if (rmass) massone = rmass[m];
+	  else massone = mass[type[m]];
+	  partner_massflag[i][j] = masscheck(massone);
+	}
+	n = bondfind(i,tag[i],partner_tag[i][j]);
+	if (n >= 0) partner_bondtype[i][j] = bond_type[i][n];
+	else {
+	  n = bondfind(m,tag[i],partner_tag[i][j]);
+	  if (n >= 0) partner_bondtype[i][j] = bond_type[m][n];
+	}
+      } else nbuf += nper;
+    }
+  }
+
+  MPI_Allreduce(&nbuf,&nbufmax,1,MPI_INT,MPI_MAX,world);
+
+  buf = new int[nbufmax];
+  bufcopy = new int[nbufmax];
+
+  // fill buffer with info
+
+  size = 0;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < npartner[i]; j++) {
+      m = atom->map(partner_tag[i][j]);
+      if (m < 0 || m >= nlocal) {
+	buf[size] = tag[i];
+	buf[size+1] = partner_tag[i][j];
+	buf[size+2] = 0;
+	buf[size+3] = 0;
+	buf[size+4] = 0;
+	n = bondfind(i,tag[i],partner_tag[i][j]);
+	if (n >= 0) buf[size+5] = bond_type[i][n];
+	else buf[size+5] = 0;
+	size += nper;
+      }
+    }
+  }
+
+  // cycle buffer around ring of procs back to self
+  // when receive buffer, scan bond partner IDs for atoms I own
+  // if I own partner:
+  //   fill in mask and type and massflag
+  //   search for bond with 1st atom and fill in bondtype
+
+  messtag = 1;
+  for (loop = 0; loop < nprocs; loop++) {
+    i = 0;
+    while (i < size) {
+      m = atom->map(buf[i+1]);
+      if (m >= 0 && m < nlocal) {
+	buf[i+2] = mask[m];
+	buf[i+3] = type[m];
+	if (nmass) {
+	  if (rmass) massone = rmass[m];
+	  else massone = mass[type[m]];
+	  buf[i+4] = masscheck(massone);
+	}
+	if (buf[i+5] == 0) {
+	  n = bondfind(m,buf[i],buf[i+1]);
+	  if (n >= 0) buf[i+5] = bond_type[m][n];
+	}
+      }
+      i += nper;
+    }
+    if (me != next) {
+      MPI_Irecv(bufcopy,nbufmax,MPI_INT,prev,messtag,world,&request);
+      MPI_Send(buf,size,MPI_INT,next,messtag,world);
+      MPI_Wait(&request,&status);
+      MPI_Get_count(&status,MPI_INT,&size);
+      for (j = 0; j < size; j++) buf[j] = bufcopy[j];
+    }
+  }
+
+  // store partner info returned to me
+
+  m = 0;
+  while (m < size) {
+    i = atom->map(buf[m]);
+    for (j = 0; j < npartner[i]; j++)
+      if (buf[m+1] == partner_tag[i][j]) break;
+    partner_mask[i][j] = buf[m+2];
+    partner_type[i][j] = buf[m+3];
+    partner_massflag[i][j] = buf[m+4];
+    partner_bondtype[i][j] = buf[m+5];
+    m += nper;
+  }
+
+  delete [] buf;
+  delete [] bufcopy;
+
+  // error check for unfilled partner info
+  // if partner_type not set, is an error
+  // partner_bondtype may not be set if special list is not consistent
+  //   with bondatom (e.g. due to delete_bonds command)
+  // this is OK if one or both atoms are not in fix group, since
+  //   bond won't be SHAKEn anyway
+  // else it's an error
+
+  flag = 0;
+  for (i = 0; i < nlocal; i++)
+    for (j = 0; j < npartner[i]; j++) {
+      if (partner_type[i][j] == 0) flag = 1;
+      if (!(mask[i] & groupbit)) continue;
+      if (!(partner_mask[i][j] & groupbit)) continue;
+      if (partner_bondtype[i][j] == 0) flag = 1;
+    }
+
+  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+  if (flag_all) error->all("Did not find fix shake partner info");
+
+  // -----------------------------------------------------
+  // identify SHAKEable bonds
+  // set nshake[i] = # of SHAKE bonds attached to atom i
+  // set partner_shake[i][] = 1 if SHAKE bonded to partner, 0 if not
+  // both atoms must be in group, bondtype must be > 0
+  // check if bondtype is in input bond_flag
+  // check if type of either atom is in input type_flag
+  // check if mass of either atom is in input mass_list
+  // -----------------------------------------------------
+
+  int np;
+
+  for (i = 0; i < nlocal; i++) {
+    nshake[i] = 0;
+    np = npartner[i];
+    for (j = 0; j < np; j++) {
+      partner_shake[i][j] = 0;
+
+      if (!(mask[i] & groupbit)) continue;
+      if (!(partner_mask[i][j] & groupbit)) continue;
+      if (partner_bondtype[i][j] <= 0) continue;
+
+      if (bond_flag[partner_bondtype[i][j]]) {
+	partner_shake[i][j] = 1;
+	nshake[i]++;
+	continue;
+      }
+      if (type_flag[type[i]] || type_flag[partner_type[i][j]]) {
+	partner_shake[i][j] = 1;
+	nshake[i]++;
+	continue;
+      }
+      if (nmass) {
+	if (partner_massflag[i][j]) {
+	  partner_shake[i][j] = 1;
+	  nshake[i]++;
+	  continue;
+	} else {
+	  if (rmass) massone = rmass[i];
+	  else massone = mass[type[i]];
+	  if (masscheck(massone)) {
+	    partner_shake[i][j] = 1;
+	    nshake[i]++;
+	    continue;
+	  }
+	}
+      }
+    }
+  }
+
+  // -----------------------------------------------------
+  // set partner_nshake for bonded partners
+  // requires communication for off-proc partners
+  // -----------------------------------------------------
+
+  // fill in partner_nshake if own bond partner
+  // info to store in buf for each off-proc bond =
+  //   2 atoms IDs in bond, space for nshake value
+  // nbufmax = largest buffer needed to hold info from any proc
+
+  nbuf = 0;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < npartner[i]; j++) {
+      m = atom->map(partner_tag[i][j]);
+      if (m >= 0 && m < nlocal) partner_nshake[i][j] = nshake[m];
+      else nbuf += 3;
+    }
+  }
+  
+  MPI_Allreduce(&nbuf,&nbufmax,1,MPI_INT,MPI_MAX,world);
+
+  buf = new int[nbufmax];
+  bufcopy = new int[nbufmax];
+
+  // fill buffer with info
+
+  size = 0;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < npartner[i]; j++) {
+      m = atom->map(partner_tag[i][j]);
+      if (m < 0 || m >= nlocal) {
+	buf[size] = tag[i];
+	buf[size+1] = partner_tag[i][j];
+	size += 3;
+      }
+    }
+  }
+
+  // cycle buffer around ring of procs back to self
+  // when receive buffer, scan bond partner IDs for atoms I own
+  // if I own partner, fill in nshake value
+
+  messtag = 2;
+  for (loop = 0; loop < nprocs; loop++) {
+    i = 0;
+    while (i < size) {
+      m = atom->map(buf[i+1]);
+      if (m >= 0 && m < nlocal) buf[i+2] = nshake[m];
+      i += 3;
+    }
+    if (me != next) {
+      MPI_Irecv(bufcopy,nbufmax,MPI_INT,prev,messtag,world,&request);
+      MPI_Send(buf,size,MPI_INT,next,messtag,world);
+      MPI_Wait(&request,&status);
+      MPI_Get_count(&status,MPI_INT,&size);
+      for (j = 0; j < size; j++) buf[j] = bufcopy[j];
+    }
+  }
+
+  // store partner info returned to me
+
+  m = 0;
+  while (m < size) {
+    i = atom->map(buf[m]);
+    for (j = 0; j < npartner[i]; j++)
+      if (buf[m+1] == partner_tag[i][j]) break;
+    partner_nshake[i][j] = buf[m+2];
+    m += 3;
+  }
+
+  delete [] buf;
+  delete [] bufcopy;
+
+  // -----------------------------------------------------
+  // error checks
+  // no atom with nshake > 3
+  // no connected atoms which both have nshake > 1
+  // -----------------------------------------------------
+
+  flag = 0;
+  for (i = 0; i < nlocal; i++) if (nshake[i] > 3) flag = 1;
+  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+  if (flag_all) error->all("Shake cluster of more than 4 atoms");
+
+  flag = 0;
+  for (i = 0; i < nlocal; i++) {
+    if (nshake[i] <= 1) continue;
+    for (j = 0; j < npartner[i]; j++)
+      if (partner_shake[i][j] && partner_nshake[i][j] > 1) flag = 1;
+  }
+  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+  if (flag_all) error->all("Shake clusters are connected");
+
+  // -----------------------------------------------------
+  // set SHAKE arrays that are stored with atoms & add angle constraints
+  // zero shake arrays for all owned atoms
+  // if I am central atom set shake_flag & shake_atom & shake_type 
+  // for 2-atom clusters, I am central atom if my atom ID < partner ID
+  // for 3-atom clusters, test for angle constraint
+  //   angle will be stored by this atom if it exists
+  //   if angle type matches angle_flag, then it is angle-constrained
+  // shake_flag[] = 0 if atom not in SHAKE cluster
+  //                2,3,4 = size of bond-only cluster
+  //                1 = 3-atom angle cluster
+  // shake_atom[][] = global IDs of 2,3,4 atoms in cluster
+  //                  central atom is 1st
+  //                  for 2-atom cluster, lowest ID is 1st
+  // shake_type[][] = bondtype of each bond in cluster
+  //                  for 3-atom angle cluster, 3rd value is angletype
+  // -----------------------------------------------------
+
+  for (i = 0; i < nlocal; i++) {
+    shake_flag[i] = 0;
+    shake_atom[i][0] = 0;
+    shake_atom[i][1] = 0;
+    shake_atom[i][2] = 0;
+    shake_atom[i][3] = 0;
+    shake_type[i][0] = 0;
+    shake_type[i][1] = 0;
+    shake_type[i][2] = 0;
+
+    if (nshake[i] == 1) {
+      for (j = 0; j < npartner[i]; j++)
+	if (partner_shake[i][j]) break;
+      if (partner_nshake[i][j] == 1 && tag[i] < partner_tag[i][j]) {
+	shake_flag[i] = 2;
+	shake_atom[i][0] = tag[i];
+	shake_atom[i][1] = partner_tag[i][j];
+	shake_type[i][0] = partner_bondtype[i][j];
+      }
+    }
+
+    if (nshake[i] > 1) {
+      shake_flag[i] = 1;
+      shake_atom[i][0] = tag[i];
+      for (j = 0; j < npartner[i]; j++)
+	if (partner_shake[i][j]) {
+	  m = shake_flag[i];
+	  shake_atom[i][m] = partner_tag[i][j];
+	  shake_type[i][m-1] = partner_bondtype[i][j];
+	  shake_flag[i]++;
+	}
+    }
+
+    if (nshake[i] == 2) {
+      n = anglefind(i,shake_atom[i][1],shake_atom[i][2]);
+      if (n < 0) continue;
+      if (angle_type[i][n] < 0) continue;
+      if (angle_flag[angle_type[i][n]]) {
+	shake_flag[i] = 1;
+	shake_type[i][2] = angle_type[i][n];
+      }
+    }
+  }
+
+  // -----------------------------------------------------
+  // set shake_flag,shake_atom,shake_type for non-central atoms
+  // requires communication for off-proc atoms
+  // -----------------------------------------------------
+
+  // fill in shake arrays for each bond partner I own
+  // info to store in buf for each off-proc bond =
+  //   all values from shake_flag, shake_atom, shake_type
+  // nbufmax = largest buffer needed to hold info from any proc
+
+  nbuf = 0;
+  for (i = 0; i < nlocal; i++) {
+    if (shake_flag[i] == 0) continue;
+    for (j = 0; j < npartner[i]; j++) {
+      if (partner_shake[i][j] == 0) continue;
+      m = atom->map(partner_tag[i][j]);
+      if (m >= 0 && m < nlocal) {
+	shake_flag[m] = shake_flag[i];
+	shake_atom[m][0] = shake_atom[i][0];
+	shake_atom[m][1] = shake_atom[i][1];
+	shake_atom[m][2] = shake_atom[i][2];
+	shake_atom[m][3] = shake_atom[i][3];
+	shake_type[m][0] = shake_type[i][0];
+	shake_type[m][1] = shake_type[i][1];
+	shake_type[m][2] = shake_type[i][2];
+      } else nbuf += 9;
+    }
+  }
+
+  MPI_Allreduce(&nbuf,&nbufmax,1,MPI_INT,MPI_MAX,world);
+
+  buf = new int[nbufmax];
+  bufcopy = new int[nbufmax];
+
+  // fill buffer with info
+
+  size = 0;
+  for (i = 0; i < nlocal; i++) {
+    if (shake_flag[i] == 0) continue;
+    for (j = 0; j < npartner[i]; j++) {
+      if (partner_shake[i][j] == 0) continue;
+      m = atom->map(partner_tag[i][j]);
+      if (m < 0 || m >= nlocal) {
+	buf[size] = partner_tag[i][j];
+	buf[size+1] = shake_flag[i];
+	buf[size+2] = shake_atom[i][0];
+	buf[size+3] = shake_atom[i][1];
+	buf[size+4] = shake_atom[i][2];
+	buf[size+5] = shake_atom[i][3];
+	buf[size+6] = shake_type[i][0];
+	buf[size+7] = shake_type[i][1];
+	buf[size+8] = shake_type[i][2];
+	size += 9;
+      }
+    }
+  }
+
+  // cycle buffer around ring of procs back to self
+  // when receive buffer, scan for ID that I own
+  // if I own ID, fill in shake array values
+
+  messtag = 3;
+  for (loop = 0; loop < nprocs; loop++) {
+    i = 0;
+    while (i < size) {
+      m = atom->map(buf[i]);
+      if (m >= 0 && m < nlocal) {
+	shake_flag[m] = buf[i+1];
+	shake_atom[m][0] = buf[i+2];
+	shake_atom[m][1] = buf[i+3];
+	shake_atom[m][2] = buf[i+4];
+	shake_atom[m][3] = buf[i+5];
+	shake_type[m][0] = buf[i+6];
+	shake_type[m][1] = buf[i+7];
+	shake_type[m][2] = buf[i+8];
+      }
+      i += 9;
+    }
+    if (me != next) {
+      MPI_Irecv(bufcopy,nbufmax,MPI_INT,prev,messtag,world,&request);
+      MPI_Send(buf,size,MPI_INT,next,messtag,world);
+      MPI_Wait(&request,&status);
+      MPI_Get_count(&status,MPI_INT,&size);
+      for (j = 0; j < size; j++) buf[j] = bufcopy[j];
+    }
+  }
+
+  delete [] buf;
+  delete [] bufcopy;
+
+  // -----------------------------------------------------
+  // free local memory
+  // -----------------------------------------------------
+
+  memory->destroy(npartner);
+  memory->destroy(nshake);
+  memory->destroy(partner_tag);
+  memory->destroy(partner_mask);
+  memory->destroy(partner_type);
+  memory->destroy(partner_massflag);
+  memory->destroy(partner_bondtype);
+  memory->destroy(partner_shake);
+  memory->destroy(partner_nshake);
+
+  // -----------------------------------------------------
+  // set bond_type and angle_type negative for SHAKE clusters
+  // must set for all SHAKE bonds and angles stored by each atom
+  // -----------------------------------------------------
+
+  for (i = 0; i < nlocal; i++) {
+    if (shake_flag[i] == 0) continue;
+    else if (shake_flag[i] == 1) {
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][1]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][2]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+      n = anglefind(i,shake_atom[i][1],shake_atom[i][2]);
+      if (n >= 0) angle_type[i][n] = -angle_type[i][n];
+    } else if (shake_flag[i] == 2) {
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][1]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+    } else if (shake_flag[i] == 3) {
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][1]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][2]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+    } else if (shake_flag[i] == 4) {
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][1]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][2]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+      n = bondfind(i,shake_atom[i][0],shake_atom[i][3]);
+      if (n >= 0) bond_type[i][n] = -bond_type[i][n];
+    }
+  }
+
+  // -----------------------------------------------------
+  // print info on SHAKE clusters
+  // -----------------------------------------------------
+
+  int count1,count2,count3,count4;
+  count1 = count2 = count3 = count4 = 0;
+  for (i = 0; i < nlocal; i++) {
+    if (shake_flag[i] == 1) count1++;
+    else if (shake_flag[i] == 2) count2++;
+    else if (shake_flag[i] == 3) count3++;
+    else if (shake_flag[i] == 4) count4++;
+  }
+  
+  for(int i=0;i<nlocal;i++)
+  {
+  }
+  
+
+  int tmp;
+  tmp = count1;
+  MPI_Allreduce(&tmp,&count1,1,MPI_INT,MPI_SUM,world);
+  tmp = count2;
+  MPI_Allreduce(&tmp,&count2,1,MPI_INT,MPI_SUM,world);
+  tmp = count3;
+  MPI_Allreduce(&tmp,&count3,1,MPI_INT,MPI_SUM,world);
+  tmp = count4;
+  MPI_Allreduce(&tmp,&count4,1,MPI_INT,MPI_SUM,world);
+
+  if (me == 0) {
+    if (screen) {
+      fprintf(screen,"  %d = # of size 2 clusters\n",count2/2);
+      fprintf(screen,"  %d = # of size 3 clusters\n",count3/3);
+      fprintf(screen,"  %d = # of size 4 clusters\n",count4/4);
+      fprintf(screen,"  %d = # of frozen angles\n",count1/3);
+    }
+    if (logfile) {
+      fprintf(logfile,"  %d = # of size 2 clusters\n",count2/2);
+      fprintf(logfile,"  %d = # of size 3 clusters\n",count3/3);
+      fprintf(logfile,"  %d = # of size 4 clusters\n",count4/4);
+      fprintf(logfile,"  %d = # of frozen angles\n",count1/3);
+    }
+  }
+  cu_shake_flag->upload();
+  cu_shake_atom->upload();
+  cu_shake_type->upload();
+  Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq,
+  		cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(),
+  		cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(),
+	    max_iter,tolerance);
+  
+}
+
+void FixShakeCuda::swap_clusters(int i, int j)
+{
+	int tmp;
+	tmp = shake_flag[i]; shake_flag[i] = shake_flag[j]; shake_flag[j] = tmp;
+	tmp = shake_atom[i][0]; shake_atom[i][0] = shake_atom[j][0]; shake_atom[j][0] = tmp;
+	tmp = shake_atom[i][1]; shake_atom[i][1] = shake_atom[j][1]; shake_atom[j][1] = tmp;
+	tmp = shake_atom[i][2]; shake_atom[i][2] = shake_atom[j][2]; shake_atom[j][2] = tmp;
+	tmp = shake_atom[i][3]; shake_atom[i][3] = shake_atom[j][3]; shake_atom[j][3] = tmp;
+	tmp = shake_type[i][0]; shake_type[i][0] = shake_type[j][0]; shake_type[j][0] = tmp;
+	tmp = shake_type[i][1]; shake_type[i][1] = shake_type[j][1]; shake_type[j][1] = tmp;
+	tmp = shake_type[i][2]; shake_type[i][2] = shake_type[j][2]; shake_type[j][2] = tmp;
+}
+
+/* ----------------------------------------------------------------------
+   check if massone is within MASSDELTA of any mass in mass_list
+   return 1 if yes, 0 if not
+------------------------------------------------------------------------- */
+
+int FixShakeCuda::masscheck(double massone)
+{
+  for (int i = 0; i < nmass; i++)
+    if (fabs(mass_list[i]-massone) <= MASSDELTA) return 1;
+  return 0;
+}
+
+/* ----------------------------------------------------------------------
+   update the unconstrained position of each atom
+   only for SHAKE clusters, else set to 0.0
+   assumes NVE update, seems to be accurate enough for NVT,NPT,NPH as well 
+------------------------------------------------------------------------- */
+
+void FixShakeCuda::unconstrained_update()
+{
+  if(cuda->finished_setup)
+  {
+    Cuda_FixShakeCuda_UnconstrainedUpdate(&cuda->shared_data);
+    return;
+  }
+  
+  double dtfmsq;
+  
+  if (rmass) {
+    for (int i = 0; i < nlocal; i++) {
+      if (shake_flag[i]) {
+	dtfmsq = dtfsq / rmass[i];
+	xshake[i][0] = x[i][0] + dtv*v[i][0] + dtfmsq*f[i][0];
+	xshake[i][1] = x[i][1] + dtv*v[i][1] + dtfmsq*f[i][1];
+	xshake[i][2] = x[i][2] + dtv*v[i][2] + dtfmsq*f[i][2];
+      } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0;
+    }
+  } else {
+    for (int i = 0; i < nlocal; i++) {
+      if (shake_flag[i]) {
+	dtfmsq = dtfsq / mass[type[i]];
+	xshake[i][0] = x[i][0] + dtv*v[i][0] + dtfmsq*f[i][0];
+	xshake[i][1] = x[i][1] + dtv*v[i][1] + dtfmsq*f[i][1];
+	xshake[i][2] = x[i][2] + dtv*v[i][2] + dtfmsq*f[i][2];
+      } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0;
+    }
+  }
+  cu_xshake->upload();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixShakeCuda::shake2(int m)
+{
+  int nlist,list[2];
+  double v[6];
+  double invmass0,invmass1;
+
+  // local atom IDs and constraint distances
+
+  int i0 = atom->map(shake_atom[m][0]);
+  int i1 = atom->map(shake_atom[m][1]);
+  double bond1 = bond_distance[shake_type[m][0]];
+   
+  // r01 = distance vec between atoms, with PBC
+
+  double r01[3];
+  r01[0] = x[i0][0] - x[i1][0];
+  r01[1] = x[i0][1] - x[i1][1];
+  r01[2] = x[i0][2] - x[i1][2];
+  domain->minimum_image(r01);
+
+  // s01 = distance vec after unconstrained update, with PBC
+
+  double s01[3];
+  s01[0] = xshake[i0][0] - xshake[i1][0];
+  s01[1] = xshake[i0][1] - xshake[i1][1];
+  s01[2] = xshake[i0][2] - xshake[i1][2];
+  domain->minimum_image(s01);
+
+  // scalar distances between atoms
+
+  double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2];
+  double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2];
+
+  // a,b,c = coeffs in quadratic equation for lamda
+  
+  if (rmass) {
+    invmass0 = 1.0/rmass[i0];
+    invmass1 = 1.0/rmass[i1];
+  } else {
+    invmass0 = 1.0/mass[type[i0]];
+    invmass1 = 1.0/mass[type[i1]];
+  }
+
+  double a = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
+  double b = 2.0 * (invmass0+invmass1) *
+    (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]);
+  double c = s01sq - bond1*bond1;
+
+  // error check
+
+  double determ = b*b - 4.0*a*c;
+  if (determ < 0.0) {
+    error->warning("Shake determinant < 0.0");
+    determ = 0.0;
+  }
+
+  // exact quadratic solution for lamda
+
+  double lamda,lamda1,lamda2;
+  lamda1 = (-b+sqrt(determ)) / (2.0*a);
+  lamda2 = (-b-sqrt(determ)) / (2.0*a);
+
+  if (fabs(lamda1) <= fabs(lamda2)) lamda = lamda1;
+  else lamda = lamda2;
+
+  // update forces if atom is owned by this processor
+  lamda /= dtfsq;
+
+  if (i0 < nlocal) {
+    f[i0][0] += lamda*r01[0];
+    f[i0][1] += lamda*r01[1];
+    f[i0][2] += lamda*r01[2];
+  }
+
+  if (i1 < nlocal) {
+    f[i1][0] -= lamda*r01[0];
+    f[i1][1] -= lamda*r01[1];
+    f[i1][2] -= lamda*r01[2];
+  }
+
+  if (evflag) {
+    nlist = 0;
+    if (i0 < nlocal) list[nlist++] = i0;
+    if (i1 < nlocal) list[nlist++] = i1;
+
+    v[0] = lamda*r01[0]*r01[0];
+    v[1] = lamda*r01[1]*r01[1];
+    v[2] = lamda*r01[2]*r01[2];
+    v[3] = lamda*r01[0]*r01[1];
+    v[4] = lamda*r01[0]*r01[2];
+    v[5] = lamda*r01[1]*r01[2];
+
+    v_tally(nlist,list,2.0,v);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixShakeCuda::shake3(int m)
+{
+  int nlist,list[3];
+  double v[6];
+  double invmass0,invmass1,invmass2;
+
+  // local atom IDs and constraint distances
+
+  int i0 = atom->map(shake_atom[m][0]);
+  int i1 = atom->map(shake_atom[m][1]);
+  int i2 = atom->map(shake_atom[m][2]);
+  double bond1 = bond_distance[shake_type[m][0]];
+  double bond2 = bond_distance[shake_type[m][1]];
+
+  // r01,r02 = distance vec between atoms, with PBC
+
+  double r01[3];
+  r01[0] = x[i0][0] - x[i1][0];
+  r01[1] = x[i0][1] - x[i1][1];
+  r01[2] = x[i0][2] - x[i1][2];
+  domain->minimum_image(r01);
+
+  double r02[3];
+  r02[0] = x[i0][0] - x[i2][0];
+  r02[1] = x[i0][1] - x[i2][1];
+  r02[2] = x[i0][2] - x[i2][2];
+  domain->minimum_image(r02);
+
+  // s01,s02 = distance vec after unconstrained update, with PBC
+
+  double s01[3];
+  s01[0] = xshake[i0][0] - xshake[i1][0];
+  s01[1] = xshake[i0][1] - xshake[i1][1];
+  s01[2] = xshake[i0][2] - xshake[i1][2];
+  domain->minimum_image(s01);
+
+  double s02[3];
+  s02[0] = xshake[i0][0] - xshake[i2][0];
+  s02[1] = xshake[i0][1] - xshake[i2][1];
+  s02[2] = xshake[i0][2] - xshake[i2][2];
+  domain->minimum_image(s02);
+
+  // scalar distances between atoms
+
+  double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2];
+  double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2];
+  double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2];
+  double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2];
+
+  // matrix coeffs and rhs for lamda equations
+
+  if (rmass) {
+    invmass0 = 1.0/rmass[i0];
+    invmass1 = 1.0/rmass[i1];
+    invmass2 = 1.0/rmass[i2];
+  } else {
+    invmass0 = 1.0/mass[type[i0]];
+    invmass1 = 1.0/mass[type[i1]];
+    invmass2 = 1.0/mass[type[i2]];
+  }
+
+  double a11 = 2.0 * (invmass0+invmass1) *
+    (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]);
+  double a12 = 2.0 * invmass0 *
+    (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]);
+  double a21 = 2.0 * invmass0 *
+    (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]);
+  double a22 = 2.0 * (invmass0+invmass2) *
+    (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]);
+
+  // inverse of matrix
+
+  double determ = a11*a22 - a12*a21;
+  if (determ == 0.0) error->one("Shake determinant = 0.0");
+  double determinv = 1.0/determ;
+  
+  double a11inv = a22*determinv;
+  double a12inv = -a12*determinv;
+  double a21inv = -a21*determinv;
+  double a22inv = a11*determinv;
+
+  // quadratic correction coeffs
+
+  double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]);
+
+  double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
+  double quad1_0202 = invmass0*invmass0 * r02sq;
+  double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102;
+
+  double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
+  double quad2_0101 = invmass0*invmass0 * r01sq;
+  double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102;
+
+  // iterate until converged
+
+  double lamda01 = 0.0;
+  double lamda02 = 0.0;
+  int niter = 0;
+  int done = 0;
+
+  double quad1,quad2,b1,b2,lamda01_new,lamda02_new;
+
+  while (!done && niter < max_iter) {
+    quad1 = quad1_0101 * lamda01*lamda01 + quad1_0202 * lamda02*lamda02 + 
+      quad1_0102 * lamda01*lamda02;
+    quad2 = quad2_0101 * lamda01*lamda01 + quad2_0202 * lamda02*lamda02 + 
+      quad2_0102 * lamda01*lamda02;
+        
+    b1 = bond1*bond1 - s01sq - quad1;
+    b2 = bond2*bond2 - s02sq - quad2;
+        
+    lamda01_new = a11inv*b1 + a12inv*b2;
+    lamda02_new = a21inv*b1 + a22inv*b2;
+
+    done = 1;
+    if (fabs(lamda01_new-lamda01) > tolerance) done = 0;
+    if (fabs(lamda02_new-lamda02) > tolerance) done = 0;
+
+    lamda01 = lamda01_new;
+    lamda02 = lamda02_new;
+    niter++;
+  }
+
+  // update forces if atom is owned by this processor
+
+  lamda01 = lamda01/dtfsq;
+  lamda02 = lamda02/dtfsq;
+
+  if (i0 < nlocal) {
+    f[i0][0] += lamda01*r01[0] + lamda02*r02[0];
+    f[i0][1] += lamda01*r01[1] + lamda02*r02[1];
+    f[i0][2] += lamda01*r01[2] + lamda02*r02[2];
+  }
+
+  if (i1 < nlocal) {
+    f[i1][0] -= lamda01*r01[0];
+    f[i1][1] -= lamda01*r01[1];
+    f[i1][2] -= lamda01*r01[2];
+  }
+
+  if (i2 < nlocal) {
+    f[i2][0] -= lamda02*r02[0];
+    f[i2][1] -= lamda02*r02[1];
+    f[i2][2] -= lamda02*r02[2];
+  }
+
+  if (evflag) {
+    nlist = 0;
+    if (i0 < nlocal) list[nlist++] = i0;
+    if (i1 < nlocal) list[nlist++] = i1;
+    if (i2 < nlocal) list[nlist++] = i2;
+
+    v[0] = lamda01*r01[0]*r01[0] + lamda02*r02[0]*r02[0];
+    v[1] = lamda01*r01[1]*r01[1] + lamda02*r02[1]*r02[1];
+    v[2] = lamda01*r01[2]*r01[2] + lamda02*r02[2]*r02[2];
+    v[3] = lamda01*r01[0]*r01[1] + lamda02*r02[0]*r02[1];
+    v[4] = lamda01*r01[0]*r01[2] + lamda02*r02[0]*r02[2];
+    v[5] = lamda01*r01[1]*r01[2] + lamda02*r02[1]*r02[2];
+
+    v_tally(nlist,list,3.0,v);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixShakeCuda::shake4(int m)
+{
+  int nlist,list[4];
+  double v[6];
+  double invmass0,invmass1,invmass2,invmass3;
+
+  // local atom IDs and constraint distances
+
+  int i0 = atom->map(shake_atom[m][0]);
+  int i1 = atom->map(shake_atom[m][1]);
+  int i2 = atom->map(shake_atom[m][2]);
+  int i3 = atom->map(shake_atom[m][3]);
+  double bond1 = bond_distance[shake_type[m][0]];
+  double bond2 = bond_distance[shake_type[m][1]];
+  double bond3 = bond_distance[shake_type[m][2]];
+
+  // r01,r02,r03 = distance vec between atoms, with PBC
+
+  double r01[3];
+  r01[0] = x[i0][0] - x[i1][0];
+  r01[1] = x[i0][1] - x[i1][1];
+  r01[2] = x[i0][2] - x[i1][2];
+  domain->minimum_image(r01);
+
+  double r02[3];
+  r02[0] = x[i0][0] - x[i2][0];
+  r02[1] = x[i0][1] - x[i2][1];
+  r02[2] = x[i0][2] - x[i2][2];
+  domain->minimum_image(r02);
+
+  double r03[3];
+  r03[0] = x[i0][0] - x[i3][0];
+  r03[1] = x[i0][1] - x[i3][1];
+  r03[2] = x[i0][2] - x[i3][2];
+  domain->minimum_image(r03);
+
+  // s01,s02,s03 = distance vec after unconstrained update, with PBC
+
+  double s01[3];
+  s01[0] = xshake[i0][0] - xshake[i1][0];
+  s01[1] = xshake[i0][1] - xshake[i1][1];
+  s01[2] = xshake[i0][2] - xshake[i1][2];
+  domain->minimum_image(s01);
+
+  double s02[3];
+  s02[0] = xshake[i0][0] - xshake[i2][0];
+  s02[1] = xshake[i0][1] - xshake[i2][1];
+  s02[2] = xshake[i0][2] - xshake[i2][2];
+  domain->minimum_image(s02);
+
+  double s03[3];
+  s03[0] = xshake[i0][0] - xshake[i3][0];
+  s03[1] = xshake[i0][1] - xshake[i3][1];
+  s03[2] = xshake[i0][2] - xshake[i3][2];
+  domain->minimum_image(s03);
+
+  // scalar distances between atoms
+
+  double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2];
+  double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2];
+  double r03sq = r03[0]*r03[0] + r03[1]*r03[1] + r03[2]*r03[2];
+  double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2];
+  double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2];
+  double s03sq = s03[0]*s03[0] + s03[1]*s03[1] + s03[2]*s03[2];
+
+  // matrix coeffs and rhs for lamda equations
+
+  if (rmass) {
+    invmass0 = 1.0/rmass[i0];
+    invmass1 = 1.0/rmass[i1];
+    invmass2 = 1.0/rmass[i2];
+    invmass3 = 1.0/rmass[i3];
+  } else {
+    invmass0 = 1.0/mass[type[i0]];
+    invmass1 = 1.0/mass[type[i1]];
+    invmass2 = 1.0/mass[type[i2]];
+    invmass3 = 1.0/mass[type[i3]];
+  }
+
+  double a11 = 2.0 * (invmass0+invmass1) *
+    (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]);
+  double a12 = 2.0 * invmass0 *
+    (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]);
+  double a13 = 2.0 * invmass0 *
+    (s01[0]*r03[0] + s01[1]*r03[1] + s01[2]*r03[2]);
+  double a21 = 2.0 * invmass0 *
+    (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]);
+  double a22 = 2.0 * (invmass0+invmass2) *
+    (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]);
+  double a23 = 2.0 * invmass0 *
+    (s02[0]*r03[0] + s02[1]*r03[1] + s02[2]*r03[2]);
+  double a31 = 2.0 * invmass0 *
+    (s03[0]*r01[0] + s03[1]*r01[1] + s03[2]*r01[2]);
+  double a32 = 2.0 * invmass0 *
+    (s03[0]*r02[0] + s03[1]*r02[1] + s03[2]*r02[2]);
+  double a33 = 2.0 * (invmass0+invmass3) *
+    (s03[0]*r03[0] + s03[1]*r03[1] + s03[2]*r03[2]);
+  
+  // inverse of matrix;
+
+  double determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 -
+    a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
+  if (determ == 0.0) error->one("Shake determinant = 0.0");
+  double determinv = 1.0/determ;
+  
+  double a11inv = determinv * (a22*a33 - a23*a32);
+  double a12inv = -determinv * (a12*a33 - a13*a32);
+  double a13inv = determinv * (a12*a23 - a13*a22);
+  double a21inv = -determinv * (a21*a33 - a23*a31);
+  double a22inv = determinv * (a11*a33 - a13*a31);
+  double a23inv = -determinv * (a11*a23 - a13*a21);
+  double a31inv = determinv * (a21*a32 - a22*a31);
+  double a32inv = -determinv * (a11*a32 - a12*a31);
+  double a33inv = determinv * (a11*a22 - a12*a21);
+
+  // quadratic correction coeffs
+
+  double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]);
+  double r0103 = (r01[0]*r03[0] + r01[1]*r03[1] + r01[2]*r03[2]);
+  double r0203 = (r02[0]*r03[0] + r02[1]*r03[1] + r02[2]*r03[2]);
+
+  double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
+  double quad1_0202 = invmass0*invmass0 * r02sq;
+  double quad1_0303 = invmass0*invmass0 * r03sq;
+  double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102;
+  double quad1_0103 = 2.0 * (invmass0+invmass1)*invmass0 * r0103;
+  double quad1_0203 = 2.0 * invmass0*invmass0 * r0203;
+
+  double quad2_0101 = invmass0*invmass0 * r01sq;
+  double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
+  double quad2_0303 = invmass0*invmass0 * r03sq;
+  double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102;
+  double quad2_0103 = 2.0 * invmass0*invmass0 * r0103;
+  double quad2_0203 = 2.0 * (invmass0+invmass2)*invmass0 * r0203;
+
+  double quad3_0101 = invmass0*invmass0 * r01sq;
+  double quad3_0202 = invmass0*invmass0 * r02sq;
+  double quad3_0303 = (invmass0+invmass3)*(invmass0+invmass3) * r03sq;
+  double quad3_0102 = 2.0 * invmass0*invmass0 * r0102;
+  double quad3_0103 = 2.0 * (invmass0+invmass3)*invmass0 * r0103;
+  double quad3_0203 = 2.0 * (invmass0+invmass3)*invmass0 * r0203;
+
+  // iterate until converged
+
+  double lamda01 = 0.0;
+  double lamda02 = 0.0;
+  double lamda03 = 0.0;
+  int niter = 0;
+  int done = 0;
+
+  double quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda03_new;
+
+  while (!done && niter < max_iter) {
+    quad1 = quad1_0101 * lamda01*lamda01 + 
+      quad1_0202 * lamda02*lamda02 +
+      quad1_0303 * lamda03*lamda03 + 
+      quad1_0102 * lamda01*lamda02 +
+      quad1_0103 * lamda01*lamda03 +
+      quad1_0203 * lamda02*lamda03;
+
+    quad2 = quad2_0101 * lamda01*lamda01 + 
+      quad2_0202 * lamda02*lamda02 +
+      quad2_0303 * lamda03*lamda03 + 
+      quad2_0102 * lamda01*lamda02 +
+      quad2_0103 * lamda01*lamda03 +
+      quad2_0203 * lamda02*lamda03;
+
+    quad3 = quad3_0101 * lamda01*lamda01 + 
+      quad3_0202 * lamda02*lamda02 +
+      quad3_0303 * lamda03*lamda03 + 
+      quad3_0102 * lamda01*lamda02 +
+      quad3_0103 * lamda01*lamda03 +
+      quad3_0203 * lamda02*lamda03;
+
+    b1 = bond1*bond1 - s01sq - quad1;
+    b2 = bond2*bond2 - s02sq - quad2;
+    b3 = bond3*bond3 - s03sq - quad3;
+        
+    lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3;
+    lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3;
+    lamda03_new = a31inv*b1 + a32inv*b2 + a33inv*b3;
+
+    done = 1;
+    if (fabs(lamda01_new-lamda01) > tolerance) done = 0;
+    if (fabs(lamda02_new-lamda02) > tolerance) done = 0;
+    if (fabs(lamda03_new-lamda03) > tolerance) done = 0;
+
+    lamda01 = lamda01_new;
+    lamda02 = lamda02_new;
+    lamda03 = lamda03_new;
+    niter++;
+  }
+
+  // update forces if atom is owned by this processor
+
+  lamda01 = lamda01/dtfsq;
+  lamda02 = lamda02/dtfsq;
+  lamda03 = lamda03/dtfsq;
+
+  if (i0 < nlocal) {
+    f[i0][0] += lamda01*r01[0] + lamda02*r02[0] + lamda03*r03[0];
+    f[i0][1] += lamda01*r01[1] + lamda02*r02[1] + lamda03*r03[1];
+    f[i0][2] += lamda01*r01[2] + lamda02*r02[2] + lamda03*r03[2];
+  }
+
+  if (i1 < nlocal) {
+    f[i1][0] -= lamda01*r01[0];
+    f[i1][1] -= lamda01*r01[1];
+    f[i1][2] -= lamda01*r01[2];
+  }
+
+  if (i2 < nlocal) {
+    f[i2][0] -= lamda02*r02[0];
+    f[i2][1] -= lamda02*r02[1];
+    f[i2][2] -= lamda02*r02[2];
+  }
+
+  if (i3 < nlocal) {
+    f[i3][0] -= lamda03*r03[0];
+    f[i3][1] -= lamda03*r03[1];
+    f[i3][2] -= lamda03*r03[2];
+  }
+
+  if (evflag) {
+    nlist = 0;
+    if (i0 < nlocal) list[nlist++] = i0;
+    if (i1 < nlocal) list[nlist++] = i1;
+    if (i2 < nlocal) list[nlist++] = i2;
+    if (i3 < nlocal) list[nlist++] = i3;
+
+    v[0] = lamda01*r01[0]*r01[0]+lamda02*r02[0]*r02[0]+lamda03*r03[0]*r03[0];
+    v[1] = lamda01*r01[1]*r01[1]+lamda02*r02[1]*r02[1]+lamda03*r03[1]*r03[1];
+    v[2] = lamda01*r01[2]*r01[2]+lamda02*r02[2]*r02[2]+lamda03*r03[2]*r03[2];
+    v[3] = lamda01*r01[0]*r01[1]+lamda02*r02[0]*r02[1]+lamda03*r03[0]*r03[1];
+    v[4] = lamda01*r01[0]*r01[2]+lamda02*r02[0]*r02[2]+lamda03*r03[0]*r03[2];
+    v[5] = lamda01*r01[1]*r01[2]+lamda02*r02[1]*r02[2]+lamda03*r03[1]*r03[2];
+//if(i0==7271) printf("%lf %lf %lf %lf %lf %lf\n",v[0],v[1],v[2],v[3],v[4],v[5]);
+
+    v_tally(nlist,list,4.0,v);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixShakeCuda::shake3angle(int m)
+{
+  int nlist,list[3];
+  double v[6];
+  double invmass0,invmass1,invmass2;
+
+  // local atom IDs and constraint distances
+
+  int i0 = atom->map(shake_atom[m][0]);
+  int i1 = atom->map(shake_atom[m][1]);
+  int i2 = atom->map(shake_atom[m][2]);
+  double bond1 = bond_distance[shake_type[m][0]];
+  double bond2 = bond_distance[shake_type[m][1]];
+  double bond12 = angle_distance[shake_type[m][2]];
+
+  // r01,r02,r12 = distance vec between atoms, with PBC
+
+  double r01[3];
+  r01[0] = x[i0][0] - x[i1][0];
+  r01[1] = x[i0][1] - x[i1][1];
+  r01[2] = x[i0][2] - x[i1][2];
+  domain->minimum_image(r01);
+
+  double r02[3];
+  r02[0] = x[i0][0] - x[i2][0];
+  r02[1] = x[i0][1] - x[i2][1];
+  r02[2] = x[i0][2] - x[i2][2];
+  domain->minimum_image(r02);
+
+  double r12[3];
+  r12[0] = x[i1][0] - x[i2][0];
+  r12[1] = x[i1][1] - x[i2][1];
+  r12[2] = x[i1][2] - x[i2][2];
+  domain->minimum_image(r12);
+
+  // s01,s02,s12 = distance vec after unconstrained update, with PBC
+
+  double s01[3];
+  s01[0] = xshake[i0][0] - xshake[i1][0];
+  s01[1] = xshake[i0][1] - xshake[i1][1];
+  s01[2] = xshake[i0][2] - xshake[i1][2];
+  domain->minimum_image(s01);
+
+  double s02[3];
+  s02[0] = xshake[i0][0] - xshake[i2][0];
+  s02[1] = xshake[i0][1] - xshake[i2][1];
+  s02[2] = xshake[i0][2] - xshake[i2][2];
+  domain->minimum_image(s02);
+
+  double s12[3];
+  s12[0] = xshake[i1][0] - xshake[i2][0];
+  s12[1] = xshake[i1][1] - xshake[i2][1];
+  s12[2] = xshake[i1][2] - xshake[i2][2];
+  domain->minimum_image(s12);
+
+  // scalar distances between atoms
+
+  double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2];
+  double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2];
+  double r12sq = r12[0]*r12[0] + r12[1]*r12[1] + r12[2]*r12[2];
+  double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2];
+  double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2];
+  double s12sq = s12[0]*s12[0] + s12[1]*s12[1] + s12[2]*s12[2];
+
+  // matrix coeffs and rhs for lamda equations
+
+  if (rmass) {
+    invmass0 = 1.0/rmass[i0];
+    invmass1 = 1.0/rmass[i1];
+    invmass2 = 1.0/rmass[i2];
+  } else {
+    invmass0 = 1.0/mass[type[i0]];
+    invmass1 = 1.0/mass[type[i1]];
+    invmass2 = 1.0/mass[type[i2]];
+  }
+
+  double a11 = 2.0 * (invmass0+invmass1) *
+    (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]);
+  double a12 = 2.0 * invmass0 *
+    (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]);
+  double a13 = - 2.0 * invmass1 *
+    (s01[0]*r12[0] + s01[1]*r12[1] + s01[2]*r12[2]);
+  double a21 = 2.0 * invmass0 *
+    (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]);
+  double a22 = 2.0 * (invmass0+invmass2) *
+    (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]);
+  double a23 = 2.0 * invmass2 *
+    (s02[0]*r12[0] + s02[1]*r12[1] + s02[2]*r12[2]);
+  double a31 = - 2.0 * invmass1 *
+    (s12[0]*r01[0] + s12[1]*r01[1] + s12[2]*r01[2]);
+  double a32 = 2.0 * invmass2 *
+    (s12[0]*r02[0] + s12[1]*r02[1] + s12[2]*r02[2]);
+  double a33 = 2.0 * (invmass1+invmass2) *
+    (s12[0]*r12[0] + s12[1]*r12[1] + s12[2]*r12[2]);
+
+  // inverse of matrix
+
+  double determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 -
+    a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
+  if (determ == 0.0) error->one("Shake determinant = 0.0");
+  double determinv = 1.0/determ;
+  
+  double a11inv = determinv * (a22*a33 - a23*a32);
+  double a12inv = -determinv * (a12*a33 - a13*a32);
+  double a13inv = determinv * (a12*a23 - a13*a22);
+  double a21inv = -determinv * (a21*a33 - a23*a31);
+  double a22inv = determinv * (a11*a33 - a13*a31);
+  double a23inv = -determinv * (a11*a23 - a13*a21);
+  double a31inv = determinv * (a21*a32 - a22*a31);
+  double a32inv = -determinv * (a11*a32 - a12*a31);
+  double a33inv = determinv * (a11*a22 - a12*a21);
+
+  // quadratic correction coeffs
+
+  double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]);
+  double r0112 = (r01[0]*r12[0] + r01[1]*r12[1] + r01[2]*r12[2]);
+  double r0212 = (r02[0]*r12[0] + r02[1]*r12[1] + r02[2]*r12[2]);
+
+  double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
+  double quad1_0202 = invmass0*invmass0 * r02sq;
+  double quad1_1212 = invmass1*invmass1 * r12sq;
+  double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102;
+  double quad1_0112 = - 2.0 * (invmass0+invmass1)*invmass1 * r0112;
+  double quad1_0212 = - 2.0 * invmass0*invmass1 * r0212;
+
+  double quad2_0101 = invmass0*invmass0 * r01sq;
+  double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
+  double quad2_1212 = invmass2*invmass2 * r12sq;
+  double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102;
+  double quad2_0112 = 2.0 * invmass0*invmass2 * r0112;
+  double quad2_0212 = 2.0 * (invmass0+invmass2)*invmass2 * r0212;
+
+  double quad3_0101 = invmass1*invmass1 * r01sq;
+  double quad3_0202 = invmass2*invmass2 * r02sq;
+  double quad3_1212 = (invmass1+invmass2)*(invmass1+invmass2) * r12sq;
+  double quad3_0102 = - 2.0 * invmass1*invmass2 * r0102;
+  double quad3_0112 = - 2.0 * (invmass1+invmass2)*invmass1 * r0112;
+  double quad3_0212 = 2.0 * (invmass1+invmass2)*invmass2 * r0212;
+
+  // iterate until converged
+
+  double lamda01 = 0.0;
+  double lamda02 = 0.0;
+  double lamda12 = 0.0;
+  int niter = 0;
+  int done = 0;
+
+  double quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda12_new;
+
+  while (!done && niter < max_iter) {
+    quad1 = quad1_0101 * lamda01*lamda01 + 
+      quad1_0202 * lamda02*lamda02 +
+      quad1_1212 * lamda12*lamda12 + 
+      quad1_0102 * lamda01*lamda02 +
+      quad1_0112 * lamda01*lamda12 +
+      quad1_0212 * lamda02*lamda12;
+
+    quad2 = quad2_0101 * lamda01*lamda01 + 
+      quad2_0202 * lamda02*lamda02 +
+      quad2_1212 * lamda12*lamda12 + 
+      quad2_0102 * lamda01*lamda02 +
+      quad2_0112 * lamda01*lamda12 +
+      quad2_0212 * lamda02*lamda12;
+      
+    quad3 = quad3_0101 * lamda01*lamda01 + 
+      quad3_0202 * lamda02*lamda02 +
+      quad3_1212 * lamda12*lamda12 + 
+      quad3_0102 * lamda01*lamda02 +
+      quad3_0112 * lamda01*lamda12 +
+      quad3_0212 * lamda02*lamda12;
+
+    b1 = bond1*bond1 - s01sq - quad1;
+    b2 = bond2*bond2 - s02sq - quad2;
+    b3 = bond12*bond12 - s12sq - quad3;
+        
+    lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3;
+    lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3;
+    lamda12_new = a31inv*b1 + a32inv*b2 + a33inv*b3;
+
+    done = 1;
+    if (fabs(lamda01_new-lamda01) > tolerance) done = 0;
+    if (fabs(lamda02_new-lamda02) > tolerance) done = 0;
+    if (fabs(lamda12_new-lamda12) > tolerance) done = 0;
+
+    lamda01 = lamda01_new;
+    lamda02 = lamda02_new;
+    lamda12 = lamda12_new;
+    niter++;
+  }
+
+  // update forces if atom is owned by this processor
+
+  lamda01 = lamda01/dtfsq;
+  lamda02 = lamda02/dtfsq;
+  lamda12 = lamda12/dtfsq;
+
+  if (i0 < nlocal) {
+    f[i0][0] += lamda01*r01[0] + lamda02*r02[0];
+    f[i0][1] += lamda01*r01[1] + lamda02*r02[1];
+    f[i0][2] += lamda01*r01[2] + lamda02*r02[2];
+  }
+
+  if (i1 < nlocal) {
+    f[i1][0] -= lamda01*r01[0] - lamda12*r12[0];
+    f[i1][1] -= lamda01*r01[1] - lamda12*r12[1];
+    f[i1][2] -= lamda01*r01[2] - lamda12*r12[2];
+  }
+
+  if (i2 < nlocal) {
+    f[i2][0] -= lamda02*r02[0] + lamda12*r12[0];
+    f[i2][1] -= lamda02*r02[1] + lamda12*r12[1];
+    f[i2][2] -= lamda02*r02[2] + lamda12*r12[2];
+  }
+
+  if (evflag) {
+    nlist = 0;
+    if (i0 < nlocal) list[nlist++] = i0;
+    if (i1 < nlocal) list[nlist++] = i1;
+    if (i2 < nlocal) list[nlist++] = i2;
+
+    v[0] = lamda01*r01[0]*r01[0]+lamda02*r02[0]*r02[0]+lamda12*r12[0]*r12[0];
+    v[1] = lamda01*r01[1]*r01[1]+lamda02*r02[1]*r02[1]+lamda12*r12[1]*r12[1];
+    v[2] = lamda01*r01[2]*r01[2]+lamda02*r02[2]*r02[2]+lamda12*r12[2]*r12[2];
+    v[3] = lamda01*r01[0]*r01[1]+lamda02*r02[0]*r02[1]+lamda12*r12[0]*r12[1];
+    v[4] = lamda01*r01[0]*r01[2]+lamda02*r02[0]*r02[2]+lamda12*r12[0]*r12[2];
+    v[5] = lamda01*r01[1]*r01[2]+lamda02*r02[1]*r02[2]+lamda12*r12[1]*r12[2];
+
+    v_tally(nlist,list,3.0,v);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   print-out bond & angle statistics 
+------------------------------------------------------------------------- */
+
+void FixShakeCuda::stats()
+{
+  int i,j,m,n,iatom,jatom,katom;
+  double delx,dely,delz;
+  double r,r1,r2,r3,angle;
+
+  // zero out accumulators
+
+  int nb = atom->nbondtypes + 1;
+  int na = atom->nangletypes + 1;
+
+  for (i = 0; i < nb; i++) {
+    b_count[i] = 0;
+    b_ave[i] = b_max[i] = 0.0;
+    b_min[i] = BIG;
+  }
+  for (i = 0; i < na; i++) {
+    a_count[i] = 0;
+    a_ave[i] = a_max[i] = 0.0;
+    a_min[i] = BIG;
+  }
+
+  // log stats for each bond & angle
+  // OK to double count since are just averaging
+
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    if (shake_flag[i] == 0) continue;
+
+    // bond stats
+
+    n = shake_flag[i];
+    if (n == 1) n = 3;
+    iatom = atom->map(shake_atom[i][0]);
+    for (j = 1; j < n; j++) {
+      jatom = atom->map(shake_atom[i][j]);
+      delx = x[iatom][0] - x[jatom][0];
+      dely = x[iatom][1] - x[jatom][1];
+      delz = x[iatom][2] - x[jatom][2];
+      domain->minimum_image(delx,dely,delz);
+      r = sqrt(delx*delx + dely*dely + delz*delz);
+      
+      m = shake_type[i][j-1];
+      b_count[m]++;
+      b_ave[m] += r;
+      b_max[m] = MAX(b_max[m],r);
+      b_min[m] = MIN(b_min[m],r);
+    }
+
+    // angle stats
+
+    if (shake_flag[i] == 1) {
+      iatom = atom->map(shake_atom[i][0]);
+      jatom = atom->map(shake_atom[i][1]);
+      katom = atom->map(shake_atom[i][2]);
+
+      delx = x[iatom][0] - x[jatom][0];
+      dely = x[iatom][1] - x[jatom][1];
+      delz = x[iatom][2] - x[jatom][2];
+      domain->minimum_image(delx,dely,delz);
+      r1 = sqrt(delx*delx + dely*dely + delz*delz);
+
+      delx = x[iatom][0] - x[katom][0];
+      dely = x[iatom][1] - x[katom][1];
+      delz = x[iatom][2] - x[katom][2];
+      domain->minimum_image(delx,dely,delz);
+      r2 = sqrt(delx*delx + dely*dely + delz*delz);
+
+      delx = x[jatom][0] - x[katom][0];
+      dely = x[jatom][1] - x[katom][1];
+      delz = x[jatom][2] - x[katom][2];
+      domain->minimum_image(delx,dely,delz);
+      r3 = sqrt(delx*delx + dely*dely + delz*delz);
+
+      angle = acos((r1*r1 + r2*r2 - r3*r3) / (2.0*r1*r2));
+      angle *= 180.0/PI;
+      m = shake_type[i][2];
+      a_count[m]++;
+      a_ave[m] += angle;
+      a_max[m] = MAX(a_max[m],angle);
+      a_min[m] = MIN(a_min[m],angle);
+    }
+  }
+
+  // sum across all procs
+
+  MPI_Allreduce(b_count,b_count_all,nb,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(b_ave,b_ave_all,nb,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(b_max,b_max_all,nb,MPI_DOUBLE,MPI_MAX,world);
+  MPI_Allreduce(b_min,b_min_all,nb,MPI_DOUBLE,MPI_MIN,world);
+
+  MPI_Allreduce(a_count,a_count_all,na,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(a_ave,a_ave_all,na,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(a_max,a_max_all,na,MPI_DOUBLE,MPI_MAX,world);
+  MPI_Allreduce(a_min,a_min_all,na,MPI_DOUBLE,MPI_MIN,world);
+
+  // print stats only for non-zero counts
+
+  if (me == 0) {
+    if (screen) {
+      fprintf(screen,
+	      "SHAKE stats (type/ave/delta) on step " BIGINT_FORMAT "\n",
+	      update->ntimestep);
+      for (i = 1; i < nb; i++) 
+	if (b_count_all[i])
+	  fprintf(screen,"  %d %g %g\n",i,
+		  b_ave_all[i]/b_count_all[i],b_max_all[i]-b_min_all[i]);
+      for (i = 1; i < na; i++) 
+	if (a_count_all[i])
+	  fprintf(screen,"  %d %g %g\n",i,
+		  a_ave_all[i]/a_count_all[i],a_max_all[i]-a_min_all[i]);
+    }
+    if (logfile) {
+      fprintf(logfile,
+	      "SHAKE stats (type/ave/delta) on step " BIGINT_FORMAT "\n",
+	      update->ntimestep);
+      for (i = 0; i < nb; i++) 
+	if (b_count_all[i])
+	  fprintf(logfile,"  %d %g %g\n",i,
+		  b_ave_all[i]/b_count_all[i],b_max_all[i]-b_min_all[i]);
+      for (i = 0; i < na; i++) 
+	if (a_count_all[i])
+	  fprintf(logfile,"  %d %g %g\n",i,
+		  a_ave_all[i]/a_count_all[i],a_max_all[i]-a_min_all[i]);
+    }
+  }
+
+  // next timestep for stats
+
+  next_output += output_every;
+}
+
+/* ----------------------------------------------------------------------
+   find a bond between global tags n1 and n2 stored with local atom i
+   return -1 if don't find it
+   return bond index if do find it
+------------------------------------------------------------------------- */
+
+int FixShakeCuda::bondfind(int i, int n1, int n2)
+{
+  int *tag = atom->tag;
+  int **bond_atom = atom->bond_atom;
+  int nbonds = atom->num_bond[i];
+
+  int m;
+  for (m = 0; m < nbonds; m++) {
+    if (n1 == tag[i] && n2 == bond_atom[i][m]) break;
+    if (n1 == bond_atom[i][m] && n2 == tag[i]) break;
+  }
+  if (m < nbonds) return m;
+  return -1;
+}
+
+/* ----------------------------------------------------------------------
+   find an angle with global end atoms n1 and n2 stored with local atom i
+   return -1 if don't find it
+   return angle index if do find it
+------------------------------------------------------------------------- */
+
+int FixShakeCuda::anglefind(int i, int n1, int n2)
+{
+  int **angle_atom1 = atom->angle_atom1;
+  int **angle_atom3 = atom->angle_atom3;
+  int nangles = atom->num_angle[i];
+
+  int m;
+  for (m = 0; m < nangles; m++) {
+    if (n1 == angle_atom1[i][m] && n2 == angle_atom3[i][m]) break;
+    if (n1 == angle_atom3[i][m] && n2 == angle_atom1[i][m]) break;
+  }
+  if (m < nangles) return m;
+  return -1;
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local atom-based arrays 
+------------------------------------------------------------------------- */
+
+double FixShakeCuda::memory_usage()
+{
+  int nmax = atom->nmax;
+  double bytes = nmax * sizeof(int);
+  bytes += nmax*4 * sizeof(int);
+  bytes += nmax*3 * sizeof(int);
+  bytes += nmax*3 * sizeof(double);
+  bytes += maxvatom*6 * sizeof(double);
+  return bytes;
+}
+
+/* ----------------------------------------------------------------------
+   allocate local atom-based arrays 
+------------------------------------------------------------------------- */
+
+void FixShakeCuda::grow_arrays(int nmax)
+{
+  memory->grow(shake_flag,nmax,"shake:shake_flag");
+  memory->grow(shake_atom,nmax,4,"shake:shake_atom");
+  memory->grow(shake_type,nmax,3,"shake:shake_type");
+  memory->destroy(xshake);
+  memory->create(xshake,nmax,3,"shake:xshake");
+
+  delete cu_shake_flag; cu_shake_flag = new cCudaData<int, int, xx > (shake_flag, nmax );
+  delete cu_shake_atom; cu_shake_atom = new cCudaData<int, int, yx> ((int*)shake_atom, nmax, 4);
+  delete cu_shake_type; cu_shake_type = new cCudaData<int, int, yx> ((int*)shake_type, nmax, 3);
+  delete cu_xshake; cu_xshake = new cCudaData<double, X_FLOAT, xy> ((double*)xshake, nmax, 3);
+  cu_shake_flag->upload();
+  cu_shake_atom->upload();
+  cu_shake_type->upload();
+  if(cu_bond_distance)
+  Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq,
+  	cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(),
+  	cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(),
+	max_iter,tolerance);
+}
+
+/* ----------------------------------------------------------------------
+   copy values within local atom-based arrays 
+------------------------------------------------------------------------- */
+
+void FixShakeCuda::copy_arrays(int i, int j)
+{
+  int flag = shake_flag[j] = shake_flag[i];
+  if (flag == 1) {
+    shake_atom[j][0] = shake_atom[i][0];
+    shake_atom[j][1] = shake_atom[i][1];
+    shake_atom[j][2] = shake_atom[i][2];
+    shake_type[j][0] = shake_type[i][0];
+    shake_type[j][1] = shake_type[i][1];
+    shake_type[j][2] = shake_type[i][2];
+  } else if (flag == 2) {
+    shake_atom[j][0] = shake_atom[i][0];
+    shake_atom[j][1] = shake_atom[i][1];
+    shake_type[j][0] = shake_type[i][0];
+  } else if (flag == 3) {
+    shake_atom[j][0] = shake_atom[i][0];
+    shake_atom[j][1] = shake_atom[i][1];
+    shake_atom[j][2] = shake_atom[i][2];
+    shake_type[j][0] = shake_type[i][0];
+    shake_type[j][1] = shake_type[i][1];
+  } else if (flag == 4) {
+    shake_atom[j][0] = shake_atom[i][0];
+    shake_atom[j][1] = shake_atom[i][1];
+    shake_atom[j][2] = shake_atom[i][2];
+    shake_atom[j][3] = shake_atom[i][3];
+    shake_type[j][0] = shake_type[i][0];
+    shake_type[j][1] = shake_type[i][1];
+    shake_type[j][2] = shake_type[i][2];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   initialize one atom's array values, called when atom is created
+------------------------------------------------------------------------- */
+
+void FixShakeCuda::set_arrays(int i)
+{
+  shake_flag[i] = 0;
+}
+
+/* ----------------------------------------------------------------------
+   pack values in local atom-based arrays for exchange with another proc 
+------------------------------------------------------------------------- */
+
+int FixShakeCuda::pack_exchange(int i, double *buf)
+{
+  int m = 0;
+  buf[m++] = shake_flag[i];
+  int flag = shake_flag[i];
+  if (flag == 1) {
+    buf[m++] = shake_atom[i][0];
+    buf[m++] = shake_atom[i][1];
+    buf[m++] = shake_atom[i][2];
+    buf[m++] = shake_type[i][0];
+    buf[m++] = shake_type[i][1];
+    buf[m++] = shake_type[i][2];
+  } else if (flag == 2) {
+    buf[m++] = shake_atom[i][0];
+    buf[m++] = shake_atom[i][1];
+    buf[m++] = shake_type[i][0];
+  } else if (flag == 3) {
+    buf[m++] = shake_atom[i][0];
+    buf[m++] = shake_atom[i][1];
+    buf[m++] = shake_atom[i][2];
+    buf[m++] = shake_type[i][0];
+    buf[m++] = shake_type[i][1];
+  } else if (flag == 4) {
+    buf[m++] = shake_atom[i][0];
+    buf[m++] = shake_atom[i][1];
+    buf[m++] = shake_atom[i][2];
+    buf[m++] = shake_atom[i][3];
+    buf[m++] = shake_type[i][0];
+    buf[m++] = shake_type[i][1];
+    buf[m++] = shake_type[i][2];
+  }
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack values in local atom-based arrays from exchange with another proc 
+------------------------------------------------------------------------- */
+
+int FixShakeCuda::unpack_exchange(int nlocal, double *buf)
+{
+  int m = 0;
+  int flag = shake_flag[nlocal] = static_cast<int> (buf[m++]);
+  if (flag == 1) {
+    shake_atom[nlocal][0] = static_cast<int> (buf[m++]);
+    shake_atom[nlocal][1] = static_cast<int> (buf[m++]);
+    shake_atom[nlocal][2] = static_cast<int> (buf[m++]);
+    shake_type[nlocal][0] = static_cast<int> (buf[m++]);
+    shake_type[nlocal][1] = static_cast<int> (buf[m++]);
+    shake_type[nlocal][2] = static_cast<int> (buf[m++]);
+  } else if (flag == 2) {
+    shake_atom[nlocal][0] = static_cast<int> (buf[m++]);
+    shake_atom[nlocal][1] = static_cast<int> (buf[m++]);
+    shake_type[nlocal][0] = static_cast<int> (buf[m++]);
+  } else if (flag == 3) {
+    shake_atom[nlocal][0] = static_cast<int> (buf[m++]);
+    shake_atom[nlocal][1] = static_cast<int> (buf[m++]);
+    shake_atom[nlocal][2] = static_cast<int> (buf[m++]);
+    shake_type[nlocal][0] = static_cast<int> (buf[m++]);
+    shake_type[nlocal][1] = static_cast<int> (buf[m++]);
+  } else if (flag == 4) {
+    shake_atom[nlocal][0] = static_cast<int> (buf[m++]);
+    shake_atom[nlocal][1] = static_cast<int> (buf[m++]);
+    shake_atom[nlocal][2] = static_cast<int> (buf[m++]);
+    shake_atom[nlocal][3] = static_cast<int> (buf[m++]);
+    shake_type[nlocal][0] = static_cast<int> (buf[m++]);
+    shake_type[nlocal][1] = static_cast<int> (buf[m++]);
+    shake_type[nlocal][2] = static_cast<int> (buf[m++]);
+  }
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   enforce SHAKE constraints from rRESPA
+   prediction portion is different than Verlet
+   rRESPA updating of atom coords is done with full v, but only portions of f
+------------------------------------------------------------------------- */
+/*
+void FixShakeCuda::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  // call stats only on outermost level
+
+  if (ilevel == nlevels_respa-1 && update->ntimestep == next_output) stats();
+
+  // perform SHAKE on every loop iteration of every rRESPA level
+  // except last loop iteration of inner levels
+
+  if (ilevel < nlevels_respa-1 && iloop == loop_respa[ilevel]-1) return;
+  
+  // xshake = atom coords after next x update in innermost loop
+  // depends on rRESPA level
+  // for levels > 0 this includes more than one velocity update
+  // xshake = predicted position from call to this routine at level N =
+  // x + dt0 (v + dtN/m fN + 1/2 dt(N-1)/m f(N-1) + ... + 1/2 dt0/m f0)
+
+  double ***f_level = ((FixRespa *) modify->fix[ifix_respa])->f_level;
+  dtfsq = dtf_inner * step_respa[ilevel];
+
+  double invmass,dtfmsq;
+  int jlevel;
+
+  if (rmass) {
+    for (int i = 0; i < nlocal; i++) {
+      if (shake_flag[i]) {
+	invmass = 1.0 / rmass[i];
+	dtfmsq = dtfsq * invmass;
+	xshake[i][0] = x[i][0] + dtv*v[i][0] + dtfmsq*f[i][0];
+	xshake[i][1] = x[i][1] + dtv*v[i][1] + dtfmsq*f[i][1];
+	xshake[i][2] = x[i][2] + dtv*v[i][2] + dtfmsq*f[i][2];
+	for (jlevel = 0; jlevel < ilevel; jlevel++) {
+	  dtfmsq = dtf_innerhalf * step_respa[jlevel] * invmass;
+	  xshake[i][0] += dtfmsq*f_level[i][jlevel][0];
+	  xshake[i][1] += dtfmsq*f_level[i][jlevel][1];
+	  xshake[i][2] += dtfmsq*f_level[i][jlevel][2];
+	}
+      } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0;
+    }
+
+  } else {
+    for (int i = 0; i < nlocal; i++) {
+      if (shake_flag[i]) {
+	invmass = 1.0 / mass[type[i]];
+	dtfmsq = dtfsq * invmass;
+	xshake[i][0] = x[i][0] + dtv*v[i][0] + dtfmsq*f[i][0];
+	xshake[i][1] = x[i][1] + dtv*v[i][1] + dtfmsq*f[i][1];
+	xshake[i][2] = x[i][2] + dtv*v[i][2] + dtfmsq*f[i][2];
+	for (jlevel = 0; jlevel < ilevel; jlevel++) {
+	  dtfmsq = dtf_innerhalf * step_respa[jlevel] * invmass;
+	  xshake[i][0] += dtfmsq*f_level[i][jlevel][0];
+	  xshake[i][1] += dtfmsq*f_level[i][jlevel][1];
+	  xshake[i][2] += dtfmsq*f_level[i][jlevel][2];
+	}
+      } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0;
+    }
+  }
+
+  // communicate results if necessary
+
+  if (nprocs > 1) comm->forward_comm_fix(this);
+
+  // virial setup
+
+  if (vflag) v_setup(vflag);
+  else evflag = 0;
+
+  // loop over clusters
+
+  int m;
+  for (int i = 0; i < nlist; i++) {
+    m = list[i];
+    if (shake_flag[m] == 2) shake2(m);
+    else if (shake_flag[m] == 3) shake3(m);
+    else if (shake_flag[m] == 4) shake4(m);
+    else shake3angle(m);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixShakeCuda::pack_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
+{
+  if(cuda->finished_setup)
+  {
+  	 int iswap=*list;
+  	 if(iswap<0) 
+  	 {
+  	 	iswap=-iswap-1;
+  	 	int first= ((int*) buf)[0];
+  	 	Cuda_FixShakeCuda_PackComm_Self(&cuda->shared_data,n,iswap,first,pbc,pbc_flag);
+  	 }
+  	 else
+     Cuda_FixShakeCuda_PackComm(&cuda->shared_data,n,iswap,(void*) buf,pbc,pbc_flag);
+     return 3;
+  }
+  
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = xshake[j][0];
+      buf[m++] = xshake[j][1];
+      buf[m++] = xshake[j][2];
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = xshake[j][0] + dx;
+      buf[m++] = xshake[j][1] + dy;
+      buf[m++] = xshake[j][2] + dz;
+    }
+  }
+  return 3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixShakeCuda::unpack_comm(int n, int first, double *buf)
+{
+  if(cuda->finished_setup)
+  {
+     Cuda_FixShakeCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
+     return;
+  }
+  
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    xshake[i][0] = buf[m++];
+    xshake[i][1] = buf[m++];
+    xshake[i][2] = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixShakeCuda::reset_dt()
+{
+  if (strcmp(update->integrate_style,"verlet") == 0) {
+    dtv = update->dt;
+    dtfsq = update->dt * update->dt * force->ftm2v;
+  } else {
+    dtv = step_respa[0];
+    dtf_innerhalf = 0.5 * step_respa[0] * force->ftm2v;
+    dtf_inner = step_respa[0] * force->ftm2v;
+  }
+  if(cu_shake_atom)
+  Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq,
+  	cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(),
+  	cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(),
+	max_iter,tolerance);
+}
diff --git a/src/USER-CUDA/fix_shake_cuda.h b/src/USER-CUDA/fix_shake_cuda.h
new file mode 100644
index 0000000000..18ea64f983
--- /dev/null
+++ b/src/USER-CUDA/fix_shake_cuda.h
@@ -0,0 +1,133 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(shake/cuda,FixShakeCuda)
+
+#else
+
+#ifndef LMP_FIX_SHAKE_CUDA_H
+#define LMP_FIX_SHAKE_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+#include "cuda_precision.h"
+
+namespace LAMMPS_NS {
+
+class FixShakeCuda : public Fix {
+ public:
+  FixShakeCuda(class LAMMPS *, int, char **);
+  ~FixShakeCuda();
+  int setmask();
+  void init();
+  void setup(int);
+  void pre_neighbor();
+  void post_force(int);
+  //void post_force_respa(int, int, int);
+
+  double memory_usage();
+  void grow_arrays(int);
+  void copy_arrays(int, int);
+  void set_arrays(int);
+  int pack_exchange(int, double *);
+  int unpack_exchange(int, double *);
+  int pack_comm(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+
+  int dof(int);
+  void reset_dt();
+
+  double time_postforce;
+ private:
+  class Cuda *cuda;
+  int me,nprocs;
+  double PI;
+  double tolerance;                      // SHAKE tolerance
+  int max_iter;                          // max # of SHAKE iterations
+  int output_every;                      // SHAKE stat output every so often
+  int next_output;                       // timestep for next output
+
+                                         // settings from input command
+  int *bond_flag,*angle_flag;            // bond/angle types to constrain
+  int *type_flag;                        // constrain bonds to these types
+  double *mass_list;                     // constrain bonds to these masses
+  int nmass;                             // # of masses in mass_list
+  bool neighbor_step;					 // was neighboring done in this step -> need to run the Cuda_FixShake_Init
+
+  double *bond_distance,*angle_distance; // constraint distances
+  cCudaData<double 	  , X_FLOAT , xx >* cu_bond_distance;
+  cCudaData<double 	  , X_FLOAT , xx >* cu_angle_distance;
+
+  int ifix_respa;                        // rRESPA fix needed by SHAKE
+  int nlevels_respa;                     // copies of needed rRESPA variables
+  int *loop_respa;
+  double *step_respa;
+
+  double **x,**v,**f;                    // local ptrs to atom class quantities
+  double *mass,*rmass;
+  int *type;
+  int nlocal;
+                                         // atom-based arrays
+  int *shake_flag;                       // 0 if atom not in SHAKE cluster
+                                         // 1 = size 3 angle cluster
+                                         // 2,3,4 = size of bond-only cluster
+  int **shake_atom;                      // global IDs of atoms in cluster
+                                         // central atom is 1st
+                                         // lowest global ID is 1st for size 2
+                                            
+  int **shake_type;                      // bondtype of each bond in cluster
+                                         // for angle cluster, 3rd value
+                                         //   is angletype
+  double **xshake;                       // unconstrained atom coords
+  cCudaData<int 	  , int	    , xx >* cu_shake_flag;
+  cCudaData<int 	  , int	    , yx >* cu_shake_atom;
+  cCudaData<int 	  , int	    , yx >* cu_shake_type;
+  cCudaData<double 	  , X_FLOAT , xy >* cu_xshake;
+  cCudaData<int 	  , int	    , xx >* cu_list;
+  cCudaData<double 	  , ENERGY_FLOAT , xx >* cu_virial;
+  int* countoccur;
+
+  int vflag;                            // virial flag
+  double dtv,dtfsq;                     // timesteps for trial move
+  double dtf_inner,dtf_innerhalf;       // timesteps for rRESPA trial move
+
+  int *list;                            // list of clusters to SHAKE
+  int nlist,maxlist;                    // size and max-size of list
+
+                                        // stat quantities
+  int *b_count,*b_count_all;            // counts for each bond type
+  double *b_ave,*b_max,*b_min;          // ave/max/min dist for each bond type
+  double *b_ave_all,*b_max_all,*b_min_all;   // MPI summing arrays
+  int *a_count,*a_count_all;            // ditto for angle types
+  double *a_ave,*a_max,*a_min;
+  double *a_ave_all,*a_max_all,*a_min_all;
+
+  void find_clusters();
+  void swap_clusters(int i,int j);
+  int masscheck(double);
+  void unconstrained_update();
+  void shake2(int);
+  void shake3(int);
+  void shake4(int);
+  void shake3angle(int);
+  void stats();
+  int bondfind(int, int, int);
+  int anglefind(int, int, int);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_temp_berendsen_cuda.cpp b/src/USER-CUDA/fix_temp_berendsen_cuda.cpp
new file mode 100644
index 0000000000..2c9853c7c2
--- /dev/null
+++ b/src/USER-CUDA/fix_temp_berendsen_cuda.cpp
@@ -0,0 +1,220 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include "fix_temp_berendsen_cuda.h"
+#include "fix_temp_berendsen_cuda_cu.h"
+#include "atom.h"
+#include "force.h"
+#include "group.h"
+#include "update.h"
+#include "comm.h"
+#include "modify.h"
+#include "compute.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+enum{NOBIAS,BIAS};
+
+/* ---------------------------------------------------------------------- */
+
+FixTempBerendsenCuda::FixTempBerendsenCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 6) error->all("Illegal fix temp/berendsen/cuda command");
+
+  // Berendsen thermostat should be applied every step
+
+  nevery = 1;
+
+  t_start = atof(arg[3]);
+  t_stop = atof(arg[4]);
+  t_period = atof(arg[5]);
+
+  // error checks
+
+  if (t_period <= 0.0) error->all("Fix temp/berendsen/cuda period must be > 0.0");
+
+  // create a new compute temp style
+  // id = fix-ID + temp, compute group = fix group
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = group->names[igroup];
+  newarg[2] = (char *) "temp/cuda";
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixTempBerendsenCuda::~FixTempBerendsenCuda()
+{
+  // delete temperature if fix created it
+
+  if (tflag) modify->delete_compute(id_temp);
+  delete [] id_temp;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempBerendsenCuda::setmask()
+{
+  int mask = 0;
+  mask |= END_OF_STEP_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempBerendsenCuda::init()
+{
+  int icompute = modify->find_compute(id_temp);
+  if (icompute < 0) 
+    error->all("Temperature ID for fix temp/berendsen/cuda does not exist");
+  temperature = modify->compute[icompute];
+  if(not temperature->cudable) 
+	error->warning("Fix temp/berendsen/cuda uses non cudable temperature compute");
+  if (temperature->tempbias) which = BIAS;
+  else which = NOBIAS;
+
+  //temperature->init();        //not in original berendsen possible error?
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempBerendsenCuda::end_of_step()
+{
+  double t_current;
+  if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();}
+  t_current = temperature->compute_scalar();
+  if (t_current == 0.0)
+    error->all("Computed temperature for fix temp/berendsen/cuda cannot be 0.0");
+
+  double delta = update->ntimestep - update->beginstep;
+  delta /= update->endstep - update->beginstep;
+  t_target = t_start + delta * (t_stop-t_start);
+
+  // rescale velocities by lamda
+
+  double lamda = sqrt(1.0 + update->dt/t_period*(t_target/t_current - 1.0));
+
+  double **v = atom->v;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  if (which == NOBIAS) {
+	Cuda_FixTempBerendsenCuda_EndOfStep(&cuda->shared_data, groupbit,lamda);
+
+    } else {
+      if(not temperature->cudable)
+      {
+      	cuda->cu_x->download();cuda->cu_v->download();
+      for (int i = 0; i < nlocal; i++) {
+	if (mask[i] & groupbit) {
+	  temperature->remove_bias(i,v[i]);
+ 	  v[i][0] *= lamda;
+	  v[i][1] *= lamda;
+	  v[i][2] *= lamda;
+	  temperature->restore_bias(i,v[i]);
+	}
+        }
+	  cuda->cu_v->upload();
+      }
+      else
+	  {
+  	    temperature->remove_bias_all();
+	    Cuda_FixTempBerendsenCuda_EndOfStep(&cuda->shared_data, groupbit,lamda);
+	    temperature->restore_bias_all();
+	  }
+    }
+
+  
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempBerendsenCuda::modify_param(int narg, char **arg)
+{
+  if (strcmp(arg[0],"temp") == 0) {
+    if (narg < 2) error->all("Illegal fix_modify command");
+    if (tflag) {
+      modify->delete_compute(id_temp);
+      tflag = 0;
+    }
+    delete [] id_temp;
+    int n = strlen(arg[1]) + 1;
+    id_temp = new char[n];
+    strcpy(id_temp,arg[1]);
+
+    int icompute = modify->find_compute(id_temp);
+    if (icompute < 0) error->all("Could not find fix_modify temperature ID");
+    temperature = modify->compute[icompute];
+
+    if (temperature->tempflag == 0)
+      error->all("Fix_modify temperature ID does not compute temperature");
+    if (temperature->igroup != igroup && comm->me == 0)
+      error->warning("Group for fix_modify temp != fix group");
+    return 2;
+  }
+  return 0;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempBerendsenCuda::reset_target(double t_new)
+{
+  t_start = t_stop = t_new;
+}
+
+
+
diff --git a/src/USER-CUDA/fix_temp_berendsen_cuda.h b/src/USER-CUDA/fix_temp_berendsen_cuda.h
new file mode 100644
index 0000000000..391cd07e72
--- /dev/null
+++ b/src/USER-CUDA/fix_temp_berendsen_cuda.h
@@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#ifdef FIX_CLASS
+
+FixStyle(temp/berendsen/cuda,FixTempBerendsenCuda)
+
+#else
+
+#ifndef LMP_FIX_TEMP_BERENDSEN_CUDA_H
+#define LMP_FIX_TEMP_BERENDSEN_CUDA_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+class FixTempBerendsenCuda : public Fix {
+ public:
+  FixTempBerendsenCuda(class LAMMPS *, int, char **);
+  ~FixTempBerendsenCuda();
+  int setmask();
+  void init();
+  void end_of_step();
+  int modify_param(int, char **);
+  void reset_target(double);
+
+ private:
+  class Cuda *cuda;
+  int which;
+  double t_start,t_stop,t_target,t_period;
+
+  char *id_temp;
+  class Compute *temperature;
+  int tflag;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_temp_rescale_cuda.cpp b/src/USER-CUDA/fix_temp_rescale_cuda.cpp
new file mode 100644
index 0000000000..42f038c6b0
--- /dev/null
+++ b/src/USER-CUDA/fix_temp_rescale_cuda.cpp
@@ -0,0 +1,222 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include "fix_temp_rescale_cuda.h"
+#include "fix_temp_rescale_cuda_cu.h"
+#include "atom.h"
+#include "force.h"
+#include "group.h"
+#include "update.h"
+#include "domain.h"
+#include "region.h"
+#include "comm.h"
+#include "modify.h"
+#include "compute.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+enum{NOBIAS,BIAS};
+
+/* ---------------------------------------------------------------------- */
+
+FixTempRescaleCuda::FixTempRescaleCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg < 8) error->all("Illegal fix temp/rescale/cuda command");
+
+  nevery = atoi(arg[3]);
+  if (nevery <= 0) error->all("Illegal fix temp/rescale/cuda command");
+
+  scalar_flag = 1;
+  global_freq = nevery;
+  extscalar = 1;
+
+  t_start = atof(arg[4]);
+  t_stop = atof(arg[5]);
+  t_window = atof(arg[6]);
+  fraction = atof(arg[7]);
+
+  // create a new compute temp
+  // id = fix-ID + temp, compute group = fix group
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[6];
+  newarg[0] = id_temp;
+  newarg[1] = group->names[igroup];
+  newarg[2] = (char *) "temp/cuda";
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+
+  energy = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixTempRescaleCuda::~FixTempRescaleCuda()
+{
+  // delete temperature if fix created it
+
+  if (tflag) modify->delete_compute(id_temp);
+  delete [] id_temp;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempRescaleCuda::setmask()
+{
+  int mask = 0;
+  mask |= END_OF_STEP_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleCuda::init()
+{
+  int icompute = modify->find_compute(id_temp);
+  if (icompute < 0) 
+    error->all("Temperature ID for fix temp/rescale/cuda does not exist");
+  temperature = modify->compute[icompute];
+  if(not temperature->cudable) 
+	error->warning("Fix temp/rescale/cuda uses non cudable temperature compute");
+  if (temperature->tempbias) which = BIAS;
+  else which = NOBIAS;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleCuda::end_of_step()
+{
+  double t_current;
+  if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();}
+  t_current = temperature->compute_scalar();
+  if (t_current == 0.0)
+    error->all("Computed temperature for fix temp/rescale/cuda cannot be 0.0");
+
+  double delta = update->ntimestep - update->beginstep;
+  delta /= update->endstep - update->beginstep;
+  double t_target = t_start + delta * (t_stop-t_start);
+
+  // rescale velocity of appropriate atoms if outside window
+
+  if (fabs(t_current-t_target) > t_window) {
+    t_target = t_current - fraction*(t_current-t_target);
+    double factor = sqrt(t_target/t_current);
+    double efactor = 0.5 * force->boltz * temperature->dof;
+
+    double **v = atom->v;
+    int *mask = atom->mask;
+    int nlocal = atom->nlocal;
+
+    if (which == NOBIAS) {
+      energy += (t_current-t_target) * efactor;
+
+	Cuda_FixTempRescaleCuda_EndOfStep(&cuda->shared_data, groupbit,factor);
+
+    } else if (which == BIAS) {
+      energy += (t_current-t_target) * efactor;
+      if(not temperature->cudable)
+      {
+      	cuda->cu_x->download();cuda->cu_v->download();
+      for (int i = 0; i < nlocal; i++) {
+	if (mask[i] & groupbit) {
+	  temperature->remove_bias(i,v[i]);
+	  v[i][0] *= factor;
+	  v[i][1] *= factor;
+	  v[i][2] *= factor;
+	  temperature->restore_bias(i,v[i]);
+	}
+        }
+	  cuda->cu_v->upload();
+      }
+      else
+      {
+	    temperature->remove_bias_all();
+	    Cuda_FixTempRescaleCuda_EndOfStep(&cuda->shared_data, groupbit,factor);
+	    temperature->restore_bias_all();
+      }
+    }
+
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempRescaleCuda::modify_param(int narg, char **arg)
+{
+  if (strcmp(arg[0],"temp") == 0) {
+    if (narg < 2) error->all("Illegal fix_modify command");
+    if (tflag) {
+      modify->delete_compute(id_temp);
+      tflag = 0;
+    }
+    delete [] id_temp;
+    int n = strlen(arg[1]) + 1;
+    id_temp = new char[n];
+    strcpy(id_temp,arg[1]);
+
+    int icompute = modify->find_compute(id_temp);
+    if (icompute < 0) error->all("Could not find fix_modify temperature ID");
+    temperature = modify->compute[icompute];
+
+    if (temperature->tempflag == 0)
+      error->all("Fix_modify temperature ID does not compute temperature");
+    if (temperature->igroup != igroup && comm->me == 0)
+      error->warning("Group for fix_modify temp != fix group");
+    if(not temperature->cudable) 
+	  error->warning("Fix temp/rescale/cuda uses non cudable temperature compute");
+    return 2;
+  }
+  return 0;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleCuda::reset_target(double t_new)
+{
+  t_start = t_stop = t_new;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixTempRescaleCuda::compute_scalar()
+{
+  return energy;
+}
diff --git a/src/USER-CUDA/fix_temp_rescale_cuda.h b/src/USER-CUDA/fix_temp_rescale_cuda.h
new file mode 100644
index 0000000000..75876e60d3
--- /dev/null
+++ b/src/USER-CUDA/fix_temp_rescale_cuda.h
@@ -0,0 +1,61 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(temp/rescale/cuda,FixTempRescaleCuda)
+
+#else
+
+#ifndef FIX_TEMP_RESCALE_CUDA_H
+#define FIX_TEMP_RESCALE_CUDA_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+class FixTempRescaleCuda : public Fix {
+ public:
+  FixTempRescaleCuda(class LAMMPS *, int, char **);
+  ~FixTempRescaleCuda();
+  int setmask();
+  void init();
+  void end_of_step();
+  int modify_param(int, char **);
+  void reset_target(double);
+  double compute_scalar();
+
+ private:
+  class Cuda *cuda;
+  int which;
+  double t_start,t_stop,t_window;
+  double fraction,energy,efactor;
+
+  char *id_temp;
+  class Compute *temperature;
+  int tflag;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp b/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp
new file mode 100644
index 0000000000..c8730a1728
--- /dev/null
+++ b/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp
@@ -0,0 +1,237 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include "fix_temp_rescale_limit_cuda.h"
+#include "fix_temp_rescale_limit_cuda_cu.h"
+#include "atom.h"
+#include "force.h"
+#include "group.h"
+#include "update.h"
+#include "domain.h"
+#include "region.h"
+#include "comm.h"
+#include "modify.h"
+#include "compute.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+#define MIN(A,B) ((A) < (B)) ? (A) : (B)
+#define MAX(A,B) ((A) > (B)) ? (A) : (B)
+
+enum{NOBIAS,BIAS};
+
+/* ---------------------------------------------------------------------- */
+
+FixTempRescaleLimitCuda::FixTempRescaleLimitCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg < 9) error->all("Illegal fix temp/rescale/limit/cuda command");
+
+  nevery = atoi(arg[3]);
+  if (nevery <= 0) error->all("Illegal fix temp/rescale/limit/cuda command");
+
+  scalar_flag = 1;
+  global_freq = nevery;
+  extscalar = 1;
+
+  t_start = atof(arg[4]);
+  t_stop = atof(arg[5]);
+  t_window = atof(arg[6]);
+  fraction = atof(arg[7]);
+  limit = atof(arg[8]);
+  if (limit <= 1.0) error->all("Illegal fix temp/rescale/limit/cuda command (limit must be > 1.0)");
+  
+
+  // create a new compute temp
+  // id = fix-ID + temp, compute group = fix group
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[6];
+  newarg[0] = id_temp;
+  newarg[1] = group->names[igroup];
+  newarg[2] = (char *) "temp/cuda";
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+
+  energy = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixTempRescaleLimitCuda::~FixTempRescaleLimitCuda()
+{
+  // delete temperature if fix created it
+
+  if (tflag) modify->delete_compute(id_temp);
+  delete [] id_temp;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempRescaleLimitCuda::setmask()
+{
+  int mask = 0;
+  mask |= END_OF_STEP_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleLimitCuda::init()
+{
+  int icompute = modify->find_compute(id_temp);
+  if (icompute < 0) 
+    error->all("Temperature ID for fix temp/rescale/limit/cuda does not exist");
+  temperature = modify->compute[icompute];
+  if(not temperature->cudable) 
+	error->warning("Fix temp/rescale/limit/cuda uses non cudable temperature compute");
+  if (temperature->tempbias) which = BIAS;
+  else which = NOBIAS;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleLimitCuda::end_of_step()
+{
+  double t_current;
+  if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();}
+  t_current = temperature->compute_scalar();
+  if (t_current == 0.0)
+    error->all("Computed temperature for fix temp/rescale/limit/cuda cannot be 0.0");
+
+  double delta = update->ntimestep - update->beginstep;
+  delta /= update->endstep - update->beginstep;
+  double t_target = t_start + delta * (t_stop-t_start);
+
+  // rescale velocity of appropriate atoms if outside window
+
+  if (fabs(t_current-t_target) > t_window) {
+    t_target = t_current - fraction*(t_current-t_target);
+    double factor = sqrt(t_target/t_current);
+    double efactor = 0.5 * force->boltz * temperature->dof;
+
+    double **v = atom->v;
+    int *mask = atom->mask;
+    int nlocal = atom->nlocal;
+
+    double massone;
+    if(atom->rmass) massone = atom->rmass[0];
+    else massone = atom->mass[0];
+
+    double current_limit=sqrt(limit*force->boltz*t_target*temperature->dof/massone/force->mvv2e);
+    if (which == NOBIAS) {
+      energy += (t_current-t_target) * efactor;
+
+    
+	Cuda_FixTempRescaleLimitCuda_EndOfStep(&cuda->shared_data, groupbit,factor,current_limit);
+	
+    } else if (which == BIAS) {
+      energy += (t_current-t_target) * efactor;
+      if(not temperature->cudable)
+      {
+      	cuda->cu_x->download();cuda->cu_v->download();
+      for (int i = 0; i < nlocal; i++) {
+	if (mask[i] & groupbit) {
+	  temperature->remove_bias(i,v[i]);
+	  double vx = v[i][0] * factor;
+	  double vy = v[i][1] * factor;
+	  double vz = v[i][2] * factor;
+	  v[i][0]=vx>0?MIN(vx,current_limit):MAX(vx,-current_limit);
+	  v[i][1]=vy>0?MIN(vy,current_limit):MAX(vy,-current_limit);
+	  v[i][2]=vz>0?MIN(vz,current_limit):MAX(vz,-current_limit);
+	  
+	  temperature->restore_bias(i,v[i]);
+	}
+        }
+	  cuda->cu_v->upload();
+      }
+      else
+      {
+   	    temperature->remove_bias_all();
+	    Cuda_FixTempRescaleLimitCuda_EndOfStep(&cuda->shared_data, groupbit,factor,current_limit);
+	    temperature->restore_bias_all();
+      }
+    }
+
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempRescaleLimitCuda::modify_param(int narg, char **arg)
+{
+  if (strcmp(arg[0],"temp") == 0) {
+    if (narg < 2) error->all("Illegal fix_modify command");
+    if (tflag) {
+      modify->delete_compute(id_temp);
+      tflag = 0;
+    }
+    delete [] id_temp;
+    int n = strlen(arg[1]) + 1;
+    id_temp = new char[n];
+    strcpy(id_temp,arg[1]);
+
+    int icompute = modify->find_compute(id_temp);
+    if (icompute < 0) error->all("Could not find fix_modify temperature ID");
+    temperature = modify->compute[icompute];
+
+    if (temperature->tempflag == 0)
+      error->all("Fix_modify temperature ID does not compute temperature");
+    if (temperature->igroup != igroup && comm->me == 0)
+      error->warning("Group for fix_modify temp != fix group");
+    if(not temperature->cudable) 
+	  error->warning("Fix temp/rescale/limit/cuda uses non cudable temperature compute");
+    return 2;
+  }
+  return 0;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleLimitCuda::reset_target(double t_new)
+{
+  t_start = t_stop = t_new;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixTempRescaleLimitCuda::compute_scalar()
+{
+  return energy;
+}
diff --git a/src/USER-CUDA/fix_temp_rescale_limit_cuda.h b/src/USER-CUDA/fix_temp_rescale_limit_cuda.h
new file mode 100644
index 0000000000..7ee49d3c40
--- /dev/null
+++ b/src/USER-CUDA/fix_temp_rescale_limit_cuda.h
@@ -0,0 +1,61 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(temp/rescale/limit/cuda,FixTempRescaleLimitCuda)
+
+#else
+
+#ifndef FIX_TEMP_RESCALE_LIMIT_CUDA_H
+#define FIX_TEMP_RESCALE_LIMIT_CUDA_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+class FixTempRescaleLimitCuda : public Fix {
+ public:
+  FixTempRescaleLimitCuda(class LAMMPS *, int, char **);
+  ~FixTempRescaleLimitCuda();
+  int setmask();
+  void init();
+  void end_of_step();
+  int modify_param(int, char **);
+  void reset_target(double);
+  double compute_scalar();
+
+ private:
+  class Cuda *cuda;
+  int which;
+  double t_start,t_stop,t_window;
+  double fraction,energy,efactor;
+  double limit;
+  char *id_temp;
+  class Compute *temperature;
+  int tflag;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/fix_viscous_cuda.cpp b/src/USER-CUDA/fix_viscous_cuda.cpp
new file mode 100644
index 0000000000..c167105027
--- /dev/null
+++ b/src/USER-CUDA/fix_viscous_cuda.cpp
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include "fix_viscous_cuda.h"
+#include "fix_viscous_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda_modify_flags.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixViscousCuda::FixViscousCuda(LAMMPS *lmp, int narg, char **arg) :
+  FixViscous(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	cu_gamma=NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixViscousCuda::~FixViscousCuda()
+{
+	delete cu_gamma;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixViscousCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+ // mask |= POST_FORCE_RESPA;
+ // mask |= MIN_POST_FORCE;
+  return mask;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+void FixViscousCuda::setup(int vflag)
+{
+   if(not cu_gamma)
+   cu_gamma = new cCudaData<double, F_FLOAT, x> (gamma,atom->ntypes+1);
+   Cuda_FixViscousCuda_Init(&cuda->shared_data);
+   cu_gamma->upload();
+ // if (strcmp(update->integrate_style,"verlet/cuda") == 0)
+    post_force(vflag);
+ /* else {
+    ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1);
+    post_force_respa(vflag,nlevels_respa-1,0);
+    ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1);
+  }*/
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixViscousCuda::min_setup(int vflag)
+{
+  Cuda_FixViscousCuda_Init(&cuda->shared_data);
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixViscousCuda::post_force(int vflag)
+{
+  // apply drag force to atoms in group
+  // direction is opposed to velocity vector
+  // magnitude depends on atom type
+
+  Cuda_FixViscousCuda_PostForce(&cuda->shared_data, groupbit,cu_gamma->dev_data());
+}
diff --git a/src/USER-CUDA/fix_viscous_cuda.h b/src/USER-CUDA/fix_viscous_cuda.h
new file mode 100644
index 0000000000..54e75cc0d2
--- /dev/null
+++ b/src/USER-CUDA/fix_viscous_cuda.h
@@ -0,0 +1,55 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(viscous/cuda,FixViscousCuda)
+
+#else
+
+#ifndef LMP_FIX_VISCOUS_CUDA_H
+#define LMP_FIX_VISCOUS_CUDA_H
+
+#include "fix_viscous.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixViscousCuda : public FixViscous {
+ public:
+  FixViscousCuda(class LAMMPS *, int, char **);
+  ~FixViscousCuda();
+  int setmask();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  cCudaData<double, F_FLOAT, x>* cu_gamma;
+
+  private:
+  class Cuda *cuda;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/modify_cuda.cpp b/src/USER-CUDA/modify_cuda.cpp
index 7f8d7f8c5a..9f1716ac7a 100644
--- a/src/USER-CUDA/modify_cuda.cpp
+++ b/src/USER-CUDA/modify_cuda.cpp
@@ -63,6 +63,8 @@ using namespace LAMMPS_NS;
 ModifyCuda::ModifyCuda(LAMMPS *lmp) : Modify(lmp)
 {
   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
 
   n_initial_integrate_cuda = 0;
   n_post_integrate_cuda = 0;
diff --git a/src/USER-CUDA/neigh_full_cuda.cpp b/src/USER-CUDA/neigh_full_cuda.cpp
index 49678c1d06..197b62a0ac 100644
--- a/src/USER-CUDA/neigh_full_cuda.cpp
+++ b/src/USER-CUDA/neigh_full_cuda.cpp
@@ -21,7 +21,6 @@
    This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 
-#ifdef CUDA
 #include "neighbor_cuda.h"
 #include "neigh_list.h"
 #include "atom.h"
@@ -313,5 +312,4 @@ return;
   MYDBG(printf(" # CUDA::NeighFullNSQCuda ... end\n");)
   */
 }
-#endif
 
diff --git a/src/USER-CUDA/neighbor_cuda.cpp b/src/USER-CUDA/neighbor_cuda.cpp
index 4575ce2acc..9626650ee8 100644
--- a/src/USER-CUDA/neighbor_cuda.cpp
+++ b/src/USER-CUDA/neighbor_cuda.cpp
@@ -36,6 +36,8 @@ enum{NSQ,BIN,MULTI};     // also in neigh_list.cpp
 NeighborCuda::NeighborCuda(LAMMPS *lmp) : Neighbor(lmp)
 {
   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/USER-CUDA/pair_born_coul_long_cuda.cpp b/src/USER-CUDA/pair_born_coul_long_cuda.cpp
new file mode 100644
index 0000000000..fa19e5c9a2
--- /dev/null
+++ b/src/USER-CUDA/pair_born_coul_long_cuda.cpp
@@ -0,0 +1,186 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_born_coul_long_cuda.h"
+#include "pair_born_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+/* ---------------------------------------------------------------------- */
+
+PairBornCoulLongCuda::PairBornCoulLongCuda(LAMMPS *lmp) : PairBornCoulLong(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairBornCoulLongCuda::allocate()
+{
+	if(! allocated) PairBornCoulLong::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = rhoinv;
+		cuda->shared_data.pair.coeff2  = sigma;
+		cuda->shared_data.pair.coeff3  = a;
+		cuda->shared_data.pair.coeff4  = c;
+		cuda->shared_data.pair.coeff5  = d;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBornCoulLongCuda::compute(int eflag, int vflag)
+{
+	MYDBG( printf("PairBornCoulLongCuda compute start\n"); fflush(stdout);)
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+	#ifdef CUDA_USE_BINNING
+	Cuda_PairBornCoulLongCuda(& cuda->shared_data, eflag, vflag);
+	#else
+	Cuda_PairBornCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+	#endif
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	MYDBG( printf("PairBornCoulLongCuda compute end\n"); fflush(stdout);)
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBornCoulLongCuda::settings(int narg, char **arg)
+{
+	PairBornCoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBornCoulLongCuda::coeff(int narg, char **arg)
+{
+	PairBornCoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairBornCoulLongCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style born/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
+  	
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cut_coulsq = cut_coul * cut_coul;
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairBornCoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairBornCoulLongCuda::init_list\n");)
+	PairBornCoulLong::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairBornCoulLongCuda::init_list end\n");)
+}
+
+void PairBornCoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairBornCoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
diff --git a/src/USER-CUDA/pair_born_coul_long_cuda.h b/src/USER-CUDA/pair_born_coul_long_cuda.h
new file mode 100644
index 0000000000..91f6f650ae
--- /dev/null
+++ b/src/USER-CUDA/pair_born_coul_long_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(born/coul/long/cuda,PairBornCoulLongCuda)
+
+#else
+
+#ifndef LMP_PAIR_BORN_COUL_LONG_CUDA_H
+#define LMP_PAIR_BORN_COUL_LONG_CUDA_H
+
+#include "pair_born_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairBornCoulLongCuda : public PairBornCoulLong
+{
+	public:
+		PairBornCoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp b/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp
new file mode 100644
index 0000000000..5d7fd4fc3f
--- /dev/null
+++ b/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp
@@ -0,0 +1,173 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_buck_coul_cut_cuda.h"
+#include "pair_buck_coul_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairBuckCoulCutCuda::PairBuckCoulCutCuda(LAMMPS *lmp) : PairBuckCoulCut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairBuckCoulCutCuda::allocate()
+{
+	if(! allocated) PairBuckCoulCut::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut_coul     = cut_coul;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = rhoinv;
+		cuda->shared_data.pair.coeff2  = buck1;
+		cuda->shared_data.pair.coeff3  = buck2;
+		cuda->shared_data.pair.coeff4  = a;
+		cuda->shared_data.pair.coeff5  = c;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulCutCuda::compute(int eflag, int vflag)
+{
+	MYDBG( printf("PairBuckCoulCutCuda compute start\n"); fflush(stdout);)
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+	
+	Cuda_PairBuckCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+  
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	MYDBG( printf("PairBuckCoulCutCuda compute end\n"); fflush(stdout);)
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulCutCuda::settings(int narg, char **arg)
+{
+	PairBuckCoulCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulCutCuda::coeff(int narg, char **arg)
+{
+	PairBuckCoulCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairBuckCoulCutCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style buck/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
+  	
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+   cuda->shared_data.pair.cut_coulsq_global=cut_coul_global * cut_coul_global;
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairBuckCoulCutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairBuckCoulCutCuda::init_list\n");)
+	PairBuckCoulCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairBuckCoulCutCuda::init_list end\n");)
+}
+
+void PairBuckCoulCutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairBuckCoulCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
diff --git a/src/USER-CUDA/pair_buck_coul_cut_cuda.h b/src/USER-CUDA/pair_buck_coul_cut_cuda.h
new file mode 100644
index 0000000000..b46a18364a
--- /dev/null
+++ b/src/USER-CUDA/pair_buck_coul_cut_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(buck/coul/cut/cuda,PairBuckCoulCutCuda)
+
+#else
+
+#ifndef LMP_PAIR_BUCK_COUL_CUT_CUDA_H
+#define LMP_PAIR_BUCK_COUL_CUT_CUDA_H
+
+#include "pair_buck_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairBuckCoulCutCuda : public PairBuckCoulCut
+{
+	public:
+		PairBuckCoulCutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_buck_coul_long_cuda.cpp b/src/USER-CUDA/pair_buck_coul_long_cuda.cpp
new file mode 100644
index 0000000000..558d42a29d
--- /dev/null
+++ b/src/USER-CUDA/pair_buck_coul_long_cuda.cpp
@@ -0,0 +1,184 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_buck_coul_long_cuda.h"
+#include "pair_buck_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+/* ---------------------------------------------------------------------- */
+
+PairBuckCoulLongCuda::PairBuckCoulLongCuda(LAMMPS *lmp) : PairBuckCoulLong(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairBuckCoulLongCuda::allocate()
+{
+	if(! allocated) PairBuckCoulLong::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = rhoinv;
+		cuda->shared_data.pair.coeff2  = buck1;
+		cuda->shared_data.pair.coeff3  = buck2;
+		cuda->shared_data.pair.coeff4  = a;
+		cuda->shared_data.pair.coeff5  = c;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulLongCuda::compute(int eflag, int vflag)
+{
+	MYDBG( printf("PairBuckCoulLongCuda compute start\n"); fflush(stdout);)
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairBuckCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	MYDBG( printf("PairBuckCoulLongCuda compute end\n"); fflush(stdout);)
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulLongCuda::settings(int narg, char **arg)
+{
+	PairBuckCoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulLongCuda::coeff(int narg, char **arg)
+{
+	PairBuckCoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairBuckCoulLongCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style buck/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
+  	
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cut_coulsq = cut_coul * cut_coul;
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairBuckCoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairBuckCoulLongCuda::init_list\n");)
+	PairBuckCoulLong::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairBuckCoulLongCuda::init_list end\n");)
+}
+
+void PairBuckCoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairBuckCoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
diff --git a/src/USER-CUDA/pair_buck_coul_long_cuda.h b/src/USER-CUDA/pair_buck_coul_long_cuda.h
new file mode 100644
index 0000000000..39a3791031
--- /dev/null
+++ b/src/USER-CUDA/pair_buck_coul_long_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(buck/coul/long/cuda,PairBuckCoulLongCuda)
+
+#else
+
+#ifndef LMP_PAIR_BUCK_COUL_LONG_CUDA_H
+#define LMP_PAIR_BUCK_COUL_LONG_CUDA_H
+
+#include "pair_buck_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairBuckCoulLongCuda : public PairBuckCoulLong
+{
+	public:
+		PairBuckCoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_buck_cuda.cpp b/src/USER-CUDA/pair_buck_cuda.cpp
new file mode 100644
index 0000000000..b8f164b923
--- /dev/null
+++ b/src/USER-CUDA/pair_buck_cuda.cpp
@@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_buck_cuda.h"
+#include "pair_buck_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairBuckCuda::PairBuckCuda(LAMMPS *lmp) : PairBuck(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairBuckCuda::allocate()
+{
+	if(! allocated) PairBuck::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = rhoinv;
+		cuda->shared_data.pair.coeff2  = buck1;
+		cuda->shared_data.pair.coeff3  = buck2;
+		cuda->shared_data.pair.coeff4  = a;
+		cuda->shared_data.pair.coeff5  = c;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCuda::compute(int eflag, int vflag)
+{
+	MYDBG( printf("PairBuckCuda compute start\n"); fflush(stdout);)
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairBuckCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	MYDBG( printf("PairBuckCuda compute end\n"); fflush(stdout);)
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCuda::settings(int narg, char **arg)
+{
+	PairBuck::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCuda::coeff(int narg, char **arg)
+{
+	PairBuck::coeff(narg, arg);
+	allocate();
+}
+
+void PairBuckCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style buck/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
+  	
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairBuckCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairBuckCuda::init_list\n");)
+	PairBuck::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairBuckCuda::init_list end\n");)
+}
+
+void PairBuckCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairBuck::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
diff --git a/src/USER-CUDA/pair_buck_cuda.h b/src/USER-CUDA/pair_buck_cuda.h
new file mode 100644
index 0000000000..9ec29e1662
--- /dev/null
+++ b/src/USER-CUDA/pair_buck_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(buck/cuda,PairBuckCuda)
+
+#else
+
+#ifndef LMP_PAIR_BUCK_CUDA_H
+#define LMP_PAIR_BUCK_CUDA_H
+
+#include "pair_buck.h"
+
+namespace LAMMPS_NS {
+
+class PairBuckCuda : public PairBuck
+{
+	public:
+		PairBuckCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.cpp b/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.cpp
new file mode 100644
index 0000000000..f0b50469ce
--- /dev/null
+++ b/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.cpp
@@ -0,0 +1,204 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_cg_cmm_coul_cut_cuda.h"
+#include "pair_cg_cmm_coul_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairCGCMMCoulCutCuda::PairCGCMMCoulCutCuda(LAMMPS *lmp) : PairCGCMMCoulCut(lmp)
+{
+  cuda = lmp->cuda;
+  if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cg_type_double = NULL;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairCGCMMCoulCutCuda::allocate()
+{
+	if(! allocated) PairCGCMMCoulCut::allocate();
+	int n = atom->ntypes;
+	if(! allocated2)
+	{
+		allocated2 = true;
+		
+  
+  		memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble");
+  		
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = cg_type_double;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+  	for (int i = 1; i <= n; i++) {
+      for (int j = i; j <= n; j++) {
+        cg_type_double[i][j] = cg_type[i][j];
+        cg_type_double[j][i] = cg_type[i][j];
+      }
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulCutCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairCGCMMCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulCutCuda::settings(int narg, char **arg)
+{
+	PairCGCMMCoulCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+	cuda->shared_data.pair.kappa = (F_FLOAT) kappa;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulCutCuda::coeff(int narg, char **arg)
+{
+	PairCGCMMCoulCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairCGCMMCoulCutCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  cut_respa=NULL;
+  if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
+
+  MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_style end\n"); )
+}
+
+void PairCGCMMCoulCutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_list\n");)
+	PairCGCMMCoulCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_list end\n");)
+}
+
+void PairCGCMMCoulCutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairCGCMMCoulCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.h b/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.h
new file mode 100644
index 0000000000..467b8a3feb
--- /dev/null
+++ b/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.h
@@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(cg/cmm/coul/cut/cuda,PairCGCMMCoulCutCuda)
+
+#else
+
+#ifndef PAIR_CG_CMM_COUL_CUT_CUDA_H
+#define PAIR_CG_CMM_COUL_CUT_CUDA_H
+
+#include "pair_cg_cmm_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairCGCMMCoulCutCuda : public PairCGCMMCoulCut
+{
+	public:
+		PairCGCMMCoulCutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		double** cg_type_double;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.cpp b/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.cpp
new file mode 100644
index 0000000000..43f0f22b09
--- /dev/null
+++ b/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.cpp
@@ -0,0 +1,204 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_cg_cmm_coul_debye_cuda.h"
+#include "pair_cg_cmm_coul_debye_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairCGCMMCoulDebyeCuda::PairCGCMMCoulDebyeCuda(LAMMPS *lmp) : PairCGCMMCoulCut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cg_type_double = NULL;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairCGCMMCoulDebyeCuda::allocate()
+{
+	if(! allocated) PairCGCMMCoulCut::allocate();
+	int n = atom->ntypes;
+	if(! allocated2)
+	{
+		allocated2 = true;
+		
+  
+  		memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble");
+  		
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = cg_type_double;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+  	for (int i = 1; i <= n; i++) {
+      for (int j = i; j <= n; j++) {
+        cg_type_double[i][j] = cg_type[i][j];
+        cg_type_double[j][i] = cg_type[i][j];
+      }
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulDebyeCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairCGCMMCoulDebyeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulDebyeCuda::settings(int narg, char **arg)
+{
+	PairCGCMMCoulCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+	cuda->shared_data.pair.kappa = (F_FLOAT) kappa;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulDebyeCuda::coeff(int narg, char **arg)
+{
+	PairCGCMMCoulCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairCGCMMCoulDebyeCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  cut_respa=NULL;
+  if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
+
+  MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_style end\n"); )
+}
+
+void PairCGCMMCoulDebyeCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_list\n");)
+	PairCGCMMCoulCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_list end\n");)
+}
+
+void PairCGCMMCoulDebyeCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairCGCMMCoulCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.h b/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.h
new file mode 100644
index 0000000000..a392125161
--- /dev/null
+++ b/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.h
@@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(cg/cmm/coul/debye/cuda,PairCGCMMCoulDebyeCuda)
+
+#else
+
+#ifndef PAIR_CG_CMM_COUL_DEBYE_CUDA_H
+#define PAIR_CG_CMM_COUL_DEBYE_CUDA_H
+
+#include "pair_cg_cmm_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairCGCMMCoulDebyeCuda : public PairCGCMMCoulCut
+{
+	public:
+		PairCGCMMCoulDebyeCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		double** cg_type_double;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.cpp b/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.cpp
new file mode 100644
index 0000000000..680daaf1c0
--- /dev/null
+++ b/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.cpp
@@ -0,0 +1,206 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_cg_cmm_coul_long_cuda.h"
+#include "pair_cg_cmm_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairCGCMMCoulLongCuda::PairCGCMMCoulLongCuda(LAMMPS *lmp) : PairCGCMMCoulLong(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cg_type_double = NULL;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongCuda::allocate()
+{
+	if(! allocated) PairCGCMMCoulLong::allocate();
+	int n = atom->ntypes;
+	if(! allocated2)
+	{
+		allocated2 = true;
+		
+  
+  		memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble");
+  		
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = cg_type_double;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+  	for (int i = 1; i <= n; i++) {
+      for (int j = i; j <= n; j++) {
+        cg_type_double[i][j] = cg_type[i][j];
+        cg_type_double[j][i] = cg_type[i][j];
+      }
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairCGCMMCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongCuda::settings(int narg, char **arg)
+{
+	PairCGCMMCoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+	cuda->shared_data.pair.kappa = (F_FLOAT) kappa;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongCuda::coeff(int narg, char **arg)
+{
+	PairCGCMMCoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairCGCMMCoulLongCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  cut_respa=NULL;
+  if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
+  MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_style end\n"); )
+}
+
+void PairCGCMMCoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_list\n");)
+	PairCGCMMCoulLong::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_list end\n");)
+}
+
+void PairCGCMMCoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairCGCMMCoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.h b/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.h
new file mode 100644
index 0000000000..cad37fc8ca
--- /dev/null
+++ b/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.h
@@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(cg/cmm/coul/long/cuda,PairCGCMMCoulLongCuda)
+
+#else
+
+#ifndef PAIR_CG_CMM_COUL_LONG_CUDA_H
+#define PAIR_CG_CMM_COUL_LONG_CUDA_H
+
+#include "pair_cg_cmm_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairCGCMMCoulLongCuda : public PairCGCMMCoulLong
+{
+	public:
+		PairCGCMMCoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		double** cg_type_double;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_cg_cmm_cuda.cpp b/src/USER-CUDA/pair_cg_cmm_cuda.cpp
new file mode 100644
index 0000000000..faaf190b7a
--- /dev/null
+++ b/src/USER-CUDA/pair_cg_cmm_cuda.cpp
@@ -0,0 +1,201 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_cg_cmm_cuda.h"
+#include "pair_cg_cmm_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairCGCMMCuda::PairCGCMMCuda(LAMMPS *lmp) : PairCGCMM(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cg_type_double = NULL;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairCGCMMCuda::allocate()
+{
+	if(! allocated) PairCGCMM::allocate();
+	int n = atom->ntypes;
+	if(! allocated2)
+	{
+		allocated2 = true;
+		
+  
+  		memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble");
+  		
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = cg_type_double;
+	    /*cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_cg_type_double_gm = new cCudaData<double, F_FLOAT, x> ((double*)cg_type_double, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1));*/
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+	}
+  	for (int i = 1; i <= n; i++) {
+      for (int j = i; j <= n; j++) {
+        cg_type_double[i][j] = cg_type[i][j];
+        cg_type_double[j][i] = cg_type[i][j];
+      }
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairCGCMMCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCuda::settings(int narg, char **arg)
+{
+	PairCGCMM::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCuda::coeff(int narg, char **arg)
+{
+	PairCGCMM::coeff(narg, arg);
+	allocate();
+}
+
+void PairCGCMMCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairCGCMMCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+  cut_respa=NULL;
+
+  MYDBG(printf("# CUDA PairCGCMMCuda::init_style end\n"); )
+}
+
+void PairCGCMMCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairCGCMMCuda::init_list\n");)
+	PairCGCMM::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairCGCMMCuda::init_list end\n");)
+}
+
+void PairCGCMMCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairCGCMM::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_cg_cmm_cuda.h b/src/USER-CUDA/pair_cg_cmm_cuda.h
new file mode 100644
index 0000000000..74236b889f
--- /dev/null
+++ b/src/USER-CUDA/pair_cg_cmm_cuda.h
@@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(cg/cmm/cuda,PairCGCMMCuda)
+
+#else
+
+#ifndef PAIR_CG_CMM_CUDA_H
+#define PAIR_CG_CMM_CUDA_H
+
+#include "pair_cg_cmm.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairCGCMMCuda : public PairCGCMM
+{
+	public:
+		PairCGCMMCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		double** cg_type_double;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_cg_type_double_gm;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_eam_alloy_cuda.cpp b/src/USER-CUDA/pair_eam_alloy_cuda.cpp
new file mode 100644
index 0000000000..238c7520d9
--- /dev/null
+++ b/src/USER-CUDA/pair_eam_alloy_cuda.cpp
@@ -0,0 +1,326 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL)
+------------------------------------------------------------------------- */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_eam_alloy_cuda.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMAlloyCuda::PairEAMAlloyCuda(LAMMPS *lmp) : PairEAMCuda(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyCuda::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all("Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all("Incorrect args for pair coefficients");
+
+  // read EAM setfl file
+
+  if (setfl) {
+    for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i];
+    delete [] setfl->elements;
+    delete [] setfl->mass;
+    memory->destroy(setfl->frho);
+    memory->destroy(setfl->rhor);
+    memory->destroy(setfl->z2r);
+    delete setfl;
+  }
+  setfl = new Setfl();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < setfl->nelements; j++)
+      if (strcmp(arg[i],setfl->elements[j]) == 0) break;
+    if (j < setfl->nelements) map[i-2] = j;
+    else error->all("No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+	setflag[i][j] = 1;
+	if (i == j) atom->set_mass(i,setfl->mass[map[i]]);
+	count++;
+      }
+    }
+  }
+
+  if (count == 0) error->all("Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyCuda::read_file(char *filename)
+{
+  Setfl *file = setfl;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = fopen(filename,"r");
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all("Incorrect element names in EAM potential file");
+  
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  char *first = strtok(line," \t\n\r\f");
+  while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+	   &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho");
+  memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,file->nr+1,
+		 "pair:z2r");
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+    if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]);
+    MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world);
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyCuda::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from setfl file
+
+  nrho = setfl->nrho;
+  nr = setfl->nr;
+  drho = setfl->drho;
+  dr = setfl->dr;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of setfl elements + 1 for zero array
+  
+  nfrho = setfl->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = # of setfl elements
+
+  nrhor = setfl->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element's rhor to global rhor
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m];
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for setfl files, I,J mapping only depends on I
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of setfl elements
+
+  nz2r = setfl->nelements * (setfl->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < setfl->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+	type2z2r[i][j] = 0;
+	continue;
+      }
+      if (irow < icol) {
+	irow = map[j];
+	icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
diff --git a/src/USER-CUDA/pair_eam_alloy_cuda.h b/src/USER-CUDA/pair_eam_alloy_cuda.h
new file mode 100644
index 0000000000..d17d9f5c79
--- /dev/null
+++ b/src/USER-CUDA/pair_eam_alloy_cuda.h
@@ -0,0 +1,44 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/alloy/cuda,PairEAMAlloyCuda)
+
+#else
+
+#ifndef LMP_PAIR_EAM_CUDA_ALLOY_H
+#define LMP_PAIR_EAM_CUDA_ALLOY_H
+
+#include "pair_eam_cuda.h"
+
+namespace LAMMPS_NS {
+
+// use virtual public since this class is parent in multiple inheritance
+
+class PairEAMAlloyCuda : virtual public PairEAMCuda {
+ public:
+  PairEAMAlloyCuda(class LAMMPS *);
+  virtual ~PairEAMAlloyCuda() {}
+  void coeff(int, char **);
+
+ protected:
+  class Cuda *cuda;
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_eam_cuda.cpp b/src/USER-CUDA/pair_eam_cuda.cpp
new file mode 100644
index 0000000000..0ca7289c6b
--- /dev/null
+++ b/src/USER-CUDA/pair_eam_cuda.cpp
@@ -0,0 +1,239 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_eam_cuda.h"
+#include "pair_eam_cuda_cu.h"
+#include "pair_virial_compute_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMCuda::PairEAMCuda(LAMMPS *lmp) : PairEAM(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.override_block_per_atom = 0;
+	
+	cuda->setSystemParams();
+	cu_rho=NULL;
+	cu_fp=NULL;
+    cu_frho_spline = NULL;
+    cu_z2r_spline = NULL;
+    cu_rhor_spline = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairEAMCuda::allocate()
+{
+	if(! allocated) PairEAM::allocate();
+		cuda->shared_data.pair.cutsq     = cutsq;
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cutforcesq;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEAMCuda::compute(int eflag, int vflag)
+{
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cutforcesq;
+    cuda->shared_data.pair.use_block_per_atom = 0;
+    cuda->shared_data.pair.collect_forces_later = 0;
+    if (atom->nmax > nmax) {
+    memory->destroy(rho);
+    memory->destroy(fp);
+    nmax = atom->nmax;
+    memory->create(rho,nmax,"pair:rho");
+    memory->create(fp,nmax,"pair:fp");
+	delete cu_rho;
+	delete cu_fp;
+	cu_rho = new cCudaData<double, F_FLOAT, x> (rho, atom->nmax);
+	cu_fp  = new cCudaData<double, F_FLOAT, x> (fp, atom->nmax);
+	Cuda_PairEAMCuda_Init(&cuda->shared_data,rdr,rdrho,nfrho,nrhor,nr,nrho,nz2r,
+		cu_frho_spline->dev_data(),cu_rhor_spline->dev_data(),cu_z2r_spline->dev_data(),
+		cu_rho->dev_data(),cu_fp->dev_data(),type2frho,type2z2r,type2rhor);
+  	}
+
+	
+
+	if(eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+    
+	Cuda_PairEAM1Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag,eflag_atom,vflag_atom);
+    comm->forward_comm_pair(this);
+	Cuda_PairEAM2Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag,eflag_atom,vflag_atom);
+    
+	if(eflag) cuda->cu_eng_vdwl->download();
+	if(vflag) cuda->cu_virial->download();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEAMCuda::settings(int narg, char **arg)
+{
+	PairEAM::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cutforcesq;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEAMCuda::coeff(int narg, char **arg)
+{
+	PairEAM::coeff(narg, arg);
+	allocate();
+}
+
+void PairEAMCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairEAMCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+  file2array();
+  array2spline();
+  int irequest;
+ 
+ 
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+
+	delete cu_rhor_spline;
+	delete cu_z2r_spline;
+	delete cu_frho_spline;
+	
+	cu_rhor_spline = new cCudaData<double, F_FLOAT, xyz>((double*)rhor_spline,nrhor,nr+1,EAM_COEFF_LENGTH);
+	cu_z2r_spline = new cCudaData<double, F_FLOAT, xyz>((double*)z2r_spline,nz2r,nr+1,EAM_COEFF_LENGTH);
+	cu_frho_spline = new cCudaData<double, F_FLOAT, xyz>((double*)frho_spline,nfrho,nrho+1,EAM_COEFF_LENGTH);
+
+	cu_rhor_spline->upload();
+	cu_z2r_spline->upload();
+	cu_frho_spline->upload();
+	
+  MYDBG(printf("# CUDA PairEAMCuda::init_style end\n"); )
+}
+
+void PairEAMCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairEAMCuda::init_list\n");)
+	PairEAM::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairEAMCuda::init_list end\n");)
+}
+
+void PairEAMCuda::array2spline()
+{
+  rdr = 1.0/dr;
+  rdrho = 1.0/drho;
+
+  memory->destroy(frho_spline);
+  memory->destroy(rhor_spline);
+  memory->destroy(z2r_spline);
+
+  memory->create(frho_spline,nfrho,nrho+1,7,"pair:frho");
+  memory->create(rhor_spline,nrhor,nr+1,7,"pair:rhor");
+  memory->create(z2r_spline,nz2r,nr+1,7,"pair:z2r");
+
+  for (int i = 0; i < nfrho; i++){
+    interpolate(nrho,drho,frho[i],frho_spline[i]);
+    for(int j=0;j<nrho+1;j++)
+      frho_spline[i][j][7]=frho_spline[i][j][3];
+  }
+  
+  for (int i = 0; i < nrhor; i++){
+    interpolate(nr,dr,rhor[i],rhor_spline[i]);
+    for(int j=0;j<nr+1;j++)
+      rhor_spline[i][j][7]=rhor_spline[i][j][3];
+  }
+
+  for (int i = 0; i < nz2r; i++){
+    interpolate(nr,dr,z2r[i],z2r_spline[i]);
+    for(int j=0;j<nr+1;j++)
+      z2r_spline[i][j][7]=z2r_spline[i][j][3];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int PairEAMCuda::pack_comm(int n, int *iswap, double *buf, int pbc_flag, int *pbc)
+{
+  Cuda_PairEAMCuda_PackComm(&cuda->shared_data,n,*iswap,buf); 
+  if(sizeof(F_FLOAT)<sizeof(double)) return 1;
+  else return 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEAMCuda::unpack_comm(int n, int first, double *buf)
+{
+  Cuda_PairEAMCuda_UnpackComm(&cuda->shared_data,n,first,buf,cu_fp->dev_data()); 
+}
+
diff --git a/src/USER-CUDA/pair_eam_cuda.h b/src/USER-CUDA/pair_eam_cuda.h
new file mode 100644
index 0000000000..e560fabd62
--- /dev/null
+++ b/src/USER-CUDA/pair_eam_cuda.h
@@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+#ifdef PAIR_CLASS
+
+PairStyle(eam/cuda,PairEAMCuda)
+
+#else
+
+#ifndef PAIR_EAM_CUDA_H
+#define PAIR_EAM_CUDA_H
+
+#include "cuda_data.h"
+#include "pair_eam.h"
+
+namespace LAMMPS_NS {
+
+class PairEAMCuda : public PairEAM
+{
+	public:
+		PairEAMCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void array2spline();
+		int pack_comm(int n, int *iswap, double *buf, int pbc_flag, int *pbc);
+		void unpack_comm(int n, int first, double *buf);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double, F_FLOAT, x>* cu_rho;
+		cCudaData<double, F_FLOAT, x>* cu_fp;
+	    cCudaData<double, F_FLOAT, xyz>* cu_rhor_spline;
+	    cCudaData<double, F_FLOAT, xyz>* cu_z2r_spline;
+	    cCudaData<double, F_FLOAT, xyz>* cu_frho_spline;
+
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_eam_fs_cuda.cpp b/src/USER-CUDA/pair_eam_fs_cuda.cpp
new file mode 100644
index 0000000000..56219d4e31
--- /dev/null
+++ b/src/USER-CUDA/pair_eam_fs_cuda.cpp
@@ -0,0 +1,335 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Tim Lau (MIT)
+------------------------------------------------------------------------- */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_eam_fs_cuda.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMFSCuda::PairEAMFSCuda(LAMMPS *lmp) : PairEAMCuda(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read EAM Finnis-Sinclair file
+------------------------------------------------------------------------- */
+
+void PairEAMFSCuda::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all("Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all("Incorrect args for pair coefficients");
+
+  // read EAM Finnis-Sinclair file
+
+  if (fs) {
+    for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i];
+    delete [] fs->elements;
+    delete [] fs->mass;
+    memory->destroy(fs->frho);
+    memory->destroy(fs->rhor);
+    memory->destroy(fs->z2r);
+    delete fs;
+  }
+  fs = new Fs();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < fs->nelements; j++)
+      if (strcmp(arg[i],fs->elements[j]) == 0) break;
+    if (j < fs->nelements) map[i-2] = j;
+    else error->all("No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+	setflag[i][j] = 1;
+	if (i == j) atom->set_mass(i,fs->mass[map[i]]);
+	count++;
+      }
+    }
+  }
+
+  if (count == 0) error->all("Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMFSCuda::read_file(char *filename)
+{
+  Fs *file = fs;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = fopen(filename,"r");
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all("Incorrect element names in EAM potential file");
+  
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  char *first = strtok(line," \t\n\r\f");
+  while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+	   &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,
+					      "pair:frho");
+  memory->create(file->rhor,file->nelements,file->nelements,
+		 file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,
+		 file->nr+1,"pair:z2r");
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+
+    for (j = 0; j < file->nelements; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]);
+      MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMFSCuda::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from fs file
+
+  nrho = fs->nrho;
+  nr = fs->nr;
+  drho = fs->drho;
+  dr = fs->dr;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of fs elements + 1 for zero array
+  
+  nfrho = fs->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < fs->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = square of # of fs elements
+
+  nrhor = fs->nelements * fs->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element pair rhor to global rhor
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j < fs->nelements; j++) {
+      for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m];
+      n++;
+    }
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for fs files, there is a full NxN set of rhor arrays
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i] * fs->nelements + map[j];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of fs elements
+
+  nz2r = fs->nelements * (fs->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+	type2z2r[i][j] = 0;
+	continue;
+      }
+      if (irow < icol) {
+	irow = map[j];
+	icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
diff --git a/src/USER-CUDA/pair_eam_fs_cuda.h b/src/USER-CUDA/pair_eam_fs_cuda.h
new file mode 100644
index 0000000000..c2d4a5504d
--- /dev/null
+++ b/src/USER-CUDA/pair_eam_fs_cuda.h
@@ -0,0 +1,44 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/fs/cuda,PairEAMFSCuda)
+
+#else
+
+#ifndef LMP_PAIR_EAM_FS_CUDA_H
+#define LMP_PAIR_EAM_FS_CUDA_H
+
+#include "pair_eam_cuda.h"
+
+namespace LAMMPS_NS {
+
+// use virtual public since this class is parent in multiple inheritance
+
+class PairEAMFSCuda : virtual public PairEAMCuda {
+ public:
+  PairEAMFSCuda(class LAMMPS *);
+  virtual ~PairEAMFSCuda() {}
+  void coeff(int, char **);
+
+ protected:
+  class Cuda *cuda;
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_gran_hooke_cuda.cpp b/src/USER-CUDA/pair_gran_hooke_cuda.cpp
new file mode 100644
index 0000000000..2b46f422fa
--- /dev/null
+++ b/src/USER-CUDA/pair_gran_hooke_cuda.cpp
@@ -0,0 +1,247 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_gran_hooke_cuda.h"
+#include "pair_gran_hooke_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "modify.h"
+#include "fix_pour.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairGranHookeCuda::PairGranHookeCuda(LAMMPS *lmp) : PairGranHooke(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairGranHookeCuda::allocate()
+{
+	if(! allocated) PairGranHooke::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+ 		int n = atom->ntypes;
+		cuda->shared_data.pair.cutsq     = cutsq;
+		memory->create(cuda->shared_data.pair.coeff1,n+1,n+1,
+			       "pair:cuda_coeff1");
+		memory->create(cuda->shared_data.pair.coeff2,
+			       n+1,n+1,"pair:cuda_coeff2");
+		cuda->shared_data.pair.coeff1[0][0]=kn;
+		cuda->shared_data.pair.coeff1[0][1]=kt;
+		cuda->shared_data.pair.coeff1[1][0]=gamman;
+		cuda->shared_data.pair.coeff1[1][1]=gammat;
+		cuda->shared_data.pair.coeff2[0][0]=xmu;
+		cuda->shared_data.pair.coeff2[0][1]=dampflag;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGranHookeCuda::compute(int eflag, int vflag)
+{
+	     cuda->shared_data.pair.use_block_per_atom = 0;
+	//cuda->cu_debugdata->memset_device(0);
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairGranHookeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	//cuda->cu_debugdata->download();
+	//printf("%lf %lf %lf %lf %lf %lf\n",1.0e-6*cuda->debugdata[0],1.0e-6*cuda->debugdata[1],1.0e-6*cuda->debugdata[2],1.0e-6*cuda->debugdata[3],1.0e-6*cuda->debugdata[4],1.0e-6*cuda->debugdata[5]);
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGranHookeCuda::settings(int narg, char **arg)
+{
+	PairGranHooke::settings(narg, arg);
+ }
+
+/* ---------------------------------------------------------------------- */
+
+void PairGranHookeCuda::coeff(int narg, char **arg)
+{
+	PairGranHooke::coeff(narg, arg);
+	allocate();
+}
+
+void PairGranHookeCuda::init_style()
+{
+	int i;
+	MYDBG(printf("# CUDA PairGranHookeCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->gran = 1;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+  if (!atom->radius_flag || !atom->omega_flag || !atom->torque_flag)
+    error->all("Pair granular requires atom attributes radius, omega, torque");
+  if (comm->ghost_velocity == 0)
+    error->all("Pair granular requires ghost atoms store velocity");
+
+  // need a half neigh list and optionally a granular history neigh list
+
+  dt = update->dt;
+
+
+
+  // check for Fix freeze and set freeze_group_bit
+
+  for (i = 0; i < modify->nfix; i++)
+    if (strcmp(modify->fix[i]->style,"freeze") == 0) break;
+  if (i < modify->nfix) freeze_group_bit = modify->fix[i]->groupbit;
+  else freeze_group_bit = 0;
+
+  cuda->shared_data.pair.freeze_group_bit=freeze_group_bit;
+  // check for Fix pour and set pour_type and pour_maxdiam
+
+  int pour_type = 0;
+  double pour_maxrad = 0.0;
+  for (i = 0; i < modify->nfix; i++)
+    if (strcmp(modify->fix[i]->style,"pour") == 0) break;
+  if (i < modify->nfix) {
+    pour_type = ((FixPour *) modify->fix[i])->ntype;
+    pour_maxrad = ((FixPour *) modify->fix[i])->radius_hi;
+  }
+
+  // set maxrad_dynamic and maxrad_frozen for each type
+  // include future Fix pour particles as dynamic
+
+  for (i = 1; i <= atom->ntypes; i++)
+    onerad_dynamic[i] = onerad_frozen[i] = 0.0;
+  if (pour_type) onerad_dynamic[pour_type] = pour_maxrad;
+
+  double *radius = atom->radius;
+  int *mask = atom->mask;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++){
+    if (mask[i] & freeze_group_bit)
+      onerad_frozen[type[i]] = MAX(onerad_frozen[type[i]],radius[i]);
+    else
+      onerad_dynamic[type[i]] = MAX(onerad_dynamic[type[i]],radius[i]);
+  }
+  
+  MPI_Allreduce(&onerad_dynamic[1],&maxrad_dynamic[1],atom->ntypes,
+		MPI_DOUBLE,MPI_MAX,world);
+  MPI_Allreduce(&onerad_frozen[1],&maxrad_frozen[1],atom->ntypes,
+		MPI_DOUBLE,MPI_MAX,world);
+
+  MYDBG(printf("# CUDA PairGranHookeCuda::init_style end\n"); )
+}
+
+void PairGranHookeCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairGranHookeCuda::init_list\n");)
+	PairGranHooke::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairGranHookeCuda::init_list end\n");)
+}
+
+void PairGranHookeCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairGranHooke::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_gran_hooke_cuda.h b/src/USER-CUDA/pair_gran_hooke_cuda.h
new file mode 100644
index 0000000000..727082f1f8
--- /dev/null
+++ b/src/USER-CUDA/pair_gran_hooke_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gran/hooke/cuda,PairGranHookeCuda)
+
+#else
+
+#ifndef PAIR_GRAN_HOOKE_CUDA_H
+#define PAIR_GRAN_HOOKE_CUDA_H
+
+#include "pair_gran_hooke.h"
+
+namespace LAMMPS_NS {
+
+class PairGranHookeCuda : public PairGranHooke
+{
+	public:
+		PairGranHookeCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj96_cut_cuda.cpp b/src/USER-CUDA/pair_lj96_cut_cuda.cpp
new file mode 100644
index 0000000000..10e43d3278
--- /dev/null
+++ b/src/USER-CUDA/pair_lj96_cut_cuda.cpp
@@ -0,0 +1,184 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj96_cut_cuda.h"
+#include "pair_lj96_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJ96CutCuda::PairLJ96CutCuda(LAMMPS *lmp) : PairLJ96Cut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJ96CutCuda::allocate()
+{
+	if(! allocated) PairLJ96Cut::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJ96CutCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+	
+	Cuda_PairLJ96CutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJ96CutCuda::settings(int narg, char **arg)
+{
+	PairLJ96Cut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJ96CutCuda::coeff(int narg, char **arg)
+{
+	PairLJ96Cut::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJ96CutCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairLJ96CutCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+
+  cut_respa = NULL;
+  MYDBG(printf("# CUDA PairLJ96CutCuda::init_style end\n"); )
+}
+
+void PairLJ96CutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJ96CutCuda::init_list\n");)
+	PairLJ96Cut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJ96CutCuda::init_list end\n");)
+}
+
+void PairLJ96CutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJ96Cut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj96_cut_cuda.h b/src/USER-CUDA/pair_lj96_cut_cuda.h
new file mode 100644
index 0000000000..0abb66f6aa
--- /dev/null
+++ b/src/USER-CUDA/pair_lj96_cut_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj96/cut/cuda,PairLJ96CutCuda)
+
+#else
+
+#ifndef PAIR_LJ96_CUT_CUDA_H
+#define PAIR_LJ96_CUT_CUDA_H
+
+#include "pair_lj96_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJ96CutCuda : public PairLJ96Cut
+{
+	public:
+		PairLJ96CutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp
new file mode 100644
index 0000000000..8e74daf70f
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp
@@ -0,0 +1,193 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_charmm_coul_charmm_cuda.h"
+#include "pair_lj_charmm_coul_charmm_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmCuda::PairLJCharmmCoulCharmmCuda(LAMMPS *lmp) : PairLJCharmmCoulCharmm(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmCuda::allocate()
+{
+	if(! allocated) PairLJCharmmCoulCharmm::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	    cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmCuda::compute(int eflag, int vflag)
+{
+  	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(eflag) cuda->cu_eng_coul->upload();
+	  if(vflag) cuda->cu_virial->upload();
+	}
+
+	Cuda_PairLJCharmmCoulCharmmCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj,cut_coul_innersq,denom_coul);
+	
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmCuda::settings(int narg, char **arg)
+{
+	PairLJCharmmCoulCharmm::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj;
+	cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq;
+	cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmCuda::coeff(int narg, char **arg)
+{
+	PairLJCharmmCoulCharmm::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCharmmCoulCharmmCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/charmm/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+	if(atom->molecular)
+	{
+	  cuda->shared_data.pair.collect_forces_later = 1;
+	}
+	
+  int irequest;
+ 
+   	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1; 
+
+   if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul)
+    error->all("Pair inner cutoff >= Pair outer cutoff");
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
+
+  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * 
+    (cut_ljsq-cut_lj_innersq);
+  denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) * 
+    (cut_coulsq-cut_coul_innersq);
+
+  cut_coulsq = cut_coul * cut_coul;
+ 
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+  
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+}
+
+void PairLJCharmmCoulCharmmCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCharmmCoulCharmmCuda::init_list\n");)
+	PairLJCharmmCoulCharmm::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCharmmCoulCharmmCuda::init_list end\n");)
+}
+
+void PairLJCharmmCoulCharmmCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCharmmCoulCharmm::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h
new file mode 100644
index 0000000000..39ec4735ef
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h
@@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/cuda,PairLJCharmmCoulCharmmCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_CUDA_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_CUDA_H
+
+#include "pair_lj_charmm_coul_charmm.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmCuda : public PairLJCharmmCoulCharmm
+{
+	public:
+		PairLJCharmmCoulCharmmCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp
new file mode 100644
index 0000000000..9a4bed09eb
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp
@@ -0,0 +1,188 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_charmm_coul_charmm_implicit_cuda.h"
+#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmImplicitCuda::PairLJCharmmCoulCharmmImplicitCuda(LAMMPS *lmp) : PairLJCharmmCoulCharmmImplicit(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.collect_forces_later = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmImplicitCuda::allocate()
+{
+	if(! allocated) PairLJCharmmCoulCharmmImplicit::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	    cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmImplicitCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(eflag) cuda->cu_eng_coul->upload();
+	  if(vflag) cuda->cu_virial->upload();
+	}
+
+	Cuda_PairLJCharmmCoulCharmmImplicitCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj,cut_coul_innersq,denom_coul);
+
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmImplicitCuda::settings(int narg, char **arg)
+{
+	PairLJCharmmCoulCharmmImplicit::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj;
+	cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq;
+	cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmImplicitCuda::coeff(int narg, char **arg)
+{
+	PairLJCharmmCoulCharmmImplicit::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCharmmCoulCharmmImplicitCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/charmm/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+   	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1; 
+
+   if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul)
+    error->all("Pair inner cutoff >= Pair outer cutoff");
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
+
+  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * 
+    (cut_ljsq-cut_lj_innersq);
+  denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) * 
+    (cut_coulsq-cut_coul_innersq);
+
+  cut_coulsq = cut_coul * cut_coul;
+
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+  
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+}
+
+void PairLJCharmmCoulCharmmImplicitCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCharmmCoulCharmmImplicitCuda::init_list\n");)
+	PairLJCharmmCoulCharmmImplicit::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCharmmCoulCharmmImplicitCuda::init_list end\n");)
+}
+
+void PairLJCharmmCoulCharmmImplicitCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCharmmCoulCharmmImplicit::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h
new file mode 100644
index 0000000000..94d8d09543
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h
@@ -0,0 +1,62 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/implicit/cuda,PairLJCharmmCoulCharmmImplicitCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_IMPLICIT_CUDA_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_IMPLICIT_CUDA_H
+
+#include "pair_lj_charmm_coul_charmm_implicit.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmImplicitCuda : public PairLJCharmmCoulCharmmImplicit
+{
+	public:
+		PairLJCharmmCoulCharmmImplicitCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp
new file mode 100644
index 0000000000..4ba45efd54
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp
@@ -0,0 +1,201 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_charmm_coul_long_cuda.h"
+#include "pair_lj_charmm_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulLongCuda::PairLJCharmmCoulLongCuda(LAMMPS *lmp) : PairLJCharmmCoulLong(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.collect_forces_later = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongCuda::allocate()
+{
+	if(! allocated) PairLJCharmmCoulLong::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		//cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	    cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(eflag) cuda->cu_eng_coul->upload();
+	  if(vflag) cuda->cu_virial->upload();
+	}
+
+	Cuda_PairLJCharmmCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj);
+
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongCuda::settings(int narg, char **arg)
+{
+	PairLJCharmmCoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj;
+	cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq;
+	cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner;	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongCuda::coeff(int narg, char **arg)
+{
+	PairLJCharmmCoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCharmmCoulLongCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/charmm/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1; 
+ 
+  if (cut_lj_inner >= cut_lj) 
+    error->all("Pair inner cutoff >= Pair outer cutoff");
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
+
+  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * 
+    (cut_ljsq-cut_lj_innersq);
+
+  cut_coulsq = cut_coul * cut_coul;
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairLJCharmmCoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCharmmCoulLongCuda::init_list\n");)
+	PairLJCharmmCoulLong::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCharmmCoulLongCuda::init_list end\n");)
+}
+
+void PairLJCharmmCoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCharmmCoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h
new file mode 100644
index 0000000000..4548883aaa
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h
@@ -0,0 +1,62 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/long/cuda,PairLJCharmmCoulLongCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_CUDA_H
+#define LMP_PAIR_LJ_CHARMM_COUL_LONG_CUDA_H
+
+#include "pair_lj_charmm_coul_long.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulLongCuda : public PairLJCharmmCoulLong
+{
+	public:
+		PairLJCharmmCoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp
new file mode 100644
index 0000000000..6ef1e7116a
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp
@@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_class2_coul_cut_cuda.h"
+#include "pair_lj_class2_coul_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJClass2CoulCutCuda::PairLJClass2CoulCutCuda(LAMMPS *lmp) : PairLJClass2CoulCut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJClass2CoulCutCuda::allocate()
+{
+	if(! allocated) PairLJClass2CoulCut::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulCutCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJClass2CoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulCutCuda::settings(int narg, char **arg)
+{
+	PairLJClass2CoulCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulCutCuda::coeff(int narg, char **arg)
+{
+	PairLJClass2CoulCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJClass2CoulCutCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/cut/cuda requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+ 
+  irequest = neighbor->request(this);
+  neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+}
+
+void PairLJClass2CoulCutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJClass2CoulCutCuda::init_list\n");)
+	PairLJClass2CoulCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJClass2CoulCutCuda::init_list end\n");)
+}
+
+void PairLJClass2CoulCutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJClass2CoulCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h
new file mode 100644
index 0000000000..e9edf9839b
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/class2/coul/cut/cuda,PairLJClass2CoulCutCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CLASS2_COUL_CUT_CUDA_H
+#define LMP_PAIR_LJ_CLASS2_COUL_CUT_CUDA_H
+
+#include "pair_lj_class2_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJClass2CoulCutCuda : public PairLJClass2CoulCut
+{
+	public:
+		PairLJClass2CoulCutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp
new file mode 100644
index 0000000000..6cf036e300
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp
@@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_class2_coul_long_cuda.h"
+#include "pair_lj_class2_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+/* ---------------------------------------------------------------------- */
+
+PairLJClass2CoulLongCuda::PairLJClass2CoulLongCuda(LAMMPS *lmp) : PairLJClass2CoulLong(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJClass2CoulLongCuda::allocate()
+{
+	if(! allocated) PairLJClass2CoulLong::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulLongCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJClass2CoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulLongCuda::settings(int narg, char **arg)
+{
+	PairLJClass2CoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulLongCuda::coeff(int narg, char **arg)
+{
+	PairLJClass2CoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJClass2CoulLongCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+
+  cut_coulsq = cut_coul * cut_coul;
+  cuda->shared_data.pair.cut_coul_global=cut_coul;
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+  // set rRESPA cutoffs
+
+  if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairLJClass2CoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJClass2CoulLongCuda::init_list\n");)
+	PairLJClass2CoulLong::init_list(id, ptr);
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	MYDBG(printf("# CUDA PairLJClass2CoulLongCuda::init_list end\n");)
+}
+
+void PairLJClass2CoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJClass2CoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h
new file mode 100644
index 0000000000..6bf4a71e16
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/class2/coul/long/cuda,PairLJClass2CoulLongCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CLASS2_COUL_LONG_CUDA_H
+#define LMP_PAIR_LJ_CLASS2_COUL_LONG_CUDA_H
+
+#include "pair_lj_class2_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairLJClass2CoulLongCuda : public PairLJClass2CoulLong
+{
+	public:
+		PairLJClass2CoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_class2_cuda.cpp b/src/USER-CUDA/pair_lj_class2_cuda.cpp
new file mode 100644
index 0000000000..0d253c940a
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_class2_cuda.cpp
@@ -0,0 +1,172 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_class2_cuda.h"
+#include "pair_lj_class2_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJClass2Cuda::PairLJClass2Cuda(LAMMPS *lmp) : PairLJClass2(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJClass2Cuda::allocate()
+{
+	if(! allocated) PairLJClass2::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2Cuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJClass2Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2Cuda::settings(int narg, char **arg)
+{
+	PairLJClass2::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2Cuda::coeff(int narg, char **arg)
+{
+	PairLJClass2::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJClass2Cuda::init_style()
+{
+	MYDBG(printf("# CUDA PairLJClass2Cuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  MYDBG(printf("# CUDA PairLJClass2Cuda::init_style end\n"); )
+}
+
+void PairLJClass2Cuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJClass2Cuda::init_list\n");)
+	PairLJClass2::init_list(id, ptr);
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	MYDBG(printf("# CUDA PairLJClass2Cuda::init_list end\n");)
+}
+
+void PairLJClass2Cuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJClass2::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_class2_cuda.h b/src/USER-CUDA/pair_lj_class2_cuda.h
new file mode 100644
index 0000000000..8643ad94aa
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_class2_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/class2/cuda,PairLJClass2Cuda)
+
+#else
+
+#ifndef PAIR_LJ_CLASS2_CUDA_H
+#define PAIR_LJ_CLASS2_CUDA_H
+
+#include "pair_lj_class2.h"
+
+namespace LAMMPS_NS {
+
+class PairLJClass2Cuda : public PairLJClass2
+{
+	public:
+		PairLJClass2Cuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp
new file mode 100644
index 0000000000..399d8c6758
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp
@@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_cut_coul_cut_cuda.h"
+#include "pair_lj_cut_coul_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulCutCuda::PairLJCutCoulCutCuda(LAMMPS *lmp) : PairLJCutCoulCut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCutCoulCutCuda::allocate()
+{
+	if(! allocated) PairLJCutCoulCut::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulCutCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJCutCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulCutCuda::settings(int narg, char **arg)
+{
+	PairLJCutCoulCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulCutCuda::coeff(int narg, char **arg)
+{
+	PairLJCutCoulCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCutCoulCutCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/cut/cuda requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+ 
+  irequest = neighbor->request(this);
+  neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+}
+
+void PairLJCutCoulCutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCutCoulCutCuda::init_list\n");)
+	PairLJCutCoulCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCutCoulCutCuda::init_list end\n");)
+}
+
+void PairLJCutCoulCutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCutCoulCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h
new file mode 100644
index 0000000000..130140d6ce
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/cut/cuda,PairLJCutCoulCutCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_CUDA_H
+#define LMP_PAIR_LJ_CUT_COUL_CUT_CUDA_H
+
+#include "pair_lj_cut_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulCutCuda : public PairLJCutCoulCut
+{
+	public:
+		PairLJCutCoulCutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp
new file mode 100644
index 0000000000..dd745ca7da
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp
@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_cut_coul_debye_cuda.h"
+#include "pair_lj_cut_coul_debye_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulDebyeCuda::PairLJCutCoulDebyeCuda(LAMMPS *lmp) : PairLJCutCoulDebye(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCutCoulDebyeCuda::allocate()
+{
+	if(! allocated) PairLJCutCoulDebye::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulDebyeCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJCutCoulDebyeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulDebyeCuda::settings(int narg, char **arg)
+{
+	PairLJCutCoulDebye::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+	cuda->shared_data.pair.kappa = (F_FLOAT) kappa;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulDebyeCuda::coeff(int narg, char **arg)
+{
+	PairLJCutCoulDebye::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCutCoulDebyeCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/debye/cuda requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+ 
+  irequest = neighbor->request(this);
+  neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+}
+
+void PairLJCutCoulDebyeCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCutCoulDebyeCuda::init_list\n");)
+	PairLJCutCoulDebye::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCutCoulDebyeCuda::init_list end\n");)
+}
+
+void PairLJCutCoulDebyeCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCutCoulDebye::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h
new file mode 100644
index 0000000000..853c428143
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/debye/cuda,PairLJCutCoulDebyeCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_DEBYE_CUDA_H
+#define LMP_PAIR_LJ_CUT_COUL_DEBYE_CUDA_H
+
+#include "pair_lj_cut_coul_debye.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulDebyeCuda : public PairLJCutCoulDebye
+{
+	public:
+		PairLJCutCoulDebyeCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp
new file mode 100644
index 0000000000..53e65182a5
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp
@@ -0,0 +1,221 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_cut_coul_long_cuda.h"
+#include "pair_lj_cut_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulLongCuda::PairLJCutCoulLongCuda(LAMMPS *lmp) : PairLJCutCoulLong(lmp)
+{
+  cuda = lmp->cuda;
+  if(cuda == NULL)
+      error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCutCoulLongCuda::allocate()
+{
+	if(! allocated) PairLJCutCoulLong::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJCutCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongCuda::settings(int narg, char **arg)
+{
+	PairLJCutCoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongCuda::coeff(int narg, char **arg)
+{
+	PairLJCutCoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCutCoulLongCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+    int respa = 0;
+    if (((Respa *) update->integrate)->level_inner >= 0) respa = 1;
+    if (((Respa *) update->integrate)->level_middle >= 0) respa = 2;
+
+    if (respa == 0) irequest = neighbor->request(this);
+    else if (respa == 1) {
+      irequest = neighbor->request(this);
+      neighbor->requests[irequest]->id = 1;
+      neighbor->requests[irequest]->half = 0;
+      neighbor->requests[irequest]->respainner = 1;
+      irequest = neighbor->request(this);
+      neighbor->requests[irequest]->id = 3;
+      neighbor->requests[irequest]->half = 0;
+      neighbor->requests[irequest]->respaouter = 1;
+    } else {
+      irequest = neighbor->request(this);
+      neighbor->requests[irequest]->id = 1;
+      neighbor->requests[irequest]->half = 0;
+      neighbor->requests[irequest]->respainner = 1;
+      irequest = neighbor->request(this);
+      neighbor->requests[irequest]->id = 2;
+      neighbor->requests[irequest]->half = 0;
+      neighbor->requests[irequest]->respamiddle = 1;
+      irequest = neighbor->request(this);
+      neighbor->requests[irequest]->id = 3;
+      neighbor->requests[irequest]->half = 0;
+      neighbor->requests[irequest]->respaouter = 1;
+    }
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+  }
+
+  cut_coulsq = cut_coul * cut_coul;
+  cuda->shared_data.pair.cut_coul_global=cut_coul;
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+  // set rRESPA cutoffs
+
+  if (strcmp(update->integrate_style,"respa") == 0 &&
+      ((Respa *) update->integrate)->level_inner >= 0)
+    cut_respa = ((Respa *) update->integrate)->cutoff;
+  else cut_respa = NULL;
+
+  if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairLJCutCoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCutCoulLongCuda::init_list\n");)
+	PairLJCutCoulLong::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCutCoulLongCuda::init_list end\n");)
+}
+
+void PairLJCutCoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCutCoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_cut_coul_long_cuda.h b/src/USER-CUDA/pair_lj_cut_coul_long_cuda.h
new file mode 100644
index 0000000000..2f14357408
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_cut_coul_long_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/long/cuda,PairLJCutCoulLongCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_CUDA_H
+#define LMP_PAIR_LJ_CUT_COUL_LONG_CUDA_H
+
+#include "pair_lj_cut_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulLongCuda : public PairLJCutCoulLong
+{
+	public:
+		PairLJCutCoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_cut_cuda.cpp b/src/USER-CUDA/pair_lj_cut_cuda.cpp
new file mode 100644
index 0000000000..d63134c3c3
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_cut_cuda.cpp
@@ -0,0 +1,184 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_cut_cuda.h"
+#include "pair_lj_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCuda::PairLJCutCuda(LAMMPS *lmp) : PairLJCut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCutCuda::allocate()
+{
+	if(! allocated) PairLJCut::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCuda::settings(int narg, char **arg)
+{
+	PairLJCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCuda::coeff(int narg, char **arg)
+{
+	PairLJCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCutCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairLJCutCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+
+  cut_respa = NULL;
+  MYDBG(printf("# CUDA PairLJCutCuda::init_style end\n"); )
+}
+
+void PairLJCutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCutCuda::init_list\n");)
+	PairLJCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCutCuda::init_list end\n");)
+}
+
+void PairLJCutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_cut_cuda.h b/src/USER-CUDA/pair_lj_cut_cuda.h
new file mode 100644
index 0000000000..f81d47952d
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_cut_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/cuda,PairLJCutCuda)
+
+#else
+
+#ifndef PAIR_LJ_CUT_CUDA_H
+#define PAIR_LJ_CUT_CUDA_H
+
+#include "pair_lj_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCuda : public PairLJCut
+{
+	public:
+		PairLJCutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_cut_experimental_cuda.cpp b/src/USER-CUDA/pair_lj_cut_experimental_cuda.cpp
new file mode 100644
index 0000000000..029ce05151
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_cut_experimental_cuda.cpp
@@ -0,0 +1,183 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_cut_experimental_cuda.h"
+#include "pair_lj_cut_experimental_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutExperimentalCuda::PairLJCutExperimentalCuda(LAMMPS *lmp) : PairLJCut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCutExperimentalCuda::allocate()
+{
+	if(! allocated) PairLJCut::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutExperimentalCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+	Cuda_PairLJCutExperimentalCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  CudaWrapper_Sync();
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+ }
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutExperimentalCuda::settings(int narg, char **arg)
+{
+	PairLJCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutExperimentalCuda::coeff(int narg, char **arg)
+{
+	PairLJCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCutExperimentalCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+
+  cut_respa = NULL;
+  MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_style end\n"); )
+}
+
+void PairLJCutExperimentalCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_list\n");)
+	PairLJCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_list end\n");)
+}
+
+void PairLJCutExperimentalCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_cut_experimental_cuda.h b/src/USER-CUDA/pair_lj_cut_experimental_cuda.h
new file mode 100644
index 0000000000..9deb686524
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_cut_experimental_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/experimental/cuda,PairLJCutExperimentalCuda)
+
+#else
+
+#ifndef PAIR_LJ_CUT_EXPERIMENTAL_CUDA_H
+#define PAIR_LJ_CUT_EXPERIMENTAL_CUDA_H
+
+#include "pair_lj_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutExperimentalCuda : public PairLJCut
+{
+	public:
+		PairLJCutExperimentalCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_expand_cuda.cpp b/src/USER-CUDA/pair_lj_expand_cuda.cpp
new file mode 100644
index 0000000000..a9f2a6561f
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_expand_cuda.cpp
@@ -0,0 +1,185 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_expand_cuda.h"
+#include "pair_lj_expand_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJExpandCuda::PairLJExpandCuda(LAMMPS *lmp) : PairLJExpand(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJExpandCuda::allocate()
+{
+	if(! allocated) PairLJExpand::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.cutsq   = cutsq;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = shift;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJExpandCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+	
+	Cuda_PairLJExpandCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+	
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJExpandCuda::settings(int narg, char **arg)
+{
+	PairLJExpand::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJExpandCuda::coeff(int narg, char **arg)
+{
+	PairLJExpand::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJExpandCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairLJExpandCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+
+  MYDBG(printf("# CUDA PairLJExpandCuda::init_style end\n"); )
+}
+
+void PairLJExpandCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJExpandCuda::init_list\n");)
+	PairLJExpand::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJExpandCuda::init_list end\n");)
+}
+
+void PairLJExpandCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJExpand::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_expand_cuda.h b/src/USER-CUDA/pair_lj_expand_cuda.h
new file mode 100644
index 0000000000..67d1030edb
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_expand_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/expand/cuda,PairLJExpandCuda)
+
+#else
+
+#ifndef PAIR_LJ_EXPAND_CUDA_H
+#define PAIR_LJ_EXPAND_CUDA_H
+
+#include "pair_lj_expand.h"
+
+namespace LAMMPS_NS {
+
+class PairLJExpandCuda : public PairLJExpand
+{
+	public:
+		PairLJExpandCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp
new file mode 100644
index 0000000000..e3ead377ca
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp
@@ -0,0 +1,199 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_gromacs_coul_gromacs_cuda.h"
+#include "pair_lj_gromacs_coul_gromacs_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJGromacsCoulGromacsCuda::PairLJGromacsCoulGromacsCuda(LAMMPS *lmp) : PairLJGromacsCoulGromacs(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJGromacsCoulGromacsCuda::allocate()
+{
+	if(! allocated) PairLJGromacsCoulGromacs::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = ljsw1;
+		cuda->shared_data.pair.coeff6  = ljsw2;
+		cuda->shared_data.pair.coeff7  = ljsw3;
+		cuda->shared_data.pair.coeff8  = ljsw4;
+		cuda->shared_data.pair.coeff9  = ljsw5;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	    cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw1_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw1, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw2_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw2, &cuda->shared_data.pair.coeff6_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw3_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw3, &cuda->shared_data.pair.coeff7_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw4_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw4, &cuda->shared_data.pair.coeff8_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw5_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw5, &cuda->shared_data.pair.coeff9_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJGromacsCoulGromacsCuda::compute(int eflag, int vflag)
+{
+  	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(eflag) cuda->cu_eng_coul->upload();
+	  if(vflag) cuda->cu_virial->upload();
+	}
+	
+	Cuda_PairLJGromacsCoulGromacsCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,cut_coul_inner,coulsw1,coulsw2,coulsw5);
+	
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJGromacsCoulGromacsCuda::settings(int narg, char **arg)
+{
+	PairLJGromacsCoulGromacs::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj;
+	cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq;
+	cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJGromacsCoulGromacsCuda::coeff(int narg, char **arg)
+{
+	PairLJGromacsCoulGromacs::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJGromacsCoulGromacsCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/gromacs/coul/gromacs requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+	if(atom->molecular)
+	{
+	  cuda->shared_data.pair.collect_forces_later = 1;
+	}
+	
+  int irequest;
+ 
+   	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1; 
+
+   if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul)
+    error->all("Pair inner cutoff >= Pair outer cutoff");
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
+
+  
+  cut_coulsq = cut_coul * cut_coul;
+ 
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+  
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+}
+
+void PairLJGromacsCoulGromacsCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJGromacsCoulGromacsCuda::init_list\n");)
+	PairLJGromacsCoulGromacs::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJGromacsCoulGromacsCuda::init_list end\n");)
+}
+
+void PairLJGromacsCoulGromacsCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJGromacsCoulGromacs::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.h b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.h
new file mode 100644
index 0000000000..333bbc0088
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.h
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/gromacs/coul/gromacs/cuda,PairLJGromacsCoulGromacsCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_GROMACS_COUL_GROMACS_CUDA_H
+#define LMP_PAIR_LJ_GROMACS_COUL_GROMACS_CUDA_H
+
+#include "pair_lj_gromacs_coul_gromacs.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairLJGromacsCoulGromacsCuda : public PairLJGromacsCoulGromacs
+{
+	public:
+		PairLJGromacsCoulGromacsCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw4_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw5_gm;
+
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_gromacs_cuda.cpp b/src/USER-CUDA/pair_lj_gromacs_cuda.cpp
new file mode 100644
index 0000000000..97bbbe16f4
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_gromacs_cuda.cpp
@@ -0,0 +1,182 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_gromacs_cuda.h"
+#include "pair_lj_gromacs_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJGromacsCuda::PairLJGromacsCuda(LAMMPS *lmp) : PairLJGromacs(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJGromacsCuda::allocate()
+{
+	if(! allocated) PairLJGromacs::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut = cut;
+		cuda->shared_data.pair.cut_inner = cut_inner;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = ljsw1;
+		cuda->shared_data.pair.coeff6  = ljsw2;
+		cuda->shared_data.pair.coeff7  = ljsw3;
+		cuda->shared_data.pair.coeff8  = ljsw4;
+		cuda->shared_data.pair.coeff9  = ljsw5;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+	    cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw1_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw1, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw2_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw2, &cuda->shared_data.pair.coeff6_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw3_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw3, &cuda->shared_data.pair.coeff7_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw4_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw4, &cuda->shared_data.pair.coeff8_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw5_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw5, &cuda->shared_data.pair.coeff9_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJGromacsCuda::compute(int eflag, int vflag)
+{
+  	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(vflag) cuda->cu_virial->upload();
+	}
+	
+	Cuda_PairLJGromacsCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+	
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJGromacsCuda::settings(int narg, char **arg)
+{
+	PairLJGromacs::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+	cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_inner_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJGromacsCuda::coeff(int narg, char **arg)
+{
+	PairLJGromacs::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJGromacsCuda::init_style()
+{
+  // request regular or rRESPA neighbor lists
+
+	if(atom->molecular)
+	{
+	  cuda->shared_data.pair.collect_forces_later = 1;
+	}
+	
+  int irequest;
+ 
+   	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1; 
+
+
+   
+}
+
+void PairLJGromacsCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJGromacsCuda::init_list\n");)
+	PairLJGromacs::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJGromacsCuda::init_list end\n");)
+}
+
+void PairLJGromacsCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJGromacs::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_gromacs_cuda.h b/src/USER-CUDA/pair_lj_gromacs_cuda.h
new file mode 100644
index 0000000000..64e38aa763
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_gromacs_cuda.h
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/gromacs/cuda,PairLJGromacsCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_GROMACS_CUDA_H
+#define LMP_PAIR_LJ_GROMACS_CUDA_H
+
+#include "pair_lj_gromacs.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairLJGromacsCuda : public PairLJGromacs
+{
+	public:
+		PairLJGromacsCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw4_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw5_gm;
+
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_lj_smooth_cuda.cpp b/src/USER-CUDA/pair_lj_smooth_cuda.cpp
new file mode 100644
index 0000000000..c8aef2ec00
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_smooth_cuda.cpp
@@ -0,0 +1,182 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_smooth_cuda.h"
+#include "pair_lj_smooth_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJSmoothCuda::PairLJSmoothCuda(LAMMPS *lmp) : PairLJSmooth(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJSmoothCuda::allocate()
+{
+	if(! allocated) PairLJSmooth::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut = cut;
+		cuda->shared_data.pair.cut_inner = cut_inner;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = ljsw1;
+		cuda->shared_data.pair.coeff6  = ljsw2;
+		cuda->shared_data.pair.coeff7  = ljsw3;
+		cuda->shared_data.pair.coeff8  = ljsw4;
+		cuda->shared_data.pair.coeff9  = ljsw0;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+	    cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw0_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw0, &cuda->shared_data.pair.coeff9_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw1_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw1, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw2_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw2, &cuda->shared_data.pair.coeff6_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw3_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw3, &cuda->shared_data.pair.coeff7_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_ljsw4_gm = new cCudaData<double, F_FLOAT, x> ((double*)ljsw4, &cuda->shared_data.pair.coeff8_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJSmoothCuda::compute(int eflag, int vflag)
+{
+  	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(vflag) cuda->cu_virial->upload();
+	}
+	
+	Cuda_PairLJSmoothCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+	
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJSmoothCuda::settings(int narg, char **arg)
+{
+	PairLJSmooth::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+	cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_inner_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJSmoothCuda::coeff(int narg, char **arg)
+{
+	PairLJSmooth::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJSmoothCuda::init_style()
+{
+  // request regular or rRESPA neighbor lists
+
+	if(atom->molecular)
+	{
+	  cuda->shared_data.pair.collect_forces_later = 1;
+	}
+	
+  int irequest;
+ 
+   	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1; 
+
+
+   
+}
+
+void PairLJSmoothCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJSmoothCuda::init_list\n");)
+	PairLJSmooth::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJSmoothCuda::init_list end\n");)
+}
+
+void PairLJSmoothCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJSmooth::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_lj_smooth_cuda.h b/src/USER-CUDA/pair_lj_smooth_cuda.h
new file mode 100644
index 0000000000..32f6b4fabc
--- /dev/null
+++ b/src/USER-CUDA/pair_lj_smooth_cuda.h
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/smooth/cuda,PairLJSmoothCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_SMOOTH_CUDA_H
+#define LMP_PAIR_LJ_SMOOTH_CUDA_H
+
+#include "pair_lj_smooth.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairLJSmoothCuda : public PairLJSmooth
+{
+	public:
+		PairLJSmoothCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw0_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_ljsw4_gm;
+
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pair_morse_cuda.cpp b/src/USER-CUDA/pair_morse_cuda.cpp
new file mode 100644
index 0000000000..b556c158d6
--- /dev/null
+++ b/src/USER-CUDA/pair_morse_cuda.cpp
@@ -0,0 +1,182 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_morse_cuda.h"
+#include "pair_morse_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairMorseCuda::PairMorseCuda(LAMMPS *lmp) : PairMorse(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairMorseCuda::allocate()
+{
+	if(! allocated) PairMorse::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = r0;
+		cuda->shared_data.pair.coeff2  = alpha;
+		cuda->shared_data.pair.coeff3  = morse1;
+		cuda->shared_data.pair.coeff4  = d0;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMorseCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+	
+	Cuda_PairMorseCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMorseCuda::settings(int narg, char **arg)
+{
+	PairMorse::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMorseCuda::coeff(int narg, char **arg)
+{
+	PairMorse::coeff(narg, arg);
+	allocate();
+}
+
+void PairMorseCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairMorseCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+
+  MYDBG(printf("# CUDA PairMorseCuda::init_style end\n"); )
+}
+
+void PairMorseCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairMorseCuda::init_list\n");)
+	PairMorse::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairMorseCuda::init_list end\n");)
+}
+
+void PairMorseCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairMorse::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
diff --git a/src/USER-CUDA/pair_morse_cuda.h b/src/USER-CUDA/pair_morse_cuda.h
new file mode 100644
index 0000000000..aae40294ba
--- /dev/null
+++ b/src/USER-CUDA/pair_morse_cuda.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(morse/cuda,PairMorseCuda)
+
+#else
+
+#ifndef PAIR_MORSE_CUDA_H
+#define PAIR_MORSE_CUDA_H
+
+#include "pair_morse.h"
+
+namespace LAMMPS_NS {
+
+class PairMorseCuda : public PairMorse
+{
+	public:
+		PairMorseCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/pppm_cuda.cpp b/src/USER-CUDA/pppm_cuda.cpp
new file mode 100644
index 0000000000..16ef9ae49f
--- /dev/null
+++ b/src/USER-CUDA/pppm_cuda.cpp
@@ -0,0 +1,1741 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+
+#include "mpi.h"
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include "pppm_cuda.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap_cuda.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include <ctime> //crmadd
+#include "cuda_wrapper_cu.h"
+#include "pppm_cuda_cu.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXORDER 7
+#define OFFSET 4096
+#define SMALL 0.00001
+#define LARGE 10000.0
+#define EPS_HOC 1.0e-7
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+
+void printArray(double* data,int nx, int ny, int nz)
+{
+  for(int i=0;i<nz;i++)
+  for(int j=0;j<ny;j++)
+  {
+  	printf("%i %i\n",i,j);
+  	for(int k=0;k<nx;k++)
+  	printf("%e ",data[2*(i*ny*nx+j*nx+k)]);
+  	printf("\n\n");
+  }
+} 
+void printArray(double*** data,int nx, int ny, int nz)
+{
+  for(int i=0;i<nx;i++)
+  for(int j=0;j<ny;j++)
+  {
+  	printf("%i %i\n",i,j);
+  	for(int k=0;k<nz;k++)
+  	printf("%e ",data[i][j][k]);
+  	printf("\n\n");
+  }
+} 
+/* ---------------------------------------------------------------------- */
+
+PPPMCuda::PPPMCuda(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, (narg==2?1:narg), arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if ((narg > 3)||(narg<1)) error->all("Illegal kspace_style pppm/cuda command");
+  #ifndef FFT_CUFFT
+  error->all("Using kspace_style pppm/cuda without cufft is not possible. Compile with cufft=1 to include cufft. Aborting.");
+  #endif
+  precision = atof(arg[0]);
+  if(narg>1)
+  precisionmodify=arg[1][0];
+  else precisionmodify='=';
+  PI = 4.0*atan(1.0);
+  
+  nfactors = 3;
+  factors = new int[nfactors];
+  factors[0] = 2;
+  factors[1] = 3;
+  factors[2] = 5;
+
+  MPI_Comm_rank(world,&me);
+  MPI_Comm_size(world,&nprocs);
+
+  density_brick = vdx_brick = vdy_brick = vdz_brick = vdx_brick_tmp = NULL;
+  density_fft = NULL;
+  greensfn = NULL;
+  work1 = work2 = NULL;
+  vg = NULL;
+  fkx = fky = fkz = NULL;
+  buf1 = buf2 = NULL;
+
+  gf_b = NULL;
+  rho1d = rho_coeff = NULL;
+
+  fft1c = fft2c = NULL;
+  remap = NULL;
+ 
+  density_brick_int=NULL;
+  density_intScale=1000000;
+  cu_vdx_brick = cu_vdy_brick = cu_vdz_brick = NULL;
+  cu_density_brick = NULL;
+  cu_density_brick_int = NULL;
+  cu_density_fft = NULL;
+  cu_energy=NULL;
+  cu_greensfn = NULL;
+  cu_work1 = cu_work2 = cu_work3 = NULL;
+  cu_vg = NULL;
+  cu_fkx = cu_fky = cu_fkz = NULL;
+  
+  cu_flag = NULL;
+  cu_debugdata = NULL;
+  cu_rho_coeff = NULL;
+  cu_virial = NULL;
+  
+  cu_gf_b = NULL;
+  
+  cu_slabbuf = NULL;
+  slabbuf = NULL;
+  
+  nmax = 0;
+  part2grid = NULL;
+  cu_part2grid = NULL;
+  adev_data_array=NULL;
+  poissontime=0;
+  old_nmax=0;
+  cu_pppm_grid_n=NULL;	
+  cu_pppm_grid_ids=NULL;	
+  
+  pppm_grid_nmax=0;
+  pppm2partgrid=new int[3];
+  pppm_grid=new int[3];  
+  firstpass=true;
+  scale = 1.0;
+}
+
+
+/* ----------------------------------------------------------------------
+   free all memory 
+------------------------------------------------------------------------- */
+
+PPPMCuda::~PPPMCuda()
+{
+  delete [] slabbuf;
+  delete cu_slabbuf;
+  
+  delete [] factors;
+  factors=NULL;
+  deallocate();
+  delete cu_part2grid;
+  cu_part2grid=NULL;
+  memory->destroy(part2grid);
+  part2grid = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   called once before run 
+------------------------------------------------------------------------- */
+
+void PPPMCuda::init()
+{
+	
+	cuda->shared_data.pppm.cudable_force=1;
+	
+    //if(cuda->finished_run) {PPPM::init(); return;}
+    
+  if (me == 0) {
+    if (screen) fprintf(screen,"PPPMCuda initialization ...\n");
+    if (logfile) fprintf(logfile,"PPPMCuda initialization ...\n");
+  }
+
+  // error check
+
+  if (domain->triclinic)
+    error->all("Cannot (yet) use PPPMCuda with triclinic box");
+  if (domain->dimension == 2) error->all("Cannot use PPPMCuda with 2d simulation");
+
+  if (!atom->q_flag) error->all("Kspace style requires atom attribute q");
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all("Cannot use nonperiodic boundaries with PPPMCuda");
+  if (slabflag == 1) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 || 
+	domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all("Incorrect boundaries with slab PPPMCuda");
+  }
+
+  if (order > MAXORDER) {
+    char str[128];
+    sprintf(str,"PPPMCuda order cannot be greater than %d",MAXORDER);
+    error->all(str);
+  }
+  // free all arrays previously allocated
+
+  deallocate();
+
+  // extract short-range Coulombic cutoff from pair style
+
+  qqrd2e = force->qqrd2e;
+
+  if (force->pair == NULL)
+    error->all("KSpace style is incompatible with Pair style");
+  int itmp=0;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all("KSpace style is incompatible with Pair style");
+  cutoff = *p_cutoff;
+
+  // if kspace is TIP4P, extract TIP4P params from pair style
+
+  qdist = 0.0;
+
+  if (strcmp(force->kspace_style,"pppm/tip4p") == 0) {
+    if (force->pair == NULL)
+      error->all("KSpace style is incompatible with Pair style");
+    double *p_qdist = (double *) force->pair->extract("qdist",itmp);
+    int *p_typeO = (int *) force->pair->extract("typeO",itmp);
+    int *p_typeH = (int *) force->pair->extract("typeH",itmp);
+    int *p_typeA = (int *) force->pair->extract("typeA",itmp);
+    int *p_typeB = (int *) force->pair->extract("typeB",itmp);
+    if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB)
+      error->all("KSpace style is incompatible with Pair style");
+    qdist = *p_qdist;
+    typeO = *p_typeO;
+    typeH = *p_typeH;
+    int typeA = *p_typeA;
+    int typeB = *p_typeB;
+
+    if (force->angle == NULL || force->bond == NULL)
+      error->all("Bond and angle potentials must be defined for TIP4P");
+    double theta = force->angle->equilibrium_angle(typeA);
+    double blen = force->bond->equilibrium_distance(typeB);
+    alpha = qdist / (2.0 * cos(0.5*theta) * blen);
+  }
+
+  // compute qsum & qsqsum and warn if not charge-neutral
+
+  qsum = qsqsum = 0.0;
+  for (int i = 0; i < atom->nlocal; i++) {
+    qsum += atom->q[i];
+    qsqsum += atom->q[i]*atom->q[i];
+  }
+
+  double tmp;
+  MPI_Allreduce(&qsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  qsum = tmp;
+  MPI_Allreduce(&qsqsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  qsqsum = tmp;
+
+  if (qsqsum == 0.0)
+    error->all("Cannot use kspace solver on system with no charge");
+  if (fabs(qsum) > SMALL && me == 0) {
+    char str[128];
+    sprintf(str,"System is not charge neutral, net charge = %g",qsum);
+    error->warning(str);
+  }
+
+  // setup FFT grid resolution and g_ewald
+  // normally one iteration thru while loop is all that is required
+  // if grid stencil extends beyond neighbor proc, reduce order and try again
+
+  int iteration = 0;
+
+  while (order > 0) {
+
+    if (iteration && me == 0)
+      error->warning("Reducing PPPMCuda order b/c stencil extends "
+		     "beyond neighbor processor");
+    iteration++;
+
+    set_grid();
+
+    if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
+      error->all("PPPMCuda grid is too large");
+
+    // global indices of PPPMCuda grid range from 0 to N-1
+    // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
+    //   global PPPMCuda grid that I own without ghost cells
+    // for slab PPPMCuda, assign z grid as if it were not extended
+
+    nxlo_in = comm->myloc[0]*nx_pppm / comm->procgrid[0];
+    nxhi_in = (comm->myloc[0]+1)*nx_pppm / comm->procgrid[0] - 1;
+    nylo_in = comm->myloc[1]*ny_pppm / comm->procgrid[1];
+    nyhi_in = (comm->myloc[1]+1)*ny_pppm / comm->procgrid[1] - 1;
+    nzlo_in = comm->myloc[2] * 
+      (static_cast<int> (nz_pppm/slab_volfactor)) / comm->procgrid[2];
+    nzhi_in = (comm->myloc[2]+1) * 
+      (static_cast<int> (nz_pppm/slab_volfactor)) / comm->procgrid[2] - 1;
+
+    // nlower,nupper = stencil size for mapping particles to PPPMCuda grid
+
+    nlower = -(order-1)/2;
+    nupper = order/2;
+
+    // shift values for particle <-> grid mapping
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    if (order % 2) shift = OFFSET + 0.5;
+    else shift = OFFSET;
+    if (order % 2) shiftone = 0.0;
+    else shiftone = 0.5;
+
+    // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
+    //   global PPPMCuda grid that my particles can contribute charge to
+    // effectively nlo_in,nhi_in + ghost cells
+    // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
+    //           position a particle in my box can be at
+    // dist[3] = particle position bound = subbox + skin/2.0 + qdist
+    //   qdist = offset due to TIP4P fictitious charge
+    //   convert to triclinic if necessary
+    // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
+    // for slab PPPMCuda, assign z grid as if it were not extended
+
+
+    triclinic = domain->triclinic;
+    double *prd,*sublo,*subhi;
+
+    if (triclinic == 0) {
+      prd = domain->prd;
+      boxlo = domain->boxlo;
+      sublo = domain->sublo;
+      subhi = domain->subhi;
+    } else {
+      prd = domain->prd_lamda;
+      boxlo = domain->boxlo_lamda;
+      sublo = domain->sublo_lamda;
+      subhi = domain->subhi_lamda;
+    }
+
+    double xprd = prd[0];
+    double yprd = prd[1];
+    double zprd = prd[2];
+    double zprd_slab = zprd*slab_volfactor;
+
+    double dist[3];
+    double cuthalf = 0.5*neighbor->skin + qdist;
+    if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
+    else {
+      dist[0] = cuthalf/domain->prd[0];
+      dist[1] = cuthalf/domain->prd[1];
+      dist[2] = cuthalf/domain->prd[2];
+    }
+    
+    int nlo,nhi;
+    
+    nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) * 
+			    nx_pppm/xprd + shift) - OFFSET;
+    nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) * 
+			    nx_pppm/xprd + shift) - OFFSET;
+    nxlo_out = nlo + nlower;
+    nxhi_out = nhi + nupper;
+
+    nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) * 
+			    ny_pppm/yprd + shift) - OFFSET;
+    nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) * 
+			    ny_pppm/yprd + shift) - OFFSET;
+    nylo_out = nlo + nlower;
+    nyhi_out = nhi + nupper;
+
+    nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) * 
+			    nz_pppm/zprd_slab + shift) - OFFSET;
+    nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) * 
+			    nz_pppm/zprd_slab + shift) - OFFSET;
+    nzlo_out = nlo + nlower;
+    nzhi_out = nhi + nupper;
+
+    // for slab PPPMCuda, change the grid boundary for processors at +z end
+    //   to include the empty volume between periodically repeating slabs
+    // for slab PPPMCuda, want charge data communicated from -z proc to +z proc,
+    //   but not vice versa, also want field data communicated from +z proc to
+    //   -z proc, but not vice versa
+    // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
+
+    if (slabflag && ((comm->myloc[2]+1) == (comm->procgrid[2]))) {
+      nzhi_in =  nz_pppm - 1;
+      nzhi_out = nz_pppm - 1;
+    }
+  
+    // nlo_ghost,nhi_ghost = # of planes I will recv from 6 directions
+    //   that overlay domain I own
+    // proc in that direction tells me via sendrecv()
+    // if no neighbor proc, value is from self since I have ghosts regardless
+
+    int nplanes;
+    MPI_Status status;
+
+    nplanes = nxlo_in - nxlo_out;
+    if (comm->procneigh[0][0] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][0],0,
+		   &nxhi_ghost,1,MPI_INT,comm->procneigh[0][1],0,
+		   world,&status);
+    else nxhi_ghost = nplanes;
+
+    nplanes = nxhi_out - nxhi_in;
+    if (comm->procneigh[0][1] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][1],0,
+		   &nxlo_ghost,1,MPI_INT,comm->procneigh[0][0],
+		   0,world,&status);
+    else nxlo_ghost = nplanes;
+
+    nplanes = nylo_in - nylo_out;
+    if (comm->procneigh[1][0] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][0],0,
+		   &nyhi_ghost,1,MPI_INT,comm->procneigh[1][1],0,
+		   world,&status);
+    else nyhi_ghost = nplanes;
+
+    nplanes = nyhi_out - nyhi_in;
+    if (comm->procneigh[1][1] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][1],0,
+		   &nylo_ghost,1,MPI_INT,comm->procneigh[1][0],0,
+		   world,&status);
+    else nylo_ghost = nplanes;
+
+    nplanes = nzlo_in - nzlo_out;
+    if (comm->procneigh[2][0] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][0],0,
+		   &nzhi_ghost,1,MPI_INT,comm->procneigh[2][1],0,
+		   world,&status);
+    else nzhi_ghost = nplanes;
+
+    nplanes = nzhi_out - nzhi_in;
+    if (comm->procneigh[2][1] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][1],0,
+		   &nzlo_ghost,1,MPI_INT,comm->procneigh[2][0],0,
+		   world,&status);
+    else nzlo_ghost = nplanes;
+
+    // test that ghost overlap is not bigger than my sub-domain
+
+    int flag = 0;
+    if (nxlo_ghost > nxhi_in-nxlo_in+1) flag = 1;
+    if (nxhi_ghost > nxhi_in-nxlo_in+1) flag = 1;
+    if (nylo_ghost > nyhi_in-nylo_in+1) flag = 1;
+    if (nyhi_ghost > nyhi_in-nylo_in+1) flag = 1;
+    if (nzlo_ghost > nzhi_in-nzlo_in+1) flag = 1;
+    if (nzhi_ghost > nzhi_in-nzlo_in+1) flag = 1;
+
+    int flag_all;
+    MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+
+    if (flag_all == 0) break;
+    order--;
+  }
+
+  if (order == 0) error->all("PPPMCuda order has been reduced to 0");
+  //printf("PPPMCuda: order is %i\n");
+
+
+
+  // decomposition of FFT mesh
+  // global indices range from 0 to N-1
+  // proc owns entire x-dimension, clump of columns in y,z dimensions
+  // npey_fft,npez_fft = # of procs in y,z dims
+  // if nprocs is small enough, proc can own 1 or more entire xy planes,
+  //   else proc owns 2d sub-blocks of yz plane
+  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
+  // nlo_fft,nhi_fft = lower/upper limit of the section
+  //   of the global FFT mesh that I own
+
+  int npey_fft,npez_fft;
+  if (nz_pppm >= nprocs) {
+    npey_fft = 1;
+    npez_fft = nprocs;
+  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
+
+  int me_y = me % npey_fft;
+  int me_z = me / npey_fft;
+
+  nxlo_fft = 0;
+  nxhi_fft = nx_pppm - 1;
+  nylo_fft = me_y*ny_pppm/npey_fft;
+  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
+  nzlo_fft = me_z*nz_pppm/npez_fft;
+  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
+
+  // PPPMCuda grid for this proc, including ghosts
+
+  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
+    (nzhi_out-nzlo_out+1);
+
+  // FFT arrays on this proc, without ghosts
+  // nfft = FFT points in FFT decomposition on this proc
+  // nfft_brick = FFT points in 3d brick-decomposition on this proc
+  // nfft_both = greater of 2 values
+
+  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
+    (nzhi_fft-nzlo_fft+1);
+  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
+    (nzhi_in-nzlo_in+1);
+  nfft_both = MAX(nfft,nfft_brick);
+
+  // buffer space for use in brick2fft and fillbrick
+  // idel = max # of ghost planes to send or recv in +/- dir of each dim
+  // nx,ny,nz = owned planes (including ghosts) in each dim
+  // nxx,nyy,nzz = max # of grid cells to send in each dim
+  // nbuf = max in any dim, augment by 3x for components of vd_xyz in fillbrick
+
+  int idelx,idely,idelz,nx,ny,nz,nxx,nyy,nzz;
+
+  idelx = MAX(nxlo_ghost,nxhi_ghost);
+  idelx = MAX(idelx,nxhi_out-nxhi_in);
+  idelx = MAX(idelx,nxlo_in-nxlo_out);
+
+  idely = MAX(nylo_ghost,nyhi_ghost);
+  idely = MAX(idely,nyhi_out-nyhi_in);
+  idely = MAX(idely,nylo_in-nylo_out);
+
+  idelz = MAX(nzlo_ghost,nzhi_ghost);
+  idelz = MAX(idelz,nzhi_out-nzhi_in);
+  idelz = MAX(idelz,nzlo_in-nzlo_out);
+
+  nx = nxhi_out - nxlo_out + 1;
+  ny = nyhi_out - nylo_out + 1;
+  nz = nzhi_out - nzlo_out + 1;
+
+  nxx = idelx * ny * nz;
+  nyy = idely * nx * nz;
+  nzz = idelz * nx * ny;
+
+  nbuf = MAX(nxx,nyy);
+  nbuf = MAX(nbuf,nzz);
+  nbuf *= 3;
+
+  // print stats
+
+  int ngrid_max,nfft_both_max,nbuf_max;
+  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nbuf,&nbuf_max,1,MPI_INT,MPI_MAX,world);
+
+  if (me == 0) {
+    if (screen) fprintf(screen,"  brick FFT buffer size/proc = %d %d %d\n",
+			ngrid_max,nfft_both_max,nbuf_max);
+    if (logfile) fprintf(logfile,"  brick FFT buffer size/proc = %d %d %d\n",
+			 ngrid_max,nfft_both_max,nbuf_max);
+  }
+cuda_shared_pppm* ap=&(cuda->shared_data.pppm);
+
+   ap->density_intScale=density_intScale;
+   ap->nxlo_in=nxlo_in;
+   ap->nxhi_in=nxhi_in;
+   ap->nxlo_out=nxlo_out;
+   ap->nxhi_out=nxhi_out;
+   ap->nylo_in=nylo_in;
+   ap->nyhi_in=nyhi_in;
+   ap->nylo_out=nylo_out;
+   ap->nyhi_out=nyhi_out;
+   ap->nzlo_in=nzlo_in;
+   ap->nzhi_in=nzhi_in;
+   ap->nzlo_out=nzlo_out;
+   ap->nzhi_out=nzhi_out;
+   ap->nxlo_in=nxlo_fft;
+   ap->nxhi_in=nxhi_fft;
+   ap->nylo_in=nylo_fft;
+   ap->nyhi_in=nyhi_fft;
+   ap->nzlo_in=nzlo_fft;
+   ap->nzhi_in=nzhi_fft;
+   ap->nx_pppm=nx_pppm;
+   ap->ny_pppm=ny_pppm;
+   ap->nz_pppm=nz_pppm;
+   ap->qqrd2e=qqrd2e;
+   ap->order=order;
+   ap->nmax=nmax;
+   ap->nlocal=atom->nlocal;
+   ap->delxinv=delxinv;
+   ap->delyinv=delyinv;
+   ap->delzinv=delzinv;
+   ap->nlower=nlower;
+   ap->nupper=nupper;
+   ap->shiftone=shiftone;
+   
+  // allocate K-space dependent memory
+
+
+  allocate();
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  compute_rho_coeff();
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPMCuda coeffs, called initially and whenever volume has changed 
+------------------------------------------------------------------------- */
+
+void PPPMCuda::setup()
+{
+  int i,j,k,l,m,n;
+  double *prd;
+  cu_gf_b->upload();
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPMCuda
+  // z dimension for 3d PPPMCuda is zprd since slab_volfactor = 1.0
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+    
+  delxinv = nx_pppm/xprd;
+  delyinv = ny_pppm/yprd;
+  delzinv = nz_pppm/zprd_slab;
+
+  delvolinv = delxinv*delyinv*delzinv;
+
+  double unitkx = (2.0*PI/xprd);
+  double unitky = (2.0*PI/yprd);
+  double unitkz = (2.0*PI/zprd_slab);
+
+  // fkx,fky,fkz for my FFT grid pts
+  Cuda_PPPM_Setup_fkxyz_vg(nx_pppm, ny_pppm,nz_pppm,unitkx,unitky,unitkz,g_ewald);
+
+  
+/*  cu_vg->download();
+  int offset=8100-2;//10*(nxhi_fft-nxlo_fft+1)*(nyhi_fft-nylo_fft+1)+10*(nyhi_fft-nylo_fft+1);
+  for (int i=nxlo_fft; i <= nxhi_fft+1;i++) printf("%e ",vg[i-nxlo_fft+offset][0]);
+  printf("\n\n");
+  double per;
+
+  #ifndef FFT_CUFFT
+  for (i = nxlo_fft; i <= nxhi_fft; i++) {
+    per = i - nx_pppm*(2*i/nx_pppm);
+    fkx[i] = unitkx*per;
+  }
+
+  for (i = nylo_fft; i <= nyhi_fft; i++) {
+    per = i - ny_pppm*(2*i/ny_pppm);
+    fky[i] = unitky*per;
+  }
+
+  for (i = nzlo_fft; i <= nzhi_fft; i++) {
+    per = i - nz_pppm*(2*i/nz_pppm);
+    fkz[i] = unitkz*per;
+  }
+  #endif
+  #ifdef FFT_CUFFT
+  for (i = 0; i < nx_pppm; i++) {
+    per = i - nx_pppm*(2*i/nx_pppm);
+    fkx[i] = unitkx*per;
+  }
+
+  for (i = 0; i < ny_pppm; i++) {
+    per = i - ny_pppm*(2*i/ny_pppm);
+    fky[i] = unitky*per;
+  }
+
+  for (i = 0; i < nz_pppm; i++) {
+    per = i - nz_pppm*(2*i/nz_pppm);
+    fkz[i] = unitkz*per;
+  }
+  #endif 
+   
+  // virial coefficients
+
+  double sqk,vterm;
+int save_n=0;
+int s_i,s_j,s_k;
+double max=0.0;
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+	sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
+	if(n==8100) printf("%lf\n",sqk);
+	if (sqk == 0.0) {
+	  vg[n][0] = 0.0;
+	  vg[n][1] = 0.0;
+	  vg[n][2] = 0.0;
+	  vg[n][3] = 0.0;
+	  vg[n][4] = 0.0;
+	  vg[n][5] = 0.0;
+	} else {
+	  vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+	  double tmp=vg[n][0];
+	  vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
+	  if(((vg[n][0]-tmp)*(vg[n][0]-tmp)>1e-6)&&(save_n==0)) {save_n=n;s_k=k;s_j=j;s_i=i;}
+	  vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
+	  vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
+	  vg[n][3] = vterm*fkx[i]*fky[j];
+	  vg[n][4] = vterm*fkx[i]*fkz[k];
+	  vg[n][5] = vterm*fky[j]*fkz[k];
+	  //if(vg[n][0]>max) {max=vg[n][0]; save_n=n;}
+	}
+	n++;
+      }
+    }
+  }
+  printf("%lf %i %i %i %i\n",max,save_n,s_k,s_j,s_i);
+  for (int i=nxlo_fft; i <= nxhi_fft;i++) printf("%e ",vg[i-nxlo_fft+offset][0]);
+  printf("\n\n");
+
+  //cu_fkx->upload();
+  //cu_fky->upload();
+ // cu_fkz->upload();
+  //cu_vg->upload();  */
+  // modified (Hockney-Eastwood) Coulomb Green's function
+
+double sqk;
+  int nx,ny,nz,kper,lper,mper;
+  double snx,sny,snz,snx2,sny2,snz2;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,dot1,dot2;
+  double numerator,denominator;
+
+  int nbx = static_cast<int> ((g_ewald*xprd/(PI*nx_pppm)) * 
+			      pow(-log(EPS_HOC),0.25));
+  int nby = static_cast<int> ((g_ewald*yprd/(PI*ny_pppm)) * 
+			      pow(-log(EPS_HOC),0.25));
+  int nbz = static_cast<int> ((g_ewald*zprd_slab/(PI*nz_pppm)) * 
+			      pow(-log(EPS_HOC),0.25));
+  Cuda_PPPM_setup_greensfn(nx_pppm,ny_pppm,nz_pppm,unitkx,unitky,unitkz,g_ewald,
+nbx,nby,nbz,xprd,yprd,zprd_slab);
+/*
+  double form = 1.0;
+
+  n = 0;
+#ifndef FFT_CUFFT
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+#endif
+#ifdef FFT_CUFFT
+  for (m = 0; m < nz_pppm; m++) {
+#endif
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = sin(0.5*unitkz*mper*zprd_slab/nz_pppm);
+    snz2 = snz*snz;
+
+#ifndef FFT_CUFFT
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+#endif
+#ifdef FFT_CUFFT
+    for (l = 0; l < ny_pppm; l++) {
+#endif
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = sin(0.5*unitky*lper*yprd/ny_pppm);
+      sny2 = sny*sny;
+
+#ifndef FFT_CUFFT
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+#endif
+#ifdef FFT_CUFFT
+      for (k = 0; k < nx_pppm; k++) {
+#endif
+	kper = k - nx_pppm*(2*k/nx_pppm);
+	snx = sin(0.5*unitkx*kper*xprd/nx_pppm);
+	snx2 = snx*snx;
+      
+	sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + 
+	  pow(unitkz*mper,2.0);
+
+	if (sqk != 0.0) {
+	  numerator = form*12.5663706/sqk;
+	  denominator = gf_denom(snx2,sny2,snz2);  
+	  sum1 = 0.0;
+	  for (nx = -nbx; nx <= nbx; nx++) {
+	    qx = unitkx*(kper+nx_pppm*nx);
+	    sx = exp(-.25*pow(qx/g_ewald,2.0));
+	    wx = 1.0;
+	    argx = 0.5*qx*xprd/nx_pppm;
+	    if (argx != 0.0) wx = pow(sin(argx)/argx,order);
+	    for (ny = -nby; ny <= nby; ny++) {
+	      qy = unitky*(lper+ny_pppm*ny);
+	      sy = exp(-.25*pow(qy/g_ewald,2.0));
+	      wy = 1.0;
+	      argy = 0.5*qy*yprd/ny_pppm;
+	      if (argy != 0.0) wy = pow(sin(argy)/argy,order);
+	      for (nz = -nbz; nz <= nbz; nz++) {
+		qz = unitkz*(mper+nz_pppm*nz);
+		sz = exp(-.25*pow(qz/g_ewald,2.0));
+		wz = 1.0;
+		argz = 0.5*qz*zprd_slab/nz_pppm;
+		if (argz != 0.0) wz = pow(sin(argz)/argz,order);
+
+		dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+		dot2 = qx*qx+qy*qy+qz*qz;
+		sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,2.0);
+	      }
+	    }
+	  }
+	  greensfn[n++] = numerator*sum1/denominator;
+	} else greensfn[n++] = 0.0;
+      }
+    }
+  }*/
+  
+#ifdef FFT_CUFFT
+  //cu_greensfn->upload();
+  //cu_fkx->upload();
+  //cu_fky->upload();
+  //cu_fkz->upload();
+  //cu_vg->upload();  
+  cu_vdx_brick->upload();
+  cu_vdy_brick->upload();
+  cu_vdz_brick->upload();
+  
+#endif
+  cu_rho_coeff->upload();
+  cu_density_brick->memset_device(0);
+  pppm_device_init_setup(&cuda->shared_data,shiftone,delxinv,delyinv,delzinv,nlower,nupper);
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMCuda long-range force, energy, virial 
+------------------------------------------------------------------------- */
+
+void PPPMCuda::compute(int eflag, int vflag)
+{
+	
+//	printf("PPPMCuda::compute START\n");
+	cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
+
+  int i;
+  timespec starttime;
+  timespec endtime;
+  timespec starttotal;
+  timespec endtotal;
+  // convert atoms from box to lamda coords
+  
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
+  // extend size of per-atom arrays if necessary
+
+  if ((cu_atom->update_nmax)||(old_nmax==0)) {
+    memory->destroy(part2grid);
+    nmax = atom->nmax;
+    memory->create(part2grid,nmax,3,"pppm:part2grid");
+ 	delete cu_part2grid;
+ 	delete adev_data_array;
+ 	adev_data_array=new dev_array[1];
+ 	cu_part2grid = new cCudaData<int  , int   , yx > ((int*)part2grid,adev_data_array, nmax,3);
+
+  	pppm_device_update(&cuda->shared_data,cu_part2grid->dev_data(),atom->nlocal,atom->nmax);
+    old_nmax=nmax;
+  }
+  if(cu_atom->update_nlocal) {pppm_update_nlocal(cu_atom->nlocal);}
+  
+  energy = 0.0;
+  if (vflag) 
+  {
+  	for (i = 0; i < 6; i++) virial[i] = 0.0;
+  	cu_virial->memset_device(0);
+  }
+  if(eflag) cu_energy->memset_device(0);
+  clock_gettime(CLOCK_REALTIME,&starttotal);
+
+  // find grid points for all my particles
+  // map my particle charge onto my local 3d density grid
+
+
+  clock_gettime(CLOCK_REALTIME,&starttime);
+
+  particle_map();
+
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  cuda->shared_data.cuda_timings.pppm_particle_map+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+
+  //cu_part2grid->download();
+  clock_gettime(CLOCK_REALTIME,&starttime);
+  make_rho();
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  cuda->shared_data.cuda_timings.pppm_make_rho+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+  
+ int nprocs=comm->nprocs;
+
+  clock_gettime(CLOCK_REALTIME,&starttime);
+
+if(nprocs>1)
+{
+  cu_density_brick->download();
+  brick2fft();
+}
+else
+{
+   #ifdef FFT_CUFFT
+   pppm_initfftdata(&cuda->shared_data,(PPPM_FLOAT*)cu_density_brick->dev_data(),(FFT_FLOAT*)cu_work2->dev_data());
+   #endif
+}
+
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  cuda->shared_data.cuda_timings.pppm_brick2fft+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  
+  clock_gettime(CLOCK_REALTIME,&starttime);
+  poisson(eflag,vflag);
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  cuda->shared_data.cuda_timings.pppm_poisson+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+
+  // all procs communicate E-field values to fill ghost cells
+  //   surrounding their 3d bricks
+
+  // not necessary since all the calculations are done on one proc
+  
+  // calculate the force on my particles
+  //cu_vdx_brick->download();
+  //cu_vdy_brick->download();
+  //cu_vdz_brick->download();
+
+   clock_gettime(CLOCK_REALTIME,&starttime);
+  fieldforce();
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  cuda->shared_data.cuda_timings.pppm_fieldforce+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+
+  // sum energy across procs and add in volume-dependent term
+
+  clock_gettime(CLOCK_REALTIME,&endtotal);
+  cuda->shared_data.cuda_timings.pppm_compute+=(endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000);
+
+  if (eflag) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+   
+    energy *= 0.5*volume;
+    energy -= g_ewald*qsqsum/1.772453851 +
+      0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy *= qqrd2e;
+  }
+
+  // sum virial across procs
+
+  if (vflag) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*volume*virial_all[i];
+  }
+
+  // 2d slab correction
+
+  if (slabflag) slabcorr(eflag);
+
+  // convert atoms back from lamda to box coords
+  
+  if (triclinic) domain->lamda2x(atom->nlocal);
+  
+  if(firstpass) firstpass=false;
+}
+
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order 
+------------------------------------------------------------------------- */
+
+
+void PPPMCuda::allocate()
+{
+	//printf("PPPMCuda::allocate START Mem: %i\n",CudaWrapper_CheckMemUseage());
+/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision\n");
+
+#ifdef PPPM_PRECISION
+if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision for pppm core\n");
+if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda: Using double precision for pppm core\n");
+#endif
+#ifdef ENERGY_PRECISION
+if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision for energy\n");
+if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda: Using double precision for energy\n");
+#endif
+#ifdef ENERGY_PRECISION
+if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision for fft\n");
+if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda: Using double precision for fft\n");
+#endif
+#ifdef X_PRECISION
+if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision for positions\n");
+if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda: Using double precision for positions\n");
+#endif
+#ifdef F_PRECISION
+if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision for forces\n");
+if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda: Using double precision for forces\n");
+#endif*/
+
+//if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision\n");
+
+  struct dev_array* dev_tmp=new struct dev_array[20];
+int n_cudata=0;
+
+
+  memory->create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:density_brick");
+  memory->create3d_offset(density_brick_int,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:density_brick_int");
+
+
+  cu_density_brick = new cCudaData<double, PPPM_FLOAT, x> ((double*) &(density_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]),
+  				   (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1));
+
+  cu_density_brick_int = new cCudaData<int, int, x> ((int*) &(density_brick_int[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]),
+  				   (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1));
+
+  memory->create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:vdx_brick");
+  memory->create3d_offset(vdx_brick_tmp,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:vdx_brick_tmp");
+
+  cu_vdx_brick = new cCudaData<double, PPPM_FLOAT, x> ((double*) &(vdx_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]),
+  				   (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1));
+
+  memory->create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:vdy_brick");
+  cu_vdy_brick = new cCudaData<double, PPPM_FLOAT, x> ((double*) &(vdy_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]),
+  				   (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1));
+
+  memory->create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:vdz_brick");
+  cu_vdz_brick = new cCudaData<double, PPPM_FLOAT, x> ((double*) &(vdz_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]),
+  				   (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1));
+
+  memory->create(density_fft,nfft_both,"pppm:density_fft");
+
+  cu_density_fft = new cCudaData<double, PPPM_FLOAT, x> (density_fft, & (dev_tmp[n_cudata++]),nfft_both);
+
+  cu_energy = new cCudaData<double, ENERGY_FLOAT, x> (NULL, &(dev_tmp[n_cudata++]),ny_pppm*nz_pppm);    
+  cu_virial = new cCudaData<double, ENERGY_FLOAT, x> (NULL, &(dev_tmp[n_cudata++]),ny_pppm*nz_pppm*6);    
+
+  memory->create(greensfn,nfft_both,"pppm:greensfn");
+  cu_greensfn = new cCudaData<double, PPPM_FLOAT, x> (greensfn, & (dev_tmp[n_cudata++]) , nx_pppm*ny_pppm*nz_pppm);
+
+  memory->create(work1,2*nx_pppm*ny_pppm*nz_pppm,"pppm:work1");
+  memory->create(work2,2*nx_pppm*ny_pppm*nz_pppm,"pppm:work2");
+  memory->create(work3,2*nx_pppm*ny_pppm*nz_pppm,"pppm:work3");
+  
+  cu_work1 = new cCudaData<double, FFT_FLOAT, x> (work1, & (dev_tmp[n_cudata++]) , 2*nx_pppm*ny_pppm*nz_pppm);
+  cu_work2 = new cCudaData<double, FFT_FLOAT, x> (work2, & (dev_tmp[n_cudata++]) , 2*nx_pppm*ny_pppm*nz_pppm);
+  cu_work3 = new cCudaData<double, FFT_FLOAT, x> (work3, & (dev_tmp[n_cudata++]) , 2*nx_pppm*ny_pppm*nz_pppm);
+  
+
+  memory->create(fkx,nx_pppm,"pppmcuda:fkx");
+  cu_fkx = new cCudaData<double, PPPM_FLOAT, x> (fkx, & (dev_tmp[n_cudata++]) , nx_pppm);
+  memory->create(fky,ny_pppm,"pppmcuda:fky");
+  cu_fky = new cCudaData<double, PPPM_FLOAT, x> (fky, & (dev_tmp[n_cudata++]) , ny_pppm);
+  memory->create(fkz,nz_pppm,"pppmcuda:fkz");
+  cu_fkz = new cCudaData<double, PPPM_FLOAT, x> (fkz, & (dev_tmp[n_cudata++]) , nz_pppm);
+
+  memory->create(vg,nfft_both,6,"pppm:vg");
+
+  cu_vg = new cCudaData<double, PPPM_FLOAT, xy> ((double*)vg, & (dev_tmp[n_cudata++]) , nfft_both,6);
+
+  memory->create(buf1,nbuf,"pppm:buf1");
+  memory->create(buf2,nbuf,"pppm:buf2");
+
+
+  // summation coeffs
+
+
+  gf_b = new double[order];
+  cu_gf_b = new cCudaData<double,PPPM_FLOAT,x> (gf_b, &(dev_tmp[n_cudata++]) , order);
+  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
+  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
+
+  cu_rho_coeff = new cCudaData<double, PPPM_FLOAT, x> ((double*) &(rho_coeff[0][(1-order)/2]), & (dev_tmp[n_cudata++]) , order*(order/2-(1-order)/2+1));
+
+  debugdata=new PPPM_FLOAT[100];
+  cu_debugdata = new cCudaData<PPPM_FLOAT, PPPM_FLOAT, x> (debugdata,& (dev_tmp[n_cudata++]),100);
+  cu_flag = new cCudaData<int, int, x> (&global_flag,& (dev_tmp[n_cudata++]),3);
+  
+  // create 2 FFTs and a Remap
+  // 1st FFT keeps data in FFT decompostion
+  // 2nd FFT returns data in 3d brick decomposition
+  // remap takes data from 3d brick to FFT decomposition
+
+  int tmp;
+
+
+
+
+  fft1c = new FFT3dCuda(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+		   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+		   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+		   0,0,&tmp,true);
+
+  fft2c = new FFT3dCuda(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+		   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+		   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+		   0,0,&tmp,false);
+
+ 
+#ifdef FFT_CUFFT  
+  fft1c->set_cudata(cu_work2->dev_data(),cu_work1->dev_data());
+  fft2c->set_cudata(cu_work2->dev_data(),cu_work3->dev_data());
+#endif
+
+  remap = new Remap(lmp,world,
+		    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+		    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+		    1,0,0,2);
+
+
+pppm_device_init(cu_density_brick->dev_data(), cu_vdx_brick->dev_data(), cu_vdy_brick->dev_data(), cu_vdz_brick->dev_data(), cu_density_fft->dev_data(),cu_energy->dev_data(),cu_virial->dev_data()
+	    , cu_work1->dev_data(), cu_work2->dev_data(), cu_work3->dev_data(), cu_greensfn->dev_data(), cu_fkx->dev_data(), cu_fky->dev_data(), cu_fkz->dev_data(), cu_vg->dev_data()
+	    ,nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,nx_pppm,ny_pppm,nz_pppm
+	    ,nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,cu_gf_b->dev_data()
+	    ,qqrd2e,order,cu_rho_coeff->dev_data(),cu_debugdata->dev_data(),cu_density_brick_int->dev_data(),slabflag
+	 );  
+}
+
+
+
+/* ----------------------------------------------------------------------
+   deallocate memory that depends on # of K-vectors and order 
+ ---------------------------------------------------------------------- */
+
+void PPPMCuda::deallocate()
+{
+  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
+  
+  density_brick = vdx_brick = vdy_brick = vdz_brick = NULL;
+
+  memory->destroy(density_fft);
+  memory->destroy(greensfn);
+  memory->destroy(work1);
+  memory->destroy(work2);
+  memory->destroy(vg);
+
+  density_fft = NULL;
+  greensfn = NULL;
+  work1 = NULL;
+  work2 = NULL;
+  vg = NULL;
+
+  memory->destroy(fkx);
+  memory->destroy(fky);
+  memory->destroy(fkz);
+
+  fkx = NULL;
+  fky = NULL;
+  fkz = NULL;
+
+  delete cu_density_brick;
+  delete cu_density_brick_int;
+  delete cu_vdx_brick;
+  delete cu_vdy_brick;
+  delete cu_vdz_brick;
+  delete cu_density_fft;
+  delete cu_energy;
+  delete cu_virial;
+#ifdef FFT_CUFFT 
+  delete cu_greensfn;
+  delete cu_gf_b;
+  delete cu_vg;
+  delete cu_work1;
+  delete cu_work2;
+  delete cu_work3;
+  delete cu_fkx;
+  delete cu_fky;
+  delete cu_fkz;
+#endif
+
+  delete cu_flag;
+  delete cu_debugdata;
+  delete cu_rho_coeff;
+
+  
+  cu_vdx_brick = cu_vdy_brick = cu_vdz_brick = NULL;
+  cu_density_brick = NULL;
+  cu_density_brick_int = NULL;
+  cu_density_fft = NULL;
+  cu_energy=NULL;
+  cu_virial=NULL;
+#ifdef FFT_CUFFT
+  cu_greensfn = NULL;
+  cu_gf_b = NULL;
+  cu_work1 = cu_work2 = cu_work3 = NULL;
+  cu_vg = NULL;
+  cu_fkx = cu_fky = cu_fkz = NULL;
+#endif
+  
+  cu_flag = NULL;
+  cu_debugdata = NULL;
+  cu_rho_coeff = NULL;
+  cu_part2grid = NULL;
+  
+  memory->destroy(buf1);
+  memory->destroy(buf2);
+
+  delete [] gf_b;
+  gf_b = NULL;
+  memory->destroy2d_offset(rho1d,-order/2); rho1d = NULL;
+  memory->destroy2d_offset(rho_coeff,(1-order)/2); rho_coeff = NULL;
+
+  delete fft1c;
+  fft1c = NULL;
+  double end=CudaWrapper_CheckMemUseage()/1024/1024;
+  delete fft2c;
+  fft2c = NULL;
+  delete remap;
+  remap = NULL;
+  buf1 = NULL;
+  buf2 = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   set size of FFT grid (nx,ny,nz_pppm) and g_ewald 
+-------------------------------------------------------------------------*/
+
+void PPPMCuda::set_grid()
+{
+  // see JCP 109, pg 7698 for derivation of coefficients
+  // higher order coefficients may be computed if needed
+
+  double **acons;
+  memory->create(acons,8,7,"pppm:acons");
+
+  acons[1][0] = 2.0 / 3.0;
+  acons[2][0] = 1.0 / 50.0;
+  acons[2][1] = 5.0 / 294.0;
+  acons[3][0] = 1.0 / 588.0;
+  acons[3][1] = 7.0 / 1440.0;
+  acons[3][2] = 21.0 / 3872.0;
+  acons[4][0] = 1.0 / 4320.0;
+  acons[4][1] = 3.0 / 1936.0;
+  acons[4][2] = 7601.0 / 2271360.0;
+  acons[4][3] = 143.0 / 28800.0;
+  acons[5][0] = 1.0 / 23232.0;
+  acons[5][1] = 7601.0 / 13628160.0;
+  acons[5][2] = 143.0 / 69120.0;
+  acons[5][3] = 517231.0 / 106536960.0;
+  acons[5][4] = 106640677.0 / 11737571328.0;
+  acons[6][0] = 691.0 / 68140800.0;
+  acons[6][1] = 13.0 / 57600.0;
+  acons[6][2] = 47021.0 / 35512320.0;
+  acons[6][3] = 9694607.0 / 2095994880.0;
+  acons[6][4] = 733191589.0 / 59609088000.0;
+  acons[6][5] = 326190917.0 / 11700633600.0;
+  acons[7][0] = 1.0 / 345600.0;
+  acons[7][1] = 3617.0 / 35512320.0;
+  acons[7][2] = 745739.0 / 838397952.0;
+  acons[7][3] = 56399353.0 / 12773376000.0;
+  acons[7][4] = 25091609.0 / 1560084480.0;
+  acons[7][5] = 1755948832039.0 / 36229939200000.0;
+  acons[7][6] = 4887769399.0 / 37838389248.0;
+
+  double q2 = qsqsum / force->dielectric;
+  bigint natoms = atom->natoms;
+
+  // use xprd,yprd,zprd even if triclinic so grid size is the same
+  // adjust z dimension for 2d slab PPPMCuda
+  // 3d PPPMCuda just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+  
+  // make initial g_ewald estimate
+  // based on desired error and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  double hx,hy,hz;
+
+  if (!gewaldflag)
+    g_ewald = sqrt(-log(precision*sqrt(natoms*cutoff*xprd*yprd*zprd) / 
+			(2.0*q2))) / cutoff;
+
+  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and precision
+  // nz_pppm uses extended zprd_slab instead of zprd
+  // h = 1/g_ewald is upper bound on h such that h*g_ewald <= 1
+  // reduce it until precision target is met
+
+  if (!gridflag) {
+    double err;
+    hx = hy = hz = 1/g_ewald;  
+
+    nx_pppm = static_cast<int> (xprd/hx + 1);
+    ny_pppm = static_cast<int> (yprd/hy + 1);
+    nz_pppm = static_cast<int> (zprd_slab/hz + 1);
+
+    err = rms(hx,xprd,natoms,q2,acons);
+    while (err > precision) {
+      err = rms(hx,xprd,natoms,q2,acons);
+      nx_pppm++;
+      hx = xprd/nx_pppm;
+    }
+
+    err = rms(hy,yprd,natoms,q2,acons);
+    while (err > precision) {
+      err = rms(hy,yprd,natoms,q2,acons);
+      ny_pppm++;
+      hy = yprd/ny_pppm;
+    }
+
+    err = rms(hz,zprd_slab,natoms,q2,acons);
+    while (err > precision) {
+      err = rms(hz,zprd_slab,natoms,q2,acons);
+      nz_pppm++;
+      hz = zprd_slab/nz_pppm;
+    }
+  }
+
+  // boost grid size until it is factorable
+
+  while (!factorable(nx_pppm)) nx_pppm++;
+  while (!factorable(ny_pppm)) ny_pppm++;
+  while (!factorable(nz_pppm)) nz_pppm++;
+
+  // if allowed try to change grid size until it is a power of a single prime factor
+  if(precisionmodify!='=')
+  {
+    if (me == 0) {
+      if (screen) {
+        fprintf(screen,"Initial grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      }
+      if (logfile) {
+        fprintf(logfile,"Initial grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      }
+    }
+  	make_power_of_prime(&nx_pppm);
+  	make_power_of_prime(&ny_pppm);
+  	make_power_of_prime(&nz_pppm);
+    if (me == 0) {
+      if (screen) {
+        fprintf(screen,"Modified grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      }
+      if (logfile) {
+        fprintf(logfile,"Modified grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      }
+    }
+  }
+  
+  // adjust g_ewald for new grid size
+
+  hx = xprd/nx_pppm;
+  hy = yprd/ny_pppm;
+  hz = zprd_slab/nz_pppm;
+
+  if (!gewaldflag) {
+    double gew1,gew2,dgew,f,fmid,hmin,rtb;
+    int ncount;
+
+    gew1 = 0.0;
+    g_ewald = gew1;
+    f = diffpr(hx,hy,hz,q2,acons);
+
+    hmin = MIN(hx,MIN(hy,hz));
+    gew2 = 10/hmin;
+    g_ewald = gew2;
+    fmid = diffpr(hx,hy,hz,q2,acons);
+
+    if (f*fmid >= 0.0) error->all("Cannot compute PPPMCuda G");
+    rtb = f < 0.0 ? (dgew=gew2-gew1,gew1) : (dgew=gew1-gew2,gew2);
+    ncount = 0;
+    while (fabs(dgew) > SMALL && fmid != 0.0) {
+      dgew *= 0.5;
+      g_ewald = rtb + dgew;
+      fmid = diffpr(hx,hy,hz,q2,acons);      
+      if (fmid <= 0.0) rtb = g_ewald;
+      ncount++;
+      if (ncount > LARGE) error->all("Cannot compute PPPMCuda G");
+    }
+  }
+
+  // final RMS precision
+
+  double lprx = rms(hx,xprd,natoms,q2,acons);
+  double lpry = rms(hy,yprd,natoms,q2,acons);
+  double lprz = rms(hz,zprd_slab,natoms,q2,acons);
+  double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
+  double spr = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) / 
+    sqrt(natoms*cutoff*xprd*yprd*zprd_slab);
+
+  // free local memory
+
+  memory->destroy(acons);
+
+  // print info
+
+  if (me == 0) {
+    if (screen) {
+      fprintf(screen,"  G vector = %g\n",g_ewald);
+      fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(screen,"  stencil order = %d\n",order);
+      fprintf(screen,"  RMS precision = %g\n",MAX(lpr,spr));
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector = %g\n",g_ewald);
+      fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(logfile,"  stencil order = %d\n",order);
+      fprintf(logfile,"  RMS precision = %g\n",MAX(lpr,spr));
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   check if all factors of n are prime
+   return 1 if yes, 0 if no 
+-------------------------------------------------------------------------*/
+
+void PPPMCuda::make_power_of_prime(int* n)
+{
+
+	if((precisionmodify!='+')&&(precisionmodify!='-')&&(precisionmodify!='c'))
+    {error->all("Unknown Option for PPPMCuda, assumeing '='");return;}
+    int oldn=*n;
+    int* primelist=new int[1000];
+    int count=0;
+    
+    int prime=1;
+    while(prime<2000) primelist[count++]=prime*=2;
+    prime=1;
+    while(prime<2000) primelist[count++]=prime*=3;
+    prime=1;
+    while(prime<2000) primelist[count++]=prime*=5;
+    prime=1;
+    while(prime<2000) primelist[count++]=prime*=7;
+    
+    for(int i=0;i<count-1;i++)
+    for(int j=0;j<count-i-1;j++) 
+    {
+    	if(primelist[j]>primelist[j+1])
+        {
+        	int a=primelist[j+1];
+        	primelist[j+1]=primelist[j];
+        	primelist[j]=a;
+        }
+    }
+    
+    int nextsmaller=0;
+    while((primelist[nextsmaller+1]<*n)&&(nextsmaller+1<count)) nextsmaller++;
+    
+    int nextlarger=count-1;
+    while((primelist[nextlarger-1]>*n)&&(nextlarger>0)) nextlarger--;
+    
+    if(precisionmodify=='-') 
+    	*n=primelist[nextsmaller];
+    if((precisionmodify=='+')&&
+    (primelist[nextlarger]*primelist[nextlarger]*primelist[nextlarger]<2*(*n)*(*n)*(*n))) 
+    	*n=primelist[nextlarger];
+    if(precisionmodify=='c')
+    {
+    	double factorsmaller=1.0*(*n)*(*n)*(*n)/(primelist[nextsmaller]*primelist[nextsmaller]*primelist[nextsmaller]);
+    	double factorlarger=1.0*(primelist[nextlarger]*primelist[nextlarger]*primelist[nextlarger])/((*n)*(*n)*(*n));
+    	if((factorlarger<factorsmaller)&&(factorlarger<2)) *n=primelist[nextlarger];
+    	else *n=primelist[nextsmaller]; 
+    	
+    }
+	delete [] primelist;
+	primelist = NULL;
+	if(*n<0.75*oldn)   
+	if (me == 0) {
+      if (screen) {
+        fprintf(screen,"\n\npppm/cuda WARNING:   \t\tSignificantly lower gridsize than requested.\n\t\t\t\t\tYou should most likely use '=' or '+' as precision modify option\n");
+      }
+      if (logfile) {
+        fprintf(logfile,"\n\npppm/cuda WARNING:   \t\tSignificantly lower gridsize than requested.\n\t\t\t\t\tYou should most likely use '=' or '+' as precision modify option\n.");
+      }
+    }
+}
+
+
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array 
+------------------------------------------------------------------------- */
+
+
+void PPPMCuda::particle_map()
+{
+  MYDBG(printf("# CUDA PPPMCuda::particle_map() ... start\n");) 
+  int flag = 0;
+
+    cu_flag->memset_device(0);
+    flag=cuda_particle_map(&cuda->shared_data,cu_flag->dev_data());
+    if(flag)
+    {
+      cu_debugdata->download();
+      printf("Out of range atom: ");
+       printf("ID: %i ",atom->tag[int(debugdata[0])]);
+       printf("x: %e ",debugdata[7]);
+       printf("y: %e ",debugdata[8]);
+       printf("z: %e ",debugdata[9]);
+       printf("nx: %e ",debugdata[4]);
+       printf("ny: %e ",debugdata[5]);
+       
+      printf("\n");
+      //printf("debugdata: cpu: %e %e %e %i\n",boxlo[0],boxlo[1],boxlo[2],atom->nlocal);
+      cuda->cu_x->download();
+  	  int nx,ny,nz;
+
+  	  double **x = atom->x;
+      int nlocal = atom->nlocal;
+  	  for (int i = 0; i < nlocal; i++) {
+        nx = static_cast<int> ((x[i][0]-boxlo[0])*delxinv+shift) - OFFSET;
+        ny = static_cast<int> ((x[i][1]-boxlo[1])*delyinv+shift) - OFFSET;
+      	nz = static_cast<int> ((x[i][2]-boxlo[2])*delzinv+shift) - OFFSET;
+
+	    if(i==1203)printf("Outside Atom: %i %e %e %e (%i %i %i)\n",i,x[i][0],x[i][1],x[i][2],nx,ny,nz);
+    	if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+		ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+		nz+nlower < nzlo_out || nz+nupper > nzhi_out || i==1203) {printf("Outside Atom: %i %e %e %e (%i %i %i)\n",i,x[i][0],x[i][1],x[i][2],nx,ny,nz); }
+  	  }
+      
+    }
+
+  int flag_all;
+  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+  if (flag_all) error->all("Out of range atoms - cannot compute PPPMCuda!");
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid 
+------------------------------------------------------------------------- */
+
+
+void PPPMCuda::make_rho()
+{
+    cuda_make_rho(&cuda->shared_data,cu_flag->dev_data(),&density_intScale,nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,cu_density_brick->dev_data(),cu_density_brick_int->dev_data());
+}
+
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver 
+------------------------------------------------------------------------- */
+void PPPMCuda::poisson(int eflag, int vflag)
+{
+
+#ifndef FFT_CUFFT
+    PPPM::poisson(eflag,vflag);
+    return;
+#endif
+#ifdef FFT_CUFFT
+  timespec starttime,starttime2;
+  timespec endtime,endtime2;
+  
+  int nprocs=comm->nprocs;
+  clock_gettime(CLOCK_REALTIME,&starttime);
+  fft1c->compute(density_fft,work1,1);
+  
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+
+
+  
+  if (eflag || vflag) {
+    poisson_energy(nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,vflag);
+    ENERGY_FLOAT gpuvirial[6];
+    energy+=sum_energy(cu_virial->dev_data(),cu_energy->dev_data(),nx_pppm,ny_pppm,nz_pppm,vflag,gpuvirial);
+    if(vflag) 
+    {
+      for(int j=0;j<6;j++) virial[j]+=gpuvirial[j];
+    }
+  }
+  
+  
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  poisson_scale(nx_pppm,ny_pppm,nz_pppm);
+
+   // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+  
+  
+  poisson_xgrad(nx_pppm,ny_pppm,nz_pppm);
+
+
+  clock_gettime(CLOCK_REALTIME,&starttime);
+  fft2c->compute(work2,work2,-1);
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+  
+  poisson_vdx_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm);
+  
+  
+  // y direction gradient
+
+  poisson_ygrad(nx_pppm,ny_pppm,nz_pppm);
+
+  clock_gettime(CLOCK_REALTIME,&starttime);
+  fft2c->compute(work2,work2,-1);
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+
+  poisson_vdy_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm);
+      
+  // z direction gradient
+
+  poisson_zgrad(nx_pppm,ny_pppm,nz_pppm);
+
+  clock_gettime(CLOCK_REALTIME,&starttime);
+  fft2c->compute(work2,work2,-1);
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+
+  poisson_vdz_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm);
+ #endif
+}
+
+/*----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles 
+-------------------------------------------------------------------------*/
+
+void PPPMCuda::fieldforce()
+{
+  cuda_fieldforce(& cuda->shared_data,cu_flag);
+  return;
+}
+
+
+
+
+/* ----------------------------------------------------------------------
+   perform and time the 4 FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+void PPPMCuda::timing(int n, double &time3d, double &time1d)
+{
+
+  double time1,time2;
+
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = 0.0;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1c->compute(work1,work1,1);
+    fft2c->compute(work1,work1,-1);
+    fft2c->compute(work1,work1,-1);
+    fft2c->compute(work1,work1,-1);
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time3d = time2 - time1;
+
+  MPI_Barrier(world);
+  /*time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1c->timing1d(work1,nfft_both,1);
+    fft2c->timing1d(work1,nfft_both,-1);
+    fft2c->timing1d(work1,nfft_both,-1);
+    fft2c->timing1d(work1,nfft_both,-1);
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time1d = time2 - time1;*/
+
+}
+
+void PPPMCuda::slabcorr(int eflag)
+{
+  // compute local contribution to global dipole moment
+  if(slabbuf==NULL)
+  {
+  	slabbuf=new ENERGY_FLOAT[(atom->nmax+31)/32];
+  	cu_slabbuf = new cCudaData<ENERGY_FLOAT,ENERGY_FLOAT, x> (slabbuf, (atom->nmax+31)/32);
+  }
+  if((atom->nlocal+31)/32*sizeof(ENERGY_FLOAT)>=cu_slabbuf->dev_size())
+  {
+  	delete [] slabbuf;
+  	delete cu_slabbuf;
+  	slabbuf=new ENERGY_FLOAT[(atom->nmax+31)/32];
+  	cu_slabbuf = new cCudaData<ENERGY_FLOAT,ENERGY_FLOAT, x> (slabbuf, (atom->nmax+31)/32);
+  }
+  
+  
+  double dipole = cuda_slabcorr_energy(&cuda->shared_data,slabbuf,(ENERGY_FLOAT*) cu_slabbuf->dev_data());
+
+  double dipole_all;
+  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // compute corrections
+  
+  double e_slabcorr = 2.0*PI*dipole_all*dipole_all/volume;
+  
+  if (eflag) energy += qqrd2e*scale * e_slabcorr;
+  
+  double ffact = -4.0*PI*dipole_all/volume; 
+ 
+  cuda_slabcorr_force(&cuda->shared_data,ffact);
+}
diff --git a/src/USER-CUDA/pppm_cuda.cu b/src/USER-CUDA/pppm_cuda.cu
new file mode 100644
index 0000000000..cabea885d3
--- /dev/null
+++ b/src/USER-CUDA/pppm_cuda.cu
@@ -0,0 +1,579 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_precision.h"
+//#define FFT_CUFFT
+#define MY_PREFIX pppm
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "pppm_cuda_cu.h"
+#include "cuda_runtime.h"
+#include <stdio.h>
+
+//#include "crm_cuda_utils.cu"
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+  __device__ __constant__ FFT_FLOAT* work1;
+  __device__ __constant__ FFT_FLOAT* work2;
+  __device__ __constant__ FFT_FLOAT* work3;
+  __device__ __constant__ PPPM_FLOAT* greensfn;
+  __device__ __constant__ PPPM_FLOAT* gf_b;
+  __device__ __constant__ PPPM_FLOAT* fkx;
+  __device__ __constant__ PPPM_FLOAT* fky;
+  __device__ __constant__ PPPM_FLOAT* fkz;
+  __device__ __constant__ PPPM_FLOAT* vg;
+  __device__ __constant__ int* part2grid;
+  __device__ __constant__ PPPM_FLOAT* density_brick;
+  __device__ __constant__ int* density_brick_int;
+  __device__ __constant__ PPPM_FLOAT density_intScale;
+  __device__ __constant__ PPPM_FLOAT* vdx_brick;
+  __device__ __constant__ PPPM_FLOAT* vdy_brick;
+  __device__ __constant__ PPPM_FLOAT* vdz_brick;
+  __device__ __constant__ PPPM_FLOAT* density_fft;
+  __device__ __constant__ ENERGY_FLOAT* energy;
+  __device__ __constant__ ENERGY_FLOAT* virial;
+  __device__ __constant__ int nxlo_in;
+  __device__ __constant__ int nxhi_in;
+  __device__ __constant__ int nxlo_out;
+  __device__ __constant__ int nxhi_out;
+  __device__ __constant__ int nylo_in;
+  __device__ __constant__ int nyhi_in;
+  __device__ __constant__ int nylo_out;
+  __device__ __constant__ int nyhi_out;
+  __device__ __constant__ int nzlo_in;
+  __device__ __constant__ int nzhi_in;
+  __device__ __constant__ int nzlo_out;
+  __device__ __constant__ int nzhi_out;
+  __device__ __constant__ int nxlo_fft;
+  __device__ __constant__ int nxhi_fft;
+  __device__ __constant__ int nylo_fft;
+  __device__ __constant__ int nyhi_fft;
+  __device__ __constant__ int nzlo_fft;
+  __device__ __constant__ int nzhi_fft;
+  __device__ __constant__ int nx_pppm;
+  __device__ __constant__ int ny_pppm;
+  __device__ __constant__ int nz_pppm;
+  __device__ __constant__ int slabflag;
+  __device__ __constant__ PPPM_FLOAT qqrd2e;
+  __device__ __constant__ int order;
+  //__device__ __constant__ float3 sublo;
+  __device__ __constant__ PPPM_FLOAT* rho_coeff;
+  __device__ __constant__ int nmax;
+  __device__ __constant__ int nlocal;
+  __device__ __constant__ PPPM_FLOAT* debugdata;
+  __device__ __constant__ PPPM_FLOAT delxinv;
+  __device__ __constant__ PPPM_FLOAT delyinv;
+  __device__ __constant__ PPPM_FLOAT delzinv;
+  __device__ __constant__ int nlower;
+  __device__ __constant__ int nupper;
+  __device__ __constant__ PPPM_FLOAT shiftone;
+  
+  
+#include "pppm_cuda_kernel.cu"
+#include "stdio.h"
+void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial
+	    ,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg
+	    ,int cu_nxlo_in,int cu_nxhi_in,int cu_nylo_in,int cu_nyhi_in,int cu_nzlo_in,int cu_nzhi_in,int cu_nxlo_out,int cu_nxhi_out,int cu_nylo_out,int cu_nyhi_out,int cu_nzlo_out,int cu_nzhi_out,int cu_nx_pppm,int cu_ny_pppm,int cu_nz_pppm
+	    ,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b
+	    ,double cu_qqrd2e, int cu_order, void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_int,int cu_slabflag
+	 )
+{
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start");
+  cudaMemcpyToSymbol("density_brick",&cu_density_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("density_brick_int",&cu_density_brick_int, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("vdx_brick",&cu_vdx_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("vdy_brick",&cu_vdy_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("vdz_brick",&cu_vdz_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("density_fft",&cu_density_fft, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("energy",&cu_energy, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol("virial",&cu_virial, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol("nxlo_in",&cu_nxlo_in, sizeof(int));
+  cudaMemcpyToSymbol("nxhi_in",&cu_nxhi_in, sizeof(int));
+  cudaMemcpyToSymbol("nxlo_out",&cu_nxlo_out, sizeof(int));
+  cudaMemcpyToSymbol("nxhi_out",&cu_nxhi_out, sizeof(int));
+  cudaMemcpyToSymbol("nylo_in",&cu_nylo_in, sizeof(int));
+  cudaMemcpyToSymbol("nyhi_in",&cu_nyhi_in, sizeof(int));
+  cudaMemcpyToSymbol("nylo_out",&cu_nylo_out, sizeof(int));
+  cudaMemcpyToSymbol("nyhi_out",&cu_nyhi_out, sizeof(int));
+  cudaMemcpyToSymbol("nzlo_in",&cu_nzlo_in, sizeof(int));
+  cudaMemcpyToSymbol("nzhi_in",&cu_nzhi_in, sizeof(int));
+  cudaMemcpyToSymbol("nzlo_out",&cu_nzlo_out, sizeof(int));
+  cudaMemcpyToSymbol("nzhi_out",&cu_nzhi_out, sizeof(int));
+  cudaMemcpyToSymbol("nxlo_fft",&cu_nxlo_fft, sizeof(int));
+  cudaMemcpyToSymbol("nxhi_fft",&cu_nxhi_fft, sizeof(int));
+  cudaMemcpyToSymbol("nylo_fft",&cu_nylo_fft, sizeof(int));
+  cudaMemcpyToSymbol("nyhi_fft",&cu_nyhi_fft, sizeof(int));
+  cudaMemcpyToSymbol("nzlo_fft",&cu_nzlo_fft, sizeof(int));
+  cudaMemcpyToSymbol("nzhi_fft",&cu_nzhi_fft, sizeof(int));
+  cudaMemcpyToSymbol("slabflag",&cu_slabflag, sizeof(int));  
+  cudaMemcpyToSymbol("nx_pppm",&cu_nx_pppm, sizeof(int));
+  cudaMemcpyToSymbol("ny_pppm",&cu_ny_pppm, sizeof(int));
+  cudaMemcpyToSymbol("nz_pppm",&cu_nz_pppm, sizeof(int));
+  cudaMemcpyToSymbol("work1",&cu_work1, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol("work2",&cu_work2, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol("work3",&cu_work3, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol("greensfn",&cu_greensfn, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("gf_b",&cu_gf_b, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("fkx",&cu_fkx, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("fky",&cu_fky, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("fkz",&cu_fkz, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("vg",&cu_vg, sizeof(PPPM_FLOAT*));
+
+  PPPM_FLOAT cu_qqrd2e_a=cu_qqrd2e;
+  cudaMemcpyToSymbol("qqrd2e",&cu_qqrd2e_a, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol("order",&cu_order, sizeof(int));
+  cudaMemcpyToSymbol("rho_coeff",&cu_rho_coeff, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol("debugdata",&cu_debugdata, sizeof(PPPM_FLOAT*));
+  
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_init");
+
+/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n");
+
+#ifdef PPPM_PRECISION
+if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n");
+if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n");
+#endif
+#ifdef ENERGY_PRECISION
+if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n");
+if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n");
+#endif
+#ifdef ENERGY_PRECISION
+if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n");
+if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n");
+#endif
+#ifdef X_PRECISION
+if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n");
+if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n");
+#endif
+#ifdef F_PRECISION
+if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n");
+if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n");
+#endif*/
+}
+
+void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT cu_shiftone,PPPM_FLOAT cu_delxinv,PPPM_FLOAT cu_delyinv,PPPM_FLOAT cu_delzinv,int cu_nlower,int cu_nupper)
+{
+  cudaMemcpyToSymbol("delxinv",&cu_delxinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol("delyinv",&cu_delyinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol("delzinv",&cu_delzinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol("shiftone",&cu_shiftone, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol("nlower",&cu_nlower, sizeof(int));
+  cudaMemcpyToSymbol("nupper",&cu_nupper, sizeof(int));
+  cudaMemcpyToSymbol(MY_CONST(sublo)   , sdata->domain.sublo, 3*sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_CONST(subhi)   , sdata->domain.subhi, 3*sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_CONST(boxlo)   , sdata->domain.boxlo, 3*sizeof(X_FLOAT));
+  CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup");
+}
+
+void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa)
+{
+  cudaMemcpyToSymbol("part2grid",&cu_part2grid, sizeof(int*));
+  cudaMemcpyToSymbol(MY_CONST(x)   , & sdata->atom.x   .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_CONST(f)   , & sdata->atom.f   .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_CONST(q)   , & sdata->atom.q   .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_CONST(tag)   , & sdata->atom.tag   .dev_data, sizeof(int*));
+  //cudaMemcpyToSymbol(MY_CONST(nlocal)   , & sdata->atom.nlocal   .dev_data, sizeof(int));
+  cudaMemcpyToSymbol("nlocal"   , &nlocala, sizeof(int));
+  cudaMemcpyToSymbol("nmax"   , &nmaxa, sizeof(int));
+  CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update");
+  
+}
+
+void pppm_update_nlocal(int nlocala)
+{
+ cudaMemcpyToSymbol("nlocal"   , &nlocala, sizeof(int));
+ CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b");
+}
+
+
+void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  setup_fkxyz_vg<<<grid,threads,0>>>(unitkx,unitky,unitkz,g_ewald);
+  cudaThreadSynchronize();
+  
+  CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg ");
+}
+
+void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald,
+int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  setup_greensfn<<<grid,threads,0>>>(unitkx,unitky,unitkz,g_ewald,nbx,nby,nbz,xprd,yprd, zprd_slab);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn ");
+}
+
+void poisson_scale(int nx_pppma,int ny_pppma,int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  poisson_scale_kernel<<<grid,threads,0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_scale ");
+
+}
+
+void poisson_xgrad(int nx_pppma,int ny_pppma,int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  poisson_xgrad_kernel<<<grid,threads,0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad ");
+}
+
+void poisson_ygrad(int nx_pppma,int ny_pppma,int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  poisson_ygrad_kernel<<<grid,threads,0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad ");
+}
+
+void poisson_zgrad(int nx_pppma,int ny_pppma,int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=nz_pppma;
+  grid.y=ny_pppma;
+  grid.z=1;
+  threads.x=nx_pppma;
+  threads.y=1;
+  threads.z=1;
+  poisson_zgrad_kernel<<<grid,threads,0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad ");
+}
+
+void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppma,int ny_pppma,int nz_pppma)
+{
+	
+  dim3 grid;
+  dim3 threads;
+  grid.x=khi-klo+1;
+  grid.y=jhi-jlo+1;
+  grid.z=1;
+  threads.x=ihi-ilo+1;
+  threads.y=1;
+  threads.z=1;
+  //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x);
+  poisson_vdx_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick ");
+  cudaThreadSynchronize();
+}
+
+void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=khi-klo+1;
+  grid.y=jhi-jlo+1;
+  grid.z=1;
+  threads.x=ihi-ilo+1;
+  threads.y=1;
+  threads.z=1;
+  poisson_vdy_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick ");
+  cudaThreadSynchronize();
+}
+
+void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x=khi-klo+1;
+  grid.y=jhi-jlo+1;
+  grid.z=1;
+  threads.x=ihi-ilo+1;
+  threads.y=1;
+  threads.z=1;
+  poisson_vdz_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick ");
+  cudaThreadSynchronize();
+}
+
+
+void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag)
+{
+  //printf("VFLAG_GPU: %i\n",vflag);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start ");
+  dim3 grid;
+  dim3 threads;
+  grid.x=nzhi_fft-nzlo_fft+1;
+  grid.y=nyhi_fft-nylo_fft+1;
+  grid.z=1;
+  threads.x=nxhi_fft-nxlo_fft+1;
+  threads.y=1;
+  threads.z=1;
+  poisson_energy_kernel<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(nxlo_fft,nylo_fft,nzlo_fft,vflag);
+  
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end ");
+}
+
+ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial)
+{
+  ENERGY_FLOAT host_energy=0;
+  dim3 grid;
+  dim3 threads;
+
+  grid.x=nz_pppma;
+  grid.y=1;
+  grid.z=1;
+  threads.x=ny_pppma;
+  threads.y=1;
+  threads.z=1;
+  sum_energy_kernel1<<<grid,threads,ny_pppma*sizeof(ENERGY_FLOAT)>>>(vflag); 
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 ");
+
+  grid.x=1;
+  grid.y=1;
+  grid.z=1;
+  threads.x=nz_pppma;
+  threads.y=1;
+  threads.z=1;
+  sum_energy_kernel2<<<grid,threads,nz_pppma*sizeof(ENERGY_FLOAT)>>>(vflag); 
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 ");
+
+  cudaMemcpy((void*) (&host_energy), cu_energy, sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
+  if(vflag)
+  cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
+  CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy");
+    
+  return host_energy;
+}
+
+void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int)
+{
+  CUT_CHECK_ERROR("cuda_make_rho begin");
+  dim3 grid,threads;
+  int cpu_flag[3];
+  grid.x=(sdata->atom.nlocal+31)/32;
+  grid.y=1;
+  grid.z=1;
+  threads.x=32;
+  threads.y=1;
+  threads.z=1;
+  int sharedmemsize=(32+32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT);
+  do
+  { 
+    cpu_flag[0]=0;
+    cpu_flag[1]=0;
+    cpu_flag[2]=0;
+    cudaMemcpyToSymbol("density_intScale",cu_density_intScale,sizeof(PPPM_FLOAT*));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z");
+    cudaMemset(flag,0,3*sizeof(int));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A");
+    cudaMemset(cu_density_brick,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(PPPM_FLOAT));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B");
+    cudaMemset(cu_density_brick_int,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(int));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C");
+     make_rho_kernel<<<grid,threads,sharedmemsize>>>((int*) flag,32/(sdata->pppm.nupper-sdata->pppm.nlower+1));
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho A");
+    cudaMemcpy((void*) &cpu_flag, flag, 3*sizeof(int),cudaMemcpyDeviceToHost);
+    if(cpu_flag[0]!=0) {(*cu_density_intScale)/=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n",*cu_density_intScale);)}
+    if((cpu_flag[0]==0)&&(cpu_flag[1]==0)) {(*cu_density_intScale)*=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n",*cu_density_intScale);)}
+   /* if((*cu_density_intScale)>0xe0000000)
+    {
+    	printf("Error Scaling\n");
+        cpu_flag[0]=0;
+        cpu_flag[1]=1;
+    }*/
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho B");
+  } while((cpu_flag[0]!=0)||(cpu_flag[1]==0));
+  
+  
+  grid.x=khi-klo+1;
+  grid.y=jhi-jlo+1;
+  threads.x=ihi-ilo+1;
+  scale_rho_kernel<<<grid,threads,0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale");
+}
+
+
+int cuda_particle_map(cuda_shared_data* sdata,void* flag)
+{
+  dim3 grid,threads;
+  int cpu_flag;
+  grid.x=(sdata->atom.nlocal+31)/32;
+  grid.y=1;
+  grid.z=1;
+  threads.x=32;
+  threads.y=1;
+  threads.z=1;
+  CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre");
+  particle_map_kernel<<<grid,threads,0>>>((int*) flag);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA particla_map a");
+  cudaMemcpy((void*) &cpu_flag, flag, sizeof(int),cudaMemcpyDeviceToHost);
+  CUT_CHECK_ERROR("ERROR-CUDA particla_map b");
+  return cpu_flag;
+}
+
+
+void cuda_fieldforce(cuda_shared_data* sdata,void* flag)
+{
+  dim3 grid,threads;
+  grid.x=(sdata->atom.nlocal+31)/32;
+  grid.y=1;
+  grid.z=1;
+  threads.x=32;
+  threads.y=1;
+  threads.z=1;
+  int sharedmemsize=(32+3*32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT);
+  fieldforce_kernel<<<grid,threads,sharedmemsize>>>
+  (sdata->pppm.nupper-sdata->pppm.nlower+1,32/(sdata->pppm.nupper-sdata->pppm.nlower+1),(int*) flag);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA fieldforce");
+}
+
+double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf)
+{
+  dim3 grid,threads;
+  grid.x=(sdata->atom.nlocal+31)/32;
+  grid.y=1;
+  grid.z=1;
+  threads.x=32;
+  threads.y=1;
+  threads.z=1;	
+  slabcorr_energy_kernel<<<grid,threads,32*sizeof(ENERGY_FLOAT)>>>(dev_buf);
+  cudaThreadSynchronize();
+  cudaMemcpy((void*) buf, dev_buf, grid.x*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
+  
+  double dipole_all=0.0;
+  for(int i=0;i<grid.x;i++)
+   dipole_all+=buf[i];
+   
+  return dipole_all;
+}
+
+void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact)
+{
+  dim3 grid,threads;
+  grid.x=(sdata->atom.nlocal+31)/32;
+  grid.y=1;
+  grid.z=1;
+  threads.x=32;
+  threads.y=1;
+  threads.z=1;	
+  slabcorr_force_kernel<<<grid,threads>>>(ffact);	
+  cudaThreadSynchronize();
+}
+
+void sum_virial(double* host_virial)
+{
+}
+
+void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out)
+{
+  int nslow=sdata->pppm.nzhi_in-sdata->pppm.nzlo_in;	
+  int nmid=sdata->pppm.nyhi_in-sdata->pppm.nylo_in;	
+  int nfast=sdata->pppm.nxhi_in-sdata->pppm.nxlo_in;
+  int nrimz=MAX(sdata->pppm.nzlo_in-sdata->pppm.nzlo_out,sdata->pppm.nzhi_out-sdata->pppm.nzhi_in);	
+  int nrimy=MAX(sdata->pppm.nylo_in-sdata->pppm.nylo_out,sdata->pppm.nyhi_out-sdata->pppm.nyhi_in);	
+  int nrimx=MAX(sdata->pppm.nxlo_in-sdata->pppm.nxlo_out,sdata->pppm.nxhi_out-sdata->pppm.nxhi_in);	
+  dim3 grid;
+  grid.x=nslow+1;
+  grid.y=nmid+1;
+  grid.z=1;
+  dim3 threads;
+  threads.x=nfast+1;
+  threads.y=1;
+  threads.z=1;
+  cudaThreadSynchronize();
+  initfftdata_core_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nrimz;
+  grid.y=nmid+1;
+  threads.x=nfast+1;
+  initfftdata_z_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nslow+1;
+  grid.y=nrimy;
+  threads.x=nfast+1;
+  initfftdata_y_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nslow+1;
+  grid.y=nmid+1;
+  threads.x=nrimx;
+  initfftdata_x_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nrimz;
+  grid.y=nrimy;
+  threads.x=nfast+1;
+  initfftdata_yz_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nrimz;
+  grid.y=nmid+1;
+  threads.x=nrimx;
+  initfftdata_xz_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nslow+1;
+  grid.y=nrimy;
+  threads.x=nrimx;
+  initfftdata_xy_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  grid.x=nrimz;
+  grid.y=nrimy;
+  threads.x=nrimx;
+  initfftdata_xyz_kernel<<<grid,threads,0>>>(in,out);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel");
+}
+
+
diff --git a/src/USER-CUDA/pppm_cuda.h b/src/USER-CUDA/pppm_cuda.h
new file mode 100644
index 0000000000..0becd762c0
--- /dev/null
+++ b/src/USER-CUDA/pppm_cuda.h
@@ -0,0 +1,114 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/cuda,PPPMCuda)
+
+#else
+
+#ifndef LMP_PPPM_CUDA_H
+#define LMP_PPPM_CUDA_H
+
+#include "pppm.h"
+#include "cuda_data.h"
+#include "cuda_precision.h"
+
+namespace LAMMPS_NS {
+
+class PPPMCuda : public PPPM {
+ public:
+  PPPMCuda(class LAMMPS *, int, char **);
+  ~PPPMCuda();
+  void init();
+  void setup();
+  void compute(int, int);
+  void timing(int, double &, double &);
+
+  double poissontime;
+ protected:
+  class Cuda *cuda;
+  class FFT3dCuda *fft1c,*fft2c;
+  double* work3;
+ 
+  cCudaData<double     , FFT_FLOAT      , x >* cu_work1;
+  cCudaData<double     , FFT_FLOAT      , x >* cu_work2;
+  cCudaData<double     , FFT_FLOAT      , x >* cu_work3;
+  cCudaData<double     , PPPM_FLOAT     , x >* cu_greensfn;
+  cCudaData<double     , PPPM_FLOAT     , x >* cu_gf_b;
+  cCudaData<double     , PPPM_FLOAT     , x >* cu_fkx;
+  cCudaData<double     , PPPM_FLOAT     , x >* cu_fky;
+  cCudaData<double     , PPPM_FLOAT     , x >* cu_fkz;
+  cCudaData<double     , PPPM_FLOAT     , xy>* cu_vg;
+  cCudaData<double     , PPPM_FLOAT     , x >* cu_density_brick;
+  cCudaData<int        , int     		, x >* cu_density_brick_int;
+  cCudaData<double     , PPPM_FLOAT     , x >* cu_vdx_brick;
+  cCudaData<double     , PPPM_FLOAT     , x >* cu_vdy_brick;
+  cCudaData<double     , PPPM_FLOAT     , x >* cu_vdz_brick;
+  cCudaData<double     , PPPM_FLOAT     , x >* cu_density_fft;
+  cCudaData<double     , ENERGY_FLOAT   , x >* cu_energy;
+  cCudaData<double     , ENERGY_FLOAT   , x >* cu_virial;
+  cCudaData<double     , X_FLOAT   		, yx>* cu_x;
+  cCudaData<double     , V_FLOAT   		, yx>* cu_v;
+  cCudaData<double     , F_FLOAT   		, yx>* cu_f;	
+  cCudaData<double     , F_FLOAT   		, yx>* cu_q;	
+  cCudaData<int        , int   			, yx>* cu_part2grid;	
+  cCudaData<double	   , PPPM_FLOAT		, x >* cu_rho_coeff;
+  cCudaData<PPPM_FLOAT , PPPM_FLOAT		, x >* cu_debugdata;
+  cCudaData<int        , int   			, x >* cu_flag;	
+  cCudaData<int        , int   			, x >* cu_pppm_grid_n;	
+  cCudaData<int        , int   			, x >* cu_pppm_grid_ids;	
+  
+  ENERGY_FLOAT* slabbuf;
+  cCudaData<ENERGY_FLOAT, ENERGY_FLOAT, x >* cu_slabbuf;
+  
+  int*** density_brick_int;
+  PPPM_FLOAT density_intScale;
+  int pppm_grid_nmax;
+  int* pppm2partgrid;
+  int* pppm_grid; 
+  PPPM_FLOAT* debugdata;
+  bool firstpass;
+  
+  void set_grid();
+  void make_power_of_prime(int* n);
+  void allocate();
+  void deallocate();
+  
+  virtual void particle_map();
+  virtual void make_rho();
+  void poisson(int, int);
+  virtual void fieldforce();
+  virtual void slabcorr(int);
+  double*** vdx_brick_tmp;
+  int old_nmax;
+  int global_flag;
+  dev_array* adev_data_array;
+  char precisionmodify;
+  
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-CUDA/verlet_cuda.cpp b/src/USER-CUDA/verlet_cuda.cpp
index 259ae8815d..0183368407 100644
--- a/src/USER-CUDA/verlet_cuda.cpp
+++ b/src/USER-CUDA/verlet_cuda.cpp
@@ -61,6 +61,8 @@ using namespace LAMMPS_NS;
 
 VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, arg) {	
   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
 
 	modify_cuda=(ModifyCuda*) modify;
 }