diff --git a/lib/cuda/Makefile.common b/lib/cuda/Makefile.common
index 1952fad37b..d98f75f3b5 100644
--- a/lib/cuda/Makefile.common
+++ b/lib/cuda/Makefile.common
@@ -14,7 +14,8 @@ SHELL = /bin/sh
 
 # System-specific settings
 
-CUDA_INSTALL_PATH = /usr/local/cuda
+#CUDA_INSTALL_PATH = /usr/local/cuda
+CUDA_INSTALL_PATH = /home/crtrott/lib/cuda
 # e.g. in Gentoo
 # CUDA_INSTALL_PATH = /opt/cuda
 
@@ -97,8 +98,22 @@ else
 	   NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
 	   SMVERSIONFLAGS	:= -arch sm_21
      else
-       CUDA_FLAGS += -DCUDA_ARCH=99  
-       SMVERSIONFLAGS	:= -arch sm_13
+       ifeq ($(strip $(arch)), 30)
+           CUDA_FLAGS += -DCUDA_ARCH=20
+           #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
+           NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
+           SMVERSIONFLAGS       := -arch sm_30
+       else
+         ifeq ($(strip $(arch)), 35)
+           CUDA_FLAGS += -DCUDA_ARCH=20
+           #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
+           NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
+           SMVERSIONFLAGS       := -arch sm_35
+         else         
+           CUDA_FLAGS += -DCUDA_ARCH=99  
+           SMVERSIONFLAGS	:= -arch sm_13
+         endif
+       endif
      endif
   endif
 endif
diff --git a/lib/cuda/crm_cuda_utils.cu b/lib/cuda/crm_cuda_utils.cu
index be66bb417a..6337d0d015 100644
--- a/lib/cuda/crm_cuda_utils.cu
+++ b/lib/cuda/crm_cuda_utils.cu
@@ -1,22 +1,22 @@
 /* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
 
    Original Version:
    http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
+   Steve Plimpton, sjplimp@sandia.gov
 
-   See the README file in the top-level LAMMPS directory. 
+   See the README file in the top-level LAMMPS directory.
 
-   ----------------------------------------------------------------------- 
+   -----------------------------------------------------------------------
 
    USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
+   https://sourceforge.net/projects/lammpscuda/
 
    Christian Trott, christian.trott@tu-ilmenau.de
    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
+   Theoretical Physics II, University of Technology Ilmenau, Germany
 
-   See the README file in the USER-CUDA directory. 
+   See the README file in the USER-CUDA directory.
 
    This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
@@ -28,36 +28,43 @@
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
-inline int3 getgrid(int n,int shared_per_thread=0,int threadsmax=256, bool p2=false)
+inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false)
 {
   int3 gridparams;
-  int sharedsize=16000;
-  if(shared_per_thread>0) threadsmax= sharedsize/shared_per_thread<threadsmax?sharedsize/shared_per_thread:threadsmax;
-  
-  if((n<60*32)||(threadsmax<64))
-  gridparams.z=32;
-  else if((n<60*64)||(threadsmax<128))
-  gridparams.z=64;
-  else if((n<60*128)||(threadsmax<256))
-  gridparams.z=128;
-  else if((n<60*256)||(threadsmax<512))
-  gridparams.z=256;
-  else gridparams.z=512;
-  
-  if(p2)
-  {
-  	gridparams.z=16;
-  	while(gridparams.z*2<=threadsmax) gridparams.z*=2;
-  }
-  
+  int sharedsize = 16000;
+
+  if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
+
+  if((n < 60 * 32) || (threadsmax < 64))
+    gridparams.z = 32;
+  else if((n < 60 * 64) || (threadsmax < 128))
+    gridparams.z = 64;
+  else if((n < 60 * 128) || (threadsmax < 256))
+    gridparams.z = 128;
+  else if((n < 60 * 256) || (threadsmax < 512))
+    gridparams.z = 256;
+  else gridparams.z = 512;
+
+  if(p2) {
+    gridparams.z = 16;
+
+    while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
+  }
+
+
+  int blocks = (n + gridparams.z - 1) / gridparams.z;
+
+  if(blocks > 10000)
+    gridparams.x = gridparams.y = int(sqrt(blocks));
+  else {
+    gridparams.x = blocks;
+    gridparams.y = 1;
+  }
+
+  while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
+
+  if(gridparams.x == 0) gridparams.x = 1;
 
-  int blocks=(n+gridparams.z-1)/gridparams.z;
-  if(blocks>10000)
-  gridparams.x=gridparams.y=int(sqrt(blocks));
-  else
-  {gridparams.x=blocks; gridparams.y=1;}
-  while(gridparams.x*gridparams.y*gridparams.z<n) gridparams.x++;
-  if(gridparams.x==0) gridparams.x=1;
   return gridparams;
 }
 
@@ -65,177 +72,188 @@ inline int3 getgrid(int n,int shared_per_thread=0,int threadsmax=256, bool p2=fa
 //take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
 static inline __device__ int negativCUDA(float f)
 {
- return ((unsigned int)1<<31&(__float_as_int(f)))>>31;
+  return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
 }
 
-//return value: -1 if f<0; else +1 
+//return value: -1 if f<0; else +1
 static inline __device__ float fsignCUDA(float f)
 {
-  return f<0.0f?-1.0f:1.0f;
+  return f < 0.0f ? -1.0f : 1.0f;
 }
 
 //functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
 //blockDim.y and blockDim.z are assumed to be 1
-static inline __device__ void copySharedToGlob(int* shared, int* glob,const int& n)
+static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n)
 {
-  	int i,k;
-	k=n-blockDim.x;
-	for(i=0;i<k;i+=blockDim.x)
-	{
-	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
-	}
-	if(threadIdx.x<n-i) 
-	{
-	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
-	}
-	__syncthreads();
+  int i, k;
+  k = n - blockDim.x;
+
+  for(i = 0; i < k; i += blockDim.x) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  __syncthreads();
 }
 
-static inline __device__ void copySharedToGlob(float* shared, float* glob,const int& n)
+static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n)
 {
- 	int i,k;
-	k=n-blockDim.x;
-	for(i=0;i<k;i+=blockDim.x)
-	{
-	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
-	}
-	if(threadIdx.x<n-i) 
-	{
-	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
-	}
-	__syncthreads();
+  int i, k;
+  k = n - blockDim.x;
+
+  for(i = 0; i < k; i += blockDim.x) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  __syncthreads();
 }
 
-static inline __device__ void copySharedToGlob(double* shared, double* glob,const int& n)
+static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n)
 {
-  	int i,k;
-	k=n-blockDim.x;
-	for(i=0;i<k;i+=blockDim.x)
-	{
-	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
-	}
-	if(threadIdx.x<n-i) 
-	{
-	  glob[i+threadIdx.x]=shared[i+threadIdx.x];
-	}
-	__syncthreads();
+  int i, k;
+  k = n - blockDim.x;
+
+  for(i = 0; i < k; i += blockDim.x) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  __syncthreads();
 }
 
-static inline __device__ void copyGlobToShared(int* glob,int* shared,const int& n)
+static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n)
 {
-  	int i,k;
-	k=n-blockDim.x;
-	for(i=0;i<k;i+=blockDim.x)
-	{
-	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
-	}
-	if(threadIdx.x<n-i) 
-	{
-	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
-	}
-	__syncthreads();
+  int i, k;
+  k = n - blockDim.x;
+
+  for(i = 0; i < k; i += blockDim.x) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  __syncthreads();
 }
 
-static __device__ inline void copyGlobToShared(float* glob,float* shared,const int& n)
+static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n)
 {
-  	int i,k;
-	k=n-blockDim.x;
-	for(i=0;i<k;i+=blockDim.x)
-	{
-	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
-	}
-	if(threadIdx.x<n-i) 
-	{
-	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
-	}
-	__syncthreads();
+  int i, k;
+  k = n - blockDim.x;
+
+  for(i = 0; i < k; i += blockDim.x) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  __syncthreads();
 }
 
-static __device__ inline void copyGlobToShared(double* glob,double* shared,const int& n)
+static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
 {
-  	int i;
-	for(i=0;i<n-blockDim.x;i+=blockDim.x)
-	{
-	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
-	}
-	if(threadIdx.x<n-i) 
-	{
-	  shared[i+threadIdx.x]=glob[i+threadIdx.x];
-	}
-	__syncthreads();
+  int i;
+
+  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  __syncthreads();
 }
 
 //copy data between two memory areas on device, 3d BlockDims are allowed
-static __device__ inline void copyData(double* source,double* target,const int& n)
+static __device__ inline void copyData(double* source, double* target, const int &n)
 {
-	int i;
-	int offset=threadIdx.x*blockDim.y*blockDim.z+threadIdx.y*blockDim.z+threadIdx.z;
-	for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
-	{
-	  target[i+offset]=source[i+offset];
-  	}
-	if(offset<n-i) 
-	{
-	  target[i+offset]=source[i+offset];
-	}
-	__syncthreads();
+  int i;
+  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
+
+  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
+    target[i + offset] = source[i + offset];
+  }
+
+  if(offset < n - i) {
+    target[i + offset] = source[i + offset];
+  }
+
+  __syncthreads();
 }
 
-static __device__ inline void copyData(float* source,float* target,const int& n)
+static __device__ inline void copyData(float* source, float* target, const int &n)
 {
-	int i;
-	int offset=threadIdx.x*blockDim.y*blockDim.z+threadIdx.y*blockDim.z+threadIdx.z;
-	for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
-	{
-	  target[i+offset]=source[i+offset];
-  	}
-	if(offset<n-i) 
-	{
-	  target[i+offset]=source[i+offset];
-	}
-	__syncthreads();
+  int i;
+  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
+
+  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
+    target[i + offset] = source[i + offset];
+  }
+
+  if(offset < n - i) {
+    target[i + offset] = source[i + offset];
+  }
+
+  __syncthreads();
 }
 
-static __device__ inline void copyData(int* source,int* target,const int& n)
+static __device__ inline void copyData(int* source, int* target, const int &n)
 {
-	int i;
-	int offset=threadIdx.x*blockDim.y*blockDim.z+threadIdx.y*blockDim.z+threadIdx.z;
-	for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
-	{
-	  target[i+offset]=source[i+offset];
-  	}
-	if(offset<n-i) 
-	{
-	  target[i+offset]=source[i+offset];
-	}
-	__syncthreads();
+  int i;
+  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
+
+  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
+    target[i + offset] = source[i + offset];
+  }
+
+  if(offset < n - i) {
+    target[i + offset] = source[i + offset];
+  }
+
+  __syncthreads();
 }
 
-static __device__ inline void copyData(unsigned int* source,unsigned int* target,const int& n)
+static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n)
 {
-	int i;
-	int offset=threadIdx.x*blockDim.y*blockDim.z+threadIdx.y*blockDim.z+threadIdx.z;
-	for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
-	{
-	  target[i+offset]=source[i+offset];
-  	}
-	if(offset<n-i) 
-	{
-	  target[i+offset]=source[i+offset];
-	}
-	__syncthreads();
+  int i;
+  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
+
+  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
+    target[i + offset] = source[i + offset];
+  }
+
+  if(offset < n - i) {
+    target[i + offset] = source[i + offset];
+  }
+
+  __syncthreads();
 }
 
 //functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
 //in the end in data[0]=sum_i=0^blockDim.x data[i]
-//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1 
+//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1
 static __device__ inline void reduceBlockP2(int* data)
 {
   __syncthreads();
-  for(int i=2;i<=blockDim.x;i*=2)
-  {
-    if(threadIdx.x<blockDim.x/i)
-    data[threadIdx.x]+=data[threadIdx.x+blockDim.x/i];
+
+  for(int i = 2; i <= blockDim.x; i *= 2) {
+    if(threadIdx.x < blockDim.x / i)
+      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
+
     __syncthreads();
   }
 }
@@ -243,10 +261,11 @@ static __device__ inline void reduceBlockP2(int* data)
 static __device__ inline void reduceBlockP2(unsigned int* data)
 {
   __syncthreads();
-  for(int i=2;i<=blockDim.x;i*=2)
-  {
-    if(threadIdx.x<blockDim.x/i)
-    data[threadIdx.x]+=data[threadIdx.x+blockDim.x/i];
+
+  for(int i = 2; i <= blockDim.x; i *= 2) {
+    if(threadIdx.x < blockDim.x / i)
+      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
+
     __syncthreads();
   }
 }
@@ -254,10 +273,11 @@ static __device__ inline void reduceBlockP2(unsigned int* data)
 static __device__ inline void reduceBlockP2(float* data)
 {
   __syncthreads();
-  for(int i=2;i<=blockDim.x;i*=2)
-  {
-    if(threadIdx.x<blockDim.x/i)
-    data[threadIdx.x]+=data[threadIdx.x+blockDim.x/i];
+
+  for(int i = 2; i <= blockDim.x; i *= 2) {
+    if(threadIdx.x < blockDim.x / i)
+      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
+
     __syncthreads();
   }
 }
@@ -265,10 +285,11 @@ static __device__ inline void reduceBlockP2(float* data)
 static __device__ inline void reduceBlockP2(double* data)
 {
   __syncthreads();
-  for(int i=2;i<=blockDim.x;i*=2)
-  {
-    if(threadIdx.x<blockDim.x/i)
-    data[threadIdx.x]+=data[threadIdx.x+blockDim.x/i];
+
+  for(int i = 2; i <= blockDim.x; i *= 2) {
+    if(threadIdx.x < blockDim.x / i)
+      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
+
     __syncthreads();
   }
 }
@@ -276,16 +297,19 @@ static __device__ inline void reduceBlockP2(double* data)
 static __device__ inline void reduceBlock(float* data)
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<blockDim.x) p2*=2;
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] += data[threadIdx.x + p2];
 
-  if(threadIdx.x<blockDim.x-p2)
-    data[threadIdx.x]+=data[threadIdx.x+p2];
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    if(threadIdx.x<p2/i)
-    data[threadIdx.x]+=data[threadIdx.x+p2/i];
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] += data[threadIdx.x + p2 / i];
+
     __syncthreads();
   }
 }
@@ -293,16 +317,19 @@ static __device__ inline void reduceBlock(float* data)
 static __device__ inline void reduceBlock(int* data)
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<blockDim.x) p2*=2;
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] += data[threadIdx.x + p2];
 
-  if(threadIdx.x<blockDim.x-p2)
-    data[threadIdx.x]+=data[threadIdx.x+p2];
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    if(threadIdx.x<p2/i)
-    data[threadIdx.x]+=data[threadIdx.x+p2/i];
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] += data[threadIdx.x + p2 / i];
+
     __syncthreads();
   }
 }
@@ -310,16 +337,19 @@ static __device__ inline void reduceBlock(int* data)
 static __device__ inline void reduceBlock(unsigned int* data)
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<blockDim.x) p2*=2;
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] += data[threadIdx.x + p2];
 
-  if(threadIdx.x<blockDim.x-p2)
-    data[threadIdx.x]+=data[threadIdx.x+p2];
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    if(threadIdx.x<p2/i)
-    data[threadIdx.x]+=data[threadIdx.x+p2/i];
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] += data[threadIdx.x + p2 / i];
+
     __syncthreads();
   }
 }
@@ -327,84 +357,93 @@ static __device__ inline void reduceBlock(unsigned int* data)
 static __device__ inline void reduceBlock(double* data)
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<blockDim.x) p2*=2;
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] += data[threadIdx.x + p2];
 
-  if(threadIdx.x<blockDim.x-p2)
-    data[threadIdx.x]+=data[threadIdx.x+p2];
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    if(threadIdx.x<p2/i)
-    data[threadIdx.x]+=data[threadIdx.x+p2/i];
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] += data[threadIdx.x + p2 / i];
+
     __syncthreads();
   }
 }
 
-static __device__ inline void cudaFillBlockData_int(int* data,const int& n,const int& value)
+static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
 {
-	int i;
-	for(i=0;i<n-blockDim.x;i+=blockDim.x)
-	{
-	  data[i+threadIdx.x]=value;
-	}
-	if(threadIdx.x<n-i) data[i+threadIdx.x]=value;
+  int i;
+
+  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
+    data[i + threadIdx.x] = value;
+  }
+
+  if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
 }
 
-static __device__ inline void cudaFillBlockData_float(float* data,const int& n,const float& value)
+static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
 {
-	int i;
-	for(i=0;i<n-blockDim.x;i+=blockDim.x)
-	{
-	  data[i+threadIdx.x]=value;
-	}
-	if(threadIdx.x<n-i) data[i+threadIdx.x]=value;
+  int i;
+
+  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
+    data[i + threadIdx.x] = value;
+  }
+
+  if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
 }
 
-static __device__ inline void reduce(float* data,int n) //cautious not sure if working
+static __device__ inline void reduce(float* data, int n) //cautious not sure if working
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<n) p2*=2;
+  int p2 = 1;
 
-  int j=0;
-  while((threadIdx.x+blockDim.x*j)*2<n-p2)
-  {
-    data[threadIdx.x+blockDim.x*j]+=data[(threadIdx.x+blockDim.x*j)+p2];
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
+    data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
     j++;
   }
+
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    while((threadIdx.x+blockDim.x*j)<p2/i)
-    {
-      data[threadIdx.x+blockDim.x*j]+=data[(threadIdx.x+blockDim.x*j)+p2/i];
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
       j++;
     }
+
     __syncthreads();
   }
 }
 
-static __device__ inline void reduce(double* data,int n) //cautious not sure if working
+static __device__ inline void reduce(double* data, int n) //cautious not sure if working
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<n) p2*=2;
+  int p2 = 1;
 
-  int j=0;
-  while((threadIdx.x+blockDim.x*j)*2<n-p2)
-  {
-    data[threadIdx.x+blockDim.x*j]+=data[(threadIdx.x+blockDim.x*j)+p2];
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
+    data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
     j++;
   }
+
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    while((threadIdx.x+blockDim.x*j)<p2/i)
-    {
-      data[threadIdx.x+blockDim.x*j]+=data[(threadIdx.x+blockDim.x*j)+p2/i];
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
       j++;
     }
+
     __syncthreads();
   }
 }
@@ -412,16 +451,19 @@ static __device__ inline void reduce(double* data,int n) //cautious not sure if
 static __device__ inline void minOfBlock(float* data)
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<blockDim.x) p2*=2;
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
 
-  if(threadIdx.x<blockDim.x-p2)
-    data[threadIdx.x]=MIN(data[threadIdx.x+p2],data[threadIdx.x]);
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    if(threadIdx.x<p2/i)
-    data[threadIdx.x]=MIN(data[threadIdx.x+p2/i],data[threadIdx.x]);
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
+
     __syncthreads();
   }
 }
@@ -429,16 +471,19 @@ static __device__ inline void minOfBlock(float* data)
 static __device__ inline void maxOfBlock(float* data)
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<blockDim.x) p2*=2;
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
 
-  if(threadIdx.x<blockDim.x-p2)
-    data[threadIdx.x]=MAX(data[threadIdx.x+p2],data[threadIdx.x]);
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    if(threadIdx.x<p2/i)
-    data[threadIdx.x]=MAX(data[threadIdx.x+p2/i],data[threadIdx.x]);
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
+
     __syncthreads();
   }
 }
@@ -446,16 +491,19 @@ static __device__ inline void maxOfBlock(float* data)
 static __device__ inline void minOfBlock(double* data)
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<blockDim.x) p2*=2;
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
 
-  if(threadIdx.x<blockDim.x-p2)
-    data[threadIdx.x]=MIN(data[threadIdx.x+p2],data[threadIdx.x]);
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    if(threadIdx.x<p2/i)
-    data[threadIdx.x]=MIN(data[threadIdx.x+p2/i],data[threadIdx.x]);
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
+
     __syncthreads();
   }
 }
@@ -463,113 +511,124 @@ static __device__ inline void minOfBlock(double* data)
 static __device__ inline void maxOfBlock(double* data)
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<blockDim.x) p2*=2;
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
 
-  if(threadIdx.x<blockDim.x-p2)
-    data[threadIdx.x]=MAX(data[threadIdx.x+p2],data[threadIdx.x]);
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    if(threadIdx.x<p2/i)
-    data[threadIdx.x]=MAX(data[threadIdx.x+p2/i],data[threadIdx.x]);
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
+
     __syncthreads();
   }
 }
 
 
-static __device__ inline void minOfData(double* data,int n) //cautious not sure if working
+static __device__ inline void minOfData(double* data, int n) //cautious not sure if working
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<n) p2*=2;
+  int p2 = 1;
 
-  int j=0;
-  while((threadIdx.x+blockDim.x*j)<n-p2)
-  {
-    data[threadIdx.x+blockDim.x*j]=MIN(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2]);
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) < n - p2) {
+    data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
     j++;
   }
+
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    while((threadIdx.x+blockDim.x*j)<p2/i)
-    {
-      data[threadIdx.x+blockDim.x*j]=MIN(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2/i]);
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
       j++;
     }
+
     __syncthreads();
   }
 }
 
-static __device__ inline void maxOfData(double* data,int n) //cautious not sure if working
+static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<n) p2*=2;
+  int p2 = 1;
 
-  int j=0;
-  while((threadIdx.x+blockDim.x*j)<n-p2)
-  {
-    data[threadIdx.x+blockDim.x*j]=MAX(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2]);
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) < n - p2) {
+    data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
     j++;
   }
+
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    while((threadIdx.x+blockDim.x*j)<p2/i)
-    {
-      data[threadIdx.x+blockDim.x*j]=MAX(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2/i]);
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
       j++;
     }
+
     __syncthreads();
   }
 }
 
-static __device__ inline void minOfData(float* data,int n) //cautious not sure if working
+static __device__ inline void minOfData(float* data, int n) //cautious not sure if working
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<n) p2*=2;
+  int p2 = 1;
 
-  int j=0;
-  while((threadIdx.x+blockDim.x*j)<n-p2)
-  {
-    data[threadIdx.x+blockDim.x*j]=MIN(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2]);
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) < n - p2) {
+    data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
     j++;
   }
+
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    while((threadIdx.x+blockDim.x*j)<p2/i)
-    {
-      data[threadIdx.x+blockDim.x*j]=MIN(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2/i]);
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
       j++;
     }
+
     __syncthreads();
   }
 }
 
-static __device__ inline void maxOfData(float* data,int n) //cautious not sure if working
+static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working
 {
   __syncthreads();
-  int p2=1;
-  while(p2*2<n) p2*=2;
+  int p2 = 1;
 
-  int j=0;
-  while((threadIdx.x+blockDim.x*j)<n-p2)
-  {
-    data[threadIdx.x+blockDim.x*j]=MAX(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2]);
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) < n - p2) {
+    data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
     j++;
   }
+
   __syncthreads();
-  for(int i=2;i<=p2;i*=2)
-  {
-    while((threadIdx.x+blockDim.x*j)<p2/i)
-    {
-      data[threadIdx.x+blockDim.x*j]=MAX(data[threadIdx.x+blockDim.x*j],data[(threadIdx.x+blockDim.x*j)+p2/i]);
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
       j++;
     }
+
     __syncthreads();
   }
 }
@@ -577,192 +636,188 @@ static __device__ inline void maxOfData(float* data,int n) //cautious not sure i
 #if X_PRECISION == 2
 static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
 {
-    int2 v = tex1Dfetch(t,i);
-    return __hiloint2double(v.y, v.x);
+  int2 v = tex1Dfetch(t, i);
+  return __hiloint2double(v.y, v.x);
 }
 
 static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
 {
-    int4 v = tex1Dfetch(t,2*i);
-    int4 u = tex1Dfetch(t,2*i+1);
-    X_FLOAT4 w;
-     
-    w.x= __hiloint2double(v.y, v.x);
-    w.y= __hiloint2double(v.w, v.z);
-    w.z= __hiloint2double(u.y, u.x);
-    w.w= __hiloint2double(u.w, u.z);
-    return w;
+  int4 v = tex1Dfetch(t, 2 * i);
+  int4 u = tex1Dfetch(t, 2 * i + 1);
+  X_FLOAT4 w;
+
+  w.x = __hiloint2double(v.y, v.x);
+  w.y = __hiloint2double(v.w, v.z);
+  w.z = __hiloint2double(u.y, u.x);
+  w.w = __hiloint2double(u.w, u.z);
+  return w;
 }
 #endif
 
 inline void BindXTypeTexture(cuda_shared_data* sdata)
 {
-	#ifdef CUDA_USE_TEXTURE
-		_x_type_tex.normalized = false;                      // access with normalized texture coordinates
-		_x_type_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_x_type_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* x_type_texture_ptr;
-		cudaGetTextureReference(&x_type_texture_ptr, MY_CONST(x_type_tex));
+#ifdef CUDA_USE_TEXTURE
+  _x_type_tex.normalized = false;                      // access with normalized texture coordinates
+  _x_type_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _x_type_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+  const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
 
-		#if X_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
-		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
-		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
-		#endif
-	#endif
+#if X_PRECISION == 1
+  cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
+  cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4));
+#else
+  cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
+  cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
+#endif
+#endif
 }
 
 static __device__ inline X_FLOAT4 fetchXType(int i)
 {
-		#ifdef CUDA_USE_TEXTURE
-		  #if X_PRECISION == 1
-		     return tex1Dfetch(_x_type_tex,i);
-		  #else
-		     return tex1Dfetch_double(_x_type_tex,i);
-		  #endif
-		#else
-		  return _x_type[i];
-		#endif		
+#ifdef CUDA_USE_TEXTURE
+#if X_PRECISION == 1
+  return tex1Dfetch(_x_type_tex, i);
+#else
+  return tex1Dfetch_double(_x_type_tex, i);
+#endif
+#else
+  return _x_type[i];
+#endif
 }
 
 #if V_PRECISION == 2
 static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
 {
-    int2 v = tex1Dfetch(t,i);
-    return __hiloint2double(v.y, v.x);
+  int2 v = tex1Dfetch(t, i);
+  return __hiloint2double(v.y, v.x);
 }
 
 static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
 {
-    int4 v = tex1Dfetch(t,2*i);
-    int4 u = tex1Dfetch(t,2*i+1);
-    V_FLOAT4 w;
-     
-    w.x= __hiloint2double(v.y, v.x);
-    w.y= __hiloint2double(v.w, v.z);
-    w.z= __hiloint2double(u.y, u.x);
-    w.w= __hiloint2double(u.w, u.z);
-    return w;
+  int4 v = tex1Dfetch(t, 2 * i);
+  int4 u = tex1Dfetch(t, 2 * i + 1);
+  V_FLOAT4 w;
+
+  w.x = __hiloint2double(v.y, v.x);
+  w.y = __hiloint2double(v.w, v.z);
+  w.z = __hiloint2double(u.y, u.x);
+  w.w = __hiloint2double(u.w, u.z);
+  return w;
 }
 #endif
 
 inline void BindVRadiusTexture(cuda_shared_data* sdata)
 {
-	#ifdef CUDA_USE_TEXTURE
-		_v_radius_tex.normalized = false;                      // access with normalized texture coordinates
-		_v_radius_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_v_radius_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* v_radius_texture_ptr;
-		cudaGetTextureReference(&v_radius_texture_ptr, MY_CONST(v_radius_tex));
+#ifdef CUDA_USE_TEXTURE
+  _v_radius_tex.normalized = false;                      // access with normalized texture coordinates
+  _v_radius_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _v_radius_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+  const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
 
-		#if V_PRECISION == 1
-		cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
-		cudaBindTexture(0,v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax*sizeof(X_FLOAT4));
-		#else
-		cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
-		cudaBindTexture(0,v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax*2*sizeof(int4));
-		#endif
-	#endif
+#if V_PRECISION == 1
+  cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
+  cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4));
+#else
+  cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
+  cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
+#endif
+#endif
 }
 
 static __device__ inline V_FLOAT4 fetchVRadius(int i)
 {
-		#ifdef CUDA_USE_TEXTURE
-		  #if V_PRECISION == 1
-		     return tex1Dfetch(_v_radius_tex,i);
-		  #else
-		     return tex1Dfetch_double_v(_v_radius_tex,i);
-		  #endif
-		#else
-		  return _v_radius[i];
-		#endif		
+#ifdef CUDA_USE_TEXTURE
+#if V_PRECISION == 1
+  return tex1Dfetch(_v_radius_tex, i);
+#else
+  return tex1Dfetch_double_v(_v_radius_tex, i);
+#endif
+#else
+  return _v_radius[i];
+#endif
 }
 
 inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
 {
-	#ifdef CUDA_USE_TEXTURE
-		_omega_rmass_tex.normalized = false;                      // access with normalized texture coordinates
-		_omega_rmass_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_omega_rmass_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* omega_rmass_texture_ptr;
-		cudaGetTextureReference(&omega_rmass_texture_ptr, MY_CONST(omega_rmass_tex));
+#ifdef CUDA_USE_TEXTURE
+  _omega_rmass_tex.normalized = false;                      // access with normalized texture coordinates
+  _omega_rmass_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+  const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
 
-		#if V_PRECISION == 1
-		cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
-		cudaBindTexture(0,omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax*sizeof(X_FLOAT4));
-		#else
-		cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
-		cudaBindTexture(0,omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax*2*sizeof(int4));
-		#endif
-	#endif
+#if V_PRECISION == 1
+  cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
+  cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4));
+#else
+  cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
+  cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
+#endif
+#endif
 }
 
 static __device__ inline V_FLOAT4 fetchOmegaRmass(int i)
 {
-		#ifdef CUDA_USE_TEXTURE
-		  #if V_PRECISION == 1
-		     return tex1Dfetch(_omega_rmass_tex,i);
-		  #else
-		     return tex1Dfetch_double_v(_omega_rmass_tex,i);
-		  #endif
-		#else
-		  return _omega_rmass[i];
-		#endif		
+#ifdef CUDA_USE_TEXTURE
+#if V_PRECISION == 1
+  return tex1Dfetch(_omega_rmass_tex, i);
+#else
+  return tex1Dfetch_double_v(_omega_rmass_tex, i);
+#endif
+#else
+  return _omega_rmass[i];
+#endif
 }
 
 #if F_PRECISION == 2
 static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
 {
-    int2 v = tex1Dfetch(t,i);
-    return __hiloint2double(v.y, v.x);
+  int2 v = tex1Dfetch(t, i);
+  return __hiloint2double(v.y, v.x);
 }
 
 static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
 {
-    int4 v = tex1Dfetch(t,2*i);
-    int4 u = tex1Dfetch(t,2*i+1);
-    F_FLOAT4 w;
-     
-    w.x= __hiloint2double(v.y, v.x);
-    w.y= __hiloint2double(v.w, v.z);
-    w.z= __hiloint2double(u.y, u.x);
-    w.w= __hiloint2double(u.w, u.z);
-    return w;
+  int4 v = tex1Dfetch(t, 2 * i);
+  int4 u = tex1Dfetch(t, 2 * i + 1);
+  F_FLOAT4 w;
+
+  w.x = __hiloint2double(v.y, v.x);
+  w.y = __hiloint2double(v.w, v.z);
+  w.z = __hiloint2double(u.y, u.x);
+  w.w = __hiloint2double(u.w, u.z);
+  return w;
 }
 #endif
 
 inline void BindQTexture(cuda_shared_data* sdata)
 {
-	#ifdef CUDA_USE_TEXTURE
-		_q_tex.normalized = false;                      // access with normalized texture coordinates
-		_q_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_q_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* q_texture_ptr;
-		cudaGetTextureReference(&q_texture_ptr, MY_CONST(q_tex));
+#ifdef CUDA_USE_TEXTURE
+  _q_tex.normalized = false;                      // access with normalized texture coordinates
+  _q_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _q_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+  const textureReference* q_texture_ptr = &MY_AP(q_tex);
 
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
-		cudaBindTexture(0,q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax*sizeof(F_FLOAT));
-		#else
-		cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
-		cudaBindTexture(0,q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax*sizeof(int2));
-		#endif
-	#endif
+#if F_PRECISION == 1
+  cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
+  cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+  cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
+  cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
+#endif
+#endif
 }
 
 static __device__ inline F_FLOAT fetchQ(int i)
 {
-		#ifdef CUDA_USE_TEXTURE
-		  #if F_PRECISION == 1
-		     return tex1Dfetch(_q_tex,i);
-		  #else
-		     return tex1Dfetch_double_f(_q_tex,i);
-		  #endif
-		#else
-		  return _q[i];
-		#endif		
+#ifdef CUDA_USE_TEXTURE
+#if F_PRECISION == 1
+  return tex1Dfetch(_q_tex, i);
+#else
+  return tex1Dfetch_double_f(_q_tex, i);
+#endif
+#else
+  return _q[i];
+#endif
 }
 
 #endif
@@ -776,7 +831,7 @@ inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
 		_coeff_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
 		_coeff_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
 		const textureReference* coeff_texture_ptr;
-		cudaGetTextureReference(&coeff_texture_ptr, MY_CONST(coeff_tex));
+		cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
 
 		#if F_PRECISION == 1
 		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
@@ -798,62 +853,67 @@ static __device__ inline X_FLOAT4 fetchXType(int i)
 		  #endif
 		#else
 		  return _x_type[i];
-		#endif		
+		#endif
 }
 */
 #define SBBITS 30
 
-static inline __device__ int sbmask(int j) {
+static inline __device__ int sbmask(int j)
+{
   return j >> SBBITS & 3;
 }
 
-static inline __device__ void minimum_image(X_FLOAT4& delta)
+static inline __device__ void minimum_image(X_FLOAT4 &delta)
 {
-  if (_triclinic == 0) {
-    if (_periodicity[0]) {
-    	delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] :
-    	          (delta.x >  X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0));
+  if(_triclinic == 0) {
+    if(_periodicity[0]) {
+      delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
+                 (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
     }
-    if (_periodicity[1]) {
-    	delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] :
-    	          (delta.y >  X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0));
+
+    if(_periodicity[1]) {
+      delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
+                 (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
     }
-    if (_periodicity[2]) {
-    	delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] :
-    	          (delta.z >  X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0));
+
+    if(_periodicity[2]) {
+      delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
     }
 
   } else {
-    if (_periodicity[1]) {
-    	delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] :
-    	          (delta.z >  X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0));
-   		delta.y += delta.z < -X_F(0.5)*_prd[2] ? _h[3] :
-    	          (delta.z >  X_F(0.5)*_prd[2] ?-_h[3] : X_F(0.0));
-     	delta.x += delta.z < -X_F(0.5)*_prd[2] ? _h[4] :
-    	          (delta.z >  X_F(0.5)*_prd[2] ?-_h[4] : X_F(0.0));
+    if(_periodicity[1]) {
+      delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
+      delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
+      delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
 
     }
-    if (_periodicity[1]) {
-    	delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] :
-    	          (delta.y >  X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0));
-    	delta.x += delta.y < -X_F(0.5)*_prd[1] ? _h[5] :
-    	          (delta.y >  X_F(0.5)*_prd[1] ?-_h[5] : X_F(0.0));
+
+    if(_periodicity[1]) {
+      delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
+                 (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
+      delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
+                 (delta.y >  X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
 
     }
-    if (_periodicity[0]) {
-    	delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] :
-    	          (delta.x >  X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0));
+
+    if(_periodicity[0]) {
+      delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
+                 (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
     }
   }
 }
 
-static inline __device__ void closest_image(X_FLOAT4& x1,X_FLOAT4& x2,X_FLOAT4& ci)
+static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci)
 {
-  ci.x=x2.x-x1.x;
-  ci.y=x2.y-x1.y;
-  ci.z=x2.z-x1.z;
+  ci.x = x2.x - x1.x;
+  ci.y = x2.y - x1.y;
+  ci.z = x2.z - x1.z;
   minimum_image(ci);
-  ci.x+=x1.x;
-  ci.y+=x1.y;
-  ci.z+=x1.z;
+  ci.x += x1.x;
+  ci.y += x1.y;
+  ci.z += x1.z;
 }
diff --git a/lib/cuda/cuda_pair.cu b/lib/cuda/cuda_pair.cu
index 0685ce4490..9f9900a2d8 100644
--- a/lib/cuda/cuda_pair.cu
+++ b/lib/cuda/cuda_pair.cu
@@ -1,28 +1,28 @@
 /* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
 
    Original Version:
    http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
+   Steve Plimpton, sjplimp@sandia.gov
 
-   See the README file in the top-level LAMMPS directory. 
+   See the README file in the top-level LAMMPS directory.
 
-   ----------------------------------------------------------------------- 
+   -----------------------------------------------------------------------
 
    USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
+   https://sourceforge.net/projects/lammpscuda/
 
    Christian Trott, christian.trott@tu-ilmenau.de
    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
+   Theoretical Physics II, University of Technology Ilmenau, Germany
 
-   See the README file in the USER-CUDA directory. 
+   See the README file in the USER-CUDA directory.
 
    This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 
-enum PAIR_FORCES {PAIR_NONE,PAIR_BORN,PAIR_BUCK,PAIR_CG_CMM,PAIR_LJ_CHARMM,PAIR_LJ_CLASS2,PAIR_LJ_CUT, PAIR_LJ_EXPAND, PAIR_LJ_GROMACS, PAIR_LJ_SMOOTH, PAIR_LJ96_CUT, PAIR_MORSE, PAIR_MORSE_R6};
-enum COUL_FORCES {COUL_NONE,COUL_CHARMM,COUL_CHARMM_IMPLICIT,COUL_CUT,COUL_LONG, COUL_DEBYE, COUL_GROMACS,COUL_SPECIAL};
+enum PAIR_FORCES {PAIR_NONE, PAIR_BORN, PAIR_BUCK, PAIR_CG_CMM, PAIR_LJ_CHARMM, PAIR_LJ_CLASS2, PAIR_LJ_CUT, PAIR_LJ_EXPAND, PAIR_LJ_GROMACS, PAIR_LJ_SMOOTH, PAIR_LJ96_CUT, PAIR_MORSE, PAIR_MORSE_R6};
+enum COUL_FORCES {COUL_NONE, COUL_CHARMM, COUL_CHARMM_IMPLICIT, COUL_CUT, COUL_LONG, COUL_DEBYE, COUL_GROMACS, COUL_SPECIAL};
 #define DATA_NONE 0
 #define DATA_V 1
 #define DATA_TAG 2
@@ -80,75 +80,75 @@ __device__ __constant__ F_FLOAT* MY_AP(coeff8_gm);
 __device__ __constant__ F_FLOAT* MY_AP(coeff9_gm);
 __device__ __constant__ F_FLOAT* MY_AP(coeff10_gm);
 
-	#define _coeff1_gm_tex         MY_AP(coeff1_gm_tex)
-	#if F_PRECISION == 1
-	texture<float> _coeff1_gm_tex;
-	#else
-	texture<int2,1> _coeff1_gm_tex;
-	#endif
+#define _coeff1_gm_tex         MY_AP(coeff1_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff1_gm_tex;
+#else
+texture<int2, 1> _coeff1_gm_tex;
+#endif
 
-	#define _coeff2_gm_tex         MY_AP(coeff2_gm_tex)
-	#if F_PRECISION == 1
-	texture<float> _coeff2_gm_tex;
-	#else
-	texture<int2,1> _coeff2_gm_tex;
-	#endif
+#define _coeff2_gm_tex         MY_AP(coeff2_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff2_gm_tex;
+#else
+texture<int2, 1> _coeff2_gm_tex;
+#endif
 
-	#define _coeff3_gm_tex         MY_AP(coeff3_gm_tex)
-	#if F_PRECISION == 1
-	texture<float> _coeff3_gm_tex;
-	#else
-	texture<int2,1> _coeff3_gm_tex;
-	#endif
+#define _coeff3_gm_tex         MY_AP(coeff3_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff3_gm_tex;
+#else
+texture<int2, 1> _coeff3_gm_tex;
+#endif
 
-	#define _coeff4_gm_tex         MY_AP(coeff4_gm_tex)
-	#if F_PRECISION == 1
-	texture<float> _coeff4_gm_tex;
-	#else
-	texture<int2,1> _coeff4_gm_tex;
-	#endif
+#define _coeff4_gm_tex         MY_AP(coeff4_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff4_gm_tex;
+#else
+texture<int2, 1> _coeff4_gm_tex;
+#endif
 
-	#define _coeff5_gm_tex         MY_AP(coeff5_gm_tex)
-	#if F_PRECISION == 1
-	texture<float> _coeff5_gm_tex;
-	#else
-	texture<int2,1> _coeff5_gm_tex;
-	#endif
+#define _coeff5_gm_tex         MY_AP(coeff5_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff5_gm_tex;
+#else
+texture<int2, 1> _coeff5_gm_tex;
+#endif
 
-	#define _coeff6_gm_tex         MY_AP(coeff6_gm_tex)
-	#if F_PRECISION == 1
-	texture<float> _coeff6_gm_tex;
-	#else
-	texture<int2,1> _coeff6_gm_tex;
-	#endif
+#define _coeff6_gm_tex         MY_AP(coeff6_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff6_gm_tex;
+#else
+texture<int2, 1> _coeff6_gm_tex;
+#endif
 
-	#define _coeff7_gm_tex         MY_AP(coeff7_gm_tex)
-	#if F_PRECISION == 1
-	texture<float> _coeff7_gm_tex;
-	#else
-	texture<int2,1> _coeff7_gm_tex;
-	#endif
+#define _coeff7_gm_tex         MY_AP(coeff7_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff7_gm_tex;
+#else
+texture<int2, 1> _coeff7_gm_tex;
+#endif
 
-	#define _coeff8_gm_tex         MY_AP(coeff8_gm_tex)
-	#if F_PRECISION == 1
-	texture<float> _coeff8_gm_tex;
-	#else
-	texture<int2,1> _coeff8_gm_tex;
-	#endif
+#define _coeff8_gm_tex         MY_AP(coeff8_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff8_gm_tex;
+#else
+texture<int2, 1> _coeff8_gm_tex;
+#endif
 
-	#define _coeff9_gm_tex         MY_AP(coeff9_gm_tex)
-	#if F_PRECISION == 1
-	texture<float> _coeff9_gm_tex;
-	#else
-	texture<int2,1> _coeff9_gm_tex;
-	#endif
+#define _coeff9_gm_tex         MY_AP(coeff9_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff9_gm_tex;
+#else
+texture<int2, 1> _coeff9_gm_tex;
+#endif
 
-	#define _coeff10_gm_tex         MY_AP(coeff10_gm_tex)
-	#if F_PRECISION == 1
-	texture<float> _coeff10_gm_tex;
-	#else
-	texture<int2,1> _coeff10_gm_tex;
-	#endif
+#define _coeff10_gm_tex         MY_AP(coeff10_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff10_gm_tex;
+#else
+texture<int2, 1> _coeff10_gm_tex;
+#endif
 
 //if more than 5 coefficients are needed for a pair potential add them here
 
@@ -172,17 +172,17 @@ __device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2];
 __device__ __constant__ X_FLOAT _cut_innersq_global;
 
 
-template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
-__global__ void Pair_Kernel_TpA(int eflag, int vflag,int eflag_atom,int vflag_atom);
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom);
 
-template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
-__global__ void Pair_Kernel_BpA(int eflag, int vflag,int eflag_atom,int vflag_atom);
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom);
 
-template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
-__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase);
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase);
 
-template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
-__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase);
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase);
 
 #include <stdio.h>
 #include "cuda_pair_cu.h"
@@ -191,654 +191,635 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vfla
 //Functions which are shared by pair styles
 
 //Update Buffersize
-void Cuda_UpdateBuffer(cuda_shared_data* sdata,int size)
+void Cuda_UpdateBuffer(cuda_shared_data* sdata, int size)
 {
-	CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed");
- 		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
-	CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed");
+  CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed");
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+  CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed");
 }
 
 void Cuda_Pair_UpdateNeighbor_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
 {
   //Neighbor
-  cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal)  , & sneighlist->firstneigh.dim[0]  , sizeof(unsigned) );
-  cudaMemcpyToSymbol(MY_CONST(firstneigh)     , & sneighlist->firstneigh.dev_data, sizeof(int*) );
-  cudaMemcpyToSymbol(MY_CONST(ilist)          , & sneighlist->ilist     .dev_data, sizeof(int*) );
-  cudaMemcpyToSymbol(MY_CONST(inum)           , & sneighlist->inum               , sizeof(int)  );
-  cudaMemcpyToSymbol(MY_CONST(numneigh)       , & sneighlist->numneigh  .dev_data, sizeof(int*) );
-  cudaMemcpyToSymbol(MY_CONST(neighbors)      , & sneighlist->neighbors  .dev_data, sizeof(int*) );
-  cudaMemcpyToSymbol(MY_CONST(maxneighbors)       , & sneighlist->maxneighbors     , sizeof(int)  );
-  cudaMemcpyToSymbol(MY_CONST(overlap_comm)       , & sdata->overlap_comm, sizeof(int) );
+  cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal)  , & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(firstneigh)     , & sneighlist->firstneigh.dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(ilist)          , & sneighlist->ilist     .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(inum)           , & sneighlist->inum               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(numneigh)       , & sneighlist->numneigh  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(neighbors)      , & sneighlist->neighbors  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(maxneighbors)       , & sneighlist->maxneighbors     , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(overlap_comm)       , & sdata->overlap_comm, sizeof(int));
 
-if(sdata->overlap_comm)
-{
-  cudaMemcpyToSymbol(MY_CONST(numneigh_border)  , & sneighlist->numneigh_border .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_CONST(numneigh_inner)   , & sneighlist->numneigh_inner  .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_CONST(neighbors_inner)  , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_CONST(ilist_border)     , & sneighlist->ilist_border    .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_CONST(inum_border)      , & sneighlist->inum_border     .dev_data, sizeof(int*) );
-}
+  if(sdata->overlap_comm) {
+    cudaMemcpyToSymbol(MY_AP(numneigh_border)  , & sneighlist->numneigh_border .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(numneigh_inner)   , & sneighlist->numneigh_inner  .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(neighbors_inner)  , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(ilist_border)     , & sneighlist->ilist_border    .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(inum_border)      , & sneighlist->inum_border     .dev_data, sizeof(int*));
+  }
 
 }
-//Update constants after nmax change which are generally needed by all pair styles 
+//Update constants after nmax change which are generally needed by all pair styles
 void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
 {
-	CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin");
+  CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin");
 
-	  //System
-		cudaMemcpyToSymbol(MY_CONST(nlocal)    			, & sdata->atom.nlocal             , sizeof(int)  );
-		cudaMemcpyToSymbol(MY_CONST(nall)      			, & sdata->atom.nall               , sizeof(int)  );
-		cudaMemcpyToSymbol(MY_CONST(nmax)      			, & sdata->atom.nmax               , sizeof(int)  );
-		
-      //Atom
-		cudaMemcpyToSymbol(MY_CONST(x)         			, & sdata->atom.x         .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(x_type)         	, & sdata->atom.x_type    .dev_data, sizeof(X_FLOAT4*) );
-		cudaMemcpyToSymbol(MY_CONST(f)         			, & sdata->atom.f         .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(type)      			, & sdata->atom.type      .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(q)         			, & sdata->atom.q         .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(tag)      			, & sdata->atom.tag       .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*) );
+  //System
+  cudaMemcpyToSymbol(MY_AP(nlocal)    			, & sdata->atom.nlocal             , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)      			, & sdata->atom.nall               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)      			, & sdata->atom.nmax               , sizeof(int));
+
+  //Atom
+  cudaMemcpyToSymbol(MY_AP(x)         			, & sdata->atom.x         .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(x_type)         	, & sdata->atom.x_type    .dev_data, sizeof(X_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(f)         			, & sdata->atom.f         .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)      			, & sdata->atom.type      .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(q)         			, & sdata->atom.q         .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(tag)      			, & sdata->atom.tag       .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*));
 
 
-	  //Other
-		cudaMemcpyToSymbol(MY_CONST(debugdata)      , & sdata->debugdata      , sizeof(int*) );
-	CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End");
+  //Other
+  cudaMemcpyToSymbol(MY_AP(debugdata)      , & sdata->debugdata      , sizeof(int*));
+  CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End");
 }
 
 //Initialisation of GPU Constants which rarely change
-void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=false, bool use_global_params=false, bool need_innercut=false, bool need_cut=true ) 
+void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = false, bool use_global_params = false, bool need_innercut = false, bool need_cut = true)
 {
-	unsigned cuda_ntypes = sdata->atom.ntypes + 1;
-	unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
-	unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
-	unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2;
+  unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+  unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
+  unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
+  unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2;
 
-    //check if enough constant memory is available
-	if((cuda_ntypes2 > CUDA_MAX_TYPES2 )&& !use_global_params)
-		printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u "
-		"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
-		"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE-1);
-	if((cuda_ntypes2 > CUDA_MAX_TYPES2 )&& !use_global_params)
-	  exit(0);
-	//type conversion of cutoffs and parameters
-	if(need_cut)
-	{
-	  X_FLOAT cutsq[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_global * sdata->pair.cut_global);
-		}
-	  }
-	
-	  int cutsqdiffer=0;
-	  X_FLOAT cutsq_global;
-	  cutsq_global = (X_FLOAT) (sdata->pair.cut_global * sdata->pair.cut_global);	
-	  if(sdata->pair.cut) 
-	  {
-		for(int i=1; i<=sdata->atom.ntypes; ++i)
-		{
-			for(int j=i; j<=sdata->atom.ntypes; ++j)
-			{
-				if(sdata->pair.cut[i][j]>1e-6)
-				{
-				  cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
-				  cutsq[j * cuda_ntypes + i] = (X_FLOAT) (sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
-				}
+  //check if enough constant memory is available
+  if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params)
+    printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u "
+           "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
+           "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1);
 
-				if(i==1&&j==1) cutsq_global = cutsq[i * cuda_ntypes + j];
-				if((cutsq_global - cutsq[i * cuda_ntypes + j])*(cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
-				 cutsqdiffer++;
-			}
-		}
-	  }
+  if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params)
+    exit(0);
 
-	  if(sdata->pair.cutsq) 
-	  {
-		for(int i=1; i<=sdata->atom.ntypes; ++i)
-		{
-			for(int j=i; j<=sdata->atom.ntypes; ++j)
-			{
-				if(sdata->pair.cut[i][j]>1e-6)
-				{
-				  cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cutsq[i][j]);
-				  cutsq[j * cuda_ntypes + i] = (X_FLOAT) (sdata->pair.cutsq[i][j]);
-				}
+  //type conversion of cutoffs and parameters
+  if(need_cut) {
+    X_FLOAT cutsq[cuda_ntypes2];
 
-				if(i==1&&j==1) cutsq_global = cutsq[i * cuda_ntypes + j];
-				if((cutsq_global - cutsq[i * cuda_ntypes + j])*(cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
-				 cutsqdiffer++;
-			}
-		}
-	  }
-	  //printf("CUTSQGLOB: %i %e\n",cutsqdiffer,cutsq_global);
-	  if(cutsqdiffer) 
-	  {
-
-	  	cutsq_global = -1.0;
-	    cudaMemcpyToSymbol(MY_CONST(cutsq)      	, cutsq                    		, nx               );
-	  }
-	  cudaMemcpyToSymbol(MY_CONST(cutsq_global)	,&cutsq_global  				, sizeof(X_FLOAT)  );
-	}
-
-    if(need_innercut)
-    {
-	  X_FLOAT cut_innersq[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			cut_innersq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
-		}
-	  }
-
-	  int cutsqdiffer=0;
-	  X_FLOAT cut_innersq_global;
-	  cut_innersq_global = (X_FLOAT) (sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);	
-	  if(sdata->pair.cut_inner) 
-	  {
-		for(int i=1; i<=sdata->atom.ntypes; ++i)
-		{
-			for(int j=i; j<=sdata->atom.ntypes; ++j)
-			{
-				if(sdata->pair.cut_inner[i][j]>1e-6)
-				{
-				  cut_innersq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
-				  cut_innersq[j * cuda_ntypes + i] = (X_FLOAT) (sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
-				}
-				if(i==1&&j==1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j];
-				if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j])*(cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6)
-				  cutsqdiffer++;
-			}
-		}
-	  }
-	  if(cutsqdiffer) 
-	  {  
-	  	cut_innersq_global = -1.0;
- 	    cudaMemcpyToSymbol(MY_CONST(cut_innersq)      	, cut_innersq                    		, nx               );
-	  }
-	  cudaMemcpyToSymbol(MY_CONST(cut_innersq_global)	,&cut_innersq_global  				, sizeof(X_FLOAT)  );
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
+      }
     }
-	
-    if(need_q)
-    {
-	  X_FLOAT cut_coulsq[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
-		}
-	  }
-	  
-	  int cutsqdiffer=0;
-	  X_FLOAT cut_coulsq_global;
-	  cut_coulsq_global = (X_FLOAT) (sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);	
-	  if(sdata->pair.cut_coulsq_global> cut_coulsq_global)  cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global; 
-	  if(sdata->pair.cut_coul) 
-	  {
-		for(int i=1; i<=sdata->atom.ntypes; ++i)
-		{
-			for(int j=i; j<=sdata->atom.ntypes; ++j)
-			{
-				if(sdata->pair.cut_coul[i][j]>1e-6)
-				{
-				  cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
-				  cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT) (sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
-				}
-				if(i==1&&j==1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j];
-				if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j])*(cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6)
-				  cutsqdiffer++;
-			}
-		}
-	  }
-	  if(cutsqdiffer) 
-	  {
-	  	cut_coulsq_global = -1.0;
- 	    cudaMemcpyToSymbol(MY_CONST(cut_coulsq)      	, cut_coulsq                    		, nx               );
-	  }
-	  cudaMemcpyToSymbol(MY_CONST(cut_coulsq_global),&cut_coulsq_global  					, sizeof(X_FLOAT)  );
+
+    int cutsqdiffer = 0;
+    X_FLOAT cutsq_global;
+    cutsq_global = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
+
+    if(sdata->pair.cut) {
+      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+        for(int j = i; j <= sdata->atom.ntypes; ++j) {
+          if(sdata->pair.cut[i][j] > 1e-6) {
+            cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
+            cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
+          }
+
+          if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
+
+          if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
+            cutsqdiffer++;
+        }
+      }
     }
-	CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed");
 
-	if(ncoeff>0)
-	{	
-	  F_FLOAT coeff1[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j];
-		}
-	  }
+    if(sdata->pair.cutsq) {
+      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+        for(int j = i; j <= sdata->atom.ntypes; ++j) {
+          if(sdata->pair.cut[i][j] > 1e-6) {
+            cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
+            cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
+          }
 
-	  if(use_global_params)
-	  {
-	    cudaMemcpyToSymbol(MY_CONST(coeff1_gm)  , &sdata->pair.coeff1_gm.dev_data   , sizeof(F_FLOAT*)  );
-	    cudaMemcpy((sdata->pair.coeff1_gm.dev_data),coeff1, n,cudaMemcpyHostToDevice);
-	   
-	    _coeff1_gm_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff1_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff1_gm_texture_ptr;
-		cudaGetTextureReference(&coeff1_gm_texture_ptr, MY_CONST(coeff1_gm_tex));
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed");
+          if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
 
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed");
-		cudaBindTexture(0,coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed");
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b-d failed");
-		cudaBindTexture(0,coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed");
-		#endif
-	    
-	  }
-	  else
-	    cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n);
-	}
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed");
+          if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
+            cutsqdiffer++;
+        }
+      }
+    }
 
-	if(ncoeff>1)
-	{	
-	  F_FLOAT coeff2[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j];
-		}
-	  }
+    //printf("CUTSQGLOB: %i %e\n",cutsqdiffer,cutsq_global);
+    if(cutsqdiffer) {
 
-	  if(use_global_params)
-	  {
-	    cudaMemcpyToSymbol(MY_CONST(coeff2_gm)  , &sdata->pair.coeff2_gm.dev_data   , sizeof(F_FLOAT*)  );
-	    cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n,cudaMemcpyHostToDevice);
+      cutsq_global = -1.0;
+      cudaMemcpyToSymbol(MY_AP(cutsq)      	, cutsq                    		, nx);
+    }
 
-	    _coeff2_gm_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff2_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff2_gm_texture_ptr;
-		cudaGetTextureReference(&coeff2_gm_texture_ptr, MY_CONST(coeff2_gm_tex));
+    cudaMemcpyToSymbol(MY_AP(cutsq_global)	, &cutsq_global  				, sizeof(X_FLOAT));
+  }
 
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
-		cudaBindTexture(0,coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
-		cudaBindTexture(0,coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
-		#endif
+  if(need_innercut) {
+    X_FLOAT cut_innersq[cuda_ntypes2];
 
-	  }
-	  else
-	    cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n);
-	}
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed");
-	
-	if(ncoeff>2)
-	{	
-	  F_FLOAT coeff3[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j];
-		}
-	  }
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
+      }
+    }
 
-	  if(use_global_params)
-	  {
-	    cudaMemcpyToSymbol(MY_CONST(coeff3_gm)  , &sdata->pair.coeff3_gm.dev_data   , sizeof(F_FLOAT*)  );
-	    cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n,cudaMemcpyHostToDevice);
-	    _coeff3_gm_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff3_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff3_gm_texture_ptr;
-		cudaGetTextureReference(&coeff3_gm_texture_ptr, MY_CONST(coeff3_gm_tex));
+    int cutsqdiffer = 0;
+    X_FLOAT cut_innersq_global;
+    cut_innersq_global = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
 
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
-		cudaBindTexture(0,coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
-		cudaBindTexture(0,coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
-		#endif
-	  }
-	  else
-	    cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n);
-	}
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed");
-	
-	if(ncoeff>3)
-	{	
-	  F_FLOAT coeff4[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j];
-	  	}
-	  }
-	
-	  if(use_global_params)
-	  {
-	    cudaMemcpyToSymbol(MY_CONST(coeff4_gm)  , &sdata->pair.coeff4_gm.dev_data   , sizeof(F_FLOAT*)  );
-	    cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n,cudaMemcpyHostToDevice);
-	    _coeff4_gm_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff4_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff4_gm_texture_ptr;
-		cudaGetTextureReference(&coeff4_gm_texture_ptr, MY_CONST(coeff4_gm_tex));
+    if(sdata->pair.cut_inner) {
+      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+        for(int j = i; j <= sdata->atom.ntypes; ++j) {
+          if(sdata->pair.cut_inner[i][j] > 1e-6) {
+            cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
+            cut_innersq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
+          }
 
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
-		cudaBindTexture(0,coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
-		cudaBindTexture(0,coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
-		#endif
-	  }
-	  else
-	    cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n);
-	}
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed");
+          if(i == 1 && j == 1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j];
 
-	if(ncoeff>4)
-	{	
-	  F_FLOAT coeff5[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j];
-		}
-	  }
-	
-	  if(use_global_params)
-	  {
-	    cudaMemcpyToSymbol(MY_CONST(coeff5_gm)  , &sdata->pair.coeff5_gm.dev_data   , sizeof(F_FLOAT*)  );
-	    cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n,cudaMemcpyHostToDevice);
-	    _coeff5_gm_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff5_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff5_gm_texture_ptr;
-		cudaGetTextureReference(&coeff5_gm_texture_ptr, MY_CONST(coeff5_gm_tex));
+          if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) * (cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6)
+            cutsqdiffer++;
+        }
+      }
+    }
 
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
-		cudaBindTexture(0,coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
-		cudaBindTexture(0,coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
-		#endif
-	  }
-	  else
-	    cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n);
-	}
+    if(cutsqdiffer) {
+      cut_innersq_global = -1.0;
+      cudaMemcpyToSymbol(MY_AP(cut_innersq)      	, cut_innersq                    		, nx);
+    }
 
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed");
-	if(ncoeff>5)
-	{	
-	  F_FLOAT coeff6[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j];
-		}
-	  }
-	
-	  if(use_global_params)
-	  {
-	    cudaMemcpyToSymbol(MY_CONST(coeff6_gm)  , &sdata->pair.coeff6_gm.dev_data   , sizeof(F_FLOAT*)  );
-	    cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n,cudaMemcpyHostToDevice);
-	    _coeff6_gm_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff6_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff6_gm_texture_ptr;
-		cudaGetTextureReference(&coeff6_gm_texture_ptr, MY_CONST(coeff6_gm_tex));
+    cudaMemcpyToSymbol(MY_AP(cut_innersq_global)	, &cut_innersq_global  				, sizeof(X_FLOAT));
+  }
 
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
-		cudaBindTexture(0,coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
-		cudaBindTexture(0,coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
-		#endif
-	  }
-	}
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed");
+  if(need_q) {
+    X_FLOAT cut_coulsq[cuda_ntypes2];
 
-	if(ncoeff>6)
-	{	
-	  F_FLOAT coeff7[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j];
-		}
-	  }
-	
-	  if(use_global_params)
-	  {
-	    cudaMemcpyToSymbol(MY_CONST(coeff7_gm)  , &sdata->pair.coeff7_gm.dev_data   , sizeof(F_FLOAT*)  );
-	    cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n,cudaMemcpyHostToDevice);
-	    _coeff7_gm_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff7_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff7_gm_texture_ptr;
-		cudaGetTextureReference(&coeff7_gm_texture_ptr, MY_CONST(coeff7_gm_tex));
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
+      }
+    }
 
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
-		cudaBindTexture(0,coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
-		cudaBindTexture(0,coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
-		#endif
-	  }
-	}
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed");
+    int cutsqdiffer = 0;
+    X_FLOAT cut_coulsq_global;
+    cut_coulsq_global = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
 
-	if(ncoeff>7)
-	{	
-	  F_FLOAT coeff8[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j];
-		}
-	  }
-	
-	  if(use_global_params)
-	  {
-	    cudaMemcpyToSymbol(MY_CONST(coeff8_gm)  , &sdata->pair.coeff8_gm.dev_data   , sizeof(F_FLOAT*)  );
-	    cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n,cudaMemcpyHostToDevice);
-	    _coeff8_gm_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff8_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff8_gm_texture_ptr;
-		cudaGetTextureReference(&coeff8_gm_texture_ptr, MY_CONST(coeff8_gm_tex));
+    if(sdata->pair.cut_coulsq_global > cut_coulsq_global)  cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global;
 
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
-		cudaBindTexture(0,coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
-		cudaBindTexture(0,coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
-		#endif
-	  }
-	}
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed");
+    if(sdata->pair.cut_coul) {
+      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+        for(int j = i; j <= sdata->atom.ntypes; ++j) {
+          if(sdata->pair.cut_coul[i][j] > 1e-6) {
+            cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
+            cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
+          }
 
-	if(ncoeff>8)
-	{	
-	  F_FLOAT coeff9[cuda_ntypes2];
-	  for(int i=1; i<=sdata->atom.ntypes; ++i)
-	  {
-		for(int j=1; j<=sdata->atom.ntypes; ++j)
-		{
-			coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j];
-		}
-	  }
-	
-	  if(use_global_params)
-	  {
-	    cudaMemcpyToSymbol(MY_CONST(coeff9_gm)  , &sdata->pair.coeff9_gm.dev_data   , sizeof(F_FLOAT*)  );
-	    cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n,cudaMemcpyHostToDevice);
-	    _coeff9_gm_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff9_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff9_gm_texture_ptr;
-		cudaGetTextureReference(&coeff9_gm_texture_ptr, MY_CONST(coeff9_gm_tex));
+          if(i == 1 && j == 1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j];
 
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
-		cudaBindTexture(0,coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
-		cudaBindTexture(0,coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2));
-		#endif
-	  }
-	}
-	CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed");
-	
-	F_FLOAT special_lj[4];
-	special_lj[0]=sdata->pair.special_lj[0];
-	special_lj[1]=sdata->pair.special_lj[1];
-	special_lj[2]=sdata->pair.special_lj[2];
-	special_lj[3]=sdata->pair.special_lj[3];
+          if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) * (cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6)
+            cutsqdiffer++;
+        }
+      }
+    }
+
+    if(cutsqdiffer) {
+      cut_coulsq_global = -1.0;
+      cudaMemcpyToSymbol(MY_AP(cut_coulsq)      	, cut_coulsq                    		, nx);
+    }
+
+    cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global  					, sizeof(X_FLOAT));
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed");
+
+  if(ncoeff > 0) {
+    F_FLOAT coeff1[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff1_gm)  , &sdata->pair.coeff1_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy((sdata->pair.coeff1_gm.dev_data), coeff1, n, cudaMemcpyHostToDevice);
+
+      _coeff1_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff1_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff1_gm_texture_ptr = &MY_AP(coeff1_gm_tex);
+      CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed");
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed");
+      cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+      CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed");
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b-d failed");
+      cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+      CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed");
+#endif
+
+    } else
+      cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n);
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed");
+
+  if(ncoeff > 1) {
+    F_FLOAT coeff2[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff2_gm)  , &sdata->pair.coeff2_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n, cudaMemcpyHostToDevice);
+
+      _coeff2_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff2_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff2_gm_texture_ptr = &MY_AP(coeff2_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+
+    } else
+      cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n);
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed");
+
+  if(ncoeff > 2) {
+    F_FLOAT coeff3[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff3_gm)  , &sdata->pair.coeff3_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n, cudaMemcpyHostToDevice);
+      _coeff3_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff3_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff3_gm_texture_ptr = &MY_AP(coeff3_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    } else
+      cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n);
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed");
+
+  if(ncoeff > 3) {
+    F_FLOAT coeff4[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff4_gm)  , &sdata->pair.coeff4_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n, cudaMemcpyHostToDevice);
+      _coeff4_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff4_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff4_gm_texture_ptr = &MY_AP(coeff4_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    } else
+      cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n);
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed");
+
+  if(ncoeff > 4) {
+    F_FLOAT coeff5[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff5_gm)  , &sdata->pair.coeff5_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n, cudaMemcpyHostToDevice);
+      _coeff5_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff5_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff5_gm_texture_ptr = &MY_AP(coeff5_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    } else
+      cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n);
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed");
+
+  if(ncoeff > 5) {
+    F_FLOAT coeff6[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff6_gm)  , &sdata->pair.coeff6_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n, cudaMemcpyHostToDevice);
+      _coeff6_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff6_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff6_gm_texture_ptr = &MY_AP(coeff6_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    }
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed");
+
+  if(ncoeff > 6) {
+    F_FLOAT coeff7[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff7_gm)  , &sdata->pair.coeff7_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n, cudaMemcpyHostToDevice);
+      _coeff7_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff7_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff7_gm_texture_ptr = &MY_AP(coeff7_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    }
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed");
+
+  if(ncoeff > 7) {
+    F_FLOAT coeff8[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff8_gm)  , &sdata->pair.coeff8_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n, cudaMemcpyHostToDevice);
+      _coeff8_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff8_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff8_gm_texture_ptr = &MY_AP(coeff8_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    }
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed");
+
+  if(ncoeff > 8) {
+    F_FLOAT coeff9[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff9_gm)  , &sdata->pair.coeff9_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n, cudaMemcpyHostToDevice);
+      _coeff9_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff9_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff9_gm_texture_ptr = &MY_AP(coeff9_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    }
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed");
+
+  F_FLOAT special_lj[4];
+  special_lj[0] = sdata->pair.special_lj[0];
+  special_lj[1] = sdata->pair.special_lj[1];
+  special_lj[2] = sdata->pair.special_lj[2];
+  special_lj[3] = sdata->pair.special_lj[3];
 
 
-	X_FLOAT box_size[3] =
-	{
-		sdata->domain.subhi[0] - sdata->domain.sublo[0],
-		sdata->domain.subhi[1] - sdata->domain.sublo[1],
-		sdata->domain.subhi[2] - sdata->domain.sublo[2]
-	};
-	
-	cudaMemcpyToSymbol(MY_CONST(box_size)   	, box_size                 		, sizeof(X_FLOAT)*3);
-	cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)	,&cuda_ntypes            		, sizeof(unsigned) );
-	cudaMemcpyToSymbol(MY_CONST(special_lj) 	, special_lj               		, sizeof(F_FLOAT)*4);
-	cudaMemcpyToSymbol(MY_CONST(virial)     	,&sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*)  );
-	cudaMemcpyToSymbol(MY_CONST(eng_vdwl)     	,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)  );
-	cudaMemcpyToSymbol(MY_CONST(periodicity)	, sdata->domain.periodicity		, sizeof(int)*3    );
-	cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int)  );
-	
-	if(need_q)
-	{
-	  F_FLOAT qqrd2e_tmp=sdata->pppm.qqrd2e;
-	  F_FLOAT special_coul[4];
-	  special_coul[0]=sdata->pair.special_coul[0];
-	  special_coul[1]=sdata->pair.special_coul[1];
-	  special_coul[2]=sdata->pair.special_coul[2];
-	  special_coul[3]=sdata->pair.special_coul[3];
-	
-	  cudaMemcpyToSymbol(MY_CONST(special_coul)	, special_coul             		, sizeof(F_FLOAT)*4);
-	  cudaMemcpyToSymbol(MY_CONST(g_ewald)    	,&sdata->pair.g_ewald	   		, sizeof(F_FLOAT)  );
-	  cudaMemcpyToSymbol(MY_CONST(qqrd2e)     	,&qqrd2e_tmp	   				, sizeof(F_FLOAT)  );
-	  cudaMemcpyToSymbol(MY_CONST(kappa)     	,&sdata->pair.kappa				, sizeof(F_FLOAT)  );
-	  cudaMemcpyToSymbol(MY_CONST(eng_coul)     ,&sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*)  );
-	}
-	CUT_CHECK_ERROR("Cuda_Pair: init failed");
+  X_FLOAT box_size[3] = {
+    sdata->domain.subhi[0] - sdata->domain.sublo[0],
+    sdata->domain.subhi[1] - sdata->domain.sublo[1],
+    sdata->domain.subhi[2] - sdata->domain.sublo[2]
+  };
+
+  cudaMemcpyToSymbol(MY_AP(box_size)   	, box_size                 		, sizeof(X_FLOAT) * 3);
+  cudaMemcpyToSymbol(MY_AP(cuda_ntypes)	, &cuda_ntypes            		, sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(special_lj) 	, special_lj               		, sizeof(F_FLOAT) * 4);
+  cudaMemcpyToSymbol(MY_AP(virial)     	, &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(eng_vdwl)     	, &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(periodicity)	, sdata->domain.periodicity		, sizeof(int) * 3);
+  cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int));
+
+  if(need_q) {
+    F_FLOAT qqrd2e_tmp = sdata->pppm.qqrd2e;
+    F_FLOAT special_coul[4];
+    special_coul[0] = sdata->pair.special_coul[0];
+    special_coul[1] = sdata->pair.special_coul[1];
+    special_coul[2] = sdata->pair.special_coul[2];
+    special_coul[3] = sdata->pair.special_coul[3];
+
+    cudaMemcpyToSymbol(MY_AP(special_coul)	, special_coul             		, sizeof(F_FLOAT) * 4);
+    cudaMemcpyToSymbol(MY_AP(g_ewald)    	, &sdata->pair.g_ewald	   		, sizeof(F_FLOAT));
+    cudaMemcpyToSymbol(MY_AP(qqrd2e)     	, &qqrd2e_tmp	   				, sizeof(F_FLOAT));
+    cudaMemcpyToSymbol(MY_AP(kappa)     	, &sdata->pair.kappa				, sizeof(F_FLOAT));
+    cudaMemcpyToSymbol(MY_AP(eng_coul)     , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*));
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init failed");
 }
 timespec startpairtime, endpairtime;
 //Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary
-void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,int eflag, int vflag, dim3& grid, dim3& threads, int& sharedperproc,bool need_q=false,int maxthreads=256)
+void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, dim3 &grid, dim3 &threads, int &sharedperproc, bool need_q = false, int maxthreads = 256)
 {
-	if(sdata->atom.nlocal==0) return;
-	if(sdata->atom.update_neigh)
-        Cuda_Pair_UpdateNeighbor_AllStyles(sdata,sneighlist);
-	if(sdata->atom.update_nmax) 
-		Cuda_Pair_UpdateNmax_AllStyles(sdata,sneighlist);
-	if(sdata->atom.update_nlocal) 		
-	{
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
-		cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
-	}		
+  if(sdata->atom.nlocal == 0) return;
+
+  if(sdata->atom.update_neigh)
+    Cuda_Pair_UpdateNeighbor_AllStyles(sdata, sneighlist);
+
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax_AllStyles(sdata, sneighlist);
+
+  if(sdata->atom.update_nlocal) {
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+    cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+  }
 
 
 
-	BindXTypeTexture(sdata);
-	if(need_q) BindQTexture(sdata);
-	
+  BindXTypeTexture(sdata);
 
-	sharedperproc=0;
-	if(sdata->pair.use_block_per_atom) sharedperproc+=3;
-	if(eflag) sharedperproc+=1;
-	if(need_q && eflag) sharedperproc+=1;
-	if(vflag) sharedperproc+=6;
+  if(need_q) BindQTexture(sdata);
 
-    int threadnum = sneighlist->inum;
-    if (sdata->comm.comm_phase==2)threadnum=sneighlist->inum_border2;
-    if(sdata->pair.use_block_per_atom) {threadnum*=64; maxthreads=64;}
-	int3 layout=getgrid(threadnum,sharedperproc*sizeof(ENERGY_FLOAT),maxthreads,true); //need to limit to 192 threads due to register limit
-	threads.x = layout.z; threads.y = 1; threads.z = 1;
-	grid.x = layout.x; grid.y = layout.y; grid.z = 1;
-	
-	int size=(unsigned)(layout.y*layout.x)*sharedperproc*sizeof(ENERGY_FLOAT);
-	if(sdata->pair.collect_forces_later) size+=(unsigned)(sdata->atom.nmax*3*sizeof(F_FLOAT));
- 		Cuda_UpdateBuffer(sdata,size);
- 		
-    if(sdata->pair.use_block_per_atom)
-    	cudaMemset(sdata->buffer, 0, size);
 
-    sdata->pair.lastgridsize=grid.x*grid.y;
-    sdata->pair.n_energy_virial=sharedperproc;
-	if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial-=3;
+  sharedperproc = 0;
 
-    clock_gettime(CLOCK_REALTIME,&startpairtime);
+  if(sdata->pair.use_block_per_atom) sharedperproc += 3;
 
-	MYDBG( printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); )
+  if(eflag) sharedperproc += 1;
+
+  if(need_q && eflag) sharedperproc += 1;
+
+  if(vflag) sharedperproc += 6;
+
+  int threadnum = sneighlist->inum;
+
+  if(sdata->comm.comm_phase == 2)threadnum = sneighlist->inum_border2;
+
+  if(sdata->pair.use_block_per_atom) {
+    threadnum *= 64;
+    maxthreads = 64;
+  }
+
+  int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_FLOAT), maxthreads, true); //need to limit to 192 threads due to register limit
+  threads.x = layout.z;
+  threads.y = 1;
+  threads.z = 1;
+  grid.x = layout.x;
+  grid.y = layout.y;
+  grid.z = 1;
+
+  int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_FLOAT);
+
+  if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_FLOAT));
+
+  Cuda_UpdateBuffer(sdata, size);
+
+  if(sdata->pair.use_block_per_atom)
+    cudaMemset(sdata->buffer, 0, size);
+
+  sdata->pair.lastgridsize = grid.x * grid.y;
+  sdata->pair.n_energy_virial = sharedperproc;
+
+  if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial -= 3;
+
+  clock_gettime(CLOCK_REALTIME, &startpairtime);
+
+  MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
 }
 
 //Function which is called after the kernel invocation, collects energy and virial
-void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3& grid, int& sharedperproc,int eflag, int vflag)
+void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sharedperproc, int eflag, int vflag)
 {
-	if((not sdata->pair.collect_forces_later) && (eflag||vflag))//not sdata->comm.comm_phase==2))
-	{
-	  cudaThreadSynchronize();
-      clock_gettime(CLOCK_REALTIME,&endpairtime);
-    sdata->cuda_timings.pair_kernel+=
-      endpairtime.tv_sec-startpairtime.tv_sec+1.0*(endpairtime.tv_nsec-startpairtime.tv_nsec)/1000000000;
-	  CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
+  if((not sdata->pair.collect_forces_later) && (eflag || vflag)) { //not sdata->comm.comm_phase==2))
+    cudaThreadSynchronize();
+    clock_gettime(CLOCK_REALTIME, &endpairtime);
+    sdata->cuda_timings.pair_kernel +=
+      endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
+    CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
 
-	  if(eflag||vflag) 
-	  {
-		int n=grid.x*grid.y;
-		if(sdata->pair.use_block_per_atom)
-		grid.x=sharedperproc-3;
-		else 
-		grid.x=sharedperproc;
-		grid.y=1;
-		dim3 threads(128,1,1);
-	    MYDBG( printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); )
-		MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(n);
-		cudaThreadSynchronize();
-		CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed");
-	  }
+    if(eflag || vflag) {
+      int n = grid.x * grid.y;
 
-	  MYDBG( printf("# CUDA: Cuda_Pair: kernel done\n"); )
-	}
+      if(sdata->pair.use_block_per_atom)
+        grid.x = sharedperproc - 3;
+      else
+        grid.x = sharedperproc;
+
+      grid.y = 1;
+      dim3 threads(128, 1, 1);
+      MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
+      MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
+      cudaThreadSynchronize();
+      CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed");
+    }
+
+    MYDBG(printf("# CUDA: Cuda_Pair: kernel done\n");)
+  }
 }
-	
+
 
 #include "pair_born_coul_long_cuda.cu"
 #include "pair_buck_coul_cut_cuda.cu"
@@ -877,149 +858,158 @@ void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3& grid, int& sh
 
 void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata)
 {
-	CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed");
-		cudaMemcpyToSymbol(MY_CONST(nlocal)    , & sdata->atom.nlocal             , sizeof(int)  );
-		cudaMemcpyToSymbol(MY_CONST(nall)      , & sdata->atom.nall               , sizeof(int)  );
-		cudaMemcpyToSymbol(MY_CONST(nmax)      , & sdata->atom.nmax               , sizeof(int)  );
-		cudaMemcpyToSymbol(MY_CONST(type)      , & sdata->atom.type       .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(x)         , & sdata->atom.x          .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(x_type)    , & sdata->atom.x_type     .dev_data, sizeof(X_FLOAT4*) );
-		cudaMemcpyToSymbol(MY_CONST(xhold)     , & sdata->atom.xhold      .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(v)         , & sdata->atom.v          .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(radius)    , & sdata->atom.radius     .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(v_radius)  , & sdata->atom.v_radius   .dev_data, sizeof(V_FLOAT4*) );
-		cudaMemcpyToSymbol(MY_CONST(omega)     , & sdata->atom.omega      .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(rmass)     , & sdata->atom.rmass      .dev_data, sizeof(V_FLOAT*) );
-	    cudaMemcpyToSymbol(MY_CONST(omega_rmass),& sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*) );
-		cudaMemcpyToSymbol(MY_CONST(map_array), & sdata->atom.map_array .dev_data, sizeof(int*)     );
-	CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed");
+  CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed");
+  cudaMemcpyToSymbol(MY_AP(nlocal)    , & sdata->atom.nlocal             , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)      , & sdata->atom.nall               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)      , & sdata->atom.nmax               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(type)      , & sdata->atom.type       .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(x)         , & sdata->atom.x          .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(x_type)    , & sdata->atom.x_type     .dev_data, sizeof(X_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(xhold)     , & sdata->atom.xhold      .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(v)         , & sdata->atom.v          .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(radius)    , & sdata->atom.radius     .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(v_radius)  , & sdata->atom.v_radius   .dev_data, sizeof(V_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(omega)     , & sdata->atom.omega      .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(rmass)     , & sdata->atom.rmass      .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
+  CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed");
 }
 
 
 void Cuda_Pair_GenerateXType(cuda_shared_data* sdata)
 {
-	MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); )
-	if(sdata->atom.update_nmax)
-	Cuda_Pair_UpdateNmax(sdata);
-  if(sdata->atom.update_nlocal)
-  {
-	  cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
-	  cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
-  }
-	MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout); )
+  MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
 
-	int3 layout=getgrid(sdata->atom.nall);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	MYDBG(printf(" # CUDA: GenerateXType ... kernel start test\n");  fflush(stdout);)
-	Pair_GenerateXType_Kernel<<<grid, threads,0>>>();
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
-	MYDBG(printf(" # CUDA: GenerateXType ... end\n");  fflush(stdout); )
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal) {
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+    cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+  }
+
+  MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout);)
+
+  int3 layout = getgrid(sdata->atom.nall);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  MYDBG(printf(" # CUDA: GenerateXType ... kernel start test\n");  fflush(stdout);)
+  Pair_GenerateXType_Kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
+  MYDBG(printf(" # CUDA: GenerateXType ... end\n");  fflush(stdout);)
 }
 
 void Cuda_Pair_RevertXType(cuda_shared_data* sdata)
 {
-	MYDBG(printf(" # CUDA: RevertXType ... start\n"); )
-  if(sdata->atom.update_nmax)
-	Cuda_Pair_UpdateNmax(sdata);
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
-	cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
+  MYDBG(printf(" # CUDA: RevertXType ... start\n");)
 
-	int3 layout=getgrid(sdata->atom.nall);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Pair_RevertXType_Kernel<<<grid, threads,0>>>();
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
-	MYDBG(printf(" # CUDA: RevertXType ... end\n"); )
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax(sdata);
+
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+
+  int3 layout = getgrid(sdata->atom.nall);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Pair_RevertXType_Kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
+  MYDBG(printf(" # CUDA: RevertXType ... end\n");)
 }
 
 void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata)
 {
-	MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); )
-  if(sdata->atom.update_nmax)
-	Cuda_Pair_UpdateNmax(sdata);
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
-	cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
-	MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout); )
+  MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
 
-	int3 layout=getgrid(sdata->atom.nall);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	MYDBG(printf(" # CUDA: GenerateVRadius ... kernel start test\n");  fflush(stdout);)
-	Pair_GenerateVRadius_Kernel<<<grid, threads,0>>>();
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Pair GenerateVRadius: Kernel failed");
-	MYDBG(printf(" # CUDA: GenerateVRadius ... end\n");  fflush(stdout); )
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax(sdata);
+
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+  MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout);)
+
+  int3 layout = getgrid(sdata->atom.nall);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  MYDBG(printf(" # CUDA: GenerateVRadius ... kernel start test\n");  fflush(stdout);)
+  Pair_GenerateVRadius_Kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair GenerateVRadius: Kernel failed");
+  MYDBG(printf(" # CUDA: GenerateVRadius ... end\n");  fflush(stdout);)
 }
 
 void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata)
 {
-	MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); )
-  if(sdata->atom.update_nmax)
-	Cuda_Pair_UpdateNmax(sdata);
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
-	cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
-	MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout); )
+  MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
 
-	int3 layout=getgrid(sdata->atom.nall);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	MYDBG(printf(" # CUDA: GenerateOmegaRmass ... kernel start test\n");  fflush(stdout);)
-	Pair_GenerateOmegaRmass_Kernel<<<grid, threads,0>>>();
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Pair GenerateOmegaRmass: Kernel failed");
-	MYDBG(printf(" # CUDA: GenerateOmegaRmass ... end\n");  fflush(stdout); )
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax(sdata);
+
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+  MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout);)
+
+  int3 layout = getgrid(sdata->atom.nall);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  MYDBG(printf(" # CUDA: GenerateOmegaRmass ... kernel start test\n");  fflush(stdout);)
+  Pair_GenerateOmegaRmass_Kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair GenerateOmegaRmass: Kernel failed");
+  MYDBG(printf(" # CUDA: GenerateOmegaRmass ... end\n");  fflush(stdout);)
 }
 
 void Cuda_Pair_BuildXHold(cuda_shared_data* sdata)
 {
   if(sdata->atom.update_nmax)
-	Cuda_Pair_UpdateNmax(sdata);
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)  );
-	cudaMemcpyToSymbol(MY_CONST(nall)    , & sdata->atom.nall          , sizeof(int)  );
+    Cuda_Pair_UpdateNmax(sdata);
 
-	int3 layout=getgrid(sdata->atom.nall);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Pair_BuildXHold_Kernel<<<grid, threads,0>>>();
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+
+  int3 layout = getgrid(sdata->atom.nall);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Pair_BuildXHold_Kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
 }
 
-void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag)
+void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag)
 {
-	cudaThreadSynchronize();
-    clock_gettime(CLOCK_REALTIME,&endpairtime);
-    sdata->cuda_timings.pair_kernel+=
-      endpairtime.tv_sec-startpairtime.tv_sec+1.0*(endpairtime.tv_nsec-startpairtime.tv_nsec)/1000000000;
+  cudaThreadSynchronize();
+  clock_gettime(CLOCK_REALTIME, &endpairtime);
+  sdata->cuda_timings.pair_kernel +=
+    endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
   CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
-	dim3 threads;
-	dim3 grid;
-	
-	if(eflag||vflag) 
-	{
-		int n=sdata->pair.lastgridsize;
-		grid.x=sdata->pair.n_energy_virial;
-		grid.y=1;
-		threads.x=128;
-		//printf("A grid.x: %i\n",grid.x);
-		MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(n);
-		cudaThreadSynchronize();
-		CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed");
-	}	
-	int3 layout=getgrid(sdata->atom.nlocal);
-	threads.x = layout.z;
-	grid.x = layout.x;
-	grid.y = layout.y;
-	Pair_CollectForces_Kernel<<<grid, threads,0>>>(sdata->pair.n_energy_virial,sdata->pair.lastgridsize);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Pair_CollectForces: Force Summation Kernel execution failed");
-	
+  dim3 threads;
+  dim3 grid;
+
+  if(eflag || vflag) {
+    int n = sdata->pair.lastgridsize;
+    grid.x = sdata->pair.n_energy_virial;
+    grid.y = 1;
+    threads.x = 128;
+    //printf("A grid.x: %i\n",grid.x);
+    MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed");
+  }
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  threads.x = layout.z;
+  grid.x = layout.x;
+  grid.y = layout.y;
+  Pair_CollectForces_Kernel <<< grid, threads, 0>>>(sdata->pair.n_energy_virial, sdata->pair.lastgridsize);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair_CollectForces: Force Summation Kernel execution failed");
+
 }
diff --git a/lib/cuda/pair_eam_cuda.cu b/lib/cuda/pair_eam_cuda.cu
index d97143a0c7..cb20343770 100644
--- a/lib/cuda/pair_eam_cuda.cu
+++ b/lib/cuda/pair_eam_cuda.cu
@@ -1,22 +1,22 @@
 /* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
 
    Original Version:
    http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
+   Steve Plimpton, sjplimp@sandia.gov
 
-   See the README file in the top-level LAMMPS directory. 
+   See the README file in the top-level LAMMPS directory.
 
-   ----------------------------------------------------------------------- 
+   -----------------------------------------------------------------------
 
    USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
+   https://sourceforge.net/projects/lammpscuda/
 
    Christian Trott, christian.trott@tu-ilmenau.de
    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
+   Theoretical Physics II, University of Technology Ilmenau, Germany
 
-   See the README file in the USER-CUDA directory. 
+   See the README file in the USER-CUDA directory.
 
    This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
@@ -54,17 +54,17 @@ __device__ __constant__ F_FLOAT* MY_AP(fp);
 
 #define _rhor_spline_tex         MY_AP(rhor_spline_tex)
 #if F_PRECISION == 1
-texture<float4,1> _rhor_spline_tex;
+texture<float4, 1> _rhor_spline_tex;
 #else
-texture<int4,1> _rhor_spline_tex;
+texture<int4, 1> _rhor_spline_tex;
 #endif
 
 
 #define _z2r_spline_tex         MY_AP(z2r_spline_tex)
 #if F_PRECISION == 1
-texture<float4,1> _z2r_spline_tex;
+texture<float4, 1> _z2r_spline_tex;
 #else
-texture<int4,1> _z2r_spline_tex;
+texture<int4, 1> _z2r_spline_tex;
 #endif
 
 
@@ -85,243 +85,258 @@ inline void BindEAMTextures(cuda_shared_data* sdata)
   _rhor_spline_tex.normalized = false;                      // access with normalized texture coordinates
   _rhor_spline_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
   _rhor_spline_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
- 
-  const textureReference* rhor_spline_texture_ptr;
-  cudaGetTextureReference(&rhor_spline_texture_ptr, MY_CONST(rhor_spline_tex));
 
-  #if F_PRECISION == 1
+  const textureReference* rhor_spline_texture_ptr = &MY_AP(rhor_spline_tex);
+
+#if F_PRECISION == 1
   cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<float4>();
-  cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
-  #else
+  cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
+#else
   cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<int4>();
-  cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
-  #endif
-  
+  cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
+#endif
+
   _z2r_spline_tex.normalized = false;                      // access with normalized texture coordinates
   _z2r_spline_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
   _z2r_spline_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
 
-  const textureReference* z2r_spline_texture_ptr;
-  cudaGetTextureReference(&z2r_spline_texture_ptr, MY_CONST(z2r_spline_tex));
+  const textureReference* z2r_spline_texture_ptr = &MY_AP(z2r_spline_tex);
 
-  #if F_PRECISION == 1
+#if F_PRECISION == 1
   cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<float4>();
-  cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
-  #else
+  cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
+#else
   cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<int4>();
-  cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
-  #endif
-  
+  cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
+#endif
+
 }
 
 void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
 {
-	CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed");
-        int3 layout=getgrid(sneighlist->inum,7*sizeof(F_FLOAT));
-	    dim3 threads(layout.z, 1, 1);
-	    dim3 grid(layout.x, layout.y, 1);		
-	    int size=(unsigned)(layout.y*layout.x)*7*sizeof(F_FLOAT);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			if(sdata->buffer!=NULL) cudaFree(sdata->buffer);
-			cudaMalloc((void**)&sdata->buffer,size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
-	CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed");
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed");
+  int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+
+    if(sdata->buffer != NULL) cudaFree(sdata->buffer);
+
+    cudaMalloc((void**)&sdata->buffer, size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed");
 }
 
 void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
 {
-cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0]  , sizeof(unsigned) );
-cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) );
-cudaMemcpyToSymbol(MY_CONST(ilist)     , & sneighlist->ilist     .dev_data, sizeof(int*) );
-cudaMemcpyToSymbol(MY_CONST(inum)      , & sneighlist->inum               , sizeof(int)  );
-cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal               , sizeof(int)      );
-cudaMemcpyToSymbol(MY_CONST(nmax)      , & sdata->atom.nmax               , sizeof(int)  );
-cudaMemcpyToSymbol(MY_CONST(numneigh)  , & sneighlist->numneigh  .dev_data, sizeof(int*) );
-cudaMemcpyToSymbol(MY_CONST(neighbors)      , & sneighlist->neighbors  .dev_data, sizeof(int*) );
-cudaMemcpyToSymbol(MY_CONST(maxneighbors)       , & sneighlist->maxneighbors     , sizeof(int)  );
+  cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(ilist)     , & sneighlist->ilist     .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(inum)      , & sneighlist->inum               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)      , & sdata->atom.nmax               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(numneigh)  , & sneighlist->numneigh  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(neighbors)      , & sneighlist->neighbors  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(maxneighbors)       , & sneighlist->maxneighbors     , sizeof(int));
 }
 
 void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
 {
-	CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed");
-		cudaMemcpyToSymbol(MY_CONST(x)         , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(x_type)         	, & sdata->atom.x_type    .dev_data, sizeof(X_FLOAT4*) );
-		cudaMemcpyToSymbol(MY_CONST(f)         			, & sdata->atom.f         .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(type)      			, & sdata->atom.type      .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(tag)      			, & sdata->atom.tag       .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*) );
-	CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed");
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed");
+  cudaMemcpyToSymbol(MY_AP(x)         , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(x_type)         	, & sdata->atom.x_type    .dev_data, sizeof(X_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(f)         			, & sdata->atom.f         .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)      			, & sdata->atom.type      .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(tag)      			, & sdata->atom.tag       .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*));
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed");
 }
 
 
-void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata,double rdr,double rdrho,int nfrho, int nrhor,int nr, int nrho,int nz2r,
-void* frho_spline,void* rhor_spline,void* z2r_spline,void* rho,void* fp,
-int* type2frho,int** type2z2r,int** type2rhor)
+void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, int nfrho, int nrhor, int nr, int nrho, int nz2r,
+                           void* frho_spline, void* rhor_spline, void* z2r_spline, void* rho, void* fp,
+                           int* type2frho, int** type2z2r, int** type2rhor)
 {
-	// !! LAMMPS indexes atom types starting with 1 !!
-	
-	unsigned cuda_ntypes = sdata->atom.ntypes + 1;
-	if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2)
-		printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u "
-		"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 "
-		"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
-	unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes;
+  // !! LAMMPS indexes atom types starting with 1 !!
+
+  unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+
+  if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
+    printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u "
+           "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 "
+           "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
+
+  unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes;
+
+  X_FLOAT cutsq_global;
+  cutsq_global = (X_FLOAT)(sdata->pair.cut_global);
+  cudaMemcpyToSymbol(MY_AP(cutsq_global)	, &cutsq_global  				, sizeof(X_FLOAT));
+
+
+  F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes];
+
+  for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i];
+
+  cudaMemcpyToSymbol(MY_AP(coeff1)        , coeff_buf             , cuda_ntypes * sizeof(F_FLOAT));
+
+  for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i];
+
+  cudaMemcpyToSymbol(MY_AP(coeff2)        , coeff_buf             , nI);
+
+  for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2z2r[0][0])[i];
+
+  cudaMemcpyToSymbol(MY_AP(coeff3)        , coeff_buf             , nI);
+
+  delete [] coeff_buf;
+  X_FLOAT box_size[3] = {
+    sdata->domain.subhi[0] - sdata->domain.sublo[0],
+    sdata->domain.subhi[1] - sdata->domain.sublo[1],
+    sdata->domain.subhi[2] - sdata->domain.sublo[2]
+  };
+  F_FLOAT rdr_F = rdr;
+  F_FLOAT rdrho_F = rdrho;
+  cudaMemcpyToSymbol(MY_AP(box_size)   , box_size                 , sizeof(X_FLOAT) * 3);
+  cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes            , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(virial)     , &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(eng_vdwl)     , &sdata->pair.eng_vdwl.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3);
+  cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
+
+  rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
+  z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
+  rhor_spline_pointer = rhor_spline;
+  z2r_spline_pointer = z2r_spline;
+
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed");
 
-	X_FLOAT cutsq_global;
-	cutsq_global = (X_FLOAT) (sdata->pair.cut_global);	
-	cudaMemcpyToSymbol(MY_CONST(cutsq_global)	,&cutsq_global  				, sizeof(X_FLOAT)  );
-	
-	
-	F_FLOAT* coeff_buf=new F_FLOAT[cuda_ntypes*cuda_ntypes];
-	for(int i=0;i<cuda_ntypes;i++) coeff_buf[i]=type2frho[i];
-	cudaMemcpyToSymbol(MY_AP(coeff1)        , coeff_buf             , cuda_ntypes*sizeof(F_FLOAT)      );
-	
-	for(int i=0;i<cuda_ntypes*cuda_ntypes;i++) coeff_buf[i]=(&type2rhor[0][0])[i];
-	cudaMemcpyToSymbol(MY_AP(coeff2)        , coeff_buf             , nI       );
-	for(int i=0;i<cuda_ntypes*cuda_ntypes;i++) coeff_buf[i]=(&type2z2r[0][0])[i];
-	cudaMemcpyToSymbol(MY_AP(coeff3)        , coeff_buf             , nI       );
-	
-	delete [] coeff_buf;
-	X_FLOAT box_size[3] =
-	{
-		sdata->domain.subhi[0] - sdata->domain.sublo[0],
-		sdata->domain.subhi[1] - sdata->domain.sublo[1],
-		sdata->domain.subhi[2] - sdata->domain.sublo[2]
-	};
-	F_FLOAT rdr_F=rdr;
-	F_FLOAT rdrho_F=rdrho;
-	cudaMemcpyToSymbol(MY_CONST(box_size)   , box_size                 , sizeof(X_FLOAT)*3);
-	cudaMemcpyToSymbol(MY_CONST(cuda_ntypes), & cuda_ntypes            , sizeof(unsigned) );
-	cudaMemcpyToSymbol(MY_CONST(virial)     , &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*)  );
-	cudaMemcpyToSymbol(MY_CONST(eng_vdwl)     , &sdata->pair.eng_vdwl.dev_data   , sizeof(ENERGY_FLOAT*)  );
-	cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity, sizeof(int)*3    );
-	cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int)  );
-	cudaMemcpyToSymbol(MY_CONST(rdr), &rdr_F, sizeof(F_FLOAT)    );
-	cudaMemcpyToSymbol(MY_CONST(rdrho), &rdrho_F, sizeof(F_FLOAT)    );
-	cudaMemcpyToSymbol(MY_CONST(nr), &nr, sizeof(int)    );
-	cudaMemcpyToSymbol(MY_CONST(nrho), &nrho, sizeof(int)    );
-	cudaMemcpyToSymbol(MY_CONST(nfrho), &nfrho, sizeof(int)    );
-	cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int)    );
-	cudaMemcpyToSymbol(MY_CONST(rho), &rho, sizeof(F_FLOAT*)    );
-	cudaMemcpyToSymbol(MY_CONST(fp), &fp, sizeof(F_FLOAT*)    );
-	cudaMemcpyToSymbol(MY_CONST(frho_spline), &frho_spline, sizeof(F_FLOAT*)    );
-	cudaMemcpyToSymbol(MY_CONST(rhor_spline), &rhor_spline, sizeof(F_FLOAT*)    );
-	cudaMemcpyToSymbol(MY_CONST(z2r_spline), &z2r_spline, sizeof(F_FLOAT*)    );
-	cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int)    );
-	
-	rhor_spline_size = nrhor*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT);
-	z2r_spline_size = nz2r*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT);
-	rhor_spline_pointer = rhor_spline;
-	z2r_spline_pointer = z2r_spline;
-	
-	CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed");
-	
 }
 
 
 
 void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
 {
-	
-	if(sdata->atom.update_nmax) 
-		Cuda_PairEAMCuda_UpdateNmax(sdata,sneighlist);
+
+  if(sdata->atom.update_nmax)
+    Cuda_PairEAMCuda_UpdateNmax(sdata, sneighlist);
+
   if(sdata->atom.update_neigh)
-    Cuda_PairEAMCuda_UpdateNeighbor(sdata,sneighlist);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	if(sdata->buffer_new)
-		Cuda_PairEAMCuda_UpdateBuffer(sdata,sneighlist);
-		cudaMemcpyToSymbol(MY_CONST(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*) );
+    Cuda_PairEAMCuda_UpdateNeighbor(sdata, sneighlist);
 
-	int sharedperproc=0;
-	if(eflag||eflag_atom) sharedperproc=1;
-	if(vflag||vflag_atom) sharedperproc=7;
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
 
-	int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT));
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	eam_buff_offset=grid.x*grid.y;
-		
-	BindXTypeTexture(sdata);
-	BindEAMTextures( sdata);// initialize only on first call
+  if(sdata->buffer_new)
+    Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist);
 
-	
-	MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); )
-	CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation");
-	PairEAMCuda_Kernel1<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag,eflag_atom,vflag_atom);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed");
-	
+  cudaMemcpyToSymbol(MY_AP(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*));
+
+  int sharedperproc = 0;
+
+  if(eflag || eflag_atom) sharedperproc = 1;
+
+  if(vflag || vflag_atom) sharedperproc = 7;
+
+  int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  eam_buff_offset = grid.x * grid.y;
+
+  BindXTypeTexture(sdata);
+  BindEAMTextures(sdata); // initialize only on first call
 
 
-	MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); )
-	
+  MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation");
+  PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed");
+
+
+
+  MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");)
+
 }
 
 void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
 {
-	int sharedperproc=0;
-	if(eflag||eflag_atom) sharedperproc=1;
-	if(vflag||vflag_atom) sharedperproc=7;
-	int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT));
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-		
-	BindXTypeTexture(sdata);
-	BindEAMTextures( sdata);// initialize only on first call
-	// initialize only on first call
-    sdata->pair.lastgridsize=grid.x*grid.y;
-    sdata->pair.n_energy_virial=sharedperproc;
+  int sharedperproc = 0;
 
-	MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); )
-	CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation");
-	PairEAMCuda_Kernel2<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag,eflag_atom,vflag_atom);
-	CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed");
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed");
-	
-	if(eflag||vflag) 
-	{
-		int n=grid.x*grid.y;
-		grid.x=sharedperproc;
-		grid.y=1;
-		threads.x=256;
-		MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)*sharedperproc>>>(n);
-		cudaThreadSynchronize();
-		CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed");
-	}
+  if(eflag || eflag_atom) sharedperproc = 1;
+
+  if(vflag || vflag_atom) sharedperproc = 7;
+
+  int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  BindXTypeTexture(sdata);
+  BindEAMTextures(sdata); // initialize only on first call
+  // initialize only on first call
+  sdata->pair.lastgridsize = grid.x * grid.y;
+  sdata->pair.n_energy_virial = sharedperproc;
+
+  MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation");
+  PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed");
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed");
+
+  if(eflag || vflag) {
+    int n = grid.x * grid.y;
+    grid.x = sharedperproc;
+    grid.y = 1;
+    threads.x = 256;
+    MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)*sharedperproc>>>(n);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed");
+  }
+
+  MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");)
 
-	MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); )
-	
 }
 
-void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send)
+void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send)
 {
- 	int3 layout=getgrid(n,0);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	F_FLOAT* buf=(F_FLOAT*) (& ((double*)sdata->buffer)[eam_buff_offset]);
-	
-    PairEAMCuda_PackComm_Kernel<<<grid, threads,0>>> ((int*) sdata->comm.sendlist.dev_data,n
-	  ,sdata->comm.maxlistlength,iswap,buf);
-	cudaThreadSynchronize();
-    cudaMemcpy(buf_send, buf, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);   
-	cudaThreadSynchronize();
+  int3 layout = getgrid(n, 0);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  F_FLOAT* buf = (F_FLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]);
+
+  PairEAMCuda_PackComm_Kernel <<< grid, threads, 0>>> ((int*) sdata->comm.sendlist.dev_data, n
+      , sdata->comm.maxlistlength, iswap, buf);
+  cudaThreadSynchronize();
+  cudaMemcpy(buf_send, buf, n* sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+  cudaThreadSynchronize();
 }
 
-void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,void* fp)
+void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp)
 {
-	F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]);
-    cudaMemcpy(fp_first,buf_recv, n*sizeof(F_FLOAT), cudaMemcpyHostToDevice);   
+  F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]);
+  cudaMemcpy(fp_first, buf_recv, n * sizeof(F_FLOAT), cudaMemcpyHostToDevice);
 }
 
 #undef _type2frho
diff --git a/lib/cuda/pair_sw_cuda.cu b/lib/cuda/pair_sw_cuda.cu
index f5b0807fcd..491d4d666f 100644
--- a/lib/cuda/pair_sw_cuda.cu
+++ b/lib/cuda/pair_sw_cuda.cu
@@ -1,22 +1,22 @@
 /* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
 
    Original Version:
    http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
+   Steve Plimpton, sjplimp@sandia.gov
 
-   See the README file in the top-level LAMMPS directory. 
+   See the README file in the top-level LAMMPS directory.
 
-   ----------------------------------------------------------------------- 
+   -----------------------------------------------------------------------
 
    USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
+   https://sourceforge.net/projects/lammpscuda/
 
    Christian Trott, christian.trott@tu-ilmenau.de
    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
+   Theoretical Physics II, University of Technology Ilmenau, Germany
 
-   See the README file in the USER-CUDA directory. 
+   See the README file in the USER-CUDA directory.
 
    This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
@@ -24,116 +24,115 @@
 #include <stdio.h>
 
 #include "pair_sw_cuda_cu.h"
-__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR];
+__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR];
 
 #include "pair_sw_cuda_kernel_nc.cu"
 
 #include <time.h>
 
 
-void Cuda_PairSWCuda_Init(cuda_shared_data* sdata,ParamSW_Float* params_host,void* map_host, void* elem2param_host,int nelements_h)
+void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h)
 {
   unsigned cuda_ntypes = sdata->atom.ntypes + 1;
-  X_FLOAT box_size[3] =
-  {
+  X_FLOAT box_size[3] = {
     sdata->domain.subhi[0] - sdata->domain.sublo[0],
     sdata->domain.subhi[1] - sdata->domain.sublo[1],
     sdata->domain.subhi[2] - sdata->domain.sublo[2]
   };
 
-  cudaMemcpyToSymbol(MY_CONST(box_size)     , box_size                      , sizeof(X_FLOAT)*3);
-  cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)  ,&cuda_ntypes                   , sizeof(unsigned) );
-  cudaMemcpyToSymbol(MY_CONST(virial)       ,&sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*)  );
-  cudaMemcpyToSymbol(MY_CONST(eng_vdwl)     ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)  );
-  cudaMemcpyToSymbol(MY_CONST(periodicity)  , sdata->domain.periodicity     , sizeof(int)*3    );
-  cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int)  );
-  cudaMemcpyToSymbol("params_sw", params_host  , sizeof(ParamSW_Float)*nelements_h*nelements_h*nelements_h  );
-  cudaMemcpyToSymbol("elem2param",elem2param_host  , sizeof(int)*nelements_h*nelements_h*nelements_h  );
-  cudaMemcpyToSymbol("map",map_host  , sizeof(int)*cuda_ntypes  );
-  cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(box_size)     , box_size                      , sizeof(X_FLOAT) * 3);
+  cudaMemcpyToSymbol(MY_AP(cuda_ntypes)  , &cuda_ntypes                   , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(virial)       , &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(eng_vdwl)     , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(periodicity)  , sdata->domain.periodicity     , sizeof(int) * 3);
+  cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int));
+  cudaMemcpyToSymbol(params_sw, params_host  , sizeof(ParamSW_Float)*nelements_h * nelements_h * nelements_h);
+  cudaMemcpyToSymbol(elem2param, elem2param_host  , sizeof(int)*nelements_h * nelements_h * nelements_h);
+  cudaMemcpyToSymbol(map, map_host  , sizeof(int)*cuda_ntypes);
+  cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int));
 }
 
-void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
 {
-	static int glob_ij_size=0;
-  static F_FLOAT4* glob_r_ij=NULL;
-  static int* glob_numneigh_red=NULL;
-  static int* glob_neighbors_red=NULL;
-  static int* glob_neightype_red=NULL;
+  static int glob_ij_size = 0;
+  static F_FLOAT4* glob_r_ij = NULL;
+  static int* glob_numneigh_red = NULL;
+  static int* glob_neighbors_red = NULL;
+  static int* glob_neightype_red = NULL;
 
-	if(glob_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT))
-	{
-	  glob_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT);
+  if(glob_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) {
+    glob_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT);
     cudaFree(glob_r_ij);
     cudaFree(glob_numneigh_red);
     cudaFree(glob_neighbors_red);
     cudaFree(glob_neightype_red);
-    cudaMalloc(&glob_r_ij,glob_ij_size*4);
-    cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int));
-    cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
-    cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
-    cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red  , sizeof(int*)  );
-    cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red  , sizeof(int*)  );
-    cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red  , sizeof(int*)  );
-    cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij  , sizeof(F_FLOAT4*)  );
-	}
-	dim3 grid,threads;
-	int sharedperproc;
+    cudaMalloc(&glob_r_ij, glob_ij_size * 4);
+    cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int));
+    cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
+    cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
+    cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij  , sizeof(F_FLOAT4*));
+  }
 
-	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64);
-	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64);
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
 
 
 
-	dim3 grid2;
-	if(sdata->atom.nall<=256*64000){
-	  grid2.x = (sdata->atom.nall+255)/256;
-	  grid2.y = 1;
-	} else {
-    grid2.x = (sdata->atom.nall+256*128-1)/(256*128);
+  dim3 grid2;
+
+  if(sdata->atom.nall <= 256 * 64000) {
+    grid2.x = (sdata->atom.nall + 255) / 256;
+    grid2.y = 1;
+  } else {
+    grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128);
     grid2.y = 128;
   }
-	grid2.z = 1;
+
+  grid2.z = 1;
   dim3 threads2;
   threads2.x = 256;
   threads2.y = 1;
   threads2.z = 1;
 
-  timespec time1,time2;
+  timespec time1, time2;
 
   //pre-calculate all neighbordistances and zeta_ij
-  clock_gettime(CLOCK_REALTIME,&time1);
-  Pair_SW_Kernel_TpA_RIJ<<<grid2, threads2,0,streams[1]>>>();
+  clock_gettime(CLOCK_REALTIME, &time1);
+  Pair_SW_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>();
   cudaThreadSynchronize();
-  clock_gettime(CLOCK_REALTIME,&time2);
-  sdata->cuda_timings.test1+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-  clock_gettime(CLOCK_REALTIME,&time1);
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.test1 +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+  clock_gettime(CLOCK_REALTIME, &time1);
 
   //actual force calculation
-  unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure
-  if(eflag)
-  {
+  unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
+
+  if(eflag) {
     if(vflag)
-      Pair_SW_Kernel_TpA<1,1><<<grid, threads,sharedsize,streams[1]>>>
-       (eflag_atom,vflag_atom);
+      Pair_SW_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
     else
-      Pair_SW_Kernel_TpA<1,0><<<grid, threads,sharedsize,streams[1]>>>
-       (eflag_atom,vflag_atom);
-  }
-  else
-  {
+      Pair_SW_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
+  } else {
     if(vflag)
-      Pair_SW_Kernel_TpA<0,1><<<grid, threads,sharedsize,streams[1]>>>
-       (eflag_atom,vflag_atom);
+      Pair_SW_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
     else
-      Pair_SW_Kernel_TpA<0,0><<<grid, threads,sharedsize,streams[1]>>>
-       (eflag_atom,vflag_atom);
+      Pair_SW_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
   }
   cudaThreadSynchronize();
-  clock_gettime(CLOCK_REALTIME,&time2);
-  sdata->cuda_timings.test2+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.test2 +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
 
   Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
 }
diff --git a/lib/cuda/pair_tersoff_cuda.cu b/lib/cuda/pair_tersoff_cuda.cu
index abbf39ecf0..0ae5e846a0 100644
--- a/lib/cuda/pair_tersoff_cuda.cu
+++ b/lib/cuda/pair_tersoff_cuda.cu
@@ -1,22 +1,22 @@
 /* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
 
    Original Version:
    http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
+   Steve Plimpton, sjplimp@sandia.gov
 
-   See the README file in the top-level LAMMPS directory. 
+   See the README file in the top-level LAMMPS directory.
 
-   ----------------------------------------------------------------------- 
+   -----------------------------------------------------------------------
 
    USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
+   https://sourceforge.net/projects/lammpscuda/
 
    Christian Trott, christian.trott@tu-ilmenau.de
    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
+   Theoretical Physics II, University of Technology Ilmenau, Germany
 
-   See the README file in the USER-CUDA directory. 
+   See the README file in the USER-CUDA directory.
 
    This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
@@ -25,7 +25,7 @@
 
 
 #include "pair_tersoff_cuda_cu.h"
-__device__ __constant__ Param_Float params[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR];
+__device__ __constant__ Param_Float params[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR];
 __device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij
 __device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff
 __device__ __constant__ bool _zbl; //is tersoff zbl?
@@ -36,119 +36,118 @@ __device__ __constant__ bool _zbl; //is tersoff zbl?
 #include <time.h>
 
 
-void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata,Param_Float* params_host,void* map_host, void* elem2param_host,int nelements_h,bool zbl)
+void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl)
 {
   unsigned cuda_ntypes = sdata->atom.ntypes + 1;
-  X_FLOAT box_size[3] =
-  {
+  X_FLOAT box_size[3] = {
     sdata->domain.subhi[0] - sdata->domain.sublo[0],
     sdata->domain.subhi[1] - sdata->domain.sublo[1],
     sdata->domain.subhi[2] - sdata->domain.sublo[2]
   };
 
-  cudaMemcpyToSymbol(MY_CONST(box_size)     , box_size                      , sizeof(X_FLOAT)*3);
-  cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)  ,&cuda_ntypes                   , sizeof(unsigned) );
-  cudaMemcpyToSymbol(MY_CONST(virial)       ,&sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*)  );
-  cudaMemcpyToSymbol(MY_CONST(eng_vdwl)     ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)  );
-  cudaMemcpyToSymbol(MY_CONST(periodicity)  , sdata->domain.periodicity     , sizeof(int)*3    );
-  cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int)  );
-  cudaMemcpyToSymbol("params", params_host  , sizeof(Param_Float)*nelements_h*nelements_h*nelements_h  );
-  cudaMemcpyToSymbol("elem2param",elem2param_host  , sizeof(int)*nelements_h*nelements_h*nelements_h  );
-  cudaMemcpyToSymbol("map",map_host  , sizeof(int)*cuda_ntypes  );
-  cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int));
-  cudaMemcpyToSymbol("_zbl",&zbl,sizeof(bool));
+  cudaMemcpyToSymbol(MY_AP(box_size)     , box_size                      , sizeof(X_FLOAT) * 3);
+  cudaMemcpyToSymbol(MY_AP(cuda_ntypes)  , &cuda_ntypes                   , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(virial)       , &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(eng_vdwl)     , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(periodicity)  , sdata->domain.periodicity     , sizeof(int) * 3);
+  cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int));
+  cudaMemcpyToSymbol(params, params_host  , sizeof(Param_Float)*nelements_h * nelements_h * nelements_h);
+  cudaMemcpyToSymbol(elem2param, elem2param_host  , sizeof(int)*nelements_h * nelements_h * nelements_h);
+  cudaMemcpyToSymbol(map, map_host  , sizeof(int)*cuda_ntypes);
+  cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int));
+  cudaMemcpyToSymbol(_zbl, &zbl, sizeof(bool));
 
 }
 
-void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
+void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
 {
-	static F_FLOAT* glob_zeta_ij=NULL;
-	static int glob_zeta_ij_size=0;
-  static F_FLOAT4* glob_r_ij=NULL;
-  static int* glob_numneigh_red=NULL;
-  static int* glob_neighbors_red=NULL;
-  static int* glob_neightype_red=NULL;
+  static F_FLOAT* glob_zeta_ij = NULL;
+  static int glob_zeta_ij_size = 0;
+  static F_FLOAT4* glob_r_ij = NULL;
+  static int* glob_numneigh_red = NULL;
+  static int* glob_neighbors_red = NULL;
+  static int* glob_neightype_red = NULL;
 
-	if(glob_zeta_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT))
-	{
-	  glob_zeta_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT);
-	  cudaFree(glob_zeta_ij);
+  if(glob_zeta_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) {
+    glob_zeta_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT);
+    cudaFree(glob_zeta_ij);
     cudaFree(glob_r_ij);
     cudaFree(glob_numneigh_red);
     cudaFree(glob_neighbors_red);
     cudaFree(glob_neightype_red);
-	  cudaMalloc(&glob_zeta_ij,glob_zeta_ij_size);
-    cudaMalloc(&glob_r_ij,glob_zeta_ij_size*4);
-    cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int));
-    cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
-    cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
-    cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red  , sizeof(int*)  );
-    cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red  , sizeof(int*)  );
-    cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red  , sizeof(int*)  );
-    cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij  , sizeof(F_FLOAT4*)  );
-    cudaMemcpyToSymbol("_glob_zeta_ij", &glob_zeta_ij  , sizeof(F_FLOAT*)  );
-	}
-	dim3 grid,threads;
-	int sharedperproc;
+    cudaMalloc(&glob_zeta_ij, glob_zeta_ij_size);
+    cudaMalloc(&glob_r_ij, glob_zeta_ij_size * 4);
+    cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int));
+    cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
+    cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
+    cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij  , sizeof(F_FLOAT4*));
+    cudaMemcpyToSymbol(_glob_zeta_ij, &glob_zeta_ij  , sizeof(F_FLOAT*));
+  }
 
-	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64);
-	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64);
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
 
 
 
-	dim3 grid2;
-	if(sdata->atom.nall<=256*64000){
-	  grid2.x = (sdata->atom.nall+255)/256;
-	  grid2.y = 1;
-	} else {
-    grid2.x = (sdata->atom.nall+256*128-1)/(256*128);
+  dim3 grid2;
+
+  if(sdata->atom.nall <= 256 * 64000) {
+    grid2.x = (sdata->atom.nall + 255) / 256;
+    grid2.y = 1;
+  } else {
+    grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128);
     grid2.y = 128;
   }
-	grid2.z = 1;
+
+  grid2.z = 1;
   dim3 threads2;
   threads2.x = 256;
   threads2.y = 1;
   threads2.z = 1;
 
-  timespec time1,time2;
+  timespec time1, time2;
 
   //pre-calculate all neighbordistances and zeta_ij
-  clock_gettime(CLOCK_REALTIME,&time1);
-  Pair_Tersoff_Kernel_TpA_RIJ<<<grid2, threads2,0,streams[1]>>>
-      ();
+  clock_gettime(CLOCK_REALTIME, &time1);
+  Pair_Tersoff_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>
+  ();
   cudaThreadSynchronize();
-  Pair_Tersoff_Kernel_TpA_ZetaIJ<<<grid2, threads2,0,streams[1]>>>
-      ();
+  Pair_Tersoff_Kernel_TpA_ZetaIJ <<< grid2, threads2, 0, streams[1]>>>
+  ();
   cudaThreadSynchronize();
-  clock_gettime(CLOCK_REALTIME,&time2);
-  sdata->cuda_timings.test1+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-  clock_gettime(CLOCK_REALTIME,&time1);
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.test1 +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+  clock_gettime(CLOCK_REALTIME, &time1);
 
   //actual force calculation
-  unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure
-  if(eflag)
-  {
+  unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
+
+  if(eflag) {
     if(vflag)
-      Pair_Tersoff_Kernel_TpA<1,1><<<grid, threads,sharedsize,streams[1]>>>
-       (eflag_atom,vflag_atom);
+      Pair_Tersoff_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
     else
-      Pair_Tersoff_Kernel_TpA<1,0><<<grid, threads,sharedsize,streams[1]>>>
-       (eflag_atom,vflag_atom);
-  }
-  else
-  {
+      Pair_Tersoff_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
+  } else {
     if(vflag)
-      Pair_Tersoff_Kernel_TpA<0,1><<<grid, threads,sharedsize,streams[1]>>>
-       (eflag_atom,vflag_atom);
+      Pair_Tersoff_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
     else
-      Pair_Tersoff_Kernel_TpA<0,0><<<grid, threads,sharedsize,streams[1]>>>
-       (eflag_atom,vflag_atom);
+      Pair_Tersoff_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
   }
   cudaThreadSynchronize();
-  clock_gettime(CLOCK_REALTIME,&time2);
-  sdata->cuda_timings.test2+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.test2 +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
 
   Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
 }
diff --git a/lib/cuda/pppm_cuda.cu b/lib/cuda/pppm_cuda.cu
index cabea885d3..dd434b9bbf 100644
--- a/lib/cuda/pppm_cuda.cu
+++ b/lib/cuda/pppm_cuda.cu
@@ -1,22 +1,22 @@
 /* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
 
    Original Version:
    http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
+   Steve Plimpton, sjplimp@sandia.gov
 
-   See the README file in the top-level LAMMPS directory. 
+   See the README file in the top-level LAMMPS directory.
 
-   ----------------------------------------------------------------------- 
+   -----------------------------------------------------------------------
 
    USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
+   https://sourceforge.net/projects/lammpscuda/
 
    Christian Trott, christian.trott@tu-ilmenau.de
    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
+   Theoretical Physics II, University of Technology Ilmenau, Germany
 
-   See the README file in the USER-CUDA directory. 
+   See the README file in the USER-CUDA directory.
 
    This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
@@ -34,484 +34,493 @@
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
-  __device__ __constant__ FFT_FLOAT* work1;
-  __device__ __constant__ FFT_FLOAT* work2;
-  __device__ __constant__ FFT_FLOAT* work3;
-  __device__ __constant__ PPPM_FLOAT* greensfn;
-  __device__ __constant__ PPPM_FLOAT* gf_b;
-  __device__ __constant__ PPPM_FLOAT* fkx;
-  __device__ __constant__ PPPM_FLOAT* fky;
-  __device__ __constant__ PPPM_FLOAT* fkz;
-  __device__ __constant__ PPPM_FLOAT* vg;
-  __device__ __constant__ int* part2grid;
-  __device__ __constant__ PPPM_FLOAT* density_brick;
-  __device__ __constant__ int* density_brick_int;
-  __device__ __constant__ PPPM_FLOAT density_intScale;
-  __device__ __constant__ PPPM_FLOAT* vdx_brick;
-  __device__ __constant__ PPPM_FLOAT* vdy_brick;
-  __device__ __constant__ PPPM_FLOAT* vdz_brick;
-  __device__ __constant__ PPPM_FLOAT* density_fft;
-  __device__ __constant__ ENERGY_FLOAT* energy;
-  __device__ __constant__ ENERGY_FLOAT* virial;
-  __device__ __constant__ int nxlo_in;
-  __device__ __constant__ int nxhi_in;
-  __device__ __constant__ int nxlo_out;
-  __device__ __constant__ int nxhi_out;
-  __device__ __constant__ int nylo_in;
-  __device__ __constant__ int nyhi_in;
-  __device__ __constant__ int nylo_out;
-  __device__ __constant__ int nyhi_out;
-  __device__ __constant__ int nzlo_in;
-  __device__ __constant__ int nzhi_in;
-  __device__ __constant__ int nzlo_out;
-  __device__ __constant__ int nzhi_out;
-  __device__ __constant__ int nxlo_fft;
-  __device__ __constant__ int nxhi_fft;
-  __device__ __constant__ int nylo_fft;
-  __device__ __constant__ int nyhi_fft;
-  __device__ __constant__ int nzlo_fft;
-  __device__ __constant__ int nzhi_fft;
-  __device__ __constant__ int nx_pppm;
-  __device__ __constant__ int ny_pppm;
-  __device__ __constant__ int nz_pppm;
-  __device__ __constant__ int slabflag;
-  __device__ __constant__ PPPM_FLOAT qqrd2e;
-  __device__ __constant__ int order;
-  //__device__ __constant__ float3 sublo;
-  __device__ __constant__ PPPM_FLOAT* rho_coeff;
-  __device__ __constant__ int nmax;
-  __device__ __constant__ int nlocal;
-  __device__ __constant__ PPPM_FLOAT* debugdata;
-  __device__ __constant__ PPPM_FLOAT delxinv;
-  __device__ __constant__ PPPM_FLOAT delyinv;
-  __device__ __constant__ PPPM_FLOAT delzinv;
-  __device__ __constant__ int nlower;
-  __device__ __constant__ int nupper;
-  __device__ __constant__ PPPM_FLOAT shiftone;
-  
-  
+__device__ __constant__ FFT_FLOAT* work1;
+__device__ __constant__ FFT_FLOAT* work2;
+__device__ __constant__ FFT_FLOAT* work3;
+__device__ __constant__ PPPM_FLOAT* greensfn;
+__device__ __constant__ PPPM_FLOAT* gf_b;
+__device__ __constant__ PPPM_FLOAT* fkx;
+__device__ __constant__ PPPM_FLOAT* fky;
+__device__ __constant__ PPPM_FLOAT* fkz;
+__device__ __constant__ PPPM_FLOAT* vg;
+__device__ __constant__ int* part2grid;
+__device__ __constant__ PPPM_FLOAT* density_brick;
+__device__ __constant__ int* density_brick_int;
+__device__ __constant__ PPPM_FLOAT density_intScale;
+__device__ __constant__ PPPM_FLOAT* vdx_brick;
+__device__ __constant__ PPPM_FLOAT* vdy_brick;
+__device__ __constant__ PPPM_FLOAT* vdz_brick;
+__device__ __constant__ PPPM_FLOAT* density_fft;
+__device__ __constant__ ENERGY_FLOAT* energy;
+__device__ __constant__ ENERGY_FLOAT* virial;
+__device__ __constant__ int nxlo_in;
+__device__ __constant__ int nxhi_in;
+__device__ __constant__ int nxlo_out;
+__device__ __constant__ int nxhi_out;
+__device__ __constant__ int nylo_in;
+__device__ __constant__ int nyhi_in;
+__device__ __constant__ int nylo_out;
+__device__ __constant__ int nyhi_out;
+__device__ __constant__ int nzlo_in;
+__device__ __constant__ int nzhi_in;
+__device__ __constant__ int nzlo_out;
+__device__ __constant__ int nzhi_out;
+__device__ __constant__ int nxlo_fft;
+__device__ __constant__ int nxhi_fft;
+__device__ __constant__ int nylo_fft;
+__device__ __constant__ int nyhi_fft;
+__device__ __constant__ int nzlo_fft;
+__device__ __constant__ int nzhi_fft;
+__device__ __constant__ int nx_pppm;
+__device__ __constant__ int ny_pppm;
+__device__ __constant__ int nz_pppm;
+__device__ __constant__ int slabflag;
+__device__ __constant__ PPPM_FLOAT qqrd2e;
+__device__ __constant__ int order;
+//__device__ __constant__ float3 sublo;
+__device__ __constant__ PPPM_FLOAT* rho_coeff;
+__device__ __constant__ int nmax;
+__device__ __constant__ int nlocal;
+__device__ __constant__ PPPM_FLOAT* debugdata;
+__device__ __constant__ PPPM_FLOAT delxinv;
+__device__ __constant__ PPPM_FLOAT delyinv;
+__device__ __constant__ PPPM_FLOAT delzinv;
+__device__ __constant__ int nlower;
+__device__ __constant__ int nupper;
+__device__ __constant__ PPPM_FLOAT shiftone;
+
+
 #include "pppm_cuda_kernel.cu"
 #include "stdio.h"
 void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial
-	    ,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg
-	    ,int cu_nxlo_in,int cu_nxhi_in,int cu_nylo_in,int cu_nyhi_in,int cu_nzlo_in,int cu_nzhi_in,int cu_nxlo_out,int cu_nxhi_out,int cu_nylo_out,int cu_nyhi_out,int cu_nzlo_out,int cu_nzhi_out,int cu_nx_pppm,int cu_ny_pppm,int cu_nz_pppm
-	    ,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b
-	    ,double cu_qqrd2e, int cu_order, void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_int,int cu_slabflag
-	 )
+                      , void* cu_work1, void* cu_work2, void* cu_work3, void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg
+                      , int cu_nxlo_in, int cu_nxhi_in, int cu_nylo_in, int cu_nyhi_in, int cu_nzlo_in, int cu_nzhi_in, int cu_nxlo_out, int cu_nxhi_out, int cu_nylo_out, int cu_nyhi_out, int cu_nzlo_out, int cu_nzhi_out, int cu_nx_pppm, int cu_ny_pppm, int cu_nz_pppm
+                      , int cu_nxlo_fft, int cu_nxhi_fft, int cu_nylo_fft, int cu_nyhi_fft, int cu_nzlo_fft, int cu_nzhi_fft, void* cu_gf_b
+                      , double cu_qqrd2e, int cu_order, void* cu_rho_coeff, void* cu_debugdata, void* cu_density_brick_int, int cu_slabflag
+                     )
 {
   CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start");
-  cudaMemcpyToSymbol("density_brick",&cu_density_brick, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("density_brick_int",&cu_density_brick_int, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("vdx_brick",&cu_vdx_brick, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("vdy_brick",&cu_vdy_brick, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("vdz_brick",&cu_vdz_brick, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("density_fft",&cu_density_fft, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("energy",&cu_energy, sizeof(ENERGY_FLOAT*));
-  cudaMemcpyToSymbol("virial",&cu_virial, sizeof(ENERGY_FLOAT*));
-  cudaMemcpyToSymbol("nxlo_in",&cu_nxlo_in, sizeof(int));
-  cudaMemcpyToSymbol("nxhi_in",&cu_nxhi_in, sizeof(int));
-  cudaMemcpyToSymbol("nxlo_out",&cu_nxlo_out, sizeof(int));
-  cudaMemcpyToSymbol("nxhi_out",&cu_nxhi_out, sizeof(int));
-  cudaMemcpyToSymbol("nylo_in",&cu_nylo_in, sizeof(int));
-  cudaMemcpyToSymbol("nyhi_in",&cu_nyhi_in, sizeof(int));
-  cudaMemcpyToSymbol("nylo_out",&cu_nylo_out, sizeof(int));
-  cudaMemcpyToSymbol("nyhi_out",&cu_nyhi_out, sizeof(int));
-  cudaMemcpyToSymbol("nzlo_in",&cu_nzlo_in, sizeof(int));
-  cudaMemcpyToSymbol("nzhi_in",&cu_nzhi_in, sizeof(int));
-  cudaMemcpyToSymbol("nzlo_out",&cu_nzlo_out, sizeof(int));
-  cudaMemcpyToSymbol("nzhi_out",&cu_nzhi_out, sizeof(int));
-  cudaMemcpyToSymbol("nxlo_fft",&cu_nxlo_fft, sizeof(int));
-  cudaMemcpyToSymbol("nxhi_fft",&cu_nxhi_fft, sizeof(int));
-  cudaMemcpyToSymbol("nylo_fft",&cu_nylo_fft, sizeof(int));
-  cudaMemcpyToSymbol("nyhi_fft",&cu_nyhi_fft, sizeof(int));
-  cudaMemcpyToSymbol("nzlo_fft",&cu_nzlo_fft, sizeof(int));
-  cudaMemcpyToSymbol("nzhi_fft",&cu_nzhi_fft, sizeof(int));
-  cudaMemcpyToSymbol("slabflag",&cu_slabflag, sizeof(int));  
-  cudaMemcpyToSymbol("nx_pppm",&cu_nx_pppm, sizeof(int));
-  cudaMemcpyToSymbol("ny_pppm",&cu_ny_pppm, sizeof(int));
-  cudaMemcpyToSymbol("nz_pppm",&cu_nz_pppm, sizeof(int));
-  cudaMemcpyToSymbol("work1",&cu_work1, sizeof(FFT_FLOAT*));
-  cudaMemcpyToSymbol("work2",&cu_work2, sizeof(FFT_FLOAT*));
-  cudaMemcpyToSymbol("work3",&cu_work3, sizeof(FFT_FLOAT*));
-  cudaMemcpyToSymbol("greensfn",&cu_greensfn, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("gf_b",&cu_gf_b, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("fkx",&cu_fkx, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("fky",&cu_fky, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("fkz",&cu_fkz, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("vg",&cu_vg, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(density_brick, &cu_density_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(density_brick_int, &cu_density_brick_int, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(vdx_brick, &cu_vdx_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(vdy_brick, &cu_vdy_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(vdz_brick, &cu_vdz_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(density_fft, &cu_density_fft, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(energy, &cu_energy, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(virial, &cu_virial, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(nxlo_in, &cu_nxlo_in, sizeof(int));
+  cudaMemcpyToSymbol(nxhi_in, &cu_nxhi_in, sizeof(int));
+  cudaMemcpyToSymbol(nxlo_out, &cu_nxlo_out, sizeof(int));
+  cudaMemcpyToSymbol(nxhi_out, &cu_nxhi_out, sizeof(int));
+  cudaMemcpyToSymbol(nylo_in, &cu_nylo_in, sizeof(int));
+  cudaMemcpyToSymbol(nyhi_in, &cu_nyhi_in, sizeof(int));
+  cudaMemcpyToSymbol(nylo_out, &cu_nylo_out, sizeof(int));
+  cudaMemcpyToSymbol(nyhi_out, &cu_nyhi_out, sizeof(int));
+  cudaMemcpyToSymbol(nzlo_in, &cu_nzlo_in, sizeof(int));
+  cudaMemcpyToSymbol(nzhi_in, &cu_nzhi_in, sizeof(int));
+  cudaMemcpyToSymbol(nzlo_out, &cu_nzlo_out, sizeof(int));
+  cudaMemcpyToSymbol(nzhi_out, &cu_nzhi_out, sizeof(int));
+  cudaMemcpyToSymbol(nxlo_fft, &cu_nxlo_fft, sizeof(int));
+  cudaMemcpyToSymbol(nxhi_fft, &cu_nxhi_fft, sizeof(int));
+  cudaMemcpyToSymbol(nylo_fft, &cu_nylo_fft, sizeof(int));
+  cudaMemcpyToSymbol(nyhi_fft, &cu_nyhi_fft, sizeof(int));
+  cudaMemcpyToSymbol(nzlo_fft, &cu_nzlo_fft, sizeof(int));
+  cudaMemcpyToSymbol(nzhi_fft, &cu_nzhi_fft, sizeof(int));
+  cudaMemcpyToSymbol(slabflag, &cu_slabflag, sizeof(int));
+  cudaMemcpyToSymbol(nx_pppm, &cu_nx_pppm, sizeof(int));
+  cudaMemcpyToSymbol(ny_pppm, &cu_ny_pppm, sizeof(int));
+  cudaMemcpyToSymbol(nz_pppm, &cu_nz_pppm, sizeof(int));
+  cudaMemcpyToSymbol(work1, &cu_work1, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol(work2, &cu_work2, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol(work3, &cu_work3, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol(greensfn, &cu_greensfn, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(gf_b, &cu_gf_b, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(fkx, &cu_fkx, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(fky, &cu_fky, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(fkz, &cu_fkz, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(vg, &cu_vg, sizeof(PPPM_FLOAT*));
+
+  PPPM_FLOAT cu_qqrd2e_a = cu_qqrd2e;
+  cudaMemcpyToSymbol(qqrd2e, &cu_qqrd2e_a, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol(order, &cu_order, sizeof(int));
+  cudaMemcpyToSymbol(rho_coeff, &cu_rho_coeff, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(debugdata, &cu_debugdata, sizeof(PPPM_FLOAT*));
 
-  PPPM_FLOAT cu_qqrd2e_a=cu_qqrd2e;
-  cudaMemcpyToSymbol("qqrd2e",&cu_qqrd2e_a, sizeof(PPPM_FLOAT));
-  cudaMemcpyToSymbol("order",&cu_order, sizeof(int));
-  cudaMemcpyToSymbol("rho_coeff",&cu_rho_coeff, sizeof(PPPM_FLOAT*));
-  cudaMemcpyToSymbol("debugdata",&cu_debugdata, sizeof(PPPM_FLOAT*));
-  
   CUT_CHECK_ERROR("ERROR-CUDA poisson_init");
 
-/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n");
+  /*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n");
 
-#ifdef PPPM_PRECISION
-if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n");
-if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n");
-#endif
-#ifdef ENERGY_PRECISION
-if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n");
-if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n");
-#endif
-#ifdef ENERGY_PRECISION
-if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n");
-if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n");
-#endif
-#ifdef X_PRECISION
-if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n");
-if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n");
-#endif
-#ifdef F_PRECISION
-if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n");
-if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n");
-#endif*/
+  #ifdef PPPM_PRECISION
+  if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n");
+  if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n");
+  #endif
+  #ifdef ENERGY_PRECISION
+  if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n");
+  if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n");
+  #endif
+  #ifdef ENERGY_PRECISION
+  if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n");
+  if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n");
+  #endif
+  #ifdef X_PRECISION
+  if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n");
+  if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n");
+  #endif
+  #ifdef F_PRECISION
+  if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n");
+  if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n");
+  #endif*/
 }
 
-void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT cu_shiftone,PPPM_FLOAT cu_delxinv,PPPM_FLOAT cu_delyinv,PPPM_FLOAT cu_delzinv,int cu_nlower,int cu_nupper)
+void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT cu_shiftone, PPPM_FLOAT cu_delxinv, PPPM_FLOAT cu_delyinv, PPPM_FLOAT cu_delzinv, int cu_nlower, int cu_nupper)
 {
-  cudaMemcpyToSymbol("delxinv",&cu_delxinv, sizeof(PPPM_FLOAT));
-  cudaMemcpyToSymbol("delyinv",&cu_delyinv, sizeof(PPPM_FLOAT));
-  cudaMemcpyToSymbol("delzinv",&cu_delzinv, sizeof(PPPM_FLOAT));
-  cudaMemcpyToSymbol("shiftone",&cu_shiftone, sizeof(PPPM_FLOAT));
-  cudaMemcpyToSymbol("nlower",&cu_nlower, sizeof(int));
-  cudaMemcpyToSymbol("nupper",&cu_nupper, sizeof(int));
-  cudaMemcpyToSymbol(MY_CONST(sublo)   , sdata->domain.sublo, 3*sizeof(X_FLOAT));
-  cudaMemcpyToSymbol(MY_CONST(subhi)   , sdata->domain.subhi, 3*sizeof(X_FLOAT));
-  cudaMemcpyToSymbol(MY_CONST(boxlo)   , sdata->domain.boxlo, 3*sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(delxinv, &cu_delxinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol(delyinv, &cu_delyinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol(delzinv, &cu_delzinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol(shiftone, &cu_shiftone, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol(nlower, &cu_nlower, sizeof(int));
+  cudaMemcpyToSymbol(nupper, &cu_nupper, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(sublo)   , sdata->domain.sublo, 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(subhi)   , sdata->domain.subhi, 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(boxlo)   , sdata->domain.boxlo, 3 * sizeof(X_FLOAT));
   CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup");
 }
 
-void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa)
+void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa)
 {
-  cudaMemcpyToSymbol("part2grid",&cu_part2grid, sizeof(int*));
-  cudaMemcpyToSymbol(MY_CONST(x)   , & sdata->atom.x   .dev_data, sizeof(X_FLOAT*));
-  cudaMemcpyToSymbol(MY_CONST(f)   , & sdata->atom.f   .dev_data, sizeof(F_FLOAT*));
-  cudaMemcpyToSymbol(MY_CONST(q)   , & sdata->atom.q   .dev_data, sizeof(F_FLOAT*));
-  cudaMemcpyToSymbol(MY_CONST(tag)   , & sdata->atom.tag   .dev_data, sizeof(int*));
-  //cudaMemcpyToSymbol(MY_CONST(nlocal)   , & sdata->atom.nlocal   .dev_data, sizeof(int));
-  cudaMemcpyToSymbol("nlocal"   , &nlocala, sizeof(int));
-  cudaMemcpyToSymbol("nmax"   , &nmaxa, sizeof(int));
+  cudaMemcpyToSymbol("part2grid", &cu_part2grid, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(x)   , & sdata->atom.x   .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)   , & sdata->atom.f   .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(q)   , & sdata->atom.q   .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(tag)   , & sdata->atom.tag   .dev_data, sizeof(int*));
+  //cudaMemcpyToSymbol(MY_AP(nlocal)   , & sdata->atom.nlocal   .dev_data, sizeof(int));
+  cudaMemcpyToSymbol(nlocal   , &nlocala, sizeof(int));
+  cudaMemcpyToSymbol(nmax   , &nmaxa, sizeof(int));
   CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update");
-  
+
 }
 
 void pppm_update_nlocal(int nlocala)
 {
- cudaMemcpyToSymbol("nlocal"   , &nlocala, sizeof(int));
- CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b");
+  cudaMemcpyToSymbol(nlocal   , &nlocala, sizeof(int));
+  CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b");
 }
 
 
-void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald)
+void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald)
 {
   dim3 grid;
   dim3 threads;
-  grid.x=nz_pppma;
-  grid.y=ny_pppma;
-  grid.z=1;
-  threads.x=nx_pppma;
-  threads.y=1;
-  threads.z=1;
-  setup_fkxyz_vg<<<grid,threads,0>>>(unitkx,unitky,unitkz,g_ewald);
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  setup_fkxyz_vg <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald);
   cudaThreadSynchronize();
-  
+
   CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg ");
 }
 
-void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald,
-int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab)
+void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald,
+                              int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab)
 {
   dim3 grid;
   dim3 threads;
-  grid.x=nz_pppma;
-  grid.y=ny_pppma;
-  grid.z=1;
-  threads.x=nx_pppma;
-  threads.y=1;
-  threads.z=1;
-  setup_greensfn<<<grid,threads,0>>>(unitkx,unitky,unitkz,g_ewald,nbx,nby,nbz,xprd,yprd, zprd_slab);
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  setup_greensfn <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald, nbx, nby, nbz, xprd, yprd, zprd_slab);
   cudaThreadSynchronize();
   CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn ");
 }
 
-void poisson_scale(int nx_pppma,int ny_pppma,int nz_pppma)
+void poisson_scale(int nx_pppma, int ny_pppma, int nz_pppma)
 {
   dim3 grid;
   dim3 threads;
-  grid.x=nz_pppma;
-  grid.y=ny_pppma;
-  grid.z=1;
-  threads.x=nx_pppma;
-  threads.y=1;
-  threads.z=1;
-  poisson_scale_kernel<<<grid,threads,0>>>();
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_scale_kernel <<< grid, threads, 0>>>();
   CUT_CHECK_ERROR("ERROR-CUDA poisson_scale ");
 
 }
 
-void poisson_xgrad(int nx_pppma,int ny_pppma,int nz_pppma)
+void poisson_xgrad(int nx_pppma, int ny_pppma, int nz_pppma)
 {
   dim3 grid;
   dim3 threads;
-  grid.x=nz_pppma;
-  grid.y=ny_pppma;
-  grid.z=1;
-  threads.x=nx_pppma;
-  threads.y=1;
-  threads.z=1;
-  poisson_xgrad_kernel<<<grid,threads,0>>>();
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_xgrad_kernel <<< grid, threads, 0>>>();
   CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad ");
 }
 
-void poisson_ygrad(int nx_pppma,int ny_pppma,int nz_pppma)
+void poisson_ygrad(int nx_pppma, int ny_pppma, int nz_pppma)
 {
   dim3 grid;
   dim3 threads;
-  grid.x=nz_pppma;
-  grid.y=ny_pppma;
-  grid.z=1;
-  threads.x=nx_pppma;
-  threads.y=1;
-  threads.z=1;
-  poisson_ygrad_kernel<<<grid,threads,0>>>();
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_ygrad_kernel <<< grid, threads, 0>>>();
   CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad ");
 }
 
-void poisson_zgrad(int nx_pppma,int ny_pppma,int nz_pppma)
+void poisson_zgrad(int nx_pppma, int ny_pppma, int nz_pppma)
 {
   dim3 grid;
   dim3 threads;
-  grid.x=nz_pppma;
-  grid.y=ny_pppma;
-  grid.z=1;
-  threads.x=nx_pppma;
-  threads.y=1;
-  threads.z=1;
-  poisson_zgrad_kernel<<<grid,threads,0>>>();
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_zgrad_kernel <<< grid, threads, 0>>>();
   CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad ");
 }
 
-void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppma,int ny_pppma,int nz_pppma)
+void poisson_vdx_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppma, int ny_pppma, int nz_pppma)
 {
-	
+
   dim3 grid;
   dim3 threads;
-  grid.x=khi-klo+1;
-  grid.y=jhi-jlo+1;
-  grid.z=1;
-  threads.x=ihi-ilo+1;
-  threads.y=1;
-  threads.z=1;
+  grid.x = khi - klo + 1;
+  grid.y = jhi - jlo + 1;
+  grid.z = 1;
+  threads.x = ihi - ilo + 1;
+  threads.y = 1;
+  threads.z = 1;
   //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x);
-  poisson_vdx_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
+  poisson_vdx_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
   CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick ");
   cudaThreadSynchronize();
 }
 
-void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm)
+void poisson_vdy_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm)
 {
   dim3 grid;
   dim3 threads;
-  grid.x=khi-klo+1;
-  grid.y=jhi-jlo+1;
-  grid.z=1;
-  threads.x=ihi-ilo+1;
-  threads.y=1;
-  threads.z=1;
-  poisson_vdy_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
+  grid.x = khi - klo + 1;
+  grid.y = jhi - jlo + 1;
+  grid.z = 1;
+  threads.x = ihi - ilo + 1;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_vdy_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
   CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick ");
   cudaThreadSynchronize();
 }
 
-void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm)
+void poisson_vdz_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm)
 {
   dim3 grid;
   dim3 threads;
-  grid.x=khi-klo+1;
-  grid.y=jhi-jlo+1;
-  grid.z=1;
-  threads.x=ihi-ilo+1;
-  threads.y=1;
-  threads.z=1;
-  poisson_vdz_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
+  grid.x = khi - klo + 1;
+  grid.y = jhi - jlo + 1;
+  grid.z = 1;
+  threads.x = ihi - ilo + 1;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_vdz_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
   CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick ");
   cudaThreadSynchronize();
 }
 
 
-void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag)
+void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int nzlo_fft, int nzhi_fft, int vflag)
 {
   //printf("VFLAG_GPU: %i\n",vflag);
   CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start ");
   dim3 grid;
   dim3 threads;
-  grid.x=nzhi_fft-nzlo_fft+1;
-  grid.y=nyhi_fft-nylo_fft+1;
-  grid.z=1;
-  threads.x=nxhi_fft-nxlo_fft+1;
-  threads.y=1;
-  threads.z=1;
-  poisson_energy_kernel<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(nxlo_fft,nylo_fft,nzlo_fft,vflag);
-  
+  grid.x = nzhi_fft - nzlo_fft + 1;
+  grid.y = nyhi_fft - nylo_fft + 1;
+  grid.z = 1;
+  threads.x = nxhi_fft - nxlo_fft + 1;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_energy_kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(nxlo_fft, nylo_fft, nzlo_fft, vflag);
+
   cudaThreadSynchronize();
   CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end ");
 }
 
-ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial)
+ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial)
 {
-  ENERGY_FLOAT host_energy=0;
+  ENERGY_FLOAT host_energy = 0;
   dim3 grid;
   dim3 threads;
 
-  grid.x=nz_pppma;
-  grid.y=1;
-  grid.z=1;
-  threads.x=ny_pppma;
-  threads.y=1;
-  threads.z=1;
-  sum_energy_kernel1<<<grid,threads,ny_pppma*sizeof(ENERGY_FLOAT)>>>(vflag); 
+  grid.x = nz_pppma;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = ny_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  sum_energy_kernel1 <<< grid, threads, ny_pppma* sizeof(ENERGY_FLOAT)>>>(vflag);
   cudaThreadSynchronize();
   CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 ");
 
-  grid.x=1;
-  grid.y=1;
-  grid.z=1;
-  threads.x=nz_pppma;
-  threads.y=1;
-  threads.z=1;
-  sum_energy_kernel2<<<grid,threads,nz_pppma*sizeof(ENERGY_FLOAT)>>>(vflag); 
+  grid.x = 1;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = nz_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  sum_energy_kernel2 <<< grid, threads, nz_pppma* sizeof(ENERGY_FLOAT)>>>(vflag);
   cudaThreadSynchronize();
   CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 ");
 
-  cudaMemcpy((void*) (&host_energy), cu_energy, sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
+  cudaMemcpy((void*)(&host_energy), cu_energy, sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
+
   if(vflag)
-  cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
+    cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6 * sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
   CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy");
-    
+
   return host_energy;
 }
 
-void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int)
+void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int)
 {
   CUT_CHECK_ERROR("cuda_make_rho begin");
-  dim3 grid,threads;
+  dim3 grid, threads;
   int cpu_flag[3];
-  grid.x=(sdata->atom.nlocal+31)/32;
-  grid.y=1;
-  grid.z=1;
-  threads.x=32;
-  threads.y=1;
-  threads.z=1;
-  int sharedmemsize=(32+32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT);
-  do
-  { 
-    cpu_flag[0]=0;
-    cpu_flag[1]=0;
-    cpu_flag[2]=0;
-    cudaMemcpyToSymbol("density_intScale",cu_density_intScale,sizeof(PPPM_FLOAT*));
+  grid.x = (sdata->atom.nlocal + 31) / 32;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = 32;
+  threads.y = 1;
+  threads.z = 1;
+  int sharedmemsize = (32 + 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT);
+
+  do {
+    cpu_flag[0] = 0;
+    cpu_flag[1] = 0;
+    cpu_flag[2] = 0;
+    cudaMemcpyToSymbol("density_intScale", cu_density_intScale, sizeof(PPPM_FLOAT*));
     CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z");
-    cudaMemset(flag,0,3*sizeof(int));
+    cudaMemset(flag, 0, 3 * sizeof(int));
     CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A");
-    cudaMemset(cu_density_brick,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(PPPM_FLOAT));
+    cudaMemset(cu_density_brick, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(PPPM_FLOAT));
     CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B");
-    cudaMemset(cu_density_brick_int,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(int));
+    cudaMemset(cu_density_brick_int, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(int));
     CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C");
-     make_rho_kernel<<<grid,threads,sharedmemsize>>>((int*) flag,32/(sdata->pppm.nupper-sdata->pppm.nlower+1));
+    make_rho_kernel <<< grid, threads, sharedmemsize>>>((int*) flag, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1));
     cudaThreadSynchronize();
     CUT_CHECK_ERROR("ERROR-CUDA make_rho A");
-    cudaMemcpy((void*) &cpu_flag, flag, 3*sizeof(int),cudaMemcpyDeviceToHost);
-    if(cpu_flag[0]!=0) {(*cu_density_intScale)/=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n",*cu_density_intScale);)}
-    if((cpu_flag[0]==0)&&(cpu_flag[1]==0)) {(*cu_density_intScale)*=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n",*cu_density_intScale);)}
-   /* if((*cu_density_intScale)>0xe0000000)
-    {
-    	printf("Error Scaling\n");
-        cpu_flag[0]=0;
-        cpu_flag[1]=1;
-    }*/
+    cudaMemcpy((void*) &cpu_flag, flag, 3 * sizeof(int), cudaMemcpyDeviceToHost);
+
+    if(cpu_flag[0] != 0) {
+      (*cu_density_intScale) /= 2;
+      MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n", *cu_density_intScale);)
+    }
+    if((cpu_flag[0] == 0) && (cpu_flag[1] == 0)) {
+      (*cu_density_intScale) *= 2;
+      MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n", *cu_density_intScale);)
+    }
+    /* if((*cu_density_intScale)>0xe0000000)
+     {
+     	printf("Error Scaling\n");
+         cpu_flag[0]=0;
+         cpu_flag[1]=1;
+     }*/
     CUT_CHECK_ERROR("ERROR-CUDA make_rho B");
-  } while((cpu_flag[0]!=0)||(cpu_flag[1]==0));
-  
-  
-  grid.x=khi-klo+1;
-  grid.y=jhi-jlo+1;
-  threads.x=ihi-ilo+1;
-  scale_rho_kernel<<<grid,threads,0>>>();
+  } while((cpu_flag[0] != 0) || (cpu_flag[1] == 0));
+
+
+  grid.x = khi - klo + 1;
+  grid.y = jhi - jlo + 1;
+  threads.x = ihi - ilo + 1;
+  scale_rho_kernel <<< grid, threads, 0>>>();
   cudaThreadSynchronize();
   CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale");
 }
 
 
-int cuda_particle_map(cuda_shared_data* sdata,void* flag)
+int cuda_particle_map(cuda_shared_data* sdata, void* flag)
 {
-  dim3 grid,threads;
+  dim3 grid, threads;
   int cpu_flag;
-  grid.x=(sdata->atom.nlocal+31)/32;
-  grid.y=1;
-  grid.z=1;
-  threads.x=32;
-  threads.y=1;
-  threads.z=1;
+  grid.x = (sdata->atom.nlocal + 31) / 32;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = 32;
+  threads.y = 1;
+  threads.z = 1;
   CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre");
-  particle_map_kernel<<<grid,threads,0>>>((int*) flag);
+  particle_map_kernel <<< grid, threads, 0>>>((int*) flag);
   cudaThreadSynchronize();
   CUT_CHECK_ERROR("ERROR-CUDA particla_map a");
-  cudaMemcpy((void*) &cpu_flag, flag, sizeof(int),cudaMemcpyDeviceToHost);
+  cudaMemcpy((void*) &cpu_flag, flag, sizeof(int), cudaMemcpyDeviceToHost);
   CUT_CHECK_ERROR("ERROR-CUDA particla_map b");
   return cpu_flag;
 }
 
 
-void cuda_fieldforce(cuda_shared_data* sdata,void* flag)
+void cuda_fieldforce(cuda_shared_data* sdata, void* flag)
 {
-  dim3 grid,threads;
-  grid.x=(sdata->atom.nlocal+31)/32;
-  grid.y=1;
-  grid.z=1;
-  threads.x=32;
-  threads.y=1;
-  threads.z=1;
-  int sharedmemsize=(32+3*32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT);
-  fieldforce_kernel<<<grid,threads,sharedmemsize>>>
-  (sdata->pppm.nupper-sdata->pppm.nlower+1,32/(sdata->pppm.nupper-sdata->pppm.nlower+1),(int*) flag);
+  dim3 grid, threads;
+  grid.x = (sdata->atom.nlocal + 31) / 32;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = 32;
+  threads.y = 1;
+  threads.z = 1;
+  int sharedmemsize = (32 + 3 * 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT);
+  fieldforce_kernel <<< grid, threads, sharedmemsize>>>
+  (sdata->pppm.nupper - sdata->pppm.nlower + 1, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1), (int*) flag);
   cudaThreadSynchronize();
   CUT_CHECK_ERROR("ERROR-CUDA fieldforce");
 }
 
 double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf)
 {
-  dim3 grid,threads;
-  grid.x=(sdata->atom.nlocal+31)/32;
-  grid.y=1;
-  grid.z=1;
-  threads.x=32;
-  threads.y=1;
-  threads.z=1;	
-  slabcorr_energy_kernel<<<grid,threads,32*sizeof(ENERGY_FLOAT)>>>(dev_buf);
+  dim3 grid, threads;
+  grid.x = (sdata->atom.nlocal + 31) / 32;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = 32;
+  threads.y = 1;
+  threads.z = 1;
+  slabcorr_energy_kernel <<< grid, threads, 32* sizeof(ENERGY_FLOAT)>>>(dev_buf);
   cudaThreadSynchronize();
-  cudaMemcpy((void*) buf, dev_buf, grid.x*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
-  
-  double dipole_all=0.0;
-  for(int i=0;i<grid.x;i++)
-   dipole_all+=buf[i];
-   
+  cudaMemcpy((void*) buf, dev_buf, grid.x* sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
+
+  double dipole_all = 0.0;
+
+  for(int i = 0; i < grid.x; i++)
+    dipole_all += buf[i];
+
   return dipole_all;
 }
 
 void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact)
 {
-  dim3 grid,threads;
-  grid.x=(sdata->atom.nlocal+31)/32;
-  grid.y=1;
-  grid.z=1;
-  threads.x=32;
-  threads.y=1;
-  threads.z=1;	
-  slabcorr_force_kernel<<<grid,threads>>>(ffact);	
+  dim3 grid, threads;
+  grid.x = (sdata->atom.nlocal + 31) / 32;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = 32;
+  threads.y = 1;
+  threads.z = 1;
+  slabcorr_force_kernel <<< grid, threads>>>(ffact);
   cudaThreadSynchronize();
 }
 
@@ -519,59 +528,59 @@ void sum_virial(double* host_virial)
 {
 }
 
-void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out)
+void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out)
 {
-  int nslow=sdata->pppm.nzhi_in-sdata->pppm.nzlo_in;	
-  int nmid=sdata->pppm.nyhi_in-sdata->pppm.nylo_in;	
-  int nfast=sdata->pppm.nxhi_in-sdata->pppm.nxlo_in;
-  int nrimz=MAX(sdata->pppm.nzlo_in-sdata->pppm.nzlo_out,sdata->pppm.nzhi_out-sdata->pppm.nzhi_in);	
-  int nrimy=MAX(sdata->pppm.nylo_in-sdata->pppm.nylo_out,sdata->pppm.nyhi_out-sdata->pppm.nyhi_in);	
-  int nrimx=MAX(sdata->pppm.nxlo_in-sdata->pppm.nxlo_out,sdata->pppm.nxhi_out-sdata->pppm.nxhi_in);	
+  int nslow = sdata->pppm.nzhi_in - sdata->pppm.nzlo_in;
+  int nmid = sdata->pppm.nyhi_in - sdata->pppm.nylo_in;
+  int nfast = sdata->pppm.nxhi_in - sdata->pppm.nxlo_in;
+  int nrimz = MAX(sdata->pppm.nzlo_in - sdata->pppm.nzlo_out, sdata->pppm.nzhi_out - sdata->pppm.nzhi_in);
+  int nrimy = MAX(sdata->pppm.nylo_in - sdata->pppm.nylo_out, sdata->pppm.nyhi_out - sdata->pppm.nyhi_in);
+  int nrimx = MAX(sdata->pppm.nxlo_in - sdata->pppm.nxlo_out, sdata->pppm.nxhi_out - sdata->pppm.nxhi_in);
   dim3 grid;
-  grid.x=nslow+1;
-  grid.y=nmid+1;
-  grid.z=1;
+  grid.x = nslow + 1;
+  grid.y = nmid + 1;
+  grid.z = 1;
   dim3 threads;
-  threads.x=nfast+1;
-  threads.y=1;
-  threads.z=1;
+  threads.x = nfast + 1;
+  threads.y = 1;
+  threads.z = 1;
   cudaThreadSynchronize();
-  initfftdata_core_kernel<<<grid,threads,0>>>(in,out);
+  initfftdata_core_kernel <<< grid, threads, 0>>>(in, out);
   cudaThreadSynchronize();
-  grid.x=nrimz;
-  grid.y=nmid+1;
-  threads.x=nfast+1;
-  initfftdata_z_kernel<<<grid,threads,0>>>(in,out);
+  grid.x = nrimz;
+  grid.y = nmid + 1;
+  threads.x = nfast + 1;
+  initfftdata_z_kernel <<< grid, threads, 0>>>(in, out);
   cudaThreadSynchronize();
-  grid.x=nslow+1;
-  grid.y=nrimy;
-  threads.x=nfast+1;
-  initfftdata_y_kernel<<<grid,threads,0>>>(in,out);
+  grid.x = nslow + 1;
+  grid.y = nrimy;
+  threads.x = nfast + 1;
+  initfftdata_y_kernel <<< grid, threads, 0>>>(in, out);
   cudaThreadSynchronize();
-  grid.x=nslow+1;
-  grid.y=nmid+1;
-  threads.x=nrimx;
-  initfftdata_x_kernel<<<grid,threads,0>>>(in,out);
+  grid.x = nslow + 1;
+  grid.y = nmid + 1;
+  threads.x = nrimx;
+  initfftdata_x_kernel <<< grid, threads, 0>>>(in, out);
   cudaThreadSynchronize();
-  grid.x=nrimz;
-  grid.y=nrimy;
-  threads.x=nfast+1;
-  initfftdata_yz_kernel<<<grid,threads,0>>>(in,out);
+  grid.x = nrimz;
+  grid.y = nrimy;
+  threads.x = nfast + 1;
+  initfftdata_yz_kernel <<< grid, threads, 0>>>(in, out);
   cudaThreadSynchronize();
-  grid.x=nrimz;
-  grid.y=nmid+1;
-  threads.x=nrimx;
-  initfftdata_xz_kernel<<<grid,threads,0>>>(in,out);
+  grid.x = nrimz;
+  grid.y = nmid + 1;
+  threads.x = nrimx;
+  initfftdata_xz_kernel <<< grid, threads, 0>>>(in, out);
   cudaThreadSynchronize();
-  grid.x=nslow+1;
-  grid.y=nrimy;
-  threads.x=nrimx;
-  initfftdata_xy_kernel<<<grid,threads,0>>>(in,out);
+  grid.x = nslow + 1;
+  grid.y = nrimy;
+  threads.x = nrimx;
+  initfftdata_xy_kernel <<< grid, threads, 0>>>(in, out);
   cudaThreadSynchronize();
-  grid.x=nrimz;
-  grid.y=nrimy;
-  threads.x=nrimx;
-  initfftdata_xyz_kernel<<<grid,threads,0>>>(in,out);
+  grid.x = nrimz;
+  grid.y = nrimy;
+  threads.x = nrimx;
+  initfftdata_xyz_kernel <<< grid, threads, 0>>>(in, out);
   cudaThreadSynchronize();
   CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel");
 }