diff --git a/lib/gpu/cg_cmm.cu b/lib/gpu/cg_cmm.cu
index eef3c823db..b009c8618e 100644
--- a/lib/gpu/cg_cmm.cu
+++ b/lib/gpu/cg_cmm.cu
@@ -17,7 +17,7 @@
 #include "preprocessor.h"
 texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/cg_cmm_long.cu b/lib/gpu/cg_cmm_long.cu
index 712656c765..cc6bf35760 100644
--- a/lib/gpu/cg_cmm_long.cu
+++ b/lib/gpu/cg_cmm_long.cu
@@ -18,9 +18,9 @@
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
-__inline float fetch_q(const int& i, const float *q) 
+ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/cg_cmm_msm.cu b/lib/gpu/cg_cmm_msm.cu
index 7f1aa812f4..190610cc2e 100644
--- a/lib/gpu/cg_cmm_msm.cu
+++ b/lib/gpu/cg_cmm_msm.cu
@@ -18,9 +18,9 @@
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
-__inline float fetch_q(const int& i, const float *q) 
+ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/charmm_long.cu b/lib/gpu/charmm_long.cu
index 5119e61947..84e3601ac8 100644
--- a/lib/gpu/charmm_long.cu
+++ b/lib/gpu/charmm_long.cu
@@ -18,9 +18,9 @@
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
-__inline float fetch_q(const int& i, const float *q) 
+ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/coul_long.cu b/lib/gpu/coul_long.cu
index 0857a1cc77..bf9d320e83 100644
--- a/lib/gpu/coul_long.cu
+++ b/lib/gpu/coul_long.cu
@@ -18,9 +18,9 @@
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
-__inline float fetch_q(const int& i, const float *q) 
+ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/ellipsoid_extra.h b/lib/gpu/ellipsoid_extra.h
index 540a032930..b8db7828a9 100644
--- a/lib/gpu/ellipsoid_extra.h
+++ b/lib/gpu/ellipsoid_extra.h
@@ -26,7 +26,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
    dot product of 2 vectors
 ------------------------------------------------------------------------- */
 
-__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
+ucl_inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
 {
   return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
 }
@@ -35,7 +35,7 @@ __inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
    cross product of 2 vectors
 ------------------------------------------------------------------------- */
 
-__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
+ucl_inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
 {
   ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
   ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
@@ -46,7 +46,7 @@ __inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
    determinant of a matrix
 ------------------------------------------------------------------------- */
 
-__inline numtyp gpu_det3(const numtyp m[9])
+ucl_inline numtyp gpu_det3(const numtyp m[9])
 {
   numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
     m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
@@ -58,7 +58,7 @@ __inline numtyp gpu_det3(const numtyp m[9])
    diagonal matrix times a full matrix
 ------------------------------------------------------------------------- */
 
-__inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], 
+ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], 
                               numtyp ans[9])
 {
   ans[0] = shape.x*m[0];
@@ -76,7 +76,7 @@ __inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9],
    add two matrices
 ------------------------------------------------------------------------- */
 
-__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
+ucl_inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
 {
   ans[0] = m[0]+m2[0];
   ans[1] = m[1]+m2[1];
@@ -93,7 +93,7 @@ __inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
    multiply the transpose of mat1 times mat2
 ------------------------------------------------------------------------- */
 
-__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
+ucl_inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
                                    numtyp ans[9])
 {
   ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
@@ -111,7 +111,7 @@ __inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
    row vector times matrix
 ------------------------------------------------------------------------- */
 
-__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
+ucl_inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
 {
   ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
   ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
@@ -124,7 +124,7 @@ __inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
    error_flag set to 2 if bad matrix inversion attempted
 ------------------------------------------------------------------------- */
 
-__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
+ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
                             __global int *error_flag)
 {
   // create augmented matrix for pivoting
@@ -243,7 +243,7 @@ __inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
    quat = [w i j k]
 ------------------------------------------------------------------------- */
 
-__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
+ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
                                     numtyp mat[9])
 {
   numtyp4 q=qif[qi];
@@ -276,7 +276,7 @@ __inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,
    transposed matrix times diagonal matrix
 ------------------------------------------------------------------------- */
 
-__inline void gpu_transpose_times_diag3(const numtyp m[9],
+ucl_inline void gpu_transpose_times_diag3(const numtyp m[9],
                                         const numtyp4 d, numtyp ans[9])
 {
   ans[0] = m[0]*d.x;
@@ -294,7 +294,7 @@ __inline void gpu_transpose_times_diag3(const numtyp m[9],
    multiply mat1 times mat2
 ------------------------------------------------------------------------- */
 
-__inline void gpu_times3(const numtyp m[9], const numtyp m2[9],
+ucl_inline void gpu_times3(const numtyp m[9], const numtyp m2[9],
                          numtyp ans[9])
 {
   ans[0] = m[0]*m2[0] + m[1]*m2[3] + m[2]*m2[6];
@@ -312,7 +312,7 @@ __inline void gpu_times3(const numtyp m[9], const numtyp m2[9],
    Apply principal rotation generator about x to rotation matrix m
 ------------------------------------------------------------------------- */
 
-__inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9])
+ucl_inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9])
 {
   ans[0] = 0;
   ans[1] = -m[2];
@@ -329,7 +329,7 @@ __inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9])
    Apply principal rotation generator about y to rotation matrix m
 ------------------------------------------------------------------------- */
 
-__inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9])
+ucl_inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9])
 {
   ans[0] = m[2];
   ans[1] = 0;
@@ -346,7 +346,7 @@ __inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9])
    Apply principal rotation generator about z to rotation matrix m
 ------------------------------------------------------------------------- */
 
-__inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9])
+ucl_inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9])
 {
   ans[0] = -m[1];
   ans[1] = m[0];
@@ -363,7 +363,7 @@ __inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9])
    matrix times vector
 ------------------------------------------------------------------------- */
 
-__inline void gpu_times_column3(const numtyp m[9], const numtyp v[3],
+ucl_inline void gpu_times_column3(const numtyp m[9], const numtyp v[3],
                                 numtyp ans[3]) 
 {
   ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2];
diff --git a/lib/gpu/gayberne.cu b/lib/gpu/gayberne.cu
index 27f6f7e5ba..f7e2b13b47 100644
--- a/lib/gpu/gayberne.cu
+++ b/lib/gpu/gayberne.cu
@@ -17,7 +17,7 @@
 #include "ellipsoid_extra.h"
 #endif
 
-__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
+ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
                                  numtyp ans[9])
 {
   numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h
index 5a5f739d84..bdba8ff7ae 100644
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@@ -55,10 +55,7 @@ typedef struct _double4 double4;
 #define __local __shared__
 #define __global  
 #define atom_add atomicAdd
-
-#ifndef __inline
-#define __inline static __inline__ __device__ 
-#endif
+#define ucl_inline static __inline__ __device__ 
 
 #endif
 
diff --git a/lib/gpu/lj.cu b/lib/gpu/lj.cu
index 075d020bd7..8d615af54f 100644
--- a/lib/gpu/lj.cu
+++ b/lib/gpu/lj.cu
@@ -17,7 +17,7 @@
 #include "preprocessor.h"
 texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/lj96.cu b/lib/gpu/lj96.cu
index 3e0f1d2e38..a4129e289b 100644
--- a/lib/gpu/lj96.cu
+++ b/lib/gpu/lj96.cu
@@ -17,7 +17,7 @@
 #include "preprocessor.h"
 texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/lj_class2_long.cu b/lib/gpu/lj_class2_long.cu
index 80c4281db8..61e1e02640 100644
--- a/lib/gpu/lj_class2_long.cu
+++ b/lib/gpu/lj_class2_long.cu
@@ -18,9 +18,9 @@
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
-__inline float fetch_q(const int& i, const float *q) 
+ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/lj_coul.cu b/lib/gpu/lj_coul.cu
index 581f280009..24a0997863 100644
--- a/lib/gpu/lj_coul.cu
+++ b/lib/gpu/lj_coul.cu
@@ -18,9 +18,9 @@
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
-__inline float fetch_q(const int& i, const float *q) 
+ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/lj_coul_long.cu b/lib/gpu/lj_coul_long.cu
index eec03fcb28..c9720fdc52 100644
--- a/lib/gpu/lj_coul_long.cu
+++ b/lib/gpu/lj_coul_long.cu
@@ -18,9 +18,9 @@
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
-__inline float fetch_q(const int& i, const float *q) 
+ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/lj_expand.cu b/lib/gpu/lj_expand.cu
index c173d0e8ca..a2a905cf78 100644
--- a/lib/gpu/lj_expand.cu
+++ b/lib/gpu/lj_expand.cu
@@ -17,7 +17,7 @@
 #include "preprocessor.h"
 texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/morse.cu b/lib/gpu/morse.cu
index 3dfa643fa6..a1a42582ad 100644
--- a/lib/gpu/morse.cu
+++ b/lib/gpu/morse.cu
@@ -17,7 +17,7 @@
 #include "preprocessor.h"
 texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/neighbor_gpu.cu b/lib/gpu/neighbor_gpu.cu
index 3142e88ab1..98ae06b9e6 100644
--- a/lib/gpu/neighbor_gpu.cu
+++ b/lib/gpu/neighbor_gpu.cu
@@ -18,7 +18,7 @@
 #include "preprocessor.h"
 texture<float4> neigh_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(neigh_tex, i); }
 #endif
 #endif
diff --git a/lib/gpu/pppm.cu b/lib/gpu/pppm.cu
index cb93ac8076..bbc3d8e454 100644
--- a/lib/gpu/pppm.cu
+++ b/lib/gpu/pppm.cu
@@ -18,9 +18,9 @@
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
-__inline float4 fetch_pos(const int& i, const float4 *pos) 
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
-__inline float fetch_q(const int& i, const float *q) 
+ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 
diff --git a/lib/gpu/preprocessor.h b/lib/gpu/preprocessor.h
index 78f73cd327..569a34ab79 100644
--- a/lib/gpu/preprocessor.h
+++ b/lib/gpu/preprocessor.h
@@ -100,8 +100,8 @@
 #define MAX_BIO_SHARED_TYPES 128
 
 #ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; }
-__inline double fetch_q(const int& i, const double *q) { return q[i]; }
+ucl_inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; }
+ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; }
 #endif
 
 #if (__CUDA_ARCH__ < 200)
@@ -134,7 +134,7 @@ typedef struct _double4 double4;
 #define __local __shared__
 #define __global  
 #define atom_add atomicAdd
-#define __inline static __inline__ __device__ 
+#define ucl_inline static __inline__ __device__ 
 
 #endif
 
@@ -166,7 +166,7 @@ typedef struct _double4 double4;
 #define MAX_BIO_SHARED_TYPES 128
 
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
+#define ucl_inline inline
 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
 
@@ -214,5 +214,5 @@ typedef struct _double4 double4;
 
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
+ucl_inline int sbmask(int j) { return j >> SBBITS & 3; }
 
diff --git a/lib/gpu/re_squared.cu b/lib/gpu/re_squared.cu
index fd6c6b0577..87a98d23be 100644
--- a/lib/gpu/re_squared.cu
+++ b/lib/gpu/re_squared.cu
@@ -17,7 +17,7 @@
 #include "ellipsoid_extra.h"
 #endif
 
-__inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
+ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
 {
   numtyp ans;
   ans = m2[0]*m[4]*m[8] - m2[0]*m[5]*m[7] -