diff --git a/lib/gpu/Makefile.xk7 b/lib/gpu/Makefile.xk7
new file mode 100644
index 0000000000..f4bd399e08
--- /dev/null
+++ b/lib/gpu/Makefile.xk7
@@ -0,0 +1,29 @@
+# /* ----------------------------------------------------------------------   
+#  Makefile for Cray XK7 Architecture supporting Hyper-Q with Proxy
+#  - Uses PrgEnv-gnu
+# ------------------------------------------------------------------------- */
+
+
+CUDA_HOME = $(CRAY_CUDATOOLKIT_DIR)
+NVCC = nvcc 
+
+CUDA_ARCH = -arch=sm_35
+
+CUDA_PRECISION = -D_SINGLE_DOUBLE
+CUDA_INCLUDE = -I$(CUDA_HOME)/include
+CUDA_LIB = -L$(CUDA_HOME)/lib64
+CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
+
+CUDR_CPP = CC -DCUDA_PROXY -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
+CUDR_OPTS = -O2 -march=bdver1 -ftree-vectorize 
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+CUDPP_OPT = # -DUSE_CUDPP -Icudpp_mini
+
+include Nvidia.makefile
+
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 2b45b109db..fca3151b82 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -109,10 +109,10 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
     for (int j=i; j<ntypes; j++)
       if (_host_form[i][j]!=ELLIPSE_ELLIPSE)
         _multiple_forms=true;
-  if (_multiple_forms && host_nlocal>0) {
-    std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
-    exit(1);
-  }
+  if (_multiple_forms && host_nlocal>0)
+    return -8;
+  if (_multiple_forms && gpu_nbor!=0)
+    return -9;
   
   if (_multiple_forms)
     ans->force.zero();
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index f681268f1c..16e4c29610 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -114,9 +114,9 @@
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
 #define BLOCK_NBOR_BUILD 128
-#define BLOCK_PAIR 512
-#define BLOCK_BIO_PAIR 512
-#define BLOCK_ELLIPSE 256
+#define BLOCK_PAIR 256
+#define BLOCK_BIO_PAIR 256
+#define BLOCK_ELLIPSE 128
 #define MAX_SHARED_TYPES 11
 
 #ifdef _SINGLE_SINGLE
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index 1e644b8a64..ece54b854f 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -77,9 +77,9 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
         
       int mtype=itype*lj_types+jtype;
       if (rsq<coeff[mtype].z) {
-        numtyp r2inv = (numtyp)1.0/rsq;
-        numtyp r = ucl_rsqrt(r2inv);
-        numtyp rinv = 1.0/r;
+        numtyp r2inv = ucl_recip(rsq);
+        numtyp r = ucl_sqrt(rsq);
+        numtyp rinv = ucl_recip(r);
         numtyp screening = exp(-kappa*r);
         numtyp force = coeff[mtype].x*screening*(kappa + rinv)*r2inv;
         force*=factor_lj;
@@ -165,9 +165,9 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
       numtyp rsq = delx*delx+dely*dely+delz*delz;
         
       if (rsq<coeff[mtype].z) {
-        numtyp r2inv = (numtyp)1.0/rsq;
-        numtyp r = ucl_rsqrt(r2inv);
-        numtyp rinv = 1.0/r;
+        numtyp r2inv = ucl_recip(rsq);
+        numtyp r = ucl_sqrt(rsq);
+        numtyp rinv = ucl_recip(r);
         numtyp screening = exp(-kappa*r);
         numtyp force = coeff[mtype].x*screening*(kappa + rinv)*r2inv;
         force*=factor_lj;