diff --git a/lib/gpu/re_squared.cpp b/lib/gpu/re_squared.cpp
index 83946dac98..a96454a1fa 100644
--- a/lib/gpu/re_squared.cpp
+++ b/lib/gpu/re_squared.cpp
@@ -96,12 +96,12 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
     
   // Allocate, cast and asynchronous memcpy of constant data
   // Copy data for bonded interactions
-  gamma_upsilon_mu.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   host_write[0]=static_cast<numtyp>(host_special_lj[0]);
   host_write[1]=static_cast<numtyp>(host_special_lj[1]);
   host_write[2]=static_cast<numtyp>(host_special_lj[2]);
   host_write[3]=static_cast<numtyp>(host_special_lj[3]);
-  ucl_copy(gamma_upsilon_mu,host_write,4,false);
+  ucl_copy(special_lj,host_write,4,false);
 
   lshape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
   UCL_H_Vec<double> d_view;
@@ -131,7 +131,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
   
   _allocated=true;
   this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
-                   lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+
+                   lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+
                    lshape.row_bytes()+shape.row_bytes()+well.row_bytes();
 
   return 0;
@@ -159,7 +159,7 @@ void RESquaredT::clear() {
   shape.clear();
   well.clear();
   lshape.clear();
-  gamma_upsilon_mu.clear();
+  special_lj.clear();
   
   this->clear_base();
 }
@@ -207,7 +207,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->k_ellipsoid.set_size(GX,BX);
       this->k_ellipsoid.run(&this->atom->dev_x.begin(),
        &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
-       &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
+       &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
        &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
        &stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
        &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, &anall,
@@ -238,7 +238,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->k_sphere_ellipsoid.set_size(GX,BX);
       this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
         &this->atom->dev_quat.begin(), &this->shape.begin(), 
-        &this->well.begin(), &this->gamma_upsilon_mu.begin(), 
+        &this->well.begin(), &this->special_lj.begin(), 
         &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), 
         &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
         &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
@@ -262,7 +262,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       if (this->_shared_types) {
         this->k_lj_fast.set_size(GX,BX);
         this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride,
+          &this->lj3.begin(), &this->special_lj.begin(), &stride,
           &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
           &this->ans->dev_engv.begin(), &this->dev_error.begin(),
           &eflag, &vflag, &this->_last_ellipse, &ainum, &anall,
@@ -270,7 +270,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       } else {
         this->k_lj.set_size(GX,BX);
         this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(),
+          &this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(),
           &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
           &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
           &vflag, &this->_last_ellipse, &ainum, &anall, 
@@ -287,7 +287,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
     this->k_ellipsoid.set_size(GX,BX);
     this->k_ellipsoid.run(&this->atom->dev_x.begin(), 
       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), 
-      &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
+      &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
       &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
       &stride, &this->ans->dev_ans.begin(), &ainum, 
       &this->ans->dev_engv.begin(), &this->dev_error.begin(),
diff --git a/lib/gpu/re_squared.cu b/lib/gpu/re_squared.cu
index 9d8c49b7f0..62b242ea48 100644
--- a/lib/gpu/re_squared.cu
+++ b/lib/gpu/re_squared.cu
@@ -41,7 +41,7 @@ __inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
 
 __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
                                __global numtyp4* shape, __global numtyp4* well, 
-                               __global numtyp *gum, __global numtyp2* sig_eps, 
+                               __global numtyp *splj, __global numtyp2* sig_eps, 
                                const int ntypes, __global numtyp *lshape, 
                                __global int *dev_nbor, const int stride, 
                                __global acctyp4 *ans, const int astride, 
@@ -54,10 +54,10 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
   int offset=tid%t_per_atom;
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=gum[0];    
-  sp_lj[1]=gum[1];    
-  sp_lj[2]=gum[2];    
-  sp_lj[3]=gum[3];
+  sp_lj[0]=splj[0];    
+  sp_lj[1]=splj[1];    
+  sp_lj[2]=splj[2];    
+  sp_lj[3]=splj[3];
   
   __local numtyp b_alpha, cr60;
   b_alpha=(numtyp)45.0/(numtyp)56.0;
@@ -242,8 +242,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
       Ur = ((numtyp)1.0+b_alpha*tprod)*sprod/Ur;
       Ur = epsilon*Ur*pow(sigh,(numtyp)6.0)/(numtyp)2025.0;
 
-      if (eflag>0)
-        energy+=Ua+Ur;
+      energy+=Ua+Ur;
 
       // force
 
@@ -399,7 +398,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
 
         dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
         dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
-        tor.x += -(dUa*Ua+dUr*Ur);
+        tor.x -= (dUa*Ua+dUr*Ur);
       }
 
       {
@@ -425,7 +424,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
 
         dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
         dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
-        tor.y += -(dUa*Ua+dUr*Ur);
+        tor.y -= (dUa*Ua+dUr*Ur);
       }
 
       {
@@ -451,7 +450,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
 
         dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
         dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
-        tor.z += -(dUa*Ua+dUr*Ur);
+        tor.z -= (dUa*Ua+dUr*Ur);
       }
 
     } // for nbor
diff --git a/lib/gpu/re_squared.h b/lib/gpu/re_squared.h
index 85fc738d07..aadb365dc3 100644
--- a/lib/gpu/re_squared.h
+++ b/lib/gpu/re_squared.h
@@ -70,8 +70,8 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> lj3;
   /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
   UCL_D_Vec<numtyp2> sigma_epsilon;
-  // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
-  UCL_D_Vec<numtyp> gamma_upsilon_mu;
+  /// special lj 0-4
+  UCL_D_Vec<numtyp> special_lj;
   
   /// If atom type constants fit in shared memory, use fast kernels
   bool _shared_types;
diff --git a/lib/gpu/re_squared_lj.cu b/lib/gpu/re_squared_lj.cu
index 910c1e4ae3..febd880ba5 100644
--- a/lib/gpu/re_squared_lj.cu
+++ b/lib/gpu/re_squared_lj.cu
@@ -24,9 +24,320 @@
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
 
+
+__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
+                   __global numtyp4* shape, __global numtyp4* well, 
+                   __global numtyp *splj, __global numtyp2* sig_eps,
+                   const int ntypes, __global numtyp *lshape, 
+                   __global int *dev_nbor, const int stride, 
+                   __global acctyp4 *ans, const int astride, 
+                   __global acctyp *engv, __global int *err_flag, 
+                   const int eflag, const int vflag, const int inum,
+                   const int nall, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=splj[0];    
+  sp_lj[1]=splj[1];    
+  sp_lj[2]=splj[2];    
+  sp_lj[3]=splj[3];
+  
+  __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
+  b_alpha=(numtyp)45.0/(numtyp)56.0;
+  cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);    
+  solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0);
+  solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0);
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *nbor_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+
+    numtyp a[9];       // Rotation matrix (lab->body)
+    numtyp aTe[9];     // A'*E
+    numtyp lA_0[9], lA_1[9], lA_2[9]; // -A*rotation generator (x,y, or z)
+    numtyp4 ishape;
+    
+    ishape=shape[itype];
+    
+    {
+      gpu_quat_to_mat_trans(q,i,a);
+      gpu_transpose_times_diag3(a,well[itype],aTe);
+      gpu_rotation_generator_x(a,lA_0);
+      gpu_rotation_generator_y(a,lA_1);
+      gpu_rotation_generator_z(a,lA_2);
+    }
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r[3], rhat[3];
+      numtyp rnorm;
+      r[0] = jx.x-ix.x;
+      r[1] = jx.y-ix.y;
+      r[2] = jx.z-ix.z;
+      rnorm = gpu_dot3(r,r);
+      rnorm = rsqrt(rnorm);
+      rhat[0] = r[0]*rnorm;
+      rhat[1] = r[1]*rnorm;
+      rhat[2] = r[2]*rnorm;
+
+      numtyp sigma, epsilon;
+      int mtype=mul24(ntypes,itype)+jtype;
+      sigma = sig_eps[mtype].x;
+      epsilon = sig_eps[mtype].y;
+
+      numtyp aTs[9]; 
+      numtyp4 scorrect;
+      numtyp half_sigma=sigma / (numtyp)2.0;
+      scorrect.x = ishape.x+half_sigma;
+      scorrect.y = ishape.y+half_sigma;
+      scorrect.z = ishape.z+half_sigma;
+      scorrect.x = scorrect.x * scorrect.x / (numtyp)2.0;
+      scorrect.y = scorrect.y * scorrect.y / (numtyp)2.0;
+      scorrect.z = scorrect.z * scorrect.z / (numtyp)2.0;
+      gpu_transpose_times_diag3(a,scorrect,aTs);
+      
+      // energy
+
+      numtyp gamma[9], s[3];
+      gpu_times3(aTs,a,gamma);
+      gpu_mldivide3(gamma,rhat,s,err_flag);
+
+      numtyp sigma12 = (numtyp)1.0/sqrt((numtyp)0.5*gpu_dot3(s,rhat));
+      numtyp temp[9], w[3];
+      gpu_times3(aTe,a,temp);
+      temp[0] += (numtyp)1.0;
+      temp[4] += (numtyp)1.0;
+      temp[8] += (numtyp)1.0;
+      gpu_mldivide3(temp,rhat,w,err_flag);
+
+      numtyp h12 = rnorm-sigma12;
+      numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w);
+      numtyp sigh = sigma/h12;
+      numtyp tprod = chi*sigh;
+
+      numtyp Ua, Ur;
+      numtyp h12p3 = pow(h12,(numtyp)3.0);
+      numtyp sigmap3 = pow(sigma,(numtyp)3.0);
+      numtyp stemp = h12/(numtyp)2.0;
+      Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
+      Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*lshape[itype]/Ua;
+      Ua = epsilon*Ua*sigmap3*solv_f_a;
+    
+      stemp = h12/cr60;
+      Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
+           (numtyp)60.0;
+      Ur = ((numtyp)1.0+b_alpha*tprod)*lshape[itype]/Ur;
+      Ur = epsilon*Ur*sigmap3*pow(sigh,(numtyp)6.0)*solv_f_r;
+
+      energy+=Ua+Ur;
+
+      // force
+
+      numtyp fourw[3], spr[3];
+      numtyp sec = sigma*chi;
+      numtyp sigma12p3 = pow(sigma12,(numtyp)3.0);
+      fourw[0] = (numtyp)4.0*w[0];
+      fourw[1] = (numtyp)4.0*w[1];
+      fourw[2] = (numtyp)4.0*w[2];
+      spr[0] = (numtyp)0.5*sigma12p3*s[0];
+      spr[1] = (numtyp)0.5*sigma12p3*s[1];
+      spr[2] = (numtyp)0.5*sigma12p3*s[2];
+
+      stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
+              (numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
+              (numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
+              (numtyp)3.0/h12;
+      numtyp hsec = h12+(numtyp)3.0*sec;
+      numtyp dspu = (numtyp)1.0/h12-(numtyp)1.0/hsec+stemp;
+      numtyp pbsu = (numtyp)3.0*sigma/hsec;
+  
+      stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
+              (numtyp)1.0/(ishape.y*cr60+h12)+
+              (numtyp)1.0/(ishape.z*cr60+h12)+
+              (numtyp)3.0/h12;
+      hsec = h12+b_alpha*sec;
+      numtyp dspr = (numtyp)7.0/h12-(numtyp)1.0/hsec+stemp;
+      numtyp pbsr = b_alpha*sigma/hsec;
+  
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+        numtyp u[3];
+        u[0] = -rhat[i]*rhat[0];
+        u[1] = -rhat[i]*rhat[1];
+        u[2] = -rhat[i]*rhat[2];
+        u[i] += (numtyp)1.0;
+        u[0] /= rnorm;
+        u[1] /= rnorm;
+        u[2] /= rnorm;
+        numtyp dchi = gpu_dot3(u,fourw);
+        numtyp dh12 = rhat[i]+gpu_dot3(u,spr);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        numtyp force=dUr*Ur+dUa*Ua;
+        if (i==0) {
+          f.x+=force;
+          if (vflag>0)
+            virial[0]+=-r[0]*force;
+        } else if (i==1) {
+          f.y+=force;
+          if (vflag>0) {
+            virial[1]+=-r[1]*force;
+            virial[3]+=-r[0]*force;
+          }
+        } else {
+          f.z+=force;
+          if (vflag>0) {
+            virial[2]+=-r[2]*force;
+            virial[4]+=-r[0]*force;
+            virial[5]+=-r[1]*force;
+          }
+        }
+
+      }
+    
+      // torque on i
+      numtyp fwae[3];
+      gpu_row_times3(fourw,aTe,fwae);
+      {
+        numtyp tempv[3], p[3], lAtwo[9];
+        gpu_times_column3(lA_0,rhat,p);
+        gpu_times_column3(lA_0,w,tempv);
+        numtyp dchi = -gpu_dot3(fwae,tempv);
+        gpu_times3(aTs,lA_0,lAtwo);
+        gpu_times_column3(lAtwo,spr,tempv);
+        numtyp dh12 = -gpu_dot3(s,tempv);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        tor.x -= (dUa*Ua+dUr*Ur);
+      }
+
+      {
+        numtyp tempv[3], p[3], lAtwo[9];
+        gpu_times_column3(lA_1,rhat,p);
+        gpu_times_column3(lA_1,w,tempv);
+        numtyp dchi = -gpu_dot3(fwae,tempv);
+        gpu_times3(aTs,lA_1,lAtwo);
+        gpu_times_column3(lAtwo,spr,tempv);
+        numtyp dh12 = -gpu_dot3(s,tempv);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        tor.y -= (dUa*Ua+dUr*Ur);
+      }
+
+      {
+        numtyp tempv[3], p[3], lAtwo[9];
+        gpu_times_column3(lA_2,rhat,p);
+        gpu_times_column3(lA_2,w,tempv);
+        numtyp dchi = -gpu_dot3(fwae,tempv);
+        gpu_times3(aTs,lA_2,lAtwo);
+        gpu_times_column3(lAtwo,spr,tempv);
+        numtyp dh12 = -gpu_dot3(s,tempv);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        tor.z -= (dUa*Ua+dUr*Ur);
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[7][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=tor.x;
+    red_acc[4][tid]=tor.y;
+    red_acc[5][tid]=tor.z;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<6; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    tor.x=red_acc[3][tid];
+    tor.y=red_acc[4][tid];
+    tor.z=red_acc[5][tid];
+
+    if (eflag>0 || vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+      red_acc[6][tid]=energy;
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<7; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+      energy=red_acc[6][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=astride;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=astride;
+      }
+    }
+    ans[ii]=f;
+    ans[ii+astride]=tor;
+  } // if ii
+
+}
+
 __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
                                __global numtyp4* shape,__global numtyp4* well, 
-                               __global numtyp *gum, __global numtyp2* sig_eps, 
+                               __global numtyp *splj, __global numtyp2* sig_eps, 
                                const int ntypes, __global numtyp *lshape, 
                                __global int *dev_nbor, const int stride, 
                                __global acctyp4 *ans, __global acctyp *engv, 
@@ -39,10 +350,16 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
   int offset=tid%t_per_atom;
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
+  sp_lj[0]=splj[0];    
+  sp_lj[1]=splj[1];    
+  sp_lj[2]=splj[2];    
+  sp_lj[3]=splj[3];
+  
+  __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
+  b_alpha=(numtyp)45.0/(numtyp)56.0;
+  cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);    
+  solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0);
+  solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0);
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -55,208 +372,175 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
 
   if (ii<inum) {
     __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
+    int j=*nbor;
     nbor+=stride;
     int numj=*nbor;
     nbor+=stride;
-    __global int *nbor_end=nbor+stride*numj;
+    __global int *nbor_end=nbor+mul24(stride,numj);
     nbor+=mul24(offset,stride);
     int n_stride=mul24(t_per_atom,stride);
   
-    numtyp4 ix=x_[i];
-    int itype=ix.w;
-      
-    numtyp oner=shape[itype].x;
-    numtyp one_well=well[itype].x;
-  
+    numtyp4 jx=x_[j];
+    int jtype=jx.w;
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
-      int j=*nbor;
-      factor_lj = sp_lj[sbmask(j)];
-      j &= NEIGHMASK;
+      int i=*nbor;
+      factor_lj = sp_lj[sbmask(i)];
+      i &= NEIGHMASK;
 
-      numtyp4 jx=x_[j];
-      int jtype=jx.w;
+      numtyp4 ix=x_[i];
+      int itype=ix.w;
+
+      numtyp a[9];       // Rotation matrix (lab->body)
+      numtyp aTe[9];     // A'*E
+      numtyp4 ishape;
+    
+      ishape=shape[itype];
+      gpu_quat_to_mat_trans(q,i,a);
+      gpu_transpose_times_diag3(a,well[itype],aTe);
 
       // Compute r12
-      numtyp r12[3];
-      r12[0] = jx.x-ix.x;
-      r12[1] = jx.y-ix.y;
-      r12[2] = jx.z-ix.z;
-      numtyp ir = gpu_dot3(r12,r12);
+      numtyp r[3], rhat[3];
+      numtyp rnorm;
+      r[0] = jx.x-ix.x;
+      r[1] = jx.y-ix.y;
+      r[2] = jx.z-ix.z;
+      rnorm = gpu_dot3(r,r);
+      rnorm = rsqrt(rnorm);
+      rhat[0] = r[0]*rnorm;
+      rhat[1] = r[1]*rnorm;
+      rhat[2] = r[2]*rnorm;
 
-      ir = rsqrt(ir);
-      numtyp r = (numtyp)1.0/ir;
+      numtyp sigma, epsilon;
+      int mtype=mul24(ntypes,itype)+jtype;
+      sigma = sig_eps[mtype].x;
+      epsilon = sig_eps[mtype].y;
+
+      numtyp aTs[9]; 
+      numtyp4 scorrect;
+      numtyp half_sigma=sigma / (numtyp)2.0;
+      scorrect.x = ishape.x+half_sigma;
+      scorrect.y = ishape.y+half_sigma;
+      scorrect.z = ishape.z+half_sigma;
+      scorrect.x = scorrect.x * scorrect.x / (numtyp)2.0;
+      scorrect.y = scorrect.y * scorrect.y / (numtyp)2.0;
+      scorrect.z = scorrect.z * scorrect.z / (numtyp)2.0;
+      gpu_transpose_times_diag3(a,scorrect,aTs);
       
-      numtyp r12hat[3];
-      r12hat[0]=r12[0]*ir;
-      r12hat[1]=r12[1]*ir;
-      r12hat[2]=r12[2]*ir;
+      // energy
 
-      numtyp a2[9];
-      gpu_quat_to_mat_trans(q,j,a2);
-  
-      numtyp u_r, dUr[3], eta;
-      { // Compute U_r, dUr, eta, and teta
-        // Compute g12
-        numtyp g12[9];
-        {
-          {
-            numtyp g2[9];
-            gpu_diag_times3(shape[jtype],a2,g12);
-            gpu_transpose_times3(a2,g12,g2);
-            g12[0]=g2[0]+oner;
-            g12[4]=g2[4]+oner;
-            g12[8]=g2[8]+oner;
-            g12[1]=g2[1];
-            g12[2]=g2[2];
-            g12[3]=g2[3];
-            g12[5]=g2[5];
-            g12[6]=g2[6];
-            g12[7]=g2[7];    
-          }
-  
-          { // Compute U_r and dUr
+      numtyp gamma[9], s[3];
+      gpu_times3(aTs,a,gamma);
+      gpu_mldivide3(gamma,rhat,s,err_flag);
+
+      numtyp sigma12 = (numtyp)1.0/sqrt((numtyp)0.5*gpu_dot3(s,rhat));
+      numtyp temp[9], w[3];
+      gpu_times3(aTe,a,temp);
+      temp[0] += (numtyp)1.0;
+      temp[4] += (numtyp)1.0;
+      temp[8] += (numtyp)1.0;
+      gpu_mldivide3(temp,rhat,w,err_flag);
+
+      numtyp h12 = rnorm-sigma12;
+      numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w);
+      numtyp sigh = sigma/h12;
+      numtyp tprod = chi*sigh;
+
+      numtyp Ua, Ur;
+      numtyp h12p3 = pow(h12,(numtyp)3.0);
+      numtyp sigmap3 = pow(sigma,(numtyp)3.0);
+      numtyp stemp = h12/(numtyp)2.0;
+      Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
+      Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*lshape[itype]/Ua;
+      Ua = epsilon*Ua*sigmap3*solv_f_a;
     
-            // Compute kappa
-            numtyp kappa[3];
-            gpu_mldivide3(g12,r12,kappa,err_flag);
+      stemp = h12/cr60;
+      Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
+           (numtyp)60.0;
+      Ur = ((numtyp)1.0+b_alpha*tprod)*lshape[itype]/Ur;
+      Ur = epsilon*Ur*sigmap3*pow(sigh,(numtyp)6.0)*solv_f_r;
 
-            // -- kappa is now / r
-            kappa[0]*=ir;
-            kappa[1]*=ir;
-            kappa[2]*=ir;
+      energy+=Ua+Ur;
+
+      // force
+
+      numtyp fourw[3], spr[3];
+      numtyp sec = sigma*chi;
+      numtyp sigma12p3 = pow(sigma12,(numtyp)3.0);
+      fourw[0] = (numtyp)4.0*w[0];
+      fourw[1] = (numtyp)4.0*w[1];
+      fourw[2] = (numtyp)4.0*w[2];
+      spr[0] = (numtyp)0.5*sigma12p3*s[0];
+      spr[1] = (numtyp)0.5*sigma12p3*s[1];
+      spr[2] = (numtyp)0.5*sigma12p3*s[2];
+
+      stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
+              (numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
+              (numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
+              (numtyp)3.0/h12;
+      numtyp hsec = h12+(numtyp)3.0*sec;
+      numtyp dspu = (numtyp)1.0/h12-(numtyp)1.0/hsec+stemp;
+      numtyp pbsu = (numtyp)3.0*sigma/hsec;
   
-            // energy
+      stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
+              (numtyp)1.0/(ishape.y*cr60+h12)+
+              (numtyp)1.0/(ishape.z*cr60+h12)+
+              (numtyp)3.0/h12;
+      hsec = h12+b_alpha*sec;
+      numtyp dspr = (numtyp)7.0/h12-(numtyp)1.0/hsec+stemp;
+      numtyp pbsr = b_alpha*sigma/hsec;
   
-            // compute u_r and dUr
-            numtyp uslj_rsq;
-            {
-              // Compute distance of closest approach
-              numtyp h12, sigma12;
-              sigma12 = gpu_dot3(r12hat,kappa);
-              sigma12 = rsqrt((numtyp)0.5*sigma12);
-              h12 = r-sigma12;
-
-              // -- kappa is now ok
-              kappa[0]*=r;
-              kappa[1]*=r;
-              kappa[2]*=r;
-          
-              int mtype=mul24(ntypes,itype)+jtype;
-              numtyp sigma = sig_eps[mtype].x;
-              numtyp epsilon = sig_eps[mtype].y;
-              numtyp varrho = sigma/(h12+gum[0]*sigma);
-              numtyp varrho6 = varrho*varrho*varrho;
-              varrho6*=varrho6;
-              numtyp varrho12 = varrho6*varrho6;
-              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-              temp1 = temp1*(numtyp)24.0*epsilon;
-              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-              numtyp temp2 = gpu_dot3(kappa,r12hat);
-              uslj_rsq = uslj_rsq*ir*ir;
-
-              dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
-              dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
-              dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
-            }
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+        numtyp u[3];
+        u[0] = -rhat[i]*rhat[0];
+        u[1] = -rhat[i]*rhat[1];
+        u[2] = -rhat[i]*rhat[2];
+        u[i] += (numtyp)1.0;
+        u[0] /= rnorm;
+        u[1] /= rnorm;
+        u[2] /= rnorm;
+        numtyp dchi = gpu_dot3(u,fourw);
+        numtyp dh12 = rhat[i]+gpu_dot3(u,spr);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        numtyp force=dUr*Ur+dUa*Ua;
+        if (i==0) {
+          f.x+=force;
+          if (vflag>0)
+            virial[0]+=-r[0]*force;
+        } else if (i==1) {
+          f.y+=force;
+          if (vflag>0) {
+            virial[1]+=-r[1]*force;
+            virial[3]+=-r[0]*force;
+          }
+        } else {
+          f.z+=force;
+          if (vflag>0) {
+            virial[2]+=-r[2]*force;
+            virial[4]+=-r[0]*force;
+            virial[5]+=-r[1]*force;
           }
         }
-     
-        // Compute eta
-        {
-          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
-          numtyp det_g12 = gpu_det3(g12);
-          eta = pow(eta/det_g12,gum[1]);
-        }
-      }
-  
-      numtyp chi, dchi[3];
-      { // Compute chi and dchi
-
-        // Compute b12
-        numtyp b12[9];
-        {
-          numtyp b2[9];
-          gpu_diag_times3(well[jtype],a2,b12);
-          gpu_transpose_times3(a2,b12,b2);
-          b12[0]=b2[0]+one_well;
-          b12[4]=b2[4]+one_well;
-          b12[8]=b2[8]+one_well;
-          b12[1]=b2[1];
-          b12[2]=b2[2];
-          b12[3]=b2[3];
-          b12[5]=b2[5];
-          b12[6]=b2[6];
-          b12[7]=b2[7];    
-        }
-
-        // compute chi_12
-        numtyp iota[3];
-        gpu_mldivide3(b12,r12,iota,err_flag);
-        // -- iota is now iota/r
-        iota[0]*=ir;
-        iota[1]*=ir;
-        iota[2]*=ir;
-        chi = gpu_dot3(r12hat,iota);
-        chi = pow(chi*(numtyp)2.0,gum[2]);
-
-        // -- iota is now ok
-        iota[0]*=r;
-        iota[1]*=r;
-        iota[2]*=r;
-
-        numtyp temp1 = gpu_dot3(iota,r12hat);
-        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
-                                                     gum[2]);
-        dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
-        dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
-        dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
-      }
-
-      numtyp temp2 = factor_lj*eta*chi;
-      if (eflag>0)
-        energy+=u_r*temp2;
-      numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag>0) {
-        r12[0]*=-1;
-        r12[1]*=-1;
-        r12[2]*=-1;
-        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-        f.x+=ft;
-        virial[0]+=r12[0]*ft;
-        ft=temp1*dchi[1]-temp2*dUr[1];
-        f.y+=ft;
-        virial[1]+=r12[1]*ft;
-        virial[3]+=r12[0]*ft;
-        ft=temp1*dchi[2]-temp2*dUr[2];
-        f.z+=ft;
-        virial[2]+=r12[2]*ft;
-        virial[4]+=r12[0]*ft;
-        virial[5]+=r12[1]*ft;
-      } else {
-        f.x+=temp1*dchi[0]-temp2*dUr[0];
-        f.y+=temp1*dchi[1]-temp2*dUr[1];
-        f.z+=temp1*dchi[2]-temp2*dUr[2];
+
       }
+    
     } // for nbor
   } // if ii
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_PAIR];
+    __local acctyp red_acc[7][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
     red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
 
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
       if (offset < s) {
-        for (int r=0; r<4; r++)
+        for (int r=0; r<3; r++)
           red_acc[r][tid] += red_acc[r][tid+s];
       }
     }
@@ -264,21 +548,22 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
     f.x=red_acc[0][tid];
     f.y=red_acc[1][tid];
     f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
 
-    if (vflag>0) {
+    if (eflag>0 || vflag>0) {
       for (int r=0; r<6; r++)
         red_acc[r][tid]=virial[r];
+      red_acc[6][tid]=energy;
 
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
         if (offset < s) {
-          for (int r=0; r<6; r++)
+          for (int r=0; r<7; r++)
             red_acc[r][tid] += red_acc[r][tid+s];
         }
       }
     
       for (int r=0; r<6; r++)
         virial[r]=red_acc[r][tid];
+      energy=red_acc[6][tid];
     }
   }