diff --git a/lib/gpu/lal_lj_smooth.cpp b/lib/gpu/lal_lj_smooth.cpp
index 59ebd0f636..4457ce79d2 100644
--- a/lib/gpu/lal_lj_smooth.cpp
+++ b/lib/gpu/lal_lj_smooth.cpp
@@ -51,16 +51,31 @@ int LJSMOOTHT::init(const int ntypes,
                           double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
                           double **host_ljsw4,
                           double **cut_inner, double **cut_inner_sq) {
+  const int max_shared_types=this->device->max_shared_types();
+
+  int onetype=0;
+  #ifdef USE_OPENCL
+  if (maxspecial==0)
+    for (int i=1; i<ntypes; i++)
+      for (int j=i; j<ntypes; j++)
+        if (host_cutsq[i][j]>0) {
+          if (onetype>0)
+            onetype=-1;
+          else if (onetype==0)
+            onetype=i*max_shared_types+j;
+        }
+  if (onetype<0) onetype=0;
+  #endif
+
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_smooth,"k_lj_smooth");
+                            _screen,lj_smooth,"k_lj_smooth",onetype);
   if (success!=0)
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
@@ -145,19 +160,9 @@ double LJSMOOTHT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int LJSMOOTHT::loop(const int _eflag, const int _vflag) {
+int LJSMOOTHT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
 
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
@@ -166,8 +171,8 @@ int LJSMOOTHT::loop(const int _eflag, const int _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3,  &ljsw, &ljsw0, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3,  &ljsw, &ljsw0, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -180,6 +185,7 @@ int LJSMOOTHT::loop(const int _eflag, const int _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJSMOOTH<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_smooth.cu b/lib/gpu/lal_lj_smooth.cu
index fa87e6fcee..346395513c 100644
--- a/lib/gpu/lal_lj_smooth.cu
+++ b/lib/gpu/lal_lj_smooth.cu
@@ -40,16 +40,20 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -96,7 +100,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e;
           if (rsq < lj1[mtype].w)
             e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
@@ -108,7 +112,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
           //numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -119,9 +123,9 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
@@ -139,6 +143,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
+  #ifndef ONETYPE
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -146,40 +151,60 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp lj1x=lj1_in[ONETYPE].x;
+  const numtyp lj1y=lj1_in[ONETYPE].y;
+  const numtyp cutsq=lj1_in[ONETYPE].z;
+  numtyp lj3x, lj3y, lj3z;
+  if (EVFLAG && eflag) {
+    lj3x=lj3_in[ONETYPE].x;
+    lj3y=lj3_in[ONETYPE].y;
+    lj3z=lj3_in[ONETYPE].z;
+  }
+  #endif
+
+  int n_stride;
+  local_allocate_store_pair();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp force, r6inv, factor_lj, forcelj;
     numtyp r, t, tsq, fskin;
 
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
+    #endif
 
+    NOUNROLL
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
+      #ifndef ONETYPE
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
+      #endif
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int mtype=itype+jx.w;
+      #endif
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -207,7 +232,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e;
           if (rsq < lj1[mtype].w)
             e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
@@ -218,7 +243,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
 
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -229,7 +254,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
diff --git a/lib/gpu/lal_lj_smooth_ext.cpp b/lib/gpu/lal_lj_smooth_ext.cpp
index aaebbe1493..7312c8b257 100644
--- a/lib/gpu/lal_lj_smooth_ext.cpp
+++ b/lib/gpu/lal_lj_smooth_ext.cpp
@@ -57,7 +57,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen,
                        host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
 
@@ -76,7 +76,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3,
                          host_ljsw4, cut_inner, cut_inner_sq);
 
diff --git a/src/GPU/pair_lj_smooth_gpu.cpp b/src/GPU/pair_lj_smooth_gpu.cpp
index 0203350507..282e189180 100644
--- a/src/GPU/pair_lj_smooth_gpu.cpp
+++ b/src/GPU/pair_lj_smooth_gpu.cpp
@@ -171,9 +171,10 @@ void PairLJSmoothGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljsmt_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, ljsw0, ljsw1, ljsw2,
                              ljsw3, ljsw4, cut_inner, cut_inner_sq);
   GPU_EXTRA::check_flag(success,error,world);