diff --git a/lib/gpu/crml_gpu_kernel2.cu b/lib/gpu/crml_gpu_kernel2.cu
index 731c50078c..5febfc77f2 100644
--- a/lib/gpu/crml_gpu_kernel2.cu
+++ b/lib/gpu/crml_gpu_kernel2.cu
@@ -280,6 +280,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
   f[tid].z=(acctyp)0;
   for (int o=0; o<6; o++)
     virial[tid][o]=(acctyp)0;
+  __syncthreads();
   
   if (ii<inum) {
     nbor=dev_nbor+ii;
@@ -293,7 +294,6 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
     qtmp=fetch_q(i,q_);
     itype=ix.w;
   }
-  __syncthreads();
 
   if (ii<inum) {
     for (int jj=offset; jj<numj; jj+=t_per_atom) {
@@ -409,7 +409,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
     }
     if (vflag>0) {
       for (int v=0; v<6; v++) {
-        *ap1=virial[tid][i];
+        *ap1=virial[tid][v];
         ap1+=inum;
       }
     }
diff --git a/lib/gpu/crml_gpu_memory2.cpp b/lib/gpu/crml_gpu_memory2.cpp
index fa2b5fe840..764fa5f48c 100644
--- a/lib/gpu/crml_gpu_memory2.cpp
+++ b/lib/gpu/crml_gpu_memory2.cpp
@@ -127,7 +127,7 @@ double CRML_GPU_Memory2T::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void CRML_GPU_Memory2T::loop(const bool _eflag, const bool _vflag) {
-  const int threads_per_atom=32;
+  const int threads_per_atom=16;
 
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();