diff --git a/lib/cuda/Makefile.defaults b/lib/cuda/Makefile.defaults
index 590435446c..16445cca18 100644
--- a/lib/cuda/Makefile.defaults
+++ b/lib/cuda/Makefile.defaults
@@ -1,6 +1,6 @@
 
 #precision setting: 1 single, 2 double, 4 mixed
-precision ?= 2
+precision ?= 4
 
 #verbose setting: 0 no, 1 yes
 verbose ?= 1
diff --git a/lib/cuda/Makefile.lammps b/lib/cuda/Makefile.lammps
index a488404f31..cb98690689 100644
--- a/lib/cuda/Makefile.lammps
+++ b/lib/cuda/Makefile.lammps
@@ -1,5 +1,5 @@
 # Settings that the LAMMPS build will import when this package library is used
-CUDA_FLAGS := -I/usr/local/cuda/include -DUNIX  -DFFT_CUFFT -DCUDA_PRECISION=2 -DCUDA_ARCH=20 
+CUDA_FLAGS := -I/usr/local/cuda/include -DUNIX  -DFFT_CUFFT -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2 -DCUDA_ARCH=20 
 CUDA_USRLIB_CONDITIONAL := -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -lcufft
  
 user-cuda_SYSINC = ${CUDA_FLAGS}
diff --git a/lib/cuda/cuda_wrapper.cu b/lib/cuda/cuda_wrapper.cu
index c8bda6ecc3..051d37d56c 100644
--- a/lib/cuda/cuda_wrapper.cu
+++ b/lib/cuda/cuda_wrapper.cu
@@ -254,7 +254,7 @@ void cuda_check_error(char* comment)
   printf("ERROR-CUDA %s %s\n", comment, cudaGetErrorString(cudaGetLastError()));
 }
 
-int CudaWrapper_CheckMemUseage()
+int CudaWrapper_CheckMemUsage()
 {
   size_t free, total;
   cudaMemGetInfo(&free, &total);
diff --git a/lib/cuda/cuda_wrapper_cu.h b/lib/cuda/cuda_wrapper_cu.h
index 5bcfaffd44..3f574463f9 100644
--- a/lib/cuda/cuda_wrapper_cu.h
+++ b/lib/cuda/cuda_wrapper_cu.h
@@ -36,7 +36,7 @@ extern "C" void  CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned
 extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped = false, bool writeCombind = false);
 extern "C" void  CudaWrapper_FreePinnedHostData(void* dev_data);
 extern "C" void  cuda_check_error(char* comment);
-extern "C" int   CudaWrapper_CheckMemUseage();
+extern "C" int   CudaWrapper_CheckMemUsage();
 extern "C" double CudaWrapper_CheckUploadTime(bool reset = false);
 extern "C" double CudaWrapper_CheckDownloadTime(bool reset = false);
 extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset = false);
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 0859062345..76000bd07b 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -492,6 +492,7 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
                            const double driver_overhead, 
                            const int threads_per_atom, FILE *screen) {
   double single[9], times[9];
+  int post_final=0;
 
   single[0]=atom.transfer_time()+ans.transfer_time();
   single[1]=nbor.time_nbor.total_seconds()+nbor.time_hybrid1.total_seconds()+
@@ -504,6 +505,9 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
   single[7]=ans.cpu_idle_time();
   single[8]=nbor.bin_time();
 
+  MPI_Finalized(&post_final);
+  if (post_final) return;
+
   MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes+atom.max_gpu_bytes();