From 86f7f6cd42a81143c5b72e71882bf0bb07ad8af8 Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Sun, 3 Apr 2011 11:51:26 -0400
Subject: [PATCH] Adding better error output to LAMMPS routines.

---
 lib/gpu/atomic_gpu_memory.cpp            | 20 ++++----
 lib/gpu/atomic_gpu_memory.h              | 17 +++++--
 lib/gpu/charge_gpu_memory.cpp            | 20 ++++----
 lib/gpu/charge_gpu_memory.h              | 17 +++++--
 lib/gpu/cmm_cut_gpu.cpp                  | 43 ++++++++--------
 lib/gpu/cmm_cut_gpu_memory.cpp           | 25 +++++-----
 lib/gpu/cmm_cut_gpu_memory.h             | 21 +++++---
 lib/gpu/cmmc_long_gpu.cpp                | 52 +++++++++-----------
 lib/gpu/cmmc_long_gpu_memory.cpp         | 33 +++++++------
 lib/gpu/cmmc_long_gpu_memory.h           | 25 ++++++----
 lib/gpu/cmmc_msm_gpu.cpp                 | 53 ++++++++++----------
 lib/gpu/cmmc_msm_gpu_memory.cpp          | 33 +++++++------
 lib/gpu/cmmc_msm_gpu_memory.h            | 25 ++++++----
 lib/gpu/crml_gpu.cpp                     | 62 +++++++++++-------------
 lib/gpu/crml_gpu_memory.cpp              | 11 +++--
 lib/gpu/crml_gpu_memory.h                | 29 ++++++-----
 lib/gpu/gb_gpu.cpp                       | 51 +++++++++----------
 lib/gpu/gb_gpu_memory.cpp                | 34 +++++++------
 lib/gpu/gb_gpu_memory.h                  | 27 +++++++----
 lib/gpu/geryon/nvc_device.h              |  4 +-
 lib/gpu/geryon/nvd_device.h              | 16 ++++--
 lib/gpu/geryon/ucl_d_mat.h               | 40 ++++++++-------
 lib/gpu/geryon/ucl_d_vec.h               | 33 ++++++++-----
 lib/gpu/geryon/ucl_h_mat.h               | 24 ++++++---
 lib/gpu/geryon/ucl_h_vec.h               | 38 ++++++++++-----
 lib/gpu/lj96_cut_gpu.cpp                 | 41 ++++++++--------
 lib/gpu/lj96_cut_gpu_memory.cpp          | 11 +++--
 lib/gpu/lj96_cut_gpu_memory.h            | 21 +++++---
 lib/gpu/lj_cut_gpu.cpp                   | 41 +++++++---------
 lib/gpu/lj_cut_gpu_memory.cpp            | 25 +++++-----
 lib/gpu/lj_cut_gpu_memory.h              | 21 +++++---
 lib/gpu/lj_expand_gpu.cpp                | 42 ++++++++--------
 lib/gpu/lj_expand_gpu_memory.cpp         | 25 +++++-----
 lib/gpu/lj_expand_gpu_memory.h           | 21 +++++---
 lib/gpu/ljc_cut_gpu.cpp                  | 50 +++++++++----------
 lib/gpu/ljc_cut_gpu_memory.cpp           | 29 ++++++-----
 lib/gpu/ljc_cut_gpu_memory.h             | 25 ++++++----
 lib/gpu/ljcl_cut_gpu.cpp                 | 52 +++++++++-----------
 lib/gpu/ljcl_cut_gpu_memory.cpp          | 11 +++--
 lib/gpu/ljcl_cut_gpu_memory.h            | 25 ++++++----
 lib/gpu/morse_gpu.cpp                    | 43 ++++++++--------
 lib/gpu/morse_gpu_memory.cpp             | 25 +++++-----
 lib/gpu/morse_gpu_memory.h               | 21 +++++---
 lib/gpu/pair_gpu_ans.h                   |  1 -
 lib/gpu/pair_gpu_device.cpp              | 52 ++++++++++++--------
 lib/gpu/pair_gpu_device.h                | 27 ++++++++---
 lib/gpu/pair_gpu_nbor.cpp                |  7 +++
 lib/gpu/pppm_gpu_memory.cpp              |  5 +-
 lib/gpu/pppm_l_gpu.cpp                   |  7 ++-
 src/GPU/Install.sh                       |  2 +
 src/GPU/fix_gpu.cpp                      |  5 +-
 src/GPU/gpu_extra.h                      | 47 ++++++++++++++++++
 src/GPU/pair_cg_cmm_coul_long_gpu.cpp    | 32 ++++++------
 src/GPU/pair_cg_cmm_coul_msm_gpu.cpp     | 32 ++++++------
 src/GPU/pair_cg_cmm_gpu.cpp              | 24 ++++-----
 src/GPU/pair_gayberne_gpu.cpp            | 30 ++++++------
 src/GPU/pair_lj96_cut_gpu.cpp            | 22 ++++-----
 src/GPU/pair_lj_charmm_coul_long_gpu.cpp | 38 +++++++--------
 src/GPU/pair_lj_cut_coul_cut_gpu.cpp     | 28 +++++------
 src/GPU/pair_lj_cut_coul_long_gpu.cpp    | 22 ++++-----
 src/GPU/pair_lj_cut_gpu.cpp              | 22 ++++-----
 src/GPU/pair_lj_cut_tgpu.cpp             | 16 +++---
 src/GPU/pair_lj_expand_gpu.cpp           | 13 ++---
 src/GPU/pair_morse_gpu.cpp               | 22 ++++-----
 src/GPU/pppm_gpu.cpp                     |  2 +-
 src/GPU/pppm_gpu_double.cpp              | 20 ++------
 src/GPU/pppm_gpu_single.cpp              | 20 ++------
 67 files changed, 983 insertions(+), 795 deletions(-)
 create mode 100755 src/GPU/gpu_extra.h

diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp
index 8e06c36413..2e5d91b993 100644
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@@ -40,11 +40,11 @@ int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
-                                   const int max_nbors, const int maxspecial,
-                                   const double cell_size,
-                                   const double gpu_split, FILE *_screen,
-                                   const char *pair_program) {
+int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
+                                  const int max_nbors, const int maxspecial,
+                                  const double cell_size,
+                                  const double gpu_split, FILE *_screen,
+                                  const char *pair_program) {
   nbor_time_avail=false;
   screen=_screen;
 
@@ -57,9 +57,11 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
   if (host_nlocal>0)
     _gpu_host=1;
 
-  if (!device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,maxspecial,
-                    _gpu_host,max_nbors,cell_size,false))
-    return false;
+  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,false);
+  if (success!=0)
+    return success;
+    
   ucl_device=device->gpu;
   atom=&device->atom;
 
@@ -79,7 +81,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
 
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h
index 08728e9c75..c7117c0288 100644
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@@ -39,11 +39,18 @@ class AtomicGPUMemory {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init_atomic(const int nlocal, const int nall, const int max_nbors,
-                   const int maxspecial, const double cell_size, 
-                   const double gpu_split, FILE *screen, 
-                   const char *pair_program);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size, 
+                  const double gpu_split, FILE *screen, 
+                  const char *pair_program);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp
index 35891b5854..e78282064c 100644
--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@@ -40,11 +40,11 @@ int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
-                                   const int max_nbors, const int maxspecial,
-                                   const double cell_size,
-                                   const double gpu_split, FILE *_screen,
-                                   const char *pair_program) {
+int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
+                                  const int max_nbors, const int maxspecial,
+                                  const double cell_size,
+                                  const double gpu_split, FILE *_screen,
+                                  const char *pair_program) {
   nbor_time_avail=false;
   screen=_screen;
 
@@ -57,9 +57,11 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
   if (host_nlocal>0)
     _gpu_host=1;
 
-  if (!device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,maxspecial,
-                    _gpu_host,max_nbors,cell_size,false))
-    return false;
+  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,false);
+  if (success!=0)
+    return success;
+
   ucl_device=device->gpu;
   atom=&device->atom;
 
@@ -80,7 +82,7 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
 
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
-  return true;
+  return success;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h
index c75eed072f..4dc33fd2dc 100644
--- a/lib/gpu/charge_gpu_memory.h
+++ b/lib/gpu/charge_gpu_memory.h
@@ -39,11 +39,18 @@ class ChargeGPUMemory {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init_atomic(const int nlocal, const int nall, const int max_nbors,
-                   const int maxspecial, const double cell_size,
-                   const double gpu_split, FILE *screen,
-                   const char *pair_program);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size,
+                  const double gpu_split, FILE *screen,
+                  const char *pair_program);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
diff --git a/lib/gpu/cmm_cut_gpu.cpp b/lib/gpu/cmm_cut_gpu.cpp
index 3070513dc9..7be958615a 100644
--- a/lib/gpu/cmm_cut_gpu.cpp
+++ b/lib/gpu/cmm_cut_gpu.cpp
@@ -28,12 +28,12 @@ static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen) {
+int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
+                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj4, double **offset, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const int maxspecial, const double cell_size, int &gpu_mode,
+                 FILE *screen) {
   CMMMF.clear();
   gpu_mode=CMMMF.device->gpu_mode();
   double gpu_split=CMMMF.device->particle_split();
@@ -54,13 +54,11 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
 
   CMMMF.device->world_barrier();
   if (message)
@@ -75,22 +73,21 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
+                         host_lj4, offset, special_lj, inum, nall, 300,
+                         maxspecial, cell_size, gpu_split, screen);
+
     CMMMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  CMMMF.estimate_gpu_overhead();
-  return true;
+
+  if (init_ok==0)
+    CMMMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void cmm_gpu_clear() {
diff --git a/lib/gpu/cmm_cut_gpu_memory.cpp b/lib/gpu/cmm_cut_gpu_memory.cpp
index 53f219274c..f82390097a 100644
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@@ -42,16 +42,19 @@ int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
-                           int **host_cg_type, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size, 
-                           const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,cmm_cut_gpu_kernel);
+int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+                          int **host_cg_type, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size, 
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,cmm_cut_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int cmm_types=ntypes;
@@ -84,7 +87,7 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/cmm_cut_gpu_memory.h b/lib/gpu/cmm_cut_gpu_memory.h
index 8099d5b9c4..fff90e477d 100644
--- a/lib/gpu/cmm_cut_gpu_memory.h
+++ b/lib/gpu/cmm_cut_gpu_memory.h
@@ -29,13 +29,20 @@ class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, int **host_cg_type,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size,
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, int **host_cg_type,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/cmmc_long_gpu.cpp b/lib/gpu/cmmc_long_gpu.cpp
index 7b100df2d3..a6f3d090af 100644
--- a/lib/gpu/cmmc_long_gpu.cpp
+++ b/lib/gpu/cmmc_long_gpu.cpp
@@ -28,14 +28,14 @@ static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                   double **host_lj1, double **host_lj2, double **host_lj3, 
-                   double **host_lj4, double **offset, double *special_lj,
-                   const int inum, const int nall, const int max_nbors, 
-                   const int maxspecial, const double cell_size, int &gpu_mode,
-                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald) {
+int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj,
+                  const int inum, const int nall, const int max_nbors, 
+                  const int maxspecial, const double cell_size, int &gpu_mode,
+                  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald) {
   CMMLMF.clear();
   gpu_mode=CMMLMF.device->gpu_mode();
   double gpu_split=CMMLMF.device->particle_split();
@@ -56,15 +56,12 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
-                             host_lj3, host_lj4, offset, special_lj, inum, 
-                             nall, 300, maxspecial, cell_size, gpu_split, 
-                             screen, host_cut_ljsq, host_cut_coulsq,
-                             host_special_coul, qqrd2e,g_ewald);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj, inum, nall, 300, 
+                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
 
   CMMLMF.device->world_barrier();
   if (message)
@@ -79,23 +76,22 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
-                               host_lj3, host_lj4, offset, special_lj, inum, 
-                               nall, 300, maxspecial, cell_size, gpu_split,
-                               screen, host_cut_ljsq, host_cut_coulsq,
-                               host_special_coul, qqrd2e, g_ewald);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
+                          host_lj4, offset, special_lj, inum,  nall, 300,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, g_ewald);
     CMMLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  CMMLMF.estimate_gpu_overhead();
-  return true;
+
+  if (init_ok==0)
+    CMMLMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void cmml_gpu_clear() {
diff --git a/lib/gpu/cmmc_long_gpu_memory.cpp b/lib/gpu/cmmc_long_gpu_memory.cpp
index 2091122832..020bbde549 100644
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@@ -43,20 +43,23 @@ int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
-                            int **host_cg_type, double **host_lj1, 
-                            double **host_lj2, double **host_lj3, 
-                            double **host_lj4, double **host_offset, 
-                            double *host_special_lj, const int nlocal,
-                            const int nall, const int max_nbors,
-                            const int maxspecial, const double cell_size,
-                            const double gpu_split, FILE *_screen,
-                            double **host_cut_ljsq, 
-                            const double host_cut_coulsq,
-                            double *host_special_coul, const double qqrd2e,
-                            const double g_ewald) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,cmmc_long_gpu_kernel);
+int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+                           int **host_cg_type, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen,
+                           double **host_cut_ljsq, 
+                           const double host_cut_coulsq,
+                           double *host_special_coul, const double qqrd2e,
+                           const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,cmmc_long_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -95,7 +98,7 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/cmmc_long_gpu_memory.h b/lib/gpu/cmmc_long_gpu_memory.h
index 8192c78249..45090368a5 100644
--- a/lib/gpu/cmmc_long_gpu_memory.h
+++ b/lib/gpu/cmmc_long_gpu_memory.h
@@ -29,15 +29,22 @@ class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, int ** cg_type,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen, double **host_cut_ljsq,
-            const double host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e, const double g_ewald);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, int ** cg_type,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/cmmc_msm_gpu.cpp b/lib/gpu/cmmc_msm_gpu.cpp
index df0a226a73..cfa6c50453 100644
--- a/lib/gpu/cmmc_msm_gpu.cpp
+++ b/lib/gpu/cmmc_msm_gpu.cpp
@@ -28,14 +28,14 @@ static CMMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool cmmm_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                   double **host_lj1, double **host_lj2, double **host_lj3, 
-                   double **host_lj4, double **offset, double *special_lj,
-                   const int inum, const int nall, const int max_nbors, 
-                   const int maxspecial, const double cell_size, int &gpu_mode,
-                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e, 
-                   const int smooth) {
+int cmmm_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj,
+                  const int inum, const int nall, const int max_nbors, 
+                  const int maxspecial, const double cell_size, int &gpu_mode,
+                  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e, 
+                  const int smooth) {
   CMMMMF.clear();
   gpu_mode=CMMMMF.device->gpu_mode();
   double gpu_split=CMMMMF.device->particle_split();
@@ -56,15 +56,12 @@ bool cmmm_gpu_init(const int ntypes, double **cutsq, int **cg_type,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=CMMMMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
-                             host_lj3, host_lj4, offset, special_lj, inum, 
-                             nall, 300, maxspecial, cell_size, gpu_split, 
-                             screen, host_cut_ljsq, host_cut_coulsq,
-                             host_special_coul, qqrd2e,smooth);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CMMMMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj, inum,  nall, 300,
+                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e,smooth);
 
   CMMMMF.device->world_barrier();
   if (message)
@@ -79,23 +76,23 @@ bool cmmm_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=CMMMMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
-                               host_lj3, host_lj4, offset, special_lj, inum, 
-                               nall, 300, maxspecial, cell_size, gpu_split,
-                               screen, host_cut_ljsq, host_cut_coulsq,
-                               host_special_coul, qqrd2e,smooth);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CMMMMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
+                          host_lj4, offset, special_lj, inum,  nall, 300,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e,smooth);
+
     CMMMMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  CMMMMF.estimate_gpu_overhead();
-  return true;
+
+  if (init_ok==0)
+    CMMMMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void cmmm_gpu_clear() {
diff --git a/lib/gpu/cmmc_msm_gpu_memory.cpp b/lib/gpu/cmmc_msm_gpu_memory.cpp
index 7675005348..aa170a476d 100644
--- a/lib/gpu/cmmc_msm_gpu_memory.cpp
+++ b/lib/gpu/cmmc_msm_gpu_memory.cpp
@@ -43,20 +43,23 @@ int CMMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool CMMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
-                            int **host_cg_type, double **host_lj1, 
-                            double **host_lj2, double **host_lj3, 
-                            double **host_lj4, double **host_offset, 
-                            double *host_special_lj, const int nlocal,
-                            const int nall, const int max_nbors,
-                            const int maxspecial, const double cell_size,
-                            const double gpu_split, FILE *_screen,
-                            double **host_cut_ljsq, 
-                            const double host_cut_coulsq,
-                            double *host_special_coul, const double qqrd2e,
-                            const int smooth) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,cmmc_msm_gpu_kernel);
+int CMMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+                           int **host_cg_type, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen,
+                           double **host_cut_ljsq, 
+                           const double host_cut_coulsq,
+                           double *host_special_coul, const double qqrd2e,
+                           const int smooth) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,cmmc_msm_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -95,7 +98,7 @@ bool CMMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/cmmc_msm_gpu_memory.h b/lib/gpu/cmmc_msm_gpu_memory.h
index ddaef5944d..9b5205f702 100644
--- a/lib/gpu/cmmc_msm_gpu_memory.h
+++ b/lib/gpu/cmmc_msm_gpu_memory.h
@@ -29,15 +29,22 @@ class CMMM_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, int ** cg_type,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen, double **host_cut_ljsq,
-            const double host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e, const int smooth);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, int ** cg_type,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const int smooth);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/crml_gpu.cpp b/lib/gpu/crml_gpu.cpp
index f55d73390d..1e59562ed5 100644
--- a/lib/gpu/crml_gpu.cpp
+++ b/lib/gpu/crml_gpu.cpp
@@ -28,16 +28,16 @@ static CRML_GPU_Memory<PRECISION,ACC_PRECISION> CRMLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen,
-                   double host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald, const double cut_lj_innersq,
-                   const double denom_lj, double **epsilon,
-                   double **sigma, const bool mix_arithmetic) {
+int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald, const double cut_lj_innersq,
+                  const double denom_lj, double **epsilon,
+                  double **sigma, const bool mix_arithmetic) {
   CRMLMF.clear();
   gpu_mode=CRMLMF.device->gpu_mode();
   double gpu_split=CRMLMF.device->particle_split();
@@ -58,16 +58,13 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                             host_lj4, offset, special_lj, inum, nall, 300,
-                             maxspecial, cell_size, gpu_split, screen,
-                             host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                             qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
-                             epsilon,sigma,mix_arithmetic);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
+                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
+                host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
+                epsilon,sigma,mix_arithmetic);
 
   CRMLMF.device->world_barrier();
   if (message)
@@ -82,25 +79,24 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                               host_lj4, offset, special_lj, inum, nall, 300,
-                               maxspecial, cell_size, gpu_split,
-                               screen, host_cut_ljsq, host_cut_coulsq,
-                               host_special_coul, qqrd2e, g_ewald, 
-                               cut_lj_innersq, denom_lj, epsilon, sigma,
-                               mix_arithmetic);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
+                          host_lj4, offset, special_lj, inum, nall, 300,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
+                          sigma, mix_arithmetic);
+
     CRMLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  CRMLMF.estimate_gpu_overhead();
-  return true;
+
+  if (init_ok==0)
+    CRMLMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void crml_gpu_clear() {
diff --git a/lib/gpu/crml_gpu_memory.cpp b/lib/gpu/crml_gpu_memory.cpp
index f051a349f9..3b6a07ee7e 100644
--- a/lib/gpu/crml_gpu_memory.cpp
+++ b/lib/gpu/crml_gpu_memory.cpp
@@ -43,7 +43,7 @@ int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool CRML_GPU_MemoryT::init(const int ntypes,
+int CRML_GPU_MemoryT::init(const int ntypes,
                            double host_cut_bothsq, double **host_lj1, 
                            double **host_lj2, double **host_lj3, 
                            double **host_lj4, double **host_offset, 
@@ -56,8 +56,11 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
                            const double g_ewald, const double cut_lj_innersq,
                            const double denom_lj, double **epsilon,
                            double **sigma, const bool mix_arithmetic) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,crml_gpu_kernel);
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,crml_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -99,7 +102,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/crml_gpu_memory.h b/lib/gpu/crml_gpu_memory.h
index 5520cd3a17..a474d5982d 100644
--- a/lib/gpu/crml_gpu_memory.h
+++ b/lib/gpu/crml_gpu_memory.h
@@ -29,17 +29,24 @@ class CRML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double host_cut_bothsq,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen, double host_cut_ljsq,
-            const double host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e, const double g_ewald,
-            const double cut_lj_innersq, const double denom_lj, 
-            double **epsilon, double **sigma, const bool mix_arithmetic);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double host_cut_bothsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald,
+           const double cut_lj_innersq, const double denom_lj, 
+           double **epsilon, double **sigma, const bool mix_arithmetic);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/gb_gpu.cpp b/lib/gpu/gb_gpu.cpp
index 45ad59d667..4593ba5452 100644
--- a/lib/gpu/gb_gpu.cpp
+++ b/lib/gpu/gb_gpu.cpp
@@ -49,14 +49,14 @@ void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool gb_gpu_init(const int ntypes, const double gamma,
-                 const double upsilon, const double mu, double **shape,
-                 double **well, double **cutsq, double **sigma,
-                 double **epsilon, double *host_lshape, int **form,
-                 double **host_lj1, double **host_lj2, double **host_lj3,
-                 double **host_lj4, double **offset, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
-                 const double cell_size, int &gpu_mode, FILE *screen) {
+int gb_gpu_init(const int ntypes, const double gamma,
+                const double upsilon, const double mu, double **shape,
+                double **well, double **cutsq, double **sigma,
+                double **epsilon, double *host_lshape, int **form,
+                double **host_lj1, double **host_lj2, double **host_lj3,
+                double **host_lj4, double **offset, double *special_lj,
+                const int inum, const int nall, const int max_nbors, 
+                const double cell_size, int &gpu_mode, FILE *screen) {
   GBMF.clear();
   gpu_mode=GBMF.device->gpu_mode();
   double gpu_split=GBMF.device->particle_split();
@@ -77,14 +77,12 @@ bool gb_gpu_init(const int ntypes, const double gamma,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                           sigma, epsilon, host_lshape, form, host_lj1, 
-                           host_lj2, host_lj3, host_lj4, offset, special_lj, 
-                           inum, nall, max_nbors, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
+                      sigma, epsilon, host_lshape, form, host_lj1, 
+                      host_lj2, host_lj3, host_lj4, offset, special_lj, 
+                      inum, nall, max_nbors, cell_size, gpu_split, screen);
 
   GBMF.device->world_barrier();
   if (message)
@@ -99,23 +97,22 @@ bool gb_gpu_init(const int ntypes, const double gamma,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                             sigma, epsilon, host_lshape, form, host_lj1, 
-                             host_lj2, host_lj3, host_lj4, offset, special_lj, 
-                             inum, nall, max_nbors, cell_size, gpu_split, 
-                             screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,  sigma,
+                        epsilon, host_lshape, form, host_lj1, host_lj2,
+                        host_lj3, host_lj4, offset, special_lj,  inum, nall,
+                        max_nbors, cell_size, gpu_split,  screen);
+
     GBMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  GBMF.estimate_gpu_overhead();
-  return true;
+
+  if (init_ok==0)
+    GBMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp
index 338751f34c..a22e582725 100644
--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@@ -50,17 +50,17 @@ int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, 
-                          const double upsilon, const double mu, 
-                          double **host_shape, double **host_well, 
-                          double **host_cutsq, double **host_sigma, 
-                          double **host_epsilon, double *host_lshape, 
-                          int **h_form, double **host_lj1, double **host_lj2,
-                          double **host_lj3, double **host_lj4,
-                          double **host_offset, const double *host_special_lj,
-                          const int nlocal, const int nall,
-                          const int max_nbors, const double cell_size,
-                          const double gpu_split, FILE *_screen) {
+int GB_GPU_MemoryT::init(const int ntypes, const double gamma, 
+                         const double upsilon, const double mu, 
+                         double **host_shape, double **host_well, 
+                         double **host_cutsq, double **host_sigma, 
+                         double **host_epsilon, double *host_lshape, 
+                         int **h_form, double **host_lj1, double **host_lj2,
+                         double **host_lj3, double **host_lj4,
+                         double **host_offset, const double *host_special_lj,
+                         const int nlocal, const int nall,
+                         const int max_nbors, const double cell_size,
+                         const double gpu_split, FILE *_screen) {
   nbor_time_avail=false;
   screen=_screen;
 
@@ -73,9 +73,11 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
   if (host_nlocal>0)
     _gpu_host=1;
   
-  if (!device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
-                    _gpu_host,max_nbors,cell_size,true))
-    return false;
+  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
+                           _gpu_host,max_nbors,cell_size,true);
+  if (success!=0)
+    return success;
+    
   ucl_device=device->gpu;
   atom=&device->atom;
 
@@ -195,7 +197,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
   _max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
   // Memory for ilist ordered by particle type
-  return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
+  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
+    return 0;
+  else return -3;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h
index edba318ace..f47f9ff758 100644
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@@ -35,16 +35,23 @@ class GB_GPU_Memory {
     * \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device 
-    * \return false if there is not sufficient memory or device init prob **/
-  bool init(const int ntypes, const double gamma,
-            const double upsilon, const double mu, double **host_shape,
-            double **host_well, double **host_cutsq, double **host_sigma, 
-            double **host_epsilon, double *host_lshape, int **h_form,
-            double **host_lj1, double **host_lj2, double **host_lj3, 
-            double **host_lj4, double **host_offset, 
-            const double *host_special_lj, const int nlocal, const int nall, 
-            const int max_nbors, const double cell_size,
-            const double gpu_split, FILE *screen);
+    * \return false if there is not sufficient memory or device init prob
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const double gamma,
+           const double upsilon, const double mu, double **host_shape,
+           double **host_well, double **host_cutsq, double **host_sigma, 
+           double **host_epsilon, double *host_lshape, int **h_form,
+           double **host_lj1, double **host_lj2, double **host_lj3, 
+           double **host_lj4, double **host_offset, 
+           const double *host_special_lj, const int nlocal, const int nall, 
+           const int max_nbors, const double cell_size,
+           const double gpu_split, FILE *screen);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
diff --git a/lib/gpu/geryon/nvc_device.h b/lib/gpu/geryon/nvc_device.h
index ed445716f6..6a232986ff 100644
--- a/lib/gpu/geryon/nvc_device.h
+++ b/lib/gpu/geryon/nvc_device.h
@@ -167,6 +167,7 @@ class UCL_Device {
   int _device, _num_devices;
   std::vector<cudaDeviceProp> _properties;
   std::vector<cudaStream_t> _cq;
+  std::vector<int> _device_ids;
 };
 
 // Grabs the properties for all devices
@@ -178,6 +179,7 @@ inline UCL_Device::UCL_Device() {
     if (deviceProp.major == 9999 && deviceProp.minor == 9999)
       break;
     _properties.push_back(deviceProp);
+    _device_ids.push_back(dev);
   }
   _device=-1;
   _cq.push_back(cudaStream_t());
@@ -194,7 +196,7 @@ inline void UCL_Device::set(int num) {
     return;
   for (int i=1; i<num_queues(); i++) pop_command_queue();
   cudaThreadExit();
-  CUDA_SAFE_CALL_NS(cudaSetDevice(num));
+  CUDA_SAFE_CALL_NS(cudaSetDevice(_device_ids[num]));
   _device=num;
 }
 
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 6b70964ba1..fd466362d6 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -42,6 +42,7 @@ inline void ucl_sync(CUstream &stream) {
 }
 
 struct NVDProperties {
+  int device_id;
   std::string name;
   int major;
   int minor;
@@ -208,15 +209,20 @@ inline UCL_Device::UCL_Device() {
   for (int dev=0; dev<_num_devices; ++dev) {
     CUdevice m;
     CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
+    int major, minor;
+    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
+    if (major==9999)
+      continue;
+      
     _properties.push_back(NVDProperties());
+    _properties.back().device_id=dev;
+    _properties.back().major=major;
+    _properties.back().minor=minor;
     
     char namecstr[1024];
     CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
     _properties.back().name=namecstr;
     
-    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major,
-                                              &_properties.back().minor,m));
-    
     CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
                                        CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
@@ -262,9 +268,9 @@ inline void UCL_Device::set(int num) {
     CU_SAFE_CALL_NS(cuCtxDestroy(_context));
     for (int i=1; i<num_queues(); i++) pop_command_queue();
   }
-  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,num));
+  _device=_properties[num].device_id;
+  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
   CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
-  _device=num;
 }
 
 // List all devices along with all properties
diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h
index c0531b2f29..11ec58629a 100644
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -61,20 +61,23 @@ class UCL_D_Mat : public UCL_BaseMat {
   inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
     clear();
-    _kind=kind;
-    _rows=rows;
-    _cols=cols;
+
     int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
-    _row_size=_pitch/sizeof(numtyp);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+_row_size*cols;
-    #endif
-    #ifndef UCL_NO_EXIT
     if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " 
                 << rows*cols*sizeof(numtyp) << " bytes on device.\n";
       exit(1);
+      #endif
+      return err;
     }
+
+    _kind=kind;
+    _rows=rows;
+    _cols=cols;
+    _row_size=_pitch/sizeof(numtyp);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_row_size*cols;
     #endif
     #ifdef _OCL_MAT
     _offset=0;
@@ -94,20 +97,23 @@ class UCL_D_Mat : public UCL_BaseMat {
   inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
     clear();
-    _kind=kind;
-    _rows=rows;
-    _cols=cols;
+
     int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
-    _row_size=_pitch/sizeof(numtyp);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+_row_size*cols;
-    #endif
-    #ifndef UCL_NO_EXIT
     if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate "
                 << rows*cols*sizeof(numtyp) << " bytes on device.\n";
       exit(1);
+      #endif
+      return err;
     }
+
+    _kind=kind;
+    _rows=rows;
+    _cols=cols;
+    _row_size=_pitch/sizeof(numtyp);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_row_size*cols;
     #endif
     #ifdef _OCL_MAT
     _offset=0;
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index 45c94bee82..d3cf4c8ada 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -60,19 +60,23 @@ class UCL_D_Vec : public UCL_BaseMat {
                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
                         
     clear();
-    _kind=kind;
-    _cols=cols;
+
     _row_bytes=cols*sizeof(numtyp);
     int err=_device_alloc(*this,cq,_row_bytes,kind);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+cols;
-    #endif
-    #ifndef UCL_NO_EXIT
     if (err!=UCL_SUCCESS) {
+      _row_bytes=0;
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes
                 << " bytes on device.\n";
       exit(1);
+      #endif
+      return err;
     }
+
+    _kind=kind;
+    _cols=cols;
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+cols;
     #endif
     #ifdef _OCL_MAT
     _offset=0;
@@ -90,19 +94,22 @@ class UCL_D_Vec : public UCL_BaseMat {
   inline int alloc(const size_t cols, UCL_Device &device,
                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
     clear();
-    _kind=kind;
-    _cols=cols;
     _row_bytes=cols*sizeof(numtyp);
     int err=_device_alloc(*this,device,_row_bytes,kind);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+cols;
-    #endif
-    #ifndef UCL_NO_EXIT
     if (err!=UCL_SUCCESS) {
+      _row_bytes=0;
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes
                 << " bytes on device.\n";
       exit(1);
+      #endif
+      return err;
     }
+
+    _kind=kind;
+    _cols=cols;
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+cols;
     #endif
     #ifdef _OCL_MAT
     _offset=0;
diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h
index 51593cfa23..ec7d313cae 100644
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -39,7 +39,11 @@ class UCL_H_Mat : public UCL_BaseMat {
    };
    typedef numtyp data_type; 
    
-  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { }
+  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
+    #ifdef _OCL_MAT
+    _carray=(cl_mem)(0);
+    #endif
+  }
   ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
   
   /// Construct with specied number of rows and columns
@@ -59,18 +63,22 @@ class UCL_H_Mat : public UCL_BaseMat {
   inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
                    const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
     clear();
-    _cols=cols;
-    _rows=rows;
+
     _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
-    int err=_host_alloc(*this,cq,_row_bytes*_rows,kind);
-    #ifndef UCL_NO_EXIT
+    int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
     if (err!=UCL_SUCCESS) {
+      _row_bytes=0;
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
                 << " bytes on host.\n";
       exit(1);
+      #endif 
+      return err;
     }
-    #endif 
+
+    _cols=cols;
+    _rows=rows;
+    _kind=kind;
     _end=_array+rows*cols;
     return err;
   }    
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index ca1dd12a47..5d9770d4ea 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -39,7 +39,11 @@ class UCL_H_Vec : public UCL_BaseMat {
    };
    typedef numtyp data_type; 
    
-  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { }
+  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
+    #ifdef _OCL_MAT
+    _carray=(cl_mem)(0);
+    #endif
+  }
   ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
   
   /// Construct with n columns
@@ -59,18 +63,23 @@ class UCL_H_Vec : public UCL_BaseMat {
   inline int alloc(const size_t cols, mat_type &cq,
                    const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
     clear();
-    _cols=cols;
+
     _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
     int err=_host_alloc(*this,cq,_row_bytes,kind);
-    _end=_array+cols;
-    #ifndef UCL_NO_EXIT
+
     if (err!=UCL_SUCCESS) {
+      _row_bytes=0;
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes
                 << " bytes on host.\n";
       exit(1);
+      #endif
+      return err;
     }
-    #endif 
+
+    _cols=cols;
+    _kind=kind;
+    _end=_array+cols;
     return err;
   }    
 
@@ -84,18 +93,23 @@ class UCL_H_Vec : public UCL_BaseMat {
   inline int alloc(const size_t cols, UCL_Device &device,
                    const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
     clear();
-    _cols=cols;
+
     _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
     int err=_host_alloc(*this,device,_row_bytes,kind);
-    _end=_array+cols;
-    #ifndef UCL_NO_EXIT
+
     if (err!=UCL_SUCCESS) {
+      _row_bytes=0;
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes
                 << " bytes on host.\n";
       exit(1);
+      #endif 
+      return err;
     }
-    #endif 
+
+    _cols=cols;
+    _kind=kind;
+    _end=_array+cols;
     return err;
   }
   
diff --git a/lib/gpu/lj96_cut_gpu.cpp b/lib/gpu/lj96_cut_gpu.cpp
index a1634e33ed..df83afd521 100644
--- a/lib/gpu/lj96_cut_gpu.cpp
+++ b/lib/gpu/lj96_cut_gpu.cpp
@@ -28,11 +28,11 @@ static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen) {
+int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen) {
   LJ96MF.clear();
   gpu_mode=LJ96MF.device->gpu_mode();
   double gpu_split=LJ96MF.device->particle_split();
@@ -53,13 +53,11 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                             host_lj4, offset, special_lj, inum, nall, 300,
-                             maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj, inum, nall, 300,
+                        maxspecial, cell_size, gpu_split, screen);
 
   LJ96MF.device->world_barrier();
   if (message)
@@ -74,22 +72,21 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                              host_lj4, offset, special_lj, inum, 
-                              nall, 300, maxspecial, cell_size, gpu_split,
-			      screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, inum,  nall, 300, maxspecial,
+                          cell_size, gpu_split, screen);
+
     LJ96MF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  LJ96MF.estimate_gpu_overhead();
-  return true;
+
+  if (init_ok==0)
+    LJ96MF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void lj96_gpu_clear() {
diff --git a/lib/gpu/lj96_cut_gpu_memory.cpp b/lib/gpu/lj96_cut_gpu_memory.cpp
index c30c5d8caf..8f0a9bae0c 100644
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@@ -42,7 +42,7 @@ int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool LJ96_GPU_MemoryT::init(const int ntypes,
+int LJ96_GPU_MemoryT::init(const int ntypes,
                            double **host_cutsq, double **host_lj1, 
                            double **host_lj2, double **host_lj3, 
                            double **host_lj4, double **host_offset, 
@@ -50,8 +50,11 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
                            const int nall, const int max_nbors,
                            const int maxspecial, const double cell_size,
                            const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,lj96_cut_gpu_kernel);
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj96_cut_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -84,7 +87,7 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lj96_cut_gpu_memory.h b/lib/gpu/lj96_cut_gpu_memory.h
index 483ef05570..fe0a0b1665 100644
--- a/lib/gpu/lj96_cut_gpu_memory.h
+++ b/lib/gpu/lj96_cut_gpu_memory.h
@@ -29,13 +29,20 @@ class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, double **host_lj1,
-            double **host_lj2, double **host_lj3, double **host_lj4,
-            double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/lj_cut_gpu.cpp b/lib/gpu/lj_cut_gpu.cpp
index a40d0a7102..aef085f7c9 100644
--- a/lib/gpu/lj_cut_gpu.cpp
+++ b/lib/gpu/lj_cut_gpu.cpp
@@ -28,12 +28,11 @@ static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljl_gpu_init(const int ntypes, double **cutsq,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen) {
+int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors,  const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen) {
   LJLMF.clear();
   gpu_mode=LJLMF.device->gpu_mode();
   double gpu_split=LJLMF.device->particle_split();
@@ -54,13 +53,11 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
 
   LJLMF.device->world_barrier();
   if (message)
@@ -75,23 +72,21 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen);
+
     LJLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  LJLMF.estimate_gpu_overhead();
 
-  return true;
+  if (init_ok==0)
+    LJLMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void ljl_gpu_clear() {
diff --git a/lib/gpu/lj_cut_gpu_memory.cpp b/lib/gpu/lj_cut_gpu_memory.cpp
index 27a759fec1..32911e22a0 100644
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@@ -42,16 +42,19 @@ int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool LJL_GPU_MemoryT::init(const int ntypes, 
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,lj_cut_gpu_kernel);
+int LJL_GPU_MemoryT::init(const int ntypes, 
+                          double **host_cutsq, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_cut_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -84,7 +87,7 @@ bool LJL_GPU_MemoryT::init(const int ntypes,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lj_cut_gpu_memory.h b/lib/gpu/lj_cut_gpu_memory.h
index 123b739649..4b86b133a1 100644
--- a/lib/gpu/lj_cut_gpu_memory.h
+++ b/lib/gpu/lj_cut_gpu_memory.h
@@ -29,13 +29,20 @@ class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/lj_expand_gpu.cpp b/lib/gpu/lj_expand_gpu.cpp
index 00ae841d94..fd0392f3fc 100644
--- a/lib/gpu/lj_expand_gpu.cpp
+++ b/lib/gpu/lj_expand_gpu.cpp
@@ -28,12 +28,12 @@ static LJE_GPU_Memory<PRECISION,ACC_PRECISION> LJEMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double **shift, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen) {
+int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double **shift, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const int maxspecial, const double cell_size, int &gpu_mode,
+                 FILE *screen) {
   LJEMF.clear();
   gpu_mode=LJEMF.device->gpu_mode();
   double gpu_split=LJEMF.device->particle_split();
@@ -54,13 +54,11 @@ bool lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                            host_lj4, offset, shift, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                       host_lj4, offset, shift, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
 
   LJEMF.device->world_barrier();
   if (message)
@@ -75,21 +73,21 @@ bool lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                              host_lj4, offset, shift, special_lj, inum, nall, 
-                              300,maxspecial, cell_size, gpu_split,screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, shift, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split,screen);
+
     LJEMF.device->world_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  LJEMF.estimate_gpu_overhead();
-  return true;
+
+  if (init_ok==0)
+    LJEMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void lje_gpu_clear() {
diff --git a/lib/gpu/lj_expand_gpu_memory.cpp b/lib/gpu/lj_expand_gpu_memory.cpp
index fe05b4fc2f..22eccab165 100644
--- a/lib/gpu/lj_expand_gpu_memory.cpp
+++ b/lib/gpu/lj_expand_gpu_memory.cpp
@@ -42,16 +42,19 @@ int LJE_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool LJE_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
-                           double **host_lj1, double **host_lj2,
-                           double **host_lj3, double **host_lj4,
-                           double **host_offset, double **host_shift,
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,lj_expand_gpu_kernel);
+int LJE_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
+                          double **host_lj1, double **host_lj2,
+                          double **host_lj3, double **host_lj4,
+                          double **host_offset, double **host_shift,
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_expand_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -84,7 +87,7 @@ bool LJE_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lj_expand_gpu_memory.h b/lib/gpu/lj_expand_gpu_memory.h
index 516af0c93d..a04b11efd1 100644
--- a/lib/gpu/lj_expand_gpu_memory.h
+++ b/lib/gpu/lj_expand_gpu_memory.h
@@ -29,13 +29,20 @@ class LJE_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, double **host_lj1,
-            double **host_lj2, double **host_lj3, double **host_lj4,
-            double **host_offset, double **host_shift, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double **host_shift, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/ljc_cut_gpu.cpp b/lib/gpu/ljc_cut_gpu.cpp
index 6703c072d9..de6f4f3e62 100644
--- a/lib/gpu/ljc_cut_gpu.cpp
+++ b/lib/gpu/ljc_cut_gpu.cpp
@@ -28,13 +28,13 @@ static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double *special_lj, const int inum,
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double **host_cut_ljsq, double **host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e) {
+int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double **host_cut_ljsq, double **host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e) {
   LJCMF.clear();
   gpu_mode=LJCMF.device->gpu_mode();
   double gpu_split=LJCMF.device->particle_split();
@@ -55,15 +55,12 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen,
-                            host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                            qqrd2e);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                       host_cut_coulsq, host_special_coul, qqrd2e);
 
   LJCMF.device->world_barrier();
   if (message)
@@ -78,23 +75,22 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen, host_cut_ljsq, host_cut_coulsq,
-                              host_special_coul, qqrd2e);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen, host_cut_ljsq,
+                         host_cut_coulsq, host_special_coul, qqrd2e);
+
     LJCMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  LJCMF.estimate_gpu_overhead();
-  return true;
+
+  if (init_ok==0)
+    LJCMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void ljc_gpu_clear() {
diff --git a/lib/gpu/ljc_cut_gpu_memory.cpp b/lib/gpu/ljc_cut_gpu_memory.cpp
index b662b653fe..7387c34aff 100644
--- a/lib/gpu/ljc_cut_gpu_memory.cpp
+++ b/lib/gpu/ljc_cut_gpu_memory.cpp
@@ -43,18 +43,21 @@ int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool LJC_GPU_MemoryT::init(const int ntypes,
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double **host_cut_ljsq, double **host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,ljc_cut_gpu_kernel);
+int LJC_GPU_MemoryT::init(const int ntypes,
+                          double **host_cutsq, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen,
+                          double **host_cut_ljsq, double **host_cut_coulsq,
+                          double *host_special_coul, const double qqrd2e) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,ljc_cut_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -95,7 +98,7 @@ bool LJC_GPU_MemoryT::init(const int ntypes,
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
                    sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/ljc_cut_gpu_memory.h b/lib/gpu/ljc_cut_gpu_memory.h
index 4dedce957a..552f9d9881 100644
--- a/lib/gpu/ljc_cut_gpu_memory.h
+++ b/lib/gpu/ljc_cut_gpu_memory.h
@@ -29,15 +29,22 @@ class LJC_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, double **host_lj1,
-            double **host_lj2, double **host_lj3, double **host_lj4,
-            double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size,
-            const double gpu_split, FILE *screen, double **host_cut_ljsq,
-            double **host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           double **host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/ljcl_cut_gpu.cpp b/lib/gpu/ljcl_cut_gpu.cpp
index 04cdab81d9..167f41b374 100644
--- a/lib/gpu/ljcl_cut_gpu.cpp
+++ b/lib/gpu/ljcl_cut_gpu.cpp
@@ -28,14 +28,14 @@ static LJCL_GPU_Memory<PRECISION,ACC_PRECISION> LJCLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen,
-                   double **host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald) {
+int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald) {
   LJCLMF.clear();
   gpu_mode=LJCLMF.device->gpu_mode();
   double gpu_split=LJCLMF.device->particle_split();
@@ -56,15 +56,12 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen,
-                            host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                            qqrd2e,g_ewald);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                        offset, special_lj, inum, nall, 300, maxspecial,
+                        cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
   LJCLMF.device->world_barrier();
   if (message)
@@ -79,23 +76,22 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen, host_cut_ljsq, host_cut_coulsq,
-                              host_special_coul, qqrd2e, g_ewald);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, inum, nall, 300, maxspecial,
+                          cell_size, gpu_split, screen, host_cut_ljsq,
+                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
     LJCLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  LJCLMF.estimate_gpu_overhead();
-  return true;
+
+  if (init_ok==0)
+    LJCLMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void ljcl_gpu_clear() {
diff --git a/lib/gpu/ljcl_cut_gpu_memory.cpp b/lib/gpu/ljcl_cut_gpu_memory.cpp
index 1f638c030c..ca7ac7d21f 100644
--- a/lib/gpu/ljcl_cut_gpu_memory.cpp
+++ b/lib/gpu/ljcl_cut_gpu_memory.cpp
@@ -43,7 +43,7 @@ int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool LJCL_GPU_MemoryT::init(const int ntypes,
+int LJCL_GPU_MemoryT::init(const int ntypes,
                            double **host_cutsq, double **host_lj1, 
                            double **host_lj2, double **host_lj3, 
                            double **host_lj4, double **host_offset, 
@@ -54,8 +54,11 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
                            double **host_cut_ljsq, const double host_cut_coulsq,
                            double *host_special_coul, const double qqrd2e,
                            const double g_ewald) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,ljcl_cut_gpu_kernel);
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,ljcl_cut_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -94,7 +97,7 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/ljcl_cut_gpu_memory.h b/lib/gpu/ljcl_cut_gpu_memory.h
index 056ba0e41f..fae4c07040 100644
--- a/lib/gpu/ljcl_cut_gpu_memory.h
+++ b/lib/gpu/ljcl_cut_gpu_memory.h
@@ -29,15 +29,22 @@ class LJCL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen, double **host_cut_ljsq,
-            const double host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e, const double g_ewald);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/morse_gpu.cpp b/lib/gpu/morse_gpu.cpp
index ea86b4ef2a..96636128db 100644
--- a/lib/gpu/morse_gpu.cpp
+++ b/lib/gpu/morse_gpu.cpp
@@ -28,12 +28,12 @@ static MOR_GPU_Memory<PRECISION,ACC_PRECISION> MORMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool mor_gpu_init(const int ntypes, double **cutsq,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen) {
+int mor_gpu_init(const int ntypes, double **cutsq,
+                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj4, double **offset, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const int maxspecial, const double cell_size, int &gpu_mode,
+                 FILE *screen) {
   MORMF.clear();
   gpu_mode=MORMF.device->gpu_mode();
   double gpu_split=MORMF.device->particle_split();
@@ -54,13 +54,11 @@ bool mor_gpu_init(const int ntypes, double **cutsq,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
 
   MORMF.device->world_barrier();
   if (message)
@@ -75,22 +73,21 @@ bool mor_gpu_init(const int ntypes, double **cutsq,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen);
+
     MORMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  MORMF.estimate_gpu_overhead();
-  return true;
+
+  if (init_ok==0)
+    MORMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void mor_gpu_clear() {
diff --git a/lib/gpu/morse_gpu_memory.cpp b/lib/gpu/morse_gpu_memory.cpp
index 037341f23c..28f980cdb6 100644
--- a/lib/gpu/morse_gpu_memory.cpp
+++ b/lib/gpu/morse_gpu_memory.cpp
@@ -42,16 +42,19 @@ int MOR_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool MOR_GPU_MemoryT::init(const int ntypes, 
-                           double **host_cutsq, double **host_morse1, 
-                           double **host_r0, double **host_alpha, 
-                           double **host_d0, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,morse_gpu_kernel);
+int MOR_GPU_MemoryT::init(const int ntypes, 
+                          double **host_cutsq, double **host_morse1, 
+                          double **host_r0, double **host_alpha, 
+                          double **host_d0, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,morse_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int types=ntypes;
@@ -83,7 +86,7 @@ bool MOR_GPU_MemoryT::init(const int ntypes,
 
   _allocated=true;
   this->_max_bytes=mor1.row_bytes()+mor2.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/morse_gpu_memory.h b/lib/gpu/morse_gpu_memory.h
index b7480dbb84..a7bbaa1f9f 100644
--- a/lib/gpu/morse_gpu_memory.h
+++ b/lib/gpu/morse_gpu_memory.h
@@ -29,13 +29,20 @@ class MOR_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq,
-            double **host_morse1, double **host_r0, double **host_alpha,
-            double **host_d0, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_morse1, double **host_r0, double **host_alpha,
+           double **host_d0, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/pair_gpu_ans.h b/lib/gpu/pair_gpu_ans.h
index ae737fe7c0..ccd1f40b2d 100644
--- a/lib/gpu/pair_gpu_ans.h
+++ b/lib/gpu/pair_gpu_ans.h
@@ -29,7 +29,6 @@ using namespace ucl_opencl;
 
 #else
 
-#include "cudpp.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 using namespace ucl_cudadr;
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index 83d20a51c9..5c68cf73d0 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -130,15 +130,22 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
   return true;
 }
 
+  /** Success will be:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+
 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
-                          const bool rot, const int nlocal, 
-                          const int host_nlocal, const int nall,
-                          PairGPUNbor *nbor, const int maxspecial,
-                          const int gpu_host, const int max_nbors, 
-                          const double cell_size, const bool pre_cut) {
+int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
+                         const bool rot, const int nlocal, 
+                         const int host_nlocal, const int nall,
+                         PairGPUNbor *nbor, const int maxspecial,
+                         const int gpu_host, const int max_nbors, 
+                         const double cell_size, const bool pre_cut) {
   if (!_device_init)
-    return false;                          
+    return -1;                          
 
   // Counts of data transfers for timing overhead estimates
   _data_in_estimate=0;
@@ -156,7 +163,7 @@ bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
   if (_init_count==0) {
     // Initialize atom and nbor data
     if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
-      return false;
+      return -3;
     compile_kernels();
     _data_in_estimate++;
     if (charge)
@@ -168,40 +175,42 @@ bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
       _data_in_estimate++;
     if (atom.quat()==false && rot)
       _data_in_estimate++;
-    atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial);
+    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial))
+      return -3;
   }
   
   if (!ans.init(ef_nlocal,charge,rot,*gpu))
-    return false;
+    return -3;
 
   if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                   *gpu,gpu_nbor,gpu_host,pre_cut))
-    return false;
+    return -3;
   nbor->cell_size(cell_size);
 
   _init_count++;
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
-                          const int nall) {
+int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
+                         const int nall) {
   if (!_device_init)
-    return false;                          
+    return -1;                          
 
   if (_init_count==0) {
     // Initialize atom and nbor data
     if (!atom.init(nall,true,false,*gpu,false,false))
-      return false;
+      return -3;
     compile_kernels();
   } else
-    atom.add_fields(true,false,false,false);
+    if (!atom.add_fields(true,false,false,false))
+      return -3;
 
   if (!ans.init(nlocal,true,false,*gpu))
-    return false;
+    return -3;
 
   _init_count++;
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
@@ -238,7 +247,10 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
 
-    for (int i=first_gpu; i<=last_gpu; i++) {
+    int last=last_gpu+1;
+    if (last>gpu->num_devices())
+      last=gpu->num_devices();
+    for (int i=first_gpu; i<last; i++) {
       std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
                         toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
                         " GHZ (";
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index 77882d7f3f..6f40ef160e 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -56,16 +56,29 @@ class PairGPUDevice {
     * \param max_nbors Initial number of rows in the neighbor matrix
     * \param cell_size cutoff+skin 
     * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel **/
-  bool init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
-            const int nlocal, const int host_nlocal, const int nall,
-            PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
-            const int max_nbors, const double cell_size, const bool pre_cut);
+    *                than the force kernel 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
+           const int nlocal, const int host_nlocal, const int nall,
+           PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
+           const int max_nbors, const double cell_size, const bool pre_cut);
 
   /// Initialize the device for Atom storage only
   /** \param nlocal Total number of local particles to allocate memory for
-    * \param nall Total number of local+ghost particles **/
-  bool init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);
+    * \param nall Total number of local+ghost particles
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);
 
   /// Output a message for pair_style acceleration with device stats
   void init_message(FILE *screen, const char *name,
diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp
index 182541260e..aed1da0a12 100644
--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@@ -75,6 +75,9 @@ bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum,
     success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
                                           UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   alloc(success);
+  if (!success)
+    return false;
+    
   if (_use_packing==false)
     _shared->compile_kernels(devi,gpu_nbor);
 
@@ -115,10 +118,14 @@ void PairGPUNbor::alloc(bool &success) {
     success=success && (dev_host_numj.alloc(_max_host,*dev,
                                             UCL_WRITE_ONLY)==UCL_SUCCESS);
     success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
+    if (!success)
+      return;
     for (int i=0; i<nt; i++)
       host_ilist[i]=i;
     success=success && (host_jlist.alloc(_max_host,*dev,
                                          UCL_NOT_PINNED)==UCL_SUCCESS);
+    if (!success)
+      return;
     int *ptr=host_nbor.begin();
     for (int i=0; i<_max_host; i++) {
       host_jlist[i]=ptr;
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 9694149c2e..a8b403cb66 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -71,10 +71,9 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
   screen=_screen;
   bool success=true;
 
-  if (!device->init(*ans,nlocal,nall)) {
-    flag=-2;
+  flag=device->init(*ans,nlocal,nall);
+  if (flag!=0)
     return 0;
-  }
   if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
     flag=-5;
     return 0;
diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp
index 9396fd0bf5..200d2f1685 100644
--- a/lib/gpu/pppm_l_gpu.cpp
+++ b/lib/gpu/pppm_l_gpu.cpp
@@ -58,11 +58,10 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
 
   success=0;
   grdtyp * host_brick=NULL;
-  if (world_me==0) {
+  if (world_me==0)
     host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,nzlo_out,
                          nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
                          slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
-  }
 
   pppm.device->world_barrier();
   if (message)
@@ -77,12 +76,12 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
+    if (gpu_rank==i && world_me!=0)
       host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,
                            nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
                            vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
                            success);
-    }
+
     pppm.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index 9541f82782..83c1f61aa1 100644
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -67,6 +67,7 @@ if (test $1 = 1) then
   
   cp fix_gpu.cpp ..
   cp fix_gpu.h ..
+  cp gpu_extra.h ..
 
   cp pair_omp_gpu.cpp ..
   cp pair_omp_gpu.h ..
@@ -114,6 +115,7 @@ elif (test $1 = 0) then
   rm ../pair_cg_cmm_coul_msm.h
   rm ../pair_cg_cmm_coul_msm_gpu.h
   rm ../fix_gpu.h
+  rm ../gpu_extra.h
   rm ../pair_omp_gpu.h
   
 fi
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 8c7b1837f6..626eb24292 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -24,6 +24,7 @@
 #include "modify.h"
 #include "domain.h"
 #include "universe.h"
+#include "gpu_extra.h"
 
 using namespace LAMMPS_NS;
 
@@ -82,9 +83,11 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
     error->all("No OpenMP support compiled in.");
   #endif
 
+  int gpu_flag = 0;
   if (!lmp_init_device(universe->uworld,world,first_gpu,last_gpu,_gpu_mode,
                        _particle_split,nthreads))
-    error->one("Could not find or initialize a specified accelerator device.");
+    gpu_flag = -2;
+  GPU_EXTRA::check_flag(gpu_flag,error,world);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/gpu_extra.h b/src/GPU/gpu_extra.h
new file mode 100755
index 0000000000..806757575a
--- /dev/null
+++ b/src/GPU/gpu_extra.h
@@ -0,0 +1,47 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (ORNL)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_GPU_EXTRA_H
+#define LMP_GPU_EXTRA_H
+
+#include "error.h"
+
+namespace GPU_EXTRA {
+
+  inline void check_flag(int error_flag, LAMMPS_NS::Error *error,
+                         MPI_Comm &world) { 
+    int all_success;
+    MPI_Allreduce(&error_flag, &all_success, 1, MPI_INT, MPI_MIN, world);
+    if (all_success != 0) {
+      if (all_success == -1)
+	error->all("Could not find fix gpu"); 
+      else if (all_success == -2)
+	error->all("Could not find/initialize a specified accelerator device.");
+      else if (all_success == -3)
+	error->all("Out of memory on GPU.");
+      else if (all_success == -4)
+	error->all("GPU library not compiled for this GPU.");
+      else if (all_success == -5)
+	error->all("Double precision is not supported on this GPU.");
+      else
+	error->all("Unknown error in GPU library.");
+    }
+  }
+
+}
+
+#endif
diff --git a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp
index d25f5860cb..b01400dfdc 100644
--- a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp
+++ b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp
@@ -35,6 +35,7 @@
 #include "domain.h"
 #include "string.h"
 #include "kspace.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
@@ -49,14 +50,14 @@
 
 // External functions from cuda library for atom decomposition
 
-bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                   double **host_lj1, double **host_lj2, double **host_lj3,
-                   double **host_lj4, double **offset, double *special_lj,
-                   const int nlocal, const int nall, const int max_nbors,
-                   const int maxspecial, const double cell_size, int &gpu_mode,
-                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald);
+int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+		  double **host_lj1, double **host_lj2, double **host_lj3,
+		  double **host_lj4, double **offset, double *special_lj,
+		  const int nlocal, const int nall, const int max_nbors,
+		  const int maxspecial, const double cell_size, int &gpu_mode,
+		  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+		  double *host_special_coul, const double qqrd2e,
+		  const double g_ewald);
 void cmml_gpu_clear();
 int ** cmml_gpu_compute_n(const int ago, const int inum, const int nall,
 			  double **host_x, int *host_type, double *sublo,
@@ -177,14 +178,13 @@ void PairCGCMMCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3,
-                               lj4, offset, force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
-                               cell_size, gpu_mode, screen, cut_ljsq,
-                               cut_coulsq_global, force->special_coul,
-                               force->qqrd2e, g_ewald);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+  int success = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3,
+			      lj4, offset, force->special_lj, atom->nlocal,
+			      atom->nlocal+atom->nghost, 300, maxspecial,
+			      cell_size, gpu_mode, screen, cut_ljsq,
+			      cut_coulsq_global, force->special_coul,
+			      force->qqrd2e, g_ewald);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pair_cg_cmm_coul_msm_gpu.cpp b/src/GPU/pair_cg_cmm_coul_msm_gpu.cpp
index 8ca92c4227..b3f934946c 100644
--- a/src/GPU/pair_cg_cmm_coul_msm_gpu.cpp
+++ b/src/GPU/pair_cg_cmm_coul_msm_gpu.cpp
@@ -34,6 +34,7 @@
 #include "domain.h"
 #include "string.h"
 #include "kspace.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
@@ -41,14 +42,14 @@ enum {C3=0,C4=1};
 
 // External functions from cuda library for atom decomposition
 
-bool cmmm_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                   double **host_lj1, double **host_lj2, double **host_lj3,
-                   double **host_lj4, double **offset, double *special_lj,
-                   const int nlocal, const int nall, const int max_nbors,
-                   const int maxspecial, const double cell_size, int &gpu_mode,
-                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const int smooth);
+int cmmm_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+		  double **host_lj1, double **host_lj2, double **host_lj3,
+		  double **host_lj4, double **offset, double *special_lj,
+		  const int nlocal, const int nall, const int max_nbors,
+		  const int maxspecial, const double cell_size, int &gpu_mode,
+		  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+		  double *host_special_coul, const double qqrd2e,
+		  const int smooth);
 void cmmm_gpu_clear();
 int ** cmmm_gpu_compute_n(const int ago, const int inum, const int nall,
 			  double **host_x, int *host_type, double *sublo,
@@ -157,14 +158,13 @@ void PairCGCMMCoulMSMGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = cmmm_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3,
-                               lj4, offset, force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
-                               cell_size, gpu_mode, screen, cut_ljsq,
-                               cut_coulsq_global, force->special_coul,
-                               force->qqrd2e,_smooth);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+  int success = cmmm_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3,
+			      lj4, offset, force->special_lj, atom->nlocal,
+			      atom->nlocal+atom->nghost, 300, maxspecial,
+			      cell_size, gpu_mode, screen, cut_ljsq,
+			      cut_coulsq_global, force->special_coul,
+			      force->qqrd2e,_smooth);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pair_cg_cmm_gpu.cpp b/src/GPU/pair_cg_cmm_gpu.cpp
index 158845846f..a252303712 100644
--- a/src/GPU/pair_cg_cmm_gpu.cpp
+++ b/src/GPU/pair_cg_cmm_gpu.cpp
@@ -34,18 +34,19 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, 
-                  double **host_lj1, double **host_lj2, double **host_lj3,
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int nlocal, const int nall, const int max_nbors,
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen);
+int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, 
+		 double **host_lj1, double **host_lj2, double **host_lj3,
+		 double **host_lj4, double **offset, double *special_lj,
+		 const int nlocal, const int nall, const int max_nbors,
+		 const int maxspecial, const double cell_size, int &gpu_mode,
+		 FILE *screen);
 void cmm_gpu_clear();
 int ** cmm_gpu_compute_n(const int ago, const int inum, const int nall,
 			 double **host_x, int *host_type, double *sublo,
@@ -150,12 +151,11 @@ void PairCGCMMGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4,
-                              offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
-                              cell_size, gpu_mode, screen);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+  int success = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4,
+			     offset, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp
index 66673cedd6..85a20faf7f 100644
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ b/src/GPU/pair_gayberne_gpu.cpp
@@ -35,20 +35,21 @@
 #include "domain.h"
 #include "update.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
-                 const double mu, double **shape, double **well, double **cutsq,
-                 double **sigma, double **epsilon, double *host_lshape,
-                 int **form, double **host_lj1, double **host_lj2,
-                 double **host_lj3, double **host_lj4, double **offset,
-                 double *special_lj, const int nlocal, const int nall,
-                 const int max_nbors, const double cell_size,
-                 int &gpu_mode, FILE *screen);
+int gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
+		const double mu, double **shape, double **well, double **cutsq,
+		double **sigma, double **epsilon, double *host_lshape,
+		int **form, double **host_lj1, double **host_lj2,
+		double **host_lj3, double **host_lj4, double **offset,
+		double *special_lj, const int nlocal, const int nall,
+		const int max_nbors, const double cell_size,
+		int &gpu_mode, FILE *screen);
 void gb_gpu_clear();
 int ** gb_gpu_compute_n(const int ago, const int inum, const int nall,
 			double **host_x, int *host_type, double *sublo,
@@ -165,13 +166,12 @@ void PairGayBerneGPU::init_style()
 
   double cell_size = sqrt(maxcut) + neighbor->skin;
 
-  bool init_ok = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, 
-                             shape, well, cutsq, sigma, epsilon, lshape, form,
-                             lj1, lj2, lj3, lj4, offset, force->special_lj, 
-                             atom->nlocal, atom->nlocal+atom->nghost, 300, 
-                             cell_size, gpu_mode, screen);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).");
+  int success = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, 
+			    shape, well, cutsq, sigma, epsilon, lshape, form,
+			    lj1, lj2, lj3, lj4, offset, force->special_lj, 
+			    atom->nlocal, atom->nlocal+atom->nghost, 300, 
+			    cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pair_lj96_cut_gpu.cpp b/src/GPU/pair_lj96_cut_gpu.cpp
index e00fe0a2be..f83646704a 100644
--- a/src/GPU/pair_lj96_cut_gpu.cpp
+++ b/src/GPU/pair_lj96_cut_gpu.cpp
@@ -34,17 +34,18 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4, 
-                   double **offset, double *special_lj, const int nlocal, 
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen);
+int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+		  double **host_lj2, double **host_lj3, double **host_lj4, 
+		  double **offset, double *special_lj, const int nlocal, 
+		  const int nall, const int max_nbors, const int maxspecial,
+		  const double cell_size, int &gpu_mode, FILE *screen);
 void lj96_gpu_clear();
 int ** lj96_gpu_compute_n(const int ago, const int inum, const int nall,
 			  double **host_x, int *host_type, double *sublo,
@@ -149,12 +150,11 @@ void PairLJ96CutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
-                               offset, force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
-                               cell_size, gpu_mode, screen);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+  int success = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+			      offset, force->special_lj, atom->nlocal,
+			      atom->nlocal+atom->nghost, 300, maxspecial,
+			      cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
index 851225d07c..802c21a2e7 100644
--- a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
@@ -35,6 +35,7 @@
 #include "domain.h"
 #include "string.h"
 #include "kspace.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
@@ -49,16 +50,16 @@
 
 // External functions from cuda library for atom decomposition
 
-bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
-		   double **host_lj2, double **host_lj3, double **host_lj4, 
-		   double **offset, double *special_lj, const int nlocal, 
-		   const int nall, const int max_nbors, const int maxspecial,
-		   const double cell_size, int &gpu_mode, FILE *screen,
-		   double host_cut_ljsq, double host_cut_coulsq,
-		   double *host_special_coul, const double qqrd2e,
-		   const double g_ewald, const double cut_lj_innersq,
-		   const double denom_lj, double **epsilon, double **sigma,
-		   const bool mix_arithmetic);
+int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+		  double **host_lj2, double **host_lj3, double **host_lj4, 
+		  double **offset, double *special_lj, const int nlocal, 
+		  const int nall, const int max_nbors, const int maxspecial,
+		  const double cell_size, int &gpu_mode, FILE *screen,
+		  double host_cut_ljsq, double host_cut_coulsq,
+		  double *host_special_coul, const double qqrd2e,
+		  const double g_ewald, const double cut_lj_innersq,
+		  const double denom_lj, double **epsilon, double **sigma,
+		  const bool mix_arithmetic);
 void crml_gpu_clear();
 int ** crml_gpu_compute_n(const int ago, const int inum,
 			  const int nall, double **host_x, int *host_type, 
@@ -192,15 +193,14 @@ void PairLJCharmmCoulLongGPU::init_style()
 	arithmetic = false;
     }
 
-  bool init_ok = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4,
-			       offset, force->special_lj, atom->nlocal,
-			       atom->nlocal+atom->nghost, 300, maxspecial,
-			       cell_size, gpu_mode, screen, cut_ljsq, 
-			       cut_coulsq, force->special_coul, force->qqrd2e,
-			       g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma,
-			       arithmetic);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+  int success = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4,
+			      offset, force->special_lj, atom->nlocal,
+			      atom->nlocal+atom->nghost, 300, maxspecial,
+			      cell_size, gpu_mode, screen, cut_ljsq, 
+			      cut_coulsq, force->special_coul, force->qqrd2e,
+			      g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma,
+			      arithmetic);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
index 7001f61f7a..c960081341 100644
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
@@ -34,19 +34,20 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4, 
-                  double **offset, double *special_lj, const int nlocal, 
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double **host_cut_ljsq, double **host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e);
+int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4, 
+                 double **offset, double *special_lj, const int nlocal, 
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double **host_cut_ljsq, double **host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e);
 void ljc_gpu_clear();
 int ** ljc_gpu_compute_n(const int ago, const int inum,
 			 const int nall, double **host_x, int *host_type, 
@@ -157,13 +158,12 @@ void PairLJCutCoulCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
-                              offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
-                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
-                              force->special_coul, force->qqrd2e);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+  int success = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+			     offset, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
+			     force->special_coul, force->qqrd2e);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pair_lj_cut_coul_long_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
index 656006a3de..ac8f983828 100644
--- a/src/GPU/pair_lj_cut_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
@@ -35,6 +35,7 @@
 #include "domain.h"
 #include "string.h"
 #include "kspace.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
@@ -49,14 +50,14 @@
 
 // External functions from cuda library for atom decomposition
 
-bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-		   double **host_lj2, double **host_lj3, double **host_lj4, 
-		   double **offset, double *special_lj, const int nlocal, 
-		   const int nall, const int max_nbors, const int maxspecial,
-		   const double cell_size, int &gpu_mode, FILE *screen,
-		   double **host_cut_ljsq, double host_cut_coulsq,
-		   double *host_special_coul, const double qqrd2e,
-		   const double g_ewald);
+int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+		  double **host_lj2, double **host_lj3, double **host_lj4, 
+		  double **offset, double *special_lj, const int nlocal, 
+		  const int nall, const int max_nbors, const int maxspecial,
+		  const double cell_size, int &gpu_mode, FILE *screen,
+		  double **host_cut_ljsq, double host_cut_coulsq,
+		  double *host_special_coul, const double qqrd2e,
+		  const double g_ewald);
 void ljcl_gpu_clear();
 int ** ljcl_gpu_compute_n(const int ago, const int inum,
 			  const int nall, double **host_x, int *host_type, 
@@ -180,13 +181,12 @@ void PairLJCutCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+  int success = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
                               atom->nlocal+atom->nghost, 300, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                               force->special_coul, force->qqrd2e, g_ewald);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp
index 55017144d5..cb84a889e9 100644
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@@ -34,17 +34,18 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4, 
-                  double **offset, double *special_lj, const int nlocal, 
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen);
+int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+		 double **host_lj2, double **host_lj3, double **host_lj4, 
+		 double **offset, double *special_lj, const int nlocal, 
+		 const int nall, const int max_nbors, const int maxspecial,
+		 const double cell_size, int &gpu_mode, FILE *screen);
 void ljl_gpu_clear();
 int ** ljl_gpu_compute_n(const int ago, const int inum,
 			 const int nall, double **host_x, int *host_type, 
@@ -149,12 +150,11 @@ void PairLJCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
-                              offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
-                              cell_size, gpu_mode, screen);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+  int success = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+			     offset, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pair_lj_cut_tgpu.cpp b/src/GPU/pair_lj_cut_tgpu.cpp
index 8f4871742c..31a2d95a71 100644
--- a/src/GPU/pair_lj_cut_tgpu.cpp
+++ b/src/GPU/pair_lj_cut_tgpu.cpp
@@ -33,17 +33,18 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4, 
-                  double **offset, double *special_lj, const int nlocal, 
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen);
+int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+		 double **host_lj2, double **host_lj3, double **host_lj4, 
+		 double **offset, double *special_lj, const int nlocal, 
+		 const int nall, const int max_nbors, const int maxspecial,
+		 const double cell_size, int &gpu_mode, FILE *screen);
 void ljl_gpu_clear();
 int ** ljl_gpu_compute_n(const int ago, const int inum,
 			 const int nall, double **host_x, int *host_type, 
@@ -161,12 +162,11 @@ void PairLJCutTGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+  int success = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
                               atom->nlocal+atom->nghost, 300, maxspecial,
                               cell_size, gpu_mode, screen);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pair_lj_expand_gpu.cpp b/src/GPU/pair_lj_expand_gpu.cpp
index ec6306546e..a55f558072 100644
--- a/src/GPU/pair_lj_expand_gpu.cpp
+++ b/src/GPU/pair_lj_expand_gpu.cpp
@@ -34,18 +34,19 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4, 
-                  double **offset, double **shift, double *special_lj, 
-		  const int nlocal, const int nall, const int max_nbors,
-		  const int maxspecial, const double cell_size, int &gpu_mode,
-		  FILE *screen);
+int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+		 double **host_lj2, double **host_lj3, double **host_lj4, 
+		 double **offset, double **shift, double *special_lj, 
+		 const int nlocal, const int nall, const int max_nbors,
+		 const int maxspecial, const double cell_size, int &gpu_mode,
+		 FILE *screen);
 void lje_gpu_clear();
 int ** lje_gpu_compute_n(const int ago, const int inum, const int nall,
 			 double **host_x, int *host_type, double *sublo,
diff --git a/src/GPU/pair_morse_gpu.cpp b/src/GPU/pair_morse_gpu.cpp
index df65203f68..35b1b99eb0 100644
--- a/src/GPU/pair_morse_gpu.cpp
+++ b/src/GPU/pair_morse_gpu.cpp
@@ -34,17 +34,18 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool mor_gpu_init(const int ntypes, double **cutsq, double **host_morse1,
-                  double **host_r0, double **host_alpha, double **host_d0, 
-                  double **offset, double *special_lj, const int nlocal, 
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen);
+int mor_gpu_init(const int ntypes, double **cutsq, double **host_morse1,
+		 double **host_r0, double **host_alpha, double **host_d0, 
+		 double **offset, double *special_lj, const int nlocal, 
+		 const int nall, const int max_nbors, const int maxspecial,
+		 const double cell_size, int &gpu_mode, FILE *screen);
 void mor_gpu_clear();
 int ** mor_gpu_compute_n(const int ago, const int inum,
 			 const int nall, double **host_x, int *host_type, 
@@ -146,12 +147,11 @@ void PairMorseGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = mor_gpu_init(atom->ntypes+1, cutsq, morse1, r0, alpha, d0,
-                              offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
-                              cell_size, gpu_mode, screen);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
+  int success = mor_gpu_init(atom->ntypes+1, cutsq, morse1, r0, alpha, d0,
+			     offset, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index af445e6e8b..d447419ce7 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
+   Contributing authors: Mike Brown (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
diff --git a/src/GPU/pppm_gpu_double.cpp b/src/GPU/pppm_gpu_double.cpp
index 33a6bf8d0f..4ddc7e5ad5 100644
--- a/src/GPU/pppm_gpu_double.cpp
+++ b/src/GPU/pppm_gpu_double.cpp
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
+   Contributing authors: Mike Brown (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
@@ -34,6 +34,7 @@
 #include "remap_wrap.h"
 #include "memory.h"
 #include "error.h"
+#include "gpu_extra.h"
 
 #define grdtyp double
 
@@ -102,22 +103,7 @@ void PPPMGPUDouble::init()
 			    slab_volfactor, nx_pppm, ny_pppm, nz_pppm, 
 			    success);
 
-  int all_success;
-  MPI_Allreduce(&success, &all_success, 1, MPI_INT, MPI_MIN, world);
-  if (all_success != 0) {
-    if (all_success == -1)
-      error->all("Could not find fix gpu"); 
-    else if (all_success == -2)
-      error->all("At least one node could not find specified GPU.");
-    else if (all_success == -3)
-      error->all("Out of memory on GPU.");
-    else if (all_success == -4)
-      error->all("GPU library not compiled for this GPU.");
-    else if (all_success == -5)
-      error->all("Double precision is not supported on this GPU.");
-    else
-      error->all("Unknown error in GPU library.");
-  }
+  GPU_EXTRA::check_flag(success,error,world);
 
   density_brick =
     create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
diff --git a/src/GPU/pppm_gpu_single.cpp b/src/GPU/pppm_gpu_single.cpp
index b57ec0abbd..cee6b438bf 100644
--- a/src/GPU/pppm_gpu_single.cpp
+++ b/src/GPU/pppm_gpu_single.cpp
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
+   Contributing authors: Mike Brown (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
@@ -34,6 +34,7 @@
 #include "remap_wrap.h"
 #include "memory.h"
 #include "error.h"
+#include "gpu_extra.h"
 
 #define grdtyp float
 
@@ -101,22 +102,7 @@ void PPPMGPUSingle::init()
 			    nyhi_out, nzhi_out, rho_coeff, &data, 
 			    slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
 
-  int all_success;
-  MPI_Allreduce(&success, &all_success, 1, MPI_INT, MPI_MIN, world);
-  if (all_success != 0) {
-    if (all_success == -1)
-      error->all("Could not find fix gpu"); 
-    else if (all_success == -2)
-      error->all("At least one node could not find specified GPU.");
-    else if (all_success == -3)
-      error->all("Out of memory on GPU.");
-    else if (all_success == -4)
-      error->all("GPU library not compiled for this GPU.");
-    else if (all_success == -5)
-      error->all("Double precision is not supported on this GPU.");
-    else
-      error->all("Unknown error in GPU library.");
-  }
+  GPU_EXTRA::check_flag(success,error,world);
 
   density_brick =
     create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,