From 8f2257ecbf1235c6a6bd7f027beadfbf9d1d2e45 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Wed, 12 Jan 2011 15:24:04 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5537
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 lib/gpu/Makefile.fermi          |  2 +-
 lib/gpu/Makefile.lens           |  4 +-
 lib/gpu/Makefile.lincoln        |  2 +-
 lib/gpu/Makefile.linux          |  2 +-
 lib/gpu/Makefile.linux_opencl   |  2 +-
 lib/gpu/Makefile.longhorn       |  2 +-
 lib/gpu/Makefile.mac            |  2 +-
 lib/gpu/Makefile.mac_opencl     |  2 +-
 lib/gpu/Nvidia.makefile         | 16 ++++++-
 lib/gpu/Opencl.makefile         | 11 +++++
 lib/gpu/cmm_cut_gpu.cpp         |  8 ++--
 lib/gpu/cmm_cut_gpu_kernel.cu   | 20 ++++-----
 lib/gpu/cmmc_long_gpu.cpp       |  8 ++--
 lib/gpu/cmmc_long_gpu_kernel.cu | 24 +++++------
 lib/gpu/gb_gpu.cpp              |  8 ++--
 lib/gpu/gb_gpu_kernel.cu        | 16 +++----
 lib/gpu/gb_gpu_kernel_lj.cu     | 30 ++++++-------
 lib/gpu/gb_gpu_memory.cpp       | 23 +++++-----
 lib/gpu/geryon/nvc_device.h     |  6 +--
 lib/gpu/geryon/nvc_macros.h     | 21 +++++++++-
 lib/gpu/geryon/nvd_device.h     |  6 +--
 lib/gpu/geryon/nvd_macros.h     | 19 ++++++++-
 lib/gpu/geryon/nvd_memory.h     |  6 +--
 lib/gpu/geryon/nvd_timer.h      |  6 +--
 lib/gpu/geryon/ocl_device.h     |  6 +--
 lib/gpu/geryon/ocl_kernel.h     |  8 ++--
 lib/gpu/geryon/ocl_timer.h      |  4 +-
 lib/gpu/lj96_cut_gpu.cpp        |  8 ++--
 lib/gpu/lj96_cut_gpu_kernel.cu  | 20 ++++-----
 lib/gpu/lj_cut_gpu.cpp          |  8 ++--
 lib/gpu/lj_cut_gpu_kernel.cu    | 20 ++++-----
 lib/gpu/ljc_cut_gpu.cpp         |  8 ++--
 lib/gpu/ljc_cut_gpu_kernel.cu   | 24 +++++------
 lib/gpu/ljcl_cut_gpu.cpp        |  8 ++--
 lib/gpu/ljcl_cut_gpu_kernel.cu  | 24 +++++------
 lib/gpu/pair_gpu_atom.cpp       | 74 ++++++++++++++++++---------------
 lib/gpu/pair_gpu_atom.h         | 25 ++++++++---
 lib/gpu/pair_gpu_balance.h      | 12 +++---
 lib/gpu/pair_gpu_device.cpp     | 66 +++++++++++++++++------------
 lib/gpu/pair_gpu_device.h       | 33 +++++++++++----
 lib/gpu/pair_gpu_precision.h    |  1 +
 41 files changed, 351 insertions(+), 244 deletions(-)

diff --git a/lib/gpu/Makefile.fermi b/lib/gpu/Makefile.fermi
index d292bcfc30..d830c8924c 100644
--- a/lib/gpu/Makefile.fermi
+++ b/lib/gpu/Makefile.fermi
@@ -30,7 +30,7 @@ CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include
 CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
 
 BIN_DIR = ./
-OBJ_DIR = ./obj
+OBJ_DIR = ./
 LIB_DIR = ./
 AR = ar
 BSH = /bin/sh
diff --git a/lib/gpu/Makefile.lens b/lib/gpu/Makefile.lens
index ceec99df7f..3b6301277f 100644
--- a/lib/gpu/Makefile.lens
+++ b/lib/gpu/Makefile.lens
@@ -26,11 +26,11 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
 
-CUDR_CPP = mpic++ -DMPI_GERYON
+CUDR_CPP = mpic++ -DMPI_GERYON -openmp
 CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
 
 BIN_DIR = ./
-OBJ_DIR = ./obj
+OBJ_DIR = ./
 LIB_DIR = ./
 AR = ar
 BSH = /bin/sh
diff --git a/lib/gpu/Makefile.lincoln b/lib/gpu/Makefile.lincoln
index c181fb08fb..97a7901811 100644
--- a/lib/gpu/Makefile.lincoln
+++ b/lib/gpu/Makefile.lincoln
@@ -28,7 +28,7 @@ CUDR_CPP = mpic++ -DMPI_GERYON
 CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
 
 BIN_DIR = ./
-OBJ_DIR = ./obj
+OBJ_DIR = ./
 LIB_DIR = ./
 AR = ar
 
diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux
index 43ee31bdf9..c0001a54ab 100644
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@@ -30,7 +30,7 @@ CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
 CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
 
 BIN_DIR = ./
-OBJ_DIR = ./obj
+OBJ_DIR = ./
 LIB_DIR = ./
 AR = ar
 BSH = /bin/sh
diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl
index 44b5d5aa2d..69522298c5 100644
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@@ -22,7 +22,7 @@ OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_SINGLE
 
 BIN_DIR = ./
-OBJ_DIR = ./ocl_obj
+OBJ_DIR = ./
 LIB_DIR = ./
 AR = ar
 BSH = /bin/sh
diff --git a/lib/gpu/Makefile.longhorn b/lib/gpu/Makefile.longhorn
index 33a02562cf..ba921f0f68 100644
--- a/lib/gpu/Makefile.longhorn
+++ b/lib/gpu/Makefile.longhorn
@@ -27,7 +27,7 @@ CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
 CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
 
 BIN_DIR = ./
-OBJ_DIR = ./obj
+OBJ_DIR = ./
 LIB_DIR = ./
 AR = ar
 
diff --git a/lib/gpu/Makefile.mac b/lib/gpu/Makefile.mac
index f16fe197bc..f061a1a68a 100644
--- a/lib/gpu/Makefile.mac
+++ b/lib/gpu/Makefile.mac
@@ -30,7 +30,7 @@ CUDR_CPP = mpic++
 CUDR_OPTS = -O2 -m32 -g
 
 BIN_DIR = ./
-OBJ_DIR = ./obj
+OBJ_DIR = ./
 LIB_DIR = ./
 AR = ar
 BSH = /bin/sh
diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl
index dae41dd3ad..53d6d466e2 100644
--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@@ -22,7 +22,7 @@ OCL_LINK = -framework OpenCL
 OCL_PREC = -D_SINGLE_SINGLE
 
 BIN_DIR = ./
-OBJ_DIR = ./ocl_obj
+OBJ_DIR = ./
 LIB_DIR = ./
 AR = ar
 BSH = /bin/sh
diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index f7de72bd25..adf281e156 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -47,6 +47,7 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
        $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
        $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
        $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
+       $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
        $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
        $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
        $(CUDPP)
@@ -59,8 +60,9 @@ PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
        $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
        $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
        $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
+       $(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \
        $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
-       $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h 
+       $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
 
 all: $(GPU_LIB) $(EXECS)
 
@@ -169,6 +171,18 @@ $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_me
 $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
 	$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu
+
+$(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h
+
+$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
+	$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
+	$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu
 
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 4da8ce5f12..ac7aecc2ee 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -37,12 +37,14 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
        $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
        $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
        $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
+       $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
        $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
        $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o 
 KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
        $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
        $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
        $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
+       $(OBJ_DIR)/crml_gpu_cl.h \
        $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h 
        
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@@ -112,6 +114,15 @@ $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_me
 $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
 	$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;
+
+$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp  $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
+	$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
+	$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h;
 
diff --git a/lib/gpu/cmm_cut_gpu.cpp b/lib/gpu/cmm_cut_gpu.cpp
index b09d713f91..53976ff7e8 100644
--- a/lib/gpu/cmm_cut_gpu.cpp
+++ b/lib/gpu/cmm_cut_gpu.cpp
@@ -46,7 +46,7 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
   CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu);
 
   bool message=false;
-  if (world_me==0 && screen)
+  if (CMMMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
@@ -62,14 +62,14 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
       return false;
   }
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  CMMMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
@@ -83,7 +83,7 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
       if (!init_ok)
         return false;
     }
-    MPI_Barrier(CMMMF.device->gpu_comm);
+    CMMMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu
index 2288cf1df9..efc6dbbd6a 100644
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@@ -91,14 +91,14 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
+    acctyp energy=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
@@ -203,14 +203,14 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
+    acctyp energy=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
diff --git a/lib/gpu/cmmc_long_gpu.cpp b/lib/gpu/cmmc_long_gpu.cpp
index 11c527aac2..a3fcf336c6 100644
--- a/lib/gpu/cmmc_long_gpu.cpp
+++ b/lib/gpu/cmmc_long_gpu.cpp
@@ -48,7 +48,7 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
   CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu);
 
   bool message=false;
-  if (world_me==0 && screen)
+  if (CMMLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
@@ -66,14 +66,14 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
       return false;
   }
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  CMMLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
@@ -88,7 +88,7 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
       if (!init_ok)
         return false;
     }
-    MPI_Barrier(CMMLMF.device->gpu_comm);
+    CMMLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu
index 23debb6b53..a43d73ffcf 100644
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@@ -114,15 +114,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   sp_lj[7]=sp_lj_in[7];
 
   if (ii<inum) {
-    acctyp energy=(numtyp)0;
-    acctyp e_coul=(numtyp)0;
+    acctyp energy=(acctyp)0;
+    acctyp e_coul=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
@@ -256,15 +256,15 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
-    acctyp e_coul=(numtyp)0;
+    acctyp energy=(acctyp)0;
+    acctyp e_coul=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
diff --git a/lib/gpu/gb_gpu.cpp b/lib/gpu/gb_gpu.cpp
index 5cdc52a4f0..5ca88fd70f 100644
--- a/lib/gpu/gb_gpu.cpp
+++ b/lib/gpu/gb_gpu.cpp
@@ -69,7 +69,7 @@ bool gb_gpu_init(const int ntypes, const double gamma,
   GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
 
   bool message=false;
-  if (world_me==0 && screen)
+  if (GBMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
@@ -86,14 +86,14 @@ bool gb_gpu_init(const int ntypes, const double gamma,
       return false;
   }
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  GBMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
         
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
@@ -108,7 +108,7 @@ bool gb_gpu_init(const int ntypes, const double gamma,
       if (!init_ok)
         return false;
     }
-    MPI_Barrier(GBMF.device->gpu_comm);
+    GBMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu
index 41174660d3..347e6ede35 100644
--- a/lib/gpu/gb_gpu_kernel.cu
+++ b/lib/gpu/gb_gpu_kernel.cu
@@ -105,18 +105,18 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
 
   if (ii<inum) {
 
-  acctyp energy=(numtyp)0;
+  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(numtyp)0;
-  f.y=(numtyp)0;
-  f.z=(numtyp)0;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
   acctyp4 tor;
-  tor.x=(numtyp)0;
-  tor.y=(numtyp)0;
-  tor.z=(numtyp)0;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
-    virial[i]=(numtyp)0;
+    virial[i]=(acctyp)0;
   
   __global int *nbor=dev_nbor+ii;
   int i=*nbor;
diff --git a/lib/gpu/gb_gpu_kernel_lj.cu b/lib/gpu/gb_gpu_kernel_lj.cu
index a5c659c106..72ce6684ab 100644
--- a/lib/gpu/gb_gpu_kernel_lj.cu
+++ b/lib/gpu/gb_gpu_kernel_lj.cu
@@ -42,14 +42,14 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
 
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
+    acctyp energy=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
@@ -276,14 +276,14 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
 
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
+    acctyp energy=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_ij+ii;
     int i=*nbor;
@@ -385,14 +385,14 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
+    acctyp energy=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_ij+ii;
     int i=*nbor;
diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp
index 89ad45557f..1d78204031 100644
--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@@ -222,7 +222,7 @@ void GB_GPU_MemoryT::clear() {
     single[4]=0;
   single[5]=atom->cast_time();
 
-  MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
+  MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
   double avg_split=hd_balancer.all_avg_split();
 
   _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
@@ -230,12 +230,13 @@ void GB_GPU_MemoryT::clear() {
               shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
               gamma_upsilon_mu.row_bytes();
   double mpi_max_bytes;
-  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
+  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
+             device->replica());
   double max_mb=mpi_max_bytes/(1024*1024);
 
-  if (device->world_me()==0)
+  if (device->replica_me()==0)
     if (screen && times[3]>0.0) {
-      int world_size=device->world_size();
+      int replica_size=device->replica_size();
 
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
@@ -244,15 +245,15 @@ void GB_GPU_MemoryT::clear() {
       fprintf(screen,"--------------------------------\n");
 
       if (device->procs_per_gpu()==1) {
-        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/world_size);
-        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/world_size);
-        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/world_size);
+        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
+        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/replica_size);
+        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
         if (nbor->gpu_nbor())
-          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/world_size);
+          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/replica_size);
         else
-          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/world_size);
-        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/world_size);
-        fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/world_size);
+          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
+        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
+        fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
       }
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
diff --git a/lib/gpu/geryon/nvc_device.h b/lib/gpu/geryon/nvc_device.h
index e5c2d801d0..ed445716f6 100644
--- a/lib/gpu/geryon/nvc_device.h
+++ b/lib/gpu/geryon/nvc_device.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -95,7 +95,7 @@ class UCL_Device {
   /** \note You cannot delete the default stream **/
   inline void pop_command_queue() {
     if (_cq.size()<2) return;
-    CUDA_SAFE_CALL_NS(cudaStreamDestroy(_cq.back()));
+    CUDA_DESTRUCT_CALL_NS(cudaStreamDestroy(_cq.back()));
     _cq.pop_back();
   }
   
@@ -290,7 +290,7 @@ inline void UCL_Device::print_all(std::ostream &out) {
     else
       out << "Unknown\n";
     #endif
-    #if CUDART_VERSION >= 3000
+    #if CUDART_VERSION >= 3010
     out << "  Concurrent kernel execution:                   ";
     if (_properties[i].concurrentKernels)
       out << "Yes\n";
diff --git a/lib/gpu/geryon/nvc_macros.h b/lib/gpu/geryon/nvc_macros.h
index b9a4e8d6f3..916d268ade 100644
--- a/lib/gpu/geryon/nvc_macros.h
+++ b/lib/gpu/geryon/nvc_macros.h
@@ -6,7 +6,7 @@
 #undef _GLIBCXX_ATOMIC_BUILTINS
 #endif // _GLIBCXX_ATOMIC_BUILTINS
 #endif // __APPLE__
- 
+
 #include <stdio.h>
 #include <cassert>
 #include <cuda_runtime.h>
@@ -18,6 +18,11 @@
 #define NVC_GERYON_EXIT assert(0==1)
 #endif
 
+#ifdef UCL_DEBUG
+#define UCL_SYNC_DEBUG
+#define UCL_DESTRUCT_CHECK
+#endif
+
 #ifndef UCL_NO_API_CHECK
 
 #define CUDA_SAFE_CALL_NS( call) do {                                        \
@@ -32,7 +37,7 @@
 
 #define CUDA_SAFE_CALL( call) do {                                           \
     CUDA_SAFE_CALL_NS( call);                                                \
-    cudaError err=cudaThreadSynchronize();                                             \
+    cudaError err=cudaThreadSynchronize();                                   \
     if( cudaSuccess != err) {                                                \
         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
@@ -53,5 +58,17 @@
 
 #endif
 
+#ifdef UCL_DESTRUCT_CHECK
+
+#define CUDA_DESTRUCT_CALL( call) CUDA_SAFE_CALL( call)
+#define CUDA_DESTRUCT_CALL_NS( call) CUDA_SAFE_CALL_NS( call)
+
+#else
+
+#define CUDA_DESTRUCT_CALL( call) call
+#define CUDA_DESTRUCT_CALL_NS( call) call
+
+#endif
+
 #endif
 
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index b407c1ede3..6b70964ba1 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
- 
+
 #ifndef NVD_DEVICE
 #define NVD_DEVICE
 
@@ -233,7 +233,7 @@ inline UCL_Device::UCL_Device() {
                       &_properties.back().canMapHostMemory, 
                       CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
     #endif
-    #if CUDA_VERSION >= 3000
+    #if CUDA_VERSION >= 3010
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
                       &_properties.back().concurrentKernels, 
                       CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
@@ -339,7 +339,7 @@ inline void UCL_Device::print_all(std::ostream &out) {
     else
       out << "No\n";
     #endif
-    #if CUDA_VERSION >= 3000
+    #if CUDA_VERSION >= 3010
     out << "  Concurrent kernel execution:                   ";
     if (_properties[i].concurrentKernels)
       out << "Yes\n";
diff --git a/lib/gpu/geryon/nvd_macros.h b/lib/gpu/geryon/nvd_macros.h
index 8cedf5c212..1fa1ce9cf8 100644
--- a/lib/gpu/geryon/nvd_macros.h
+++ b/lib/gpu/geryon/nvd_macros.h
@@ -1,6 +1,6 @@
 #ifndef NVD_MACROS_H
 #define NVD_MACROS_H
- 
+
 #include <stdio.h>
 #include <cassert>
 #include <cuda.h>
@@ -18,6 +18,11 @@
 #define NVD_GERYON_EXIT assert(0==1)
 #endif
 
+#ifdef UCL_DEBUG
+#define UCL_SYNC_DEBUG
+#define UCL_DESTRUCT_CHECK
+#endif
+
 #ifndef UCL_NO_API_CHECK
 
 #define CU_SAFE_CALL_NS( call ) do {                                         \
@@ -53,5 +58,17 @@
 
 #endif
 
+#ifdef UCL_DESTRUCT_CHECK
+
+#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
+#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)
+
+#else
+
+#define CU_DESTRUCT_CALL( call) call
+#define CU_DESTRUCT_CALL_NS( call) call
+
+#endif
+
 #endif
 
diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h
index 41d656b8cf..2023e8586f 100644
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
- 
+
 #ifndef NVD_MEMORY_H
 #define NVD_MEMORY_H
 
@@ -78,7 +78,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
 template <class mat_type>
 inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
   if (kind!=UCL_NOT_PINNED)
-    CU_SAFE_CALL(cuMemFreeHost(mat.begin()));
+    CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin()));
   else
     free(mat.begin());
 }
@@ -134,7 +134,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
 
 template <class mat_type>
 inline void _device_free(mat_type &mat) {
-  CU_SAFE_CALL(cuMemFree(mat.cbegin()));
+  CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
 }
 
 inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { 
diff --git a/lib/gpu/geryon/nvd_timer.h b/lib/gpu/geryon/nvd_timer.h
index e068e53d8f..59001c03fd 100644
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
- 
+
 #ifndef NVD_TIMER_H
 #define NVD_TIMER_H
 
@@ -41,8 +41,8 @@ class UCL_Timer {
   /** \note init() must be called to reuse timer after a clear() **/
   inline void clear() {
     if (_initialized) { 
-      CU_SAFE_CALL(cuEventDestroy(start_event));
-      CU_SAFE_CALL(cuEventDestroy(stop_event));
+      CU_DESTRUCT_CALL(cuEventDestroy(start_event));
+      CU_DESTRUCT_CALL(cuEventDestroy(stop_event));
       _initialized=false;
       _total_time=0.0;
     }
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 97cbf7f1e7..57f83b533e 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
- 
+
 #ifndef OCL_DEVICE
 #define OCL_DEVICE
 
@@ -265,10 +265,10 @@ inline UCL_Device::UCL_Device() {
 inline UCL_Device::~UCL_Device() {
   if (_device>-1) {
     for (size_t i=0; i<_cq.size(); i++) {
-      CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
       _cq.pop_back();
     }
-    CL_SAFE_CALL(clReleaseContext(_context));
+    CL_DESTRUCT_CALL(clReleaseContext(_context));
   }
 }
 
diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h
index 4a02f848a6..8b8807f153 100644
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2010 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -51,9 +51,9 @@ class UCL_Program {
   /** \note Must call init() after each clear **/
   inline void clear() {
     if (_init_done) {
-      CL_SAFE_CALL(clReleaseProgram(_program)); 
-      CL_SAFE_CALL(clReleaseContext(_context));
-      CL_SAFE_CALL(clReleaseCommandQueue(_cq));
+      CL_DESTRUCT_CALL(clReleaseProgram(_program)); 
+      CL_DESTRUCT_CALL(clReleaseContext(_context));
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
       _init_done=false;
     }
   }
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index 072c2f212d..aafb0aac4b 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
- 
+
 #ifndef OCL_TIMER_H
 #define OCL_TIMER_H
 
@@ -41,7 +41,7 @@ class UCL_Timer {
   /** \note init() must be called to reuse timer after a clear() **/
   inline void clear() {
     if (_initialized) {
-      CL_SAFE_CALL(clReleaseCommandQueue(_cq));
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
       clReleaseEvent(start_event);
       clReleaseEvent(stop_event);
       _initialized=false;
diff --git a/lib/gpu/lj96_cut_gpu.cpp b/lib/gpu/lj96_cut_gpu.cpp
index 6411b20853..24fb5d8570 100644
--- a/lib/gpu/lj96_cut_gpu.cpp
+++ b/lib/gpu/lj96_cut_gpu.cpp
@@ -45,7 +45,7 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu);
 
   bool message=false;
-  if (world_me==0 && screen)
+  if (LJ96MF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
@@ -61,14 +61,14 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
       return false;
   }
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  LJ96MF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
@@ -82,7 +82,7 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
       if (!init_ok)
         return false;
     }
-    MPI_Barrier(LJ96MF.device->gpu_comm);
+    LJ96MF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu
index a1faec8f24..083060a7a1 100644
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@@ -91,14 +91,14 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
+    acctyp energy=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
@@ -196,14 +196,14 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
+    acctyp energy=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
diff --git a/lib/gpu/lj_cut_gpu.cpp b/lib/gpu/lj_cut_gpu.cpp
index 020aeaa3ac..12fab2f9f1 100644
--- a/lib/gpu/lj_cut_gpu.cpp
+++ b/lib/gpu/lj_cut_gpu.cpp
@@ -46,7 +46,7 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
   LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu);
 
   bool message=false;
-  if (world_me==0 && screen)
+  if (LJLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
@@ -62,14 +62,14 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
       return false;
   }
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  LJLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
@@ -83,7 +83,7 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
       if (!init_ok)
         return false;
     }
-    MPI_Barrier(LJLMF.device->gpu_comm);
+    LJLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu
index 5c784084c4..25d00a202a 100644
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@@ -91,14 +91,14 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
+    acctyp energy=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
@@ -195,14 +195,14 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
+    acctyp energy=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
diff --git a/lib/gpu/ljc_cut_gpu.cpp b/lib/gpu/ljc_cut_gpu.cpp
index 7943d20e63..955a25adce 100644
--- a/lib/gpu/ljc_cut_gpu.cpp
+++ b/lib/gpu/ljc_cut_gpu.cpp
@@ -47,7 +47,7 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu);
 
   bool message=false;
-  if (world_me==0 && screen)
+  if (LJCMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
@@ -65,14 +65,14 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
       return false;
   }
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  LJCMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
@@ -87,7 +87,7 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
       if (!init_ok)
         return false;
     }
-    MPI_Barrier(LJCMF.device->gpu_comm);
+    LJCMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu
index b6d9610f0d..fac49b87ea 100644
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@@ -106,15 +106,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   sp_lj[7]=sp_lj_in[7];
 
   if (ii<inum) {
-    acctyp energy=(numtyp)0;
-    acctyp e_coul=(numtyp)0;
+    acctyp energy=(acctyp)0;
+    acctyp e_coul=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
@@ -234,15 +234,15 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
-    acctyp e_coul=(numtyp)0;
+    acctyp energy=(acctyp)0;
+    acctyp e_coul=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
diff --git a/lib/gpu/ljcl_cut_gpu.cpp b/lib/gpu/ljcl_cut_gpu.cpp
index 6fda0566a7..8fa15998bf 100644
--- a/lib/gpu/ljcl_cut_gpu.cpp
+++ b/lib/gpu/ljcl_cut_gpu.cpp
@@ -48,7 +48,7 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   LJCLMF.device->init_message(screen,"lj/cut/coul/long",first_gpu,last_gpu);
 
   bool message=false;
-  if (world_me==0 && screen)
+  if (LJCLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
@@ -66,14 +66,14 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
       return false;
   }
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  LJCLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
@@ -88,7 +88,7 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
       if (!init_ok)
         return false;
     }
-    MPI_Barrier(LJCLMF.device->gpu_comm);
+    LJCLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu
index 9e2fb3c230..be2ae069e4 100644
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@@ -114,15 +114,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   sp_lj[7]=sp_lj_in[7];
 
   if (ii<inum) {
-    acctyp energy=(numtyp)0;
-    acctyp e_coul=(numtyp)0;
+    acctyp energy=(acctyp)0;
+    acctyp e_coul=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
@@ -248,15 +248,15 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   if (ii<inum) {
   
-    acctyp energy=(numtyp)0;
-    acctyp e_coul=(numtyp)0;
+    acctyp energy=(acctyp)0;
+    acctyp e_coul=(acctyp)0;
     acctyp4 f;
-    f.x=(numtyp)0;
-    f.y=(numtyp)0;
-    f.z=(numtyp)0;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
+      virial[i]=(acctyp)0;
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp
index b158d67dfa..0ca2345087 100644
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@@ -30,7 +30,8 @@ __win_sort _win_sort;
 
 template <class numtyp, class acctyp>
 PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
-                              _vflag(false),_inum(0),_ilist(NULL) {
+                              _vflag(false),_inum(0),_ilist(NULL), 
+                              _newton(false) {
   #ifndef USE_OPENCL
   sort_config.op = CUDPP_ADD;
   sort_config.datatype = CUDPP_UINT;
@@ -64,7 +65,13 @@ int PairGPUAtomT::bytes_per_atom() const {
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::alloc(const int max_atoms) {
+bool PairGPUAtomT::alloc(const int inum, const int nall) {
+  _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
+  if (_newton)
+    _max_local=_max_atoms;
+  else
+    _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
+
   bool success=true;
   
   int ans_elements=4;
@@ -79,10 +86,10 @@ bool PairGPUAtomT::alloc(const int max_atoms) {
   // Allocate storage for CUDPP sort
   #ifndef USE_OPENCL
   #ifdef WINDLL
-  _win_sort_alloc(max_atoms);
+  _win_sort_alloc(_max_atoms);
   #else
   if (_gpu_nbor) {
-    CUDPPResult result = cudppPlan(&sort_plan, sort_config, max_atoms, 1, 0);  
+    CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
     if (CUDPP_SUCCESS != result)
       return false;
   }
@@ -92,23 +99,23 @@ bool PairGPUAtomT::alloc(const int max_atoms) {
   // --------------------------   Host allocations
   // Get a host write only buffer
   #ifdef GPU_CAST
-  success=success && (host_x_cast.alloc(max_atoms*3,*dev,
+  success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
                                         UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  success=success && (host_type_cast.alloc(max_atoms,*dev,
+  success=success && (host_type_cast.alloc(_max_atoms,*dev,
                                            UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   #else
-  success=success && (host_x.alloc(max_atoms*4,*dev,
+  success=success && (host_x.alloc(_max_atoms*4,*dev,
                       UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   #endif                      
-  success=success && (host_ans.alloc(ans_elements*max_atoms,*dev)==UCL_SUCCESS);
-  success=success && (host_engv.alloc(_ev_fields*max_atoms,*dev)==UCL_SUCCESS);
+  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
+  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
   // Buffer for casting only if different precisions
   if (_charge)
-    success=success && (host_q.alloc(max_atoms,*dev,
+    success=success && (host_q.alloc(_max_atoms,*dev,
                                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   // Buffer for casting only if different precisions
   if (_rot)
-    success=success && (host_quat.alloc(max_atoms*4,*dev,
+    success=success && (host_quat.alloc(_max_atoms*4,*dev,
                                         UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
 
     
@@ -128,43 +135,44 @@ bool PairGPUAtomT::alloc(const int max_atoms) {
       dev_q.view(host_q);
   } else {
     #ifdef GPU_CAST
-    success=success && (UCL_SUCCESS==dev_x.alloc(max_atoms*4,*dev));
+    success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
     success=success && (UCL_SUCCESS==
-                        dev_x_cast.alloc(max_atoms*3,*dev,UCL_READ_ONLY));
+                        dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
     success=success && (UCL_SUCCESS==
-                        dev_type_cast.alloc(max_atoms,*dev,UCL_READ_ONLY));
+                        dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
     _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
     #else
     success=success && (UCL_SUCCESS==
-                        dev_x.alloc(max_atoms*4,*dev,UCL_READ_ONLY));
+                        dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
     #endif
-    success=success && (dev_engv.alloc(_ev_fields*max_atoms,*dev,
+    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
                                        UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_ans.alloc(ans_elements*max_atoms,
+    success=success && (dev_ans.alloc(ans_elements*_max_local,
                                       *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
     if (_charge) {
-      success=success && (dev_q.alloc(max_atoms,*dev,
+      success=success && (dev_q.alloc(_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
       _gpu_bytes+=dev_q.row_bytes();
     }
     if (_rot) {
-      success=success && (dev_quat.alloc(max_atoms*4,*dev,
+      success=success && (dev_quat.alloc(_max_atoms*4,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
       _gpu_bytes+=dev_quat.row_bytes();
     }
   }
   if (_gpu_nbor) {
-    success=success && (dev_cell_id.alloc(max_atoms,*dev)==UCL_SUCCESS);
-    success=success && (dev_particle_id.alloc(max_atoms,*dev)==UCL_SUCCESS);
+    success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
     _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
     if (_bonds) {
-      success=success && (dev_tag.alloc(max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
       _gpu_bytes+=dev_tag.row_bytes();
     }
   }
 
   _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
-    
+  
+  _allocated=true;  
   return success;
 }
 
@@ -188,14 +196,13 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
   _ev_fields=6+_e_fields;
     
   // Initialize atom and nbor data
-  int max_local=static_cast<int>(static_cast<double>(inum)*1.10);
-  if (max_local==0)
-    max_local=1000;
-  if (nall<=inum)
-    _max_atoms=max_local*2;
-  else
-    _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
-    
+  int ef_inum=inum;
+  if (ef_inum==0)
+    ef_inum=1000;
+  int ef_nall=nall;
+  if (ef_nall<=ef_inum)
+    ef_nall=ef_inum*2;
+  
   // Initialize timers for the selected device
   time_pos.init(*dev);
   time_other.init(*dev);
@@ -209,8 +216,7 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
   compile_kernels(*dev);
   #endif
   
-  _allocated=true;
-  return success && alloc(_max_atoms);
+  return success && alloc(ef_inum,ef_nall);
 }
   
 template <class numtyp, class acctyp>
@@ -285,7 +291,7 @@ double PairGPUAtomT::host_memory_usage() const {
     atom_bytes+=4;
   int ans_bytes=atom_bytes+_ev_fields;
   return _max_atoms*atom_bytes*sizeof(numtyp)+
-         ans_bytes*(_max_atoms)*sizeof(acctyp)+
+         ans_bytes*(_max_local)*sizeof(acctyp)+
          sizeof(PairGPUAtom<numtyp,acctyp>);
 }
   
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
index 7cec73f98c..e0a1fd9fb1 100644
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@@ -77,11 +77,9 @@ class PairGPUAtom {
   inline bool resize(const int inum, const int nall, bool &success) {
     _inum=inum;
     _nall=nall;
-    if (nall>_max_atoms) {
+    if (inum>_max_local || nall>_max_atoms) {
       clear_resize();
-      _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
-      _allocated=true;
-      success = success && alloc(_max_atoms);
+      success = success && alloc(inum,nall);
       return true;
     }
     return false;
@@ -203,6 +201,19 @@ class PairGPUAtom {
     ucl_copy(dev_v,view,false);
   }
 
+  /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
+  template <class dev_typ, class t1, class t2>
+  inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, 
+                         UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
+    for (int i=0; i<n; i++) {
+      buffer[i*2]=static_cast<numtyp>(one[i][i]);
+      buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
+    }
+    UCL_H_Vec<dev_typ> view;
+    view.view((dev_typ*)buffer.begin(),n,*dev);
+    ucl_copy(dev_v,view,false);
+  }
+
   // -------------------------COPY TO GPU ----------------------------------
 
   /// Cast positions and types to write buffer
@@ -386,15 +397,17 @@ class PairGPUAtom {
 
   bool _compiled;
 
-  bool alloc(const int max_atoms);
+  bool alloc(const int inum, const int nall);
   
   bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_atoms, _nall, _inum, _e_fields, _ev_fields;
+  int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
   bool _gpu_nbor, _bonds;
   int *_ilist;
   double _time_cast;
   
   double _gpu_bytes;
+  
+  bool _newton;
 
   #ifndef USE_OPENCL
   CUDPPConfiguration sort_config;
diff --git a/lib/gpu/pair_gpu_balance.h b/lib/gpu/pair_gpu_balance.h
index 5d7b74dac9..a3a0f61a62 100644
--- a/lib/gpu/pair_gpu_balance.h
+++ b/lib/gpu/pair_gpu_balance.h
@@ -65,11 +65,9 @@ class PairGPUBalance {
   inline double all_avg_split() {
     if (_load_balance) {
       double _all_avg_split=0.0;
-      int nprocs;
-      MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
       MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0,
-                 MPI_COMM_WORLD);
-      _all_avg_split/=nprocs;
+                 _device->replica());
+      _all_avg_split/=_device->replica_size();
       return _all_avg_split/_avg_count;
     } else
       return _actual_split;
@@ -83,10 +81,10 @@ class PairGPUBalance {
   inline void start_timer() {
     if (_measure_this_step) {
       _device->gpu->sync();
-      MPI_Barrier(_device->gpu_comm);
+      _device->gpu_barrier();
       _device_time.start();
       _device->gpu->sync();
-      MPI_Barrier(_device->gpu_comm);
+      _device->gpu_barrier();
       _device->start_host_timer();
     }
   }
@@ -178,7 +176,7 @@ void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) {
     cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full;
 
     MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX,
-                  _device->gpu_comm);
+                  _device->gpu_comm());
     double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]);
     split*=_HD_BALANCE_GAP;
 
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index ab03e814b6..c2d980cf99 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
- 
+
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
@@ -34,19 +34,28 @@ PairGPUDeviceT::~PairGPUDevice() {
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu,
-                                 const int gpu_mode, const double p_split) {
+bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
+                                 const int first_gpu, const int last_gpu,
+                                 const int gpu_mode, const double p_split,
+                                 const int nthreads) {
+  _nthreads=nthreads;
+
   if (_device_init)
     return true;
   _device_init=true;
+  _comm_world=world;
+  _comm_replica=replica;
   _first_device=first_gpu;
   _last_device=last_gpu;
   _gpu_mode=gpu_mode;
   _particle_split=p_split;
 
-  // Get the rank within the world
-  MPI_Comm_rank(MPI_COMM_WORLD,&_world_me);
-  MPI_Comm_size(MPI_COMM_WORLD,&_world_size);
+  // Get the rank/size within the world
+  MPI_Comm_rank(_comm_world,&_world_me);
+  MPI_Comm_size(_comm_world,&_world_size);
+  // Get the rank/size within the replica
+  MPI_Comm_rank(_comm_replica,&_replica_me);
+  MPI_Comm_size(_comm_replica,&_replica_size);
 
   // Get the names of all nodes
   int name_length;
@@ -54,7 +63,7 @@ bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu,
   char node_names[MPI_MAX_PROCESSOR_NAME*_world_size];
   MPI_Get_processor_name(node_name,&name_length);
   MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
-                MPI_MAX_PROCESSOR_NAME,MPI_CHAR,MPI_COMM_WORLD);
+                MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
   std::string node_string=std::string(node_name);
   
   // Get the number of procs per node                
@@ -80,7 +89,7 @@ bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu,
   
   // Set up a per node communicator and find rank within
   MPI_Comm node_comm;
-  MPI_Comm_split(MPI_COMM_WORLD, split_id, 0, &node_comm);  
+  MPI_Comm_split(_comm_world, split_id, 0, &node_comm);  
   int node_rank;
   MPI_Comm_rank(node_comm,&node_rank);                  
 
@@ -90,8 +99,8 @@ bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu,
   int my_gpu=node_rank/_procs_per_gpu;
   
   // Set up a per device communicator
-  MPI_Comm_split(node_comm,my_gpu,0,&gpu_comm);
-  MPI_Comm_rank(gpu_comm,&_gpu_rank);
+  MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
+  MPI_Comm_rank(_comm_gpu,&_gpu_rank);
 
   gpu=new UCL_Device();
   if (my_gpu>=gpu->num_devices())
@@ -111,10 +120,13 @@ bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,
     return false;                          
   if (_init_count==0) {
     // Initialize atom and nbor data
-    if (!atom.init(nlocal,nall,charge,rot,*gpu,gpu_nbor,
+    int ef_nlocal=nlocal;
+    if (_particle_split<1.0 && _particle_split>0.0)
+      ef_nlocal=static_cast<int>(_particle_split*nlocal);
+    if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
                    gpu_nbor && maxspecial>0))
       return false;
-    if (!nbor.init(nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
+    if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
                    gpu_host,pre_cut))
       return false;
     nbor.cell_size(cell_size);
@@ -136,7 +148,7 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
   std::string fs=toa(gpu->free_gigabytes())+"/";
   #endif
   
-  if (_world_me == 0 && screen) {
+  if (_replica_me == 0 && screen) {
     fprintf(screen,"\n-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
     fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
@@ -175,14 +187,14 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
   single[3]=time_pair.total_seconds();
   single[4]=atom.cast_time();
 
-  MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
+  MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes;
   double mpi_max_bytes;
-  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
+  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
 
-  if (world_me()==0)
+  if (replica_me()==0)
     if (screen && times[3]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
@@ -191,14 +203,14 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
       fprintf(screen,"--------------------------------\n");
 
       if (procs_per_gpu()==1) {
-        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_world_size);
-        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_world_size);
-        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_world_size);
+        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
+        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
+        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
         if (nbor.gpu_nbor())
-          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_world_size);
+          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_replica_size);
         else
-          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_world_size);
-        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_world_size);
+          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
+        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
       }
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
@@ -239,10 +251,11 @@ double PairGPUDeviceT::host_memory_usage() const {
 template class PairGPUDevice<PRECISION,ACC_PRECISION>;
 PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 
-bool lmp_init_device(const int first_gpu, const int last_gpu,
-                     const int gpu_mode, const double particle_split) {
-  return pair_gpu_device.init_device(first_gpu,last_gpu,gpu_mode,
-                                     particle_split);
+bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
+                     const int last_gpu, const int gpu_mode, 
+                     const double particle_split, const int nthreads) {
+  return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
+                                     particle_split,nthreads);
 }
 
 void lmp_clear_device() {
@@ -261,3 +274,4 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom,
   }
   return 0.0;
 }
+
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index ed2baa0cc6..33aa54959b 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
- 
+
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
@@ -34,8 +34,9 @@ class PairGPUDevice {
   /// Initialize the device for use by this process
   /** Sets up a per-device MPI communicator for load balancing and initializes
     * the device (>=first_gpu and <=last_gpu) that this proc will be using **/
-  bool init_device(const int first_gpu, const int last_gpu,
-                   const int gpu_mode, const double particle_split);
+  bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
+                   const int last_gpu, const int gpu_mode, 
+                   const double particle_split, const int nthreads);
 
   /// Initialize the device for Atom and Neighbor storage
   /** \param rot True if quaternions need to be stored
@@ -83,12 +84,26 @@ class PairGPUDevice {
 
   /// Return the number of procs sharing a device (size of device commincator)
   inline int procs_per_gpu() const { return _procs_per_gpu; }
-  /// Return my rank in the device communicator
-  inline int gpu_rank() const { return _gpu_rank; }
+  /// Return the number of threads per proc
+  inline int num_threads() const { return _nthreads; }
   /// My rank within all processes
   inline int world_me() const { return _world_me; }
   /// Total number of processes
   inline int world_size() const { return _world_size; }
+  /// MPI Barrier for world
+  inline void world_barrier() { MPI_Barrier(_comm_world); }
+  /// Return the replica MPI communicator
+  inline MPI_Comm & replica() { return _comm_replica; }
+  /// My rank within replica communicator
+  inline int replica_me() const { return _replica_me; }
+  /// Number of procs in replica communicator
+  inline int replica_size() const { return _replica_size; }
+  /// Return the per-GPU MPI communicator
+  inline MPI_Comm & gpu_comm() { return _comm_gpu; }
+  /// Return my rank in the device communicator
+  inline int gpu_rank() const { return _gpu_rank; }
+  /// MPI Barrier for gpu
+  inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
   /// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH
   inline int gpu_mode() const { return _gpu_mode; }
   /// Index of first device used by a node
@@ -104,8 +119,6 @@ class PairGPUDevice {
 
   /// Geryon Device
   UCL_Device *gpu;
-  /// Device communicator
-  MPI_Comm gpu_comm;
 
   enum{GPU_FORCE, GPU_NEIGH};
 
@@ -122,8 +135,10 @@ class PairGPUDevice {
  private:
   int _init_count;
   bool _device_init;
-  int _procs_per_gpu, _gpu_rank, _world_me, _world_size;
-  int _gpu_mode, _first_device, _last_device;
+  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
+  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
+      _replica_size;
+  int _gpu_mode, _first_device, _last_device, _nthreads;
   double _particle_split;
   double _cpu_full;
 
diff --git a/lib/gpu/pair_gpu_precision.h b/lib/gpu/pair_gpu_precision.h
index 554466a2fa..a5f57c1f95 100644
--- a/lib/gpu/pair_gpu_precision.h
+++ b/lib/gpu/pair_gpu_precision.h
@@ -85,6 +85,7 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #endif
 
 #define MAX_SHARED_TYPES 8
+#define MAX_BIO_SHARED_TYPES 128
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
 #endif