From 0ecefcd1ba64b9580707de6ba247c32ec26e1cd1 Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Tue, 10 May 2011 16:38:17 -0400
Subject: [PATCH] Working re_squared.

---
 lib/gpu/Nvidia.makefile        |  32 ++--
 lib/gpu/Opencl.makefile        |   8 +-
 lib/gpu/base_ellipsoid.cpp     |  13 +-
 lib/gpu/base_ellipsoid.h       |   8 +-
 lib/gpu/gayberne.cpp           |   4 +-
 lib/gpu/re_squared.cpp         |  38 ++--
 lib/gpu/re_squared.cu          |  28 +--
 lib/gpu/re_squared.h           |  15 +-
 lib/gpu/re_squared_ext.cpp     |  35 ++--
 src/ASPHERE/pair_resquared.h   |   2 +-
 src/GPU/Install.sh             |   4 +
 src/GPU/pair_gayberne_gpu.h    |   5 +-
 src/GPU/pair_resquared_gpu.cpp | 325 +++++++++++++++++++++++++++++++++
 src/GPU/pair_resquared_gpu.h   |  49 +++++
 14 files changed, 468 insertions(+), 98 deletions(-)
 create mode 100644 src/GPU/pair_resquared_gpu.cpp
 create mode 100644 src/GPU/pair_resquared_gpu.h

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index aca8d8bba8..96df9c16e2 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -65,10 +65,10 @@ PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
        $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
        $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \
        $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \
-       $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/gayberne.ptx \
-       $(OBJ_DIR)/gayberne_lj.ptx $(OBJ_DIR)/gayberne_ptx.h \
-       $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx \
-       $(OBJ_DIR)/re_squared_ptx.h \
+       $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h \
+       $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx \
+       $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/re_squared.ptx \
+       $(OBJ_DIR)/re_squared_lj.ptx $(OBJ_DIR)/re_squared_ptx.h \
        $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
        $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
        $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \
@@ -145,8 +145,8 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.c
 $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(CUDR) -o $@ -c charge_gpu_memory.cpp
 
-$(OBJ_DIR)/base_ellipsoid.o: $(ALL_H) base_ellipsoid.h base_ellipsoid.cpp
-	$(CUDR) -o $@ -c base_ellipsoid.cpp
+$(OBJ_DIR)/base_ellipsoid.o: $(ALL_H) base_ellipsoid.h base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_ptx.h
+	$(CUDR) -o $@ -c base_ellipsoid.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu
@@ -166,17 +166,20 @@ $(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(O
 $(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
 	$(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/ellipsoid_nbor.ptx: ellipsoid_nbor.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ ellipsoid_nbor.cu
+
+$(OBJ_DIR)/ellipsoid_nbor_ptx.h: $(OBJ_DIR)/ellipsoid_nbor.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h
+
 $(OBJ_DIR)/gayberne.ptx: gayberne.cu pair_gpu_precision.h ellipsoid_extra.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gayberne.cu
 
 $(OBJ_DIR)/gayberne_lj.ptx: gayberne_lj.cu pair_gpu_precision.h ellipsoid_extra.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gayberne_lj.cu
 
-$(OBJ_DIR)/ellipsoid_nbor.ptx: ellipsoid_nbor.cu pair_gpu_precision.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ ellipsoid_nbor.cu
-
-$(OBJ_DIR)/gayberne_ptx.h: $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx
-	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx $(OBJ_DIR)/gayberne_ptx.h
+$(OBJ_DIR)/gayberne_ptx.h: $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx $(OBJ_DIR)/gayberne_ptx.h
 
 $(OBJ_DIR)/gayberne.o: $(ALL_H) gayberne.h gayberne.cpp $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/base_ellipsoid.o
 	$(CUDR) -o $@ -c gayberne.cpp -I$(OBJ_DIR)
@@ -190,11 +193,8 @@ $(OBJ_DIR)/re_squared.ptx: re_squared.cu pair_gpu_precision.h ellipsoid_extra.h
 $(OBJ_DIR)/re_squared_lj.ptx: re_squared_lj.cu pair_gpu_precision.h ellipsoid_extra.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ re_squared_lj.cu
 
-$(OBJ_DIR)/ellipsoid_nbor.ptx: ellipsoid_nbor.cu pair_gpu_precision.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ ellipsoid_nbor.cu
-
-$(OBJ_DIR)/re_squared_ptx.h: $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx
-	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx $(OBJ_DIR)/re_squared_ptx.h
+$(OBJ_DIR)/re_squared_ptx.h: $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx $(OBJ_DIR)/re_squared_ptx.h
 
 $(OBJ_DIR)/re_squared.o: $(ALL_H) re_squared.h re_squared.cpp $(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/base_ellipsoid.o
 	$(CUDR) -o $@ -c re_squared.cpp -I$(OBJ_DIR)
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 88138be91f..25a0a06656 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -91,8 +91,8 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.c
 $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(OCL) -o $@ -c charge_gpu_memory.cpp
 
-$(OBJ_DIR)/base_ellipsoid.o: $(OCL_H) base_ellipsoid.h base_ellipsoid.cpp
-	$(OCL) -o $@ -c base_ellipsoid.cpp
+$(OBJ_DIR)/base_ellipsoid.o: $(OCL_H) base_ellipsoid.h base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h
+	$(OCL) -o $@ -c base_ellipsoid.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h;
@@ -112,7 +112,7 @@ $(OBJ_DIR)/gayberne_cl.h: gayberne.cu gayberne_lj.cu ellipsoid_extra.h
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gayberne.tar $(OBJ_DIR)/gayberne_lj.tar $(OBJ_DIR)/gayberne_cl.h; \
 	rm -f $(OBJ_DIR)/gayberne.tar $(OBJ_DIR)/gayberne_lj.tar
 
-$(OBJ_DIR)/gayberne.o: $(ALL_H) gayberne.h gayberne.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/gayberne_cl.h $(OBJ_DIR)/base_ellipsoid.o
+$(OBJ_DIR)/gayberne.o: $(ALL_H) gayberne.h gayberne.cpp $(OBJ_DIR)/gayberne_cl.h $(OBJ_DIR)/base_ellipsoid.o
 	$(OCL) -o $@ -c gayberne.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/gayberne.o gayberne_ext.cpp
@@ -124,7 +124,7 @@ $(OBJ_DIR)/re_squared_cl.h: re_squared.cu re_squared_lj.cu ellipsoid_extra.h
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/re_squared.tar $(OBJ_DIR)/re_squared_lj.tar $(OBJ_DIR)/re_squared_cl.h; \
 	rm -f $(OBJ_DIR)/re_squared.tar $(OBJ_DIR)/re_squared_lj.tar
 
-$(OBJ_DIR)/re_squared.o: $(ALL_H) re_squared.h re_squared.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/re_squared_cl.h $(OBJ_DIR)/base_ellipsoid.o
+$(OBJ_DIR)/re_squared.o: $(ALL_H) re_squared.h re_squared.cpp $(OBJ_DIR)/re_squared_cl.h $(OBJ_DIR)/base_ellipsoid.o
 	$(OCL) -o $@ -c re_squared.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/re_squared.o re_squared_ext.cpp
diff --git a/lib/gpu/base_ellipsoid.cpp b/lib/gpu/base_ellipsoid.cpp
index 2f5c2e7aa1..65c5495345 100644
--- a/lib/gpu/base_ellipsoid.cpp
+++ b/lib/gpu/base_ellipsoid.cpp
@@ -16,6 +16,12 @@
 #include "base_ellipsoid.h"
 using namespace LAMMPS_AL;
 
+#ifdef USE_OPENCL
+#include "ellipsoid_nbor_cl.h"
+#else
+#include "ellipsoid_nbor_ptx.h"
+#endif
+
 #define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
 extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 
@@ -43,7 +49,6 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
                               const int max_nbors, const int maxspecial,
                               const double cell_size, const double gpu_split,
                               FILE *_screen, const int ntypes, int **h_form,
-                              const char *nbor_program, 
                               const char *ellipsoid_program,
                               const char *lj_program) {
   nbor_time_avail=false;
@@ -69,7 +74,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
   atom=&device->atom;
 
   _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device,nbor_program,ellipsoid_program,lj_program);
+  compile_kernels(*ucl_device,ellipsoid_program,lj_program);
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -423,7 +428,7 @@ double BaseEllipsoidT::host_memory_usage_base() const {
 }
 
 template <class numtyp, class acctyp>
-void BaseEllipsoidT::compile_kernels(UCL_Device &dev, const char *nbor_string,
+void BaseEllipsoidT::compile_kernels(UCL_Device &dev, 
                                      const char *ellipsoid_string,
                                      const char *lj_string) {
   if (_compiled)
@@ -433,7 +438,7 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev, const char *nbor_string,
                     std::string(OCL_PRECISION_COMPILE);
 
   nbor_program=new UCL_Program(dev);
-  nbor_program->load_string(nbor_string,flags.c_str());
+  nbor_program->load_string(ellipsoid_nbor,flags.c_str());
   k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
   k_nbor.set_function(*nbor_program,"kernel_nbor");
 
diff --git a/lib/gpu/base_ellipsoid.h b/lib/gpu/base_ellipsoid.h
index 2c083175df..2665c6cfdc 100644
--- a/lib/gpu/base_ellipsoid.h
+++ b/lib/gpu/base_ellipsoid.h
@@ -48,8 +48,8 @@ class BaseEllipsoid {
   int init_base(const int nlocal, const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size,
                 const double gpu_split, FILE *screen, const int ntypes,
-                int **h_form, const char *nbor_program, 
-                const char *ellipsoid_program, const char *lj_program);
+                int **h_form, const char *ellipsoid_program,
+                const char *lj_program);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
@@ -233,8 +233,8 @@ class BaseEllipsoid {
   int **_host_form;
   int _last_ellipse, _max_last_ellipse;
 
-  void compile_kernels(UCL_Device &dev, const char *nbor_string,
-                       const char *ellipsoid_string, const char *lj_string);
+  void compile_kernels(UCL_Device &dev, const char *ellipsoid_string,
+                       const char *lj_string);
 
   virtual void loop(const bool _eflag, const bool _vflag) = 0;
 };
diff --git a/lib/gpu/gayberne.cpp b/lib/gpu/gayberne.cpp
index 123073e50a..d6299d9f94 100644
--- a/lib/gpu/gayberne.cpp
+++ b/lib/gpu/gayberne.cpp
@@ -15,7 +15,6 @@
 
 #ifdef USE_OPENCL
 #include "gayberne_cl.h"
-#include "ellipsoid_nbor_cl.h"
 #else
 #include "gayberne_ptx.h"
 #endif
@@ -56,8 +55,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
                          const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                          _screen,ntypes,h_form,ellipsoid_nbor,
-                          gayberne,gayberne_lj);
+                          _screen,ntypes,h_form,gayberne,gayberne_lj);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/re_squared.cpp b/lib/gpu/re_squared.cpp
index ce718560d4..83946dac98 100644
--- a/lib/gpu/re_squared.cpp
+++ b/lib/gpu/re_squared.cpp
@@ -15,7 +15,6 @@
 
 #ifdef USE_OPENCL
 #include "re_squared_cl.h"
-#include "ellipsoid_nbor_cl.h"
 #else
 #include "re_squared_ptx.h"
 #endif
@@ -43,20 +42,18 @@ int RESquaredT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-int RESquaredT::init(const int ntypes, const double gamma, 
-                         const double upsilon, const double mu, 
-                         double **host_shape, double **host_well, 
-                         double **host_cutsq, double **host_sigma, 
-                         double **host_epsilon, double *host_lshape, 
-                         int **h_form, double **host_lj1, double **host_lj2,
-                         double **host_lj3, double **host_lj4,
-                         double **host_offset, const double *host_special_lj,
-                         const int nlocal, const int nall, const int max_nbors,
-                         const int maxspecial, const double cell_size,
-                         const double gpu_split, FILE *_screen) {
+int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, 
+                     double **host_cutsq, double **host_sigma, 
+                     double **host_epsilon, double *host_lshape, int **h_form,
+                     double **host_lj1, double **host_lj2, double **host_lj3,
+                     double **host_lj4, double **host_offset,
+                     const double *host_special_lj, const int nlocal,
+                     const int nall, const int max_nbors, const int maxspecial,
+                     const double cell_size, const double gpu_split,
+                     FILE *_screen) {
   int success;
   success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                          _screen,ntypes,h_form,ellipsoid_nbor,
+                          _screen,ntypes,h_form,
                           re_squared,re_squared_lj);
   if (success!=0)
     return success;
@@ -99,15 +96,12 @@ int RESquaredT::init(const int ntypes, const double gamma,
     
   // Allocate, cast and asynchronous memcpy of constant data
   // Copy data for bonded interactions
-  gamma_upsilon_mu.alloc(7,*(this->ucl_device),UCL_READ_ONLY);
-  host_write[0]=static_cast<numtyp>(gamma); 
-  host_write[1]=static_cast<numtyp>(upsilon);
-  host_write[2]=static_cast<numtyp>(mu);
-  host_write[3]=static_cast<numtyp>(host_special_lj[0]);
-  host_write[4]=static_cast<numtyp>(host_special_lj[1]);
-  host_write[5]=static_cast<numtyp>(host_special_lj[2]);
-  host_write[6]=static_cast<numtyp>(host_special_lj[3]);
-  ucl_copy(gamma_upsilon_mu,host_write,7,false);
+  gamma_upsilon_mu.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  host_write[0]=static_cast<numtyp>(host_special_lj[0]);
+  host_write[1]=static_cast<numtyp>(host_special_lj[1]);
+  host_write[2]=static_cast<numtyp>(host_special_lj[2]);
+  host_write[3]=static_cast<numtyp>(host_special_lj[3]);
+  ucl_copy(gamma_upsilon_mu,host_write,4,false);
 
   lshape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
   UCL_H_Vec<double> d_view;
diff --git a/lib/gpu/re_squared.cu b/lib/gpu/re_squared.cu
index e4603331d8..3170e8d8d5 100644
--- a/lib/gpu/re_squared.cu
+++ b/lib/gpu/re_squared.cu
@@ -101,15 +101,15 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
     ishape=shape[itype];
     numtyp4 ishape2;
     ishape2.x=ishape.x*ishape.x;
-    ishape2.y=ishape.x*ishape.y;
-    ishape2.z=ishape.x*ishape.z;
+    ishape2.y=ishape.y*ishape.y;
+    ishape2.z=ishape.z*ishape.z;
     
     {
       numtyp aTs[9];    // A1'*S1^2
       gpu_quat_to_mat_trans(q,i,a1);
       gpu_transpose_times_diag3(a1,well[itype],aTe1);
-      gpu_transpose_times_diag3(a1,ishape,aTs);
-      gpu_diag_times3(ishape,a1,sa1);
+      gpu_transpose_times_diag3(a1,ishape2,aTs);
+      gpu_diag_times3(ishape2,a1,sa1);
       gpu_times3(aTs,a1,gamma1);
       gpu_rotation_generator_x(a1,lA1_0);
       gpu_rotation_generator_y(a1,lA1_1);
@@ -159,14 +159,14 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
       jshape=shape[jtype];
       numtyp4 jshape2;
       jshape2.x=jshape.x*jshape.x;
-      jshape2.y=jshape.x*jshape.y;
-      jshape2.z=jshape.x*jshape.z;
+      jshape2.y=jshape.y*jshape.y;
+      jshape2.z=jshape.z*jshape.z;
       {
         numtyp aTs[9];    // A1'*S1^2
         gpu_quat_to_mat_trans(q,j,a2);
         gpu_transpose_times_diag3(a2,well[jtype],aTe2);
-        gpu_transpose_times_diag3(a2,jshape,aTs);
-        gpu_diag_times3(jshape,a2,sa2);
+        gpu_transpose_times_diag3(a2,jshape2,aTs);
+        gpu_diag_times3(jshape2,a2,sa2);
         gpu_times3(aTs,a2,gamma2);
         gpu_rotation_generator_x(a2,lA2_0);
         gpu_rotation_generator_y(a2,lA2_1);
@@ -357,19 +357,19 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
         if (i==0) {
           f.x+=force;
           if (vflag>0)
-            virial[0]+=r[0]*force;
+            virial[0]+=-r[0]*force;
         } else if (i==1) {
           f.y+=force;
           if (vflag>0) {
-            virial[1]+=r[1]*force;
-            virial[3]+=r[0]*force;
+            virial[1]+=-r[1]*force;
+            virial[3]+=-r[0]*force;
           }
         } else {
           f.z+=force;
           if (vflag>0) {
-            virial[2]+=r[2]*force;
-            virial[4]+=r[0]*force;
-            virial[5]+=r[1]*force;
+            virial[2]+=-r[2]*force;
+            virial[4]+=-r[0]*force;
+            virial[5]+=-r[1]*force;
           }
         }
       }
diff --git a/lib/gpu/re_squared.h b/lib/gpu/re_squared.h
index bebc12f204..85fc738d07 100644
--- a/lib/gpu/re_squared.h
+++ b/lib/gpu/re_squared.h
@@ -40,14 +40,13 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, const double gamma,
-           const double upsilon, const double mu, double **host_shape,
-           double **host_well, double **host_cutsq, double **host_sigma, 
-           double **host_epsilon, double *host_lshape, int **h_form,
-           double **host_lj1, double **host_lj2, double **host_lj3, 
-           double **host_lj4, double **host_offset, 
-           const double *host_special_lj, const int nlocal, const int nall, 
-           const int max_nbors, const int maxspecial, const double cell_size,
+  int init(const int ntypes, double **host_shape, double **host_well,
+           double **host_cutsq, double **host_sigma,  double **host_epsilon,
+           double *host_lshape, int **h_form, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, const double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
diff --git a/lib/gpu/re_squared_ext.cpp b/lib/gpu/re_squared_ext.cpp
index f573496004..38a6acfaf4 100644
--- a/lib/gpu/re_squared_ext.cpp
+++ b/lib/gpu/re_squared_ext.cpp
@@ -27,15 +27,13 @@ static RESquared<PRECISION,ACC_PRECISION> REMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int re_gpu_init(const int ntypes, const double gamma,
-                const double upsilon, const double mu, double **shape,
-                double **well, double **cutsq, double **sigma,
-                double **epsilon, double *host_lshape, int **form,
-                double **host_lj1, double **host_lj2, double **host_lj3,
-                double **host_lj4, double **offset, double *special_lj,
-                const int inum, const int nall, const int max_nbors, 
-                const int maxspecial, const double cell_size, int &gpu_mode,
-                FILE *screen) {
+int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
+                double **sigma, double **epsilon, double *host_lshape,
+                int **form, double **host_lj1, double **host_lj2, 
+                double **host_lj3, double **host_lj4, double **offset,
+                double *special_lj, const int inum, const int nall,
+                const int max_nbors, const int maxspecial,
+                const double cell_size, int &gpu_mode, FILE *screen) {
   REMF.clear();
   gpu_mode=REMF.device->gpu_mode();
   double gpu_split=REMF.device->particle_split();
@@ -45,7 +43,7 @@ int re_gpu_init(const int ntypes, const double gamma,
   int gpu_rank=REMF.device->gpu_rank();
   int procs_per_gpu=REMF.device->procs_per_gpu();
 
-  REMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
+  REMF.device->init_message(screen,"resquared",first_gpu,last_gpu);
 
   bool message=false;
   if (REMF.device->replica_me()==0 && screen)
@@ -58,11 +56,10 @@ int re_gpu_init(const int ntypes, const double gamma,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=REMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                      sigma, epsilon, host_lshape, form, host_lj1, 
-                      host_lj2, host_lj3, host_lj4, offset, special_lj, 
-                      inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
-                      screen);
+    init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, host_lshape,
+                      form, host_lj1, host_lj2, host_lj3, host_lj4, offset,
+                      special_lj, inum, nall, max_nbors, maxspecial, cell_size,
+                      gpu_split, screen);
 
   REMF.device->world_barrier();
   if (message)
@@ -78,10 +75,10 @@ int re_gpu_init(const int ntypes, const double gamma,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=REMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,  sigma,
-                        epsilon, host_lshape, form, host_lj1, host_lj2,
-                        host_lj3, host_lj4, offset, special_lj,  inum, nall,
-                        max_nbors, maxspecial, cell_size, gpu_split,  screen);
+      init_ok=REMF.init(ntypes, shape, well, cutsq,  sigma, epsilon, 
+                        host_lshape, form, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj,  inum, nall,
+                        max_nbors, maxspecial, cell_size, gpu_split, screen);
 
     REMF.device->gpu_barrier();
     if (message) 
diff --git a/src/ASPHERE/pair_resquared.h b/src/ASPHERE/pair_resquared.h
index 6aa2273c45..f3e1dcc58f 100755
--- a/src/ASPHERE/pair_resquared.h
+++ b/src/ASPHERE/pair_resquared.h
@@ -38,7 +38,7 @@ class PairRESquared : public Pair {
   void write_restart_settings(FILE *);
   void read_restart_settings(FILE *);
 
- private:
+ protected:
   double cut_global;
   double **cut;
 
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index dd5d7add03..202b4c956d 100644
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -26,6 +26,8 @@ if (test $1 = 1) then
   if (test -e ../pair_gayberne.cpp) then
     cp pair_gayberne_gpu.cpp ..
     cp pair_gayberne_gpu.h ..
+    cp pair_resquared_gpu.cpp ..
+    cp pair_resquared_gpu.h ..
   fi
   
   if (test -e ../pair_lj_cut_coul_long.cpp) then
@@ -83,6 +85,7 @@ elif (test $1 = 0) then
   rm ../pppm_gpu_single.cpp
   rm ../pppm_gpu_double.cpp
   rm ../pair_gayberne_gpu.cpp
+  rm ../pair_resquared_gpu.cpp
   rm ../pair_lj_cut_gpu.cpp
   rm ../pair_morse_gpu.cpp
   rm ../pair_lj96_cut_gpu.cpp
@@ -102,6 +105,7 @@ elif (test $1 = 0) then
   rm ../pppm_gpu_single.h
   rm ../pppm_gpu_double.h
   rm ../pair_gayberne_gpu.h
+  rm ../pair_resquared_gpu.h
   rm ../pair_lj_cut_gpu.h
   rm ../pair_morse_gpu.h
   rm ../pair_lj96_cut_gpu.h
diff --git a/src/GPU/pair_gayberne_gpu.h b/src/GPU/pair_gayberne_gpu.h
index 76d3b516a9..9ecae3c939 100644
--- a/src/GPU/pair_gayberne_gpu.h
+++ b/src/GPU/pair_gayberne_gpu.h
@@ -17,11 +17,10 @@ PairStyle(gayberne/gpu,PairGayBerneGPU)
 
 #else
 
-#ifndef LMP_PAIR_GPU_H
-#define LMP_PAIR_GPU_H
+#ifndef LMP_PAIR_GAYBERNE_GPU_H
+#define LMP_PAIR_GAYBERNE_GPU_H
 
 #include "pair_gayberne.h"
-#define MAX_GPU_THREADS 1
 
 namespace LAMMPS_NS {
 
diff --git a/src/GPU/pair_resquared_gpu.cpp b/src/GPU/pair_resquared_gpu.cpp
new file mode 100644
index 0000000000..e8896d6dd9
--- /dev/null
+++ b/src/GPU/pair_resquared_gpu.cpp
@@ -0,0 +1,325 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "lmptype.h"
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_resquared_gpu.h"
+#include "math_extra.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "atom_vec_ellipsoid.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "domain.h"
+#include "update.h"
+#include "string.h"
+#include "gpu_extra.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for atom decomposition
+
+int re_gpu_init(const int ntypes, double **shape, double **well,
+		double **cutsq, double **sigma, double **epsilon,
+		double *host_lshape, int **form, double **host_lj1,
+		double **host_lj2, double **host_lj3, double **host_lj4,
+		double **offset, double *special_lj, const int nlocal,
+		const int nall,	const int max_nbors, const int maxspecial,
+		const double cell_size,	int &gpu_mode, FILE *screen);
+void re_gpu_clear();
+int ** re_gpu_compute_n(const int ago, const int inum, const int nall,
+			double **host_x, int *host_type, double *sublo,
+			double *subhi, int *tag, int **nspecial, int **special,
+			const bool eflag, const bool vflag,
+			const bool eatom, const bool vatom, int &host_start,
+			int **ilist, int **jnum, const double cpu_time,
+			bool &success, double **host_quat);
+int * re_gpu_compute(const int ago, const int inum, const int nall,
+		     double **host_x, int *host_type, int *ilist, int *numj,
+		     int **firstneigh, const bool eflag, const bool vflag,
+		     const bool eatom, const bool vatom, int &host_start,
+		     const double cpu_time, bool &success, double **host_quat);
+double re_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
+
+/* ---------------------------------------------------------------------- */
+
+PairRESquaredGPU::PairRESquaredGPU(LAMMPS *lmp) : PairRESquared(lmp),
+                                                gpu_mode(GPU_PAIR)
+{
+  avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid");
+  if (!avec) 
+    error->all("Pair gayberne requires atom style ellipsoid");
+  quat_nmax = 0;
+  quat = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairRESquaredGPU::~PairRESquaredGPU()
+{
+  re_gpu_clear();
+  cpu_time = 0.0;
+  memory->destroy(quat);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairRESquaredGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;  
+
+  if (nall > quat_nmax) {
+    quat_nmax = static_cast<int>(1.1 * nall);
+    memory->grow(quat, quat_nmax, 4, "pair:quat");
+  }
+  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
+  int *ellipsoid = atom->ellipsoid;
+  for (int i=0; i<nall; i++) {
+    int qi = ellipsoid[i];
+    if (qi > -1) {
+      quat[i][0] = bonus[qi].quat[0];
+      quat[i][1] = bonus[qi].quat[1];
+      quat[i][2] = bonus[qi].quat[2];
+      quat[i][3] = bonus[qi].quat[3];
+    }
+  }
+
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    firstneigh = re_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				  atom->type, domain->sublo, domain->subhi,
+				  atom->tag, atom->nspecial, atom->special,
+				  eflag, vflag, eflag_atom, vflag_atom,
+				  host_start, &ilist, &numneigh, cpu_time,
+				  success, quat);
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    olist = re_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+			   ilist, numneigh, firstneigh, eflag, vflag,
+			   eflag_atom, vflag_atom, host_start,
+			   cpu_time, success, quat);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start < inum) {
+    cpu_time = MPI_Wtime();
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairRESquaredGPU::init_style()
+{
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU RESquared pair style");
+  if (!atom->ellipsoid_flag)
+    error->all("Pair resquared requires atom style ellipsoid");
+
+  // per-type shape precalculations
+  // require that atom shapes are identical within each type
+  // if shape = 0 for point particle, set shape = 1 as required by Gay-Berne
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    if (!atom->shape_consistency(i,shape1[i][0],shape1[i][1],shape1[i][2]))
+      error->all("Pair resquared requires atoms with same type have same shape");
+    if (shape1[i][0] == 0.0)
+      shape1[i][0] = shape1[i][1] = shape1[i][2] = 1.0;
+    shape2[i][0] = shape1[i][0]*shape1[i][0];
+    shape2[i][1] = shape1[i][1]*shape1[i][1];
+    shape2[i][2] = shape1[i][2]*shape1[i][2];
+    lshape[i] = shape1[i][0]*shape1[i][1]*shape1[i][2];
+  }
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  int success = re_gpu_init(atom->ntypes+1, shape1, well, cutsq, sigma,
+			    epsilon, lshape, form, lj1, lj2, lj3, lj4, offset,
+			    force->special_lj, atom->nlocal, 
+			    atom->nlocal+atom->nghost, 300, maxspecial,
+			    cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+  quat_nmax = static_cast<int>(1.1 * (atom->nlocal + atom->nghost));
+  memory->grow(quat, quat_nmax, 4, "pair:quat");
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairRESquaredGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + memory->usage(quat,quat_nmax)+re_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairRESquaredGPU::cpu_compute(int start, int inum, int eflag, int vflag,
+				  int *ilist, int *numneigh, int **firstneigh)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
+  double fforce[3],ttor[3],rtor[3],r12[3];
+  int *jlist;
+  RE2Vars wi,wj;
+
+  evdwl = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double **tor = atom->torque;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    itype = type[i];
+
+    // not a LJ sphere
+
+    if (lshape[itype] != 0.0) precompute_i(i,wi);
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      // r12 = center to center vector
+
+      r12[0] = x[j][0]-x[i][0];
+      r12[1] = x[j][1]-x[i][1];
+      r12[2] = x[j][2]-x[i][2];
+      rsq = MathExtra::dot3(r12,r12);
+      jtype = type[j];
+
+      // compute if less than cutoff
+
+      if (rsq < cutsq[itype][jtype]) {
+        switch (form[itype][jtype]) {
+
+         case SPHERE_SPHERE:
+          r2inv = 1.0/rsq;
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          forcelj *= -r2inv;
+          if (eflag) one_eng =
+              r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
+              offset[itype][jtype];
+          fforce[0] = r12[0]*forcelj;
+          fforce[1] = r12[1]*forcelj;
+          fforce[2] = r12[2]*forcelj;
+          break;
+
+         case SPHERE_ELLIPSE:
+          precompute_i(j,wj);
+          one_eng = resquared_lj(j,i,wj,r12,rsq,fforce,rtor,false);
+          break;
+
+         case ELLIPSE_SPHERE:
+          one_eng = resquared_lj(i,j,wi,r12,rsq,fforce,ttor,true);
+          tor[i][0] += ttor[0]*factor_lj;
+          tor[i][1] += ttor[1]*factor_lj;
+          tor[i][2] += ttor[2]*factor_lj;
+          break;
+
+         default:
+          precompute_i(j,wj);
+          one_eng = resquared_analytic(i,j,wi,wj,r12,rsq,fforce,ttor,rtor);
+          tor[i][0] += ttor[0]*factor_lj;
+          tor[i][1] += ttor[1]*factor_lj;
+          tor[i][2] += ttor[2]*factor_lj;
+
+         break;
+        }
+
+        fforce[0] *= factor_lj;
+        fforce[1] *= factor_lj;
+        fforce[2] *= factor_lj;
+        f[i][0] += fforce[0];
+        f[i][1] += fforce[1];
+        f[i][2] += fforce[2];
+
+        if (eflag) evdwl = factor_lj*one_eng;
+
+	if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],
+				      fforce[2],-r12[0],-r12[1],-r12[2]);
+      }
+    }
+  }
+}
diff --git a/src/GPU/pair_resquared_gpu.h b/src/GPU/pair_resquared_gpu.h
new file mode 100644
index 0000000000..0fd24ae6dc
--- /dev/null
+++ b/src/GPU/pair_resquared_gpu.h
@@ -0,0 +1,49 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(resquared/gpu,PairRESquaredGPU)
+
+#else
+
+#ifndef LMP_PAIR_RESQUARED_GPU_H
+#define LMP_PAIR_RESQUARED_GPU_H
+
+#include "pair_resquared.h"
+
+namespace LAMMPS_NS {
+
+class PairRESquaredGPU : public PairRESquared {
+ public:
+  PairRESquaredGPU(LAMMPS *lmp);
+  ~PairRESquaredGPU();
+  void cpu_compute(int, int, int, int, int *, int *, int **);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+  enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int *olist;
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+  int quat_nmax;
+  double **quat;
+};
+
+}
+#endif
+#endif