git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6136 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-18 20:03:24 +00:00
parent 38acfd9fad
commit 14a348470c
18 changed files with 337 additions and 305 deletions
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@ -20,7 +20,11 @@
 CUDA_HOME = /usr/local/cuda
 NVCC = nvcc
 # newer CUDA
 CUDA_ARCH = -arch=sm_13
 # older CUDA
 #CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE
 CUDA_PRECISION = -D_SINGLE_SINGLE
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -33,13 +33,17 @@ NOTE: Installation of the CUDA SDK is not required.
 Current pair styles supporting GPU acceleration:
-  1. lj/cut/gpu
+  1. lj/cut
-  2. lj/cut/coul/cut/gpu
+  2. lj96/cut
-  3. lj/cut/coul/long/gpu
+  3. lj/expand
-  4. lj96/cut/gpu
+  4. lj/cut/coul/cut
-  5. gayberne/gpu
+  5. lj/cut/coul/long
-  6. cmm/cg/gpu
+  6. lj/charmm/coul/long
-  7. cmm/cg/coul/long/gpu
+  7. morse
  8. cg/cmm
  9. cg/cmm/coul/long
 10. gayberne
 11. pppm
                     MULTIPLE LAMMPS PROCESSES
@ -52,12 +56,12 @@ LAMMPS user manual for details on running with GPU acceleration.
                    BUILDING AND PRECISION MODES
-To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME, NVCC, CUDA_INCLUD,
+To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of 
-CUDA_LIB and CUDA_OPTS variables in one of the Makefiles. CUDA_ARCH should
+the Makefiles. CUDA_ARCH should be set based on the compute capability of 
-be set based on the compute capability of your GPU. This can be verified by
+your GPU. This can be verified by running the nvc_get_devices executable after
-running the nvc_get_devices executable after the build is complete.
+the build is complete. Additionally, the GPU package must be installed and
-Additionally, the GPU package must be installed and compiled for LAMMPS.
+compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
-This may require editing the gpu_SYSPATH variable in the LAMMPS makefile.
+LAMMPS makefile.
 Please note that the GPU library accesses the CUDA driver library directly,
 so it needs to be linked not only to the CUDA runtime library (libcudart.so)
@ -74,6 +78,10 @@ the CUDA_PRECISION variable:
  CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
  CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
 NOTE: PPPM acceleration can only be run on GPUs with compute capability>=1.1.
      You will get the error "GPU library not compiled for this accelerator."
      when attempting to run PPPM on a GPU with compute capability 1.0.
 NOTE: Double precision is only supported on certain GPUs (with
      compute capability>=1.3).
@ -83,15 +91,17 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3,
 NOTE: For Fermi, make sure that -arch=sm_20 is set on the CUDA_ARCH line.
 NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
-      package has been installed before installing the GPU package in LAMMPS.
+      package has been installed.
 NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be
-      installed if the USER-CG-CMM package has been installed before
+      installed if the USER-CG-CMM package has been installed.
      installing the GPU package in LAMMPS.
-NOTE: The lj/cut/coul/long/gpu and cg/cmm/coul/long/gpu style will only be 
+NOTE: The lj/cut/coul/long/gpu, cg/cmm/coul/long/gpu, pppm/gpu/single, and
-      installed if the KSPACE package has been installed before installing
+      pppm/gpu/double styles will only be installed if the KSPACE package has
-      the GPU package in LAMMPS.
+      been installed.
 NOTE: The lj/charmm/coul/long will only be installed if the MOLECULE package
      has been installed.
                      EXAMPLE BUILD PROCESS
@ -105,7 +115,3 @@ make yes-asphere
 make yes-kspace
 make yes-gpu
 make linux
 ------------------------------------------------------------------------
 Last merge with gpulammps: r561 on 2010-11-12
 ------------------------------------------------------------------------
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@ -18,30 +18,6 @@
 #ifndef CMM_GPU_KERNEL
 #define CMM_GPU_KERNEL
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #ifdef NV_KERNEL
 #include "nv_kernel_def.h"
@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@ -18,38 +18,6 @@
 #ifndef CMML_GPU_KERNEL
 #define CMML_GPU_KERNEL
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define EWALD_F (numtyp)1.12837917
 #define EWALD_P (numtyp)0.3275911
 #define A1 (numtyp)0.254829592
 #define A2 (numtyp)-0.284496736
 #define A3 (numtyp)1.421413741
 #define A4 (numtyp)-1.453152027
 #define A5 (numtyp)1.061405429
 #ifdef NV_KERNEL
 #include "nv_kernel_def.h"
@ -93,6 +61,38 @@ __inline float fetch_q(const int& i, const float *q)
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define EWALD_F (numtyp)1.12837917
 #define EWALD_P (numtyp)0.3275911
 #define A1 (numtyp)0.254829592
 #define A2 (numtyp)-0.284496736
 #define A3 (numtyp)1.421413741
 #define A4 (numtyp)-1.453152027
 #define A5 (numtyp)1.061405429
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@ -18,40 +18,6 @@
 #ifndef CRML_GPU_KERNEL
 #define CRML_GPU_KERNEL
 #define MAX_BIO_SHARED_TYPES 128
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define EWALD_F (numtyp)1.12837917
 #define EWALD_P (numtyp)0.3275911
 #define A1 (numtyp)0.254829592
 #define A2 (numtyp)-0.284496736
 #define A3 (numtyp)1.421413741
 #define A4 (numtyp)-1.453152027
 #define A5 (numtyp)1.061405429
 #ifdef NV_KERNEL
 #include "nv_kernel_def.h"
@ -94,6 +60,40 @@ __inline float fetch_q(const int& i, const float *q)
 #endif
 #define MAX_BIO_SHARED_TYPES 128
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define EWALD_F (numtyp)1.12837917
 #define EWALD_P (numtyp)0.3275911
 #define A1 (numtyp)0.254829592
 #define A2 (numtyp)-0.284496736
 #define A3 (numtyp)1.421413741
 #define A4 (numtyp)-1.453152027
 #define A5 (numtyp)1.061405429
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
--- a/lib/gpu/gb_gpu_kernel_nbor.cu
+++ b/lib/gpu/gb_gpu_kernel_nbor.cu
@ -18,16 +18,6 @@
 #ifndef PAIR_GPU_KERNEL_H
 #define PAIR_GPU_KERNEL_H
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #else
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #endif
 #ifdef NV_KERNEL
 #include "nv_kernel_def.h"
@ -44,6 +34,16 @@
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #else
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #endif
 // ---------------------------------------------------------------------------
 // Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
 // -- Only unpack neighbors matching the specified inclusive range of forms
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@ -33,6 +33,14 @@
 #define MEM_THREADS 32
 #endif
 #ifdef CUDA_PRE_THREE
 struct __builtin_align__(16) _double4
 {
  double x, y, z, w;
 };
 typedef struct _double4 double4;
 #endif
 #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
 #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
 #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@ -18,30 +18,6 @@
 #ifndef LJ96_GPU_KERNEL
 #define LJ96_GPU_KERNEL
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #ifdef NV_KERNEL
 #include "nv_kernel_def.h"
@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@ -18,30 +18,6 @@
 #ifndef LJ_GPU_KERNEL
 #define LJ_GPU_KERNEL
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #ifdef NV_KERNEL
 #include "nv_kernel_def.h"
@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
--- a/lib/gpu/lj_expand_gpu_kernel.cu
+++ b/lib/gpu/lj_expand_gpu_kernel.cu
@ -18,30 +18,6 @@
 #ifndef LJE_GPU_KERNEL
 #define LJE_GPU_KERNEL
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #ifdef NV_KERNEL
 #include "nv_kernel_def.h"
@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@ -18,30 +18,6 @@
 #ifndef LJC_GPU_KERNEL
 #define LJC_GPU_KERNEL
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #ifdef NV_KERNEL
 #include "nv_kernel_def.h"
@ -85,6 +61,30 @@ __inline float fetch_q(const int& i, const float *q)
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@ -18,38 +18,6 @@
 #ifndef LJCL_GPU_KERNEL
 #define LJCL_GPU_KERNEL
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define EWALD_F (numtyp)1.12837917
 #define EWALD_P (numtyp)0.3275911
 #define A1 (numtyp)0.254829592
 #define A2 (numtyp)-0.284496736
 #define A3 (numtyp)1.421413741
 #define A4 (numtyp)-1.453152027
 #define A5 (numtyp)1.061405429
 #ifdef NV_KERNEL
 #include "nv_kernel_def.h"
@ -93,6 +61,38 @@ __inline float fetch_q(const int& i, const float *q)
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define EWALD_F (numtyp)1.12837917
 #define EWALD_P (numtyp)0.3275911
 #define A1 (numtyp)0.254829592
 #define A2 (numtyp)-0.284496736
 #define A3 (numtyp)1.421413741
 #define A4 (numtyp)-1.453152027
 #define A5 (numtyp)1.061405429
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
--- a/lib/gpu/morse_gpu_kernel.cu
+++ b/lib/gpu/morse_gpu_kernel.cu
@ -18,30 +18,6 @@
 #ifndef MORSE_GPU_KERNEL
 #define MORSE_GPU_KERNEL
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #ifdef NV_KERNEL
 #include "nv_kernel_def.h"
@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
--- a/lib/gpu/pair_gpu_atom_kernel.cu
+++ b/lib/gpu/pair_gpu_atom_kernel.cu
@ -15,6 +15,13 @@
   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 #ifdef NV_KERNEL
 #include "geryon/ucl_nv_kernel.h"
 #else
 #pragma OPENCL EXTENSION cl_khr_fp64: enable
 #define GLOBAL_ID_X get_global_id(0)
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp4 double4
@ -23,13 +30,6 @@
 #define numtyp4 float4
 #endif
 #ifdef NV_KERNEL
 #include "geryon/ucl_nv_kernel.h"
 #else
 #pragma OPENCL EXTENSION cl_khr_fp64: enable
 #define GLOBAL_ID_X get_global_id(0)
 #endif
 __kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x,
                            __global int *type, const int nall) {
  int ii=GLOBAL_ID_X;
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@ -549,8 +549,9 @@ int PairGPUDeviceT::compile_kernels() {
  k_info.run(&d_gpu_lib_data.begin());
  ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
  _ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0;
  #ifndef USE_OPENCL
-  if (static_cast<double>(h_gpu_lib_data[0])/100.0>gpu->arch())
+  if (_ptx_arch>gpu->arch())
    return -4;
  #endif
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@ -226,6 +226,8 @@ class PairGPUDevice {
  inline int block_bio_pair() const { return _block_bio_pair; }
  /// Return the maximum number of atom types for shared mem with "bio" styles
  inline int max_bio_shared_types() const { return _max_bio_shared_types; }
  /// Architecture gpu code compiled for (returns 0 for OpenCL)
  inline double ptx_arch() const { return _ptx_arch; }
  // -------------------- SHARED DEVICE ROUTINES -------------------- 
  // Perform asynchronous zero of integer array 
@ -281,6 +283,7 @@ class PairGPUDevice {
  int _gpu_mode, _first_device, _last_device, _nthreads;
  double _particle_split;
  double _cpu_full;
  double _ptx_arch;
  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
  int _pppm_max_spline, _pppm_block;
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@ -18,27 +18,6 @@
 #ifndef PPPM_GPU_KERNEL
 #define PPPM_GPU_KERNEL
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 #ifdef NV_KERNEL
 #include "geryon/ucl_nv_kernel.h"
@ -67,6 +46,12 @@ __inline float fetch_q(const int& i, const float *q)
 #endif
 // Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
 // generated at runtime with use of pppm/gpu
 #if (__CUDA_ARCH__ < 110)
 #define atom_add(x,y) 0
 #endif
 #else
 #pragma OPENCL EXTENSION cl_khr_fp64: enable
@ -85,6 +70,27 @@ __inline float fetch_q(const int& i, const float *q)
 #endif
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 #ifndef numtyp
 #define numtyp float
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 // Maximum order for spline
 #define PPPM_MAX_SPLINE 8
 // Thread block size for PPPM kernels
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@ -66,6 +66,10 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
    flag=-5;
    return 0;
  }
  if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
    flag=-4;
    return 0;
  }
  ucl_device=device->gpu;
  atom=&device->atom;