From 2728aab0e5e1c38e074f58a6b8df9a77f76cbe02 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 2 Dec 2011 16:11:02 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7283 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/lal_neighbor.cpp | 7 ++++- lib/gpu/lal_neighbor_gpu.cu | 2 +- lib/gpu/lal_neighbor_shared.cpp | 3 ++- lib/gpu/lal_preprocessor.h | 45 +++++++++++++++++---------------- 4 files changed, 32 insertions(+), 25 deletions(-) diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 5f0b1b5c98..a1ec1d98c4 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -146,6 +146,10 @@ void Neighbor::alloc(bool &success) { ptr+=_max_nbors; } _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes(); + } else { + // Some OpenCL implementations return errors for NULL pointers as args + dev_host_nbor.view(dev_nbor); + dev_host_numj.view(dev_nbor); } if (_maxspecial>0) { dev_nspecial.clear(); @@ -460,7 +464,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, ptr+=mn; } _gpu_bytes+=dev_host_nbor.row_bytes(); - } + } else + dev_host_nbor.view(dev_nbor); if (_alloc_packed) { dev_packed.clear(); success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev, diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index 29007abe81..36cd8c42ff 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -200,7 +200,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_, r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; if (r2 < cell_size*cell_size && r2 > 1e-5) { cnt++; - if (cnt < neigh_bin_size) { + if (cnt <= neigh_bin_size) { *neigh_list = pid_j; neigh_list++; if ((cnt & (t_per_atom-1))==0) diff --git a/lib/gpu/lal_neighbor_shared.cpp b/lib/gpu/lal_neighbor_shared.cpp index 1547eac4f2..aeac76062a 100644 --- a/lib/gpu/lal_neighbor_shared.cpp +++ b/lib/gpu/lal_neighbor_shared.cpp @@ -50,7 +50,8 @@ void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) { return; _gpu_nbor=gpu_nbor; - std::string flags="-cl-fast-relaxed-math -cl-mad-enable -D"+ + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE)+" -D"+ std::string(OCL_VENDOR); if (_gpu_nbor==0) { diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 722860f512..28734b3439 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -67,6 +67,22 @@ #ifdef NV_KERNEL +#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) +#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) +#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); +#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y); +#define THREAD_ID_X threadIdx.x +#define THREAD_ID_Y threadIdx.y +#define BLOCK_ID_X blockIdx.x +#define BLOCK_ID_Y blockIdx.y +#define BLOCK_SIZE_X blockDim.x +#define BLOCK_SIZE_Y blockDim.y +#define __kernel extern "C" __global__ +#define __local __shared__ +#define __global +#define atom_add atomicAdd +#define ucl_inline static __inline__ __device__ + #ifdef __CUDA_ARCH__ #define ARCH __CUDA_ARCH__ #else @@ -120,24 +136,7 @@ struct __builtin_align__(16) _double4 typedef struct _double4 double4; #endif -#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) -#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) -#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); -#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y); -#define THREAD_ID_X threadIdx.x -#define THREAD_ID_Y threadIdx.y -#define BLOCK_ID_X blockIdx.x -#define BLOCK_ID_Y blockIdx.y -#define BLOCK_SIZE_X blockDim.x -#define BLOCK_SIZE_Y blockDim.y -#define __kernel extern "C" __global__ -#define __local __shared__ -#define __global -#define atom_add atomicAdd -#define ucl_inline static __inline__ __device__ - - -#ifndef _DOUBLE_DOUBLE +#ifdef _DOUBLE_DOUBLE #define ucl_exp exp #define ucl_powr pow @@ -156,20 +155,18 @@ typedef struct _double4 double4; #define ucl_ceil ceilf #define ucl_abs fabsf #define ucl_recip(x) ((numtyp)1.0/(x)) +#define ucl_rsqrt rsqrtf +#define ucl_sqrt sqrtf #ifdef NO_HARDWARE_TRANSCENDENTALS #define ucl_exp expf #define ucl_powr powf -#define ucl_rsqrt rsqrtf -#define ucl_sqrt sqrtf #else #define ucl_exp __expf #define ucl_powr __powf -#define ucl_rsqrt __rsqrtf -#define ucl_sqrt __sqrtf #endif @@ -255,6 +252,10 @@ typedef struct _double4 double4; #define ucl_ceil ceil #define ucl_abs fabs +#ifdef _DOUBLE_DOUBLE +#define NO_HARDWARE_TRANSCENDENTALS +#endif + #ifdef NO_HARDWARE_TRANSCENDENTALS #define ucl_exp exp