From db8cc02e18dd577cb84b07c716e0fd681e025a5a Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Wed, 26 Oct 2011 17:42:21 -0400 Subject: [PATCH] Adding a capability for neighbor builds to openCL builds. --- lib/gpu/Makefile.firefly_opencl | 2 +- lib/gpu/Opencl.makefile | 7 ++- lib/gpu/device.cpp | 4 ++ lib/gpu/neighbor.cpp | 8 +-- lib/gpu/neighbor_gpu.cu | 86 ++++++++++++++++++--------------- lib/gpu/neighbor_shared.cpp | 13 ++--- lib/gpu/preprocessor.h | 2 + 7 files changed, 67 insertions(+), 55 deletions(-) diff --git a/lib/gpu/Makefile.firefly_opencl b/lib/gpu/Makefile.firefly_opencl index 7424846343..d035a4d29f 100644 --- a/lib/gpu/Makefile.firefly_opencl +++ b/lib/gpu/Makefile.firefly_opencl @@ -1,4 +1,4 @@ -OCL_CPP = mpic++ -O3 -DFERMI_OCL -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK -I/usr/local/cuda/include/ +OCL_CPP = mpic++ -O3 -g -DFERMI_OCL -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK -I/usr/local/cuda/include/ OCL_LINK = -lOpenCL OCL_PREC = -D_SINGLE_SINGLE diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index f47645e4e3..24703f71b0 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -40,7 +40,7 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_class2_long_cl.h \ $(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/morse_cl.h \ $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/cg_cmm_cl.h \ - $(OBJ_DIR)/cg_cmm_long_cl.h + $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h OCL_EXECS = $(BIN_DIR)/ocl_get_devices @@ -58,7 +58,10 @@ $(OBJ_DIR)/ans.o: answer.cpp answer.h $(OCL_H) $(OBJ_DIR)/neighbor_cpu_cl.h: neighbor_cpu.cu preprocessor.h $(BSH) ./geryon/file_to_cstr.sh neighbor_cpu preprocessor.h neighbor_cpu.cu $(OBJ_DIR)/neighbor_cpu_cl.h -$(OBJ_DIR)/neighbor_shared.o: neighbor_shared.cpp neighbor_shared.h $(OCL_H) $(OBJ_DIR)/neighbor_cpu_cl.h +$(OBJ_DIR)/neighbor_gpu_cl.h: neighbor_gpu.cu preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh neighbor_gpu preprocessor.h neighbor_gpu.cu $(OBJ_DIR)/neighbor_gpu_cl.h + +$(OBJ_DIR)/neighbor_shared.o: neighbor_shared.cpp neighbor_shared.h $(OCL_H) $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h $(OCL) -o $@ -c neighbor_shared.cpp -I$(OBJ_DIR) $(OBJ_DIR)/neighbor.o: neighbor.cpp neighbor.h $(OCL_H) neighbor_shared.h diff --git a/lib/gpu/device.cpp b/lib/gpu/device.cpp index ff68db0448..3e6e15d33c 100644 --- a/lib/gpu/device.cpp +++ b/lib/gpu/device.cpp @@ -160,6 +160,10 @@ int DeviceT::init(Answer &ans, const bool charge, gpu_nbor=1; else if (_gpu_mode==Device::GPU_HYB_NEIGH) gpu_nbor=2; + #ifdef USE_OPENCL + if (gpu_nbor==1) + gpu_nbor=2; + #endif if (_init_count==0) { // Initialize atom and nbor data diff --git a/lib/gpu/neighbor.cpp b/lib/gpu/neighbor.cpp index ee076bf430..d443887287 100644 --- a/lib/gpu/neighbor.cpp +++ b/lib/gpu/neighbor.cpp @@ -97,8 +97,8 @@ void Neighbor::alloc(bool &success) { host_acc.clear(); int nt=_max_atoms+_max_host; if (_use_packing==false || _gpu_nbor>0) - success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev, - UCL_READ_ONLY)==UCL_SUCCESS); + success=success && + (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS); else success=success && (dev_nbor.alloc(3*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); @@ -428,8 +428,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, if (mn>_max_nbors) { mn=static_cast(static_cast(mn)*1.10); dev_nbor.clear(); - success=success && (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_cell_id, - UCL_READ_ONLY)==UCL_SUCCESS); + success=success && + (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_x)==UCL_SUCCESS); _gpu_bytes=dev_nbor.row_bytes(); if (_max_host>0) { host_nbor.clear(); diff --git a/lib/gpu/neighbor_gpu.cu b/lib/gpu/neighbor_gpu.cu index ebdce7ccc1..6976eaac84 100644 --- a/lib/gpu/neighbor_gpu.cu +++ b/lib/gpu/neighbor_gpu.cu @@ -21,29 +21,6 @@ texture neigh_tex; ucl_inline float4 fetch_pos(const int& i, const float4 *pos) { return tex1Dfetch(neigh_tex, i); } #endif -#endif - -__kernel void transpose(int *out, int *in, int columns_in, int rows_in) -{ - __local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; - - unsigned ti=THREAD_ID_X; - unsigned tj=THREAD_ID_Y; - unsigned bi=BLOCK_ID_X; - unsigned bj=BLOCK_ID_Y; - - unsigned i=bi*BLOCK_CELL_2D+ti; - unsigned j=bj*BLOCK_CELL_2D+tj; - if ((iload_string(neighbor_cpu,flags.c_str()); k_nbor.set_function(*nbor_program,"kernel_unpack"); } else { build_program=new UCL_Program(dev); - #ifdef USE_OPENCL - if (gpu_nbor==1) { - std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n"; - exit(1); - } - #else build_program->load_string(neighbor_gpu,flags.c_str()); - #endif - if (gpu_nbor==1) { + + if (_gpu_nbor==1) { k_cell_id.set_function(*build_program,"calc_cell_id"); k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts"); } diff --git a/lib/gpu/preprocessor.h b/lib/gpu/preprocessor.h index 2a7441b5c4..ecc9ec1337 100644 --- a/lib/gpu/preprocessor.h +++ b/lib/gpu/preprocessor.h @@ -243,6 +243,8 @@ typedef struct _double4 double4; #define BLOCK_ID_X get_group_id(0) #define BLOCK_SIZE_X get_local_size(0) #define GLOBAL_SIZE_X get_global_size(0) +#define THREAD_ID_Y get_local_id(1) +#define BLOCK_ID_Y get_group_id(1) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define ucl_inline inline #define fetch_pos(i,y) x_[i]