From 8f2257ecbf1235c6a6bd7f027beadfbf9d1d2e45 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Wed, 12 Jan 2011 15:24:04 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5537 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Makefile.fermi | 2 +- lib/gpu/Makefile.lens | 4 +- lib/gpu/Makefile.lincoln | 2 +- lib/gpu/Makefile.linux | 2 +- lib/gpu/Makefile.linux_opencl | 2 +- lib/gpu/Makefile.longhorn | 2 +- lib/gpu/Makefile.mac | 2 +- lib/gpu/Makefile.mac_opencl | 2 +- lib/gpu/Nvidia.makefile | 16 ++++++- lib/gpu/Opencl.makefile | 11 +++++ lib/gpu/cmm_cut_gpu.cpp | 8 ++-- lib/gpu/cmm_cut_gpu_kernel.cu | 20 ++++----- lib/gpu/cmmc_long_gpu.cpp | 8 ++-- lib/gpu/cmmc_long_gpu_kernel.cu | 24 +++++------ lib/gpu/gb_gpu.cpp | 8 ++-- lib/gpu/gb_gpu_kernel.cu | 16 +++---- lib/gpu/gb_gpu_kernel_lj.cu | 30 ++++++------- lib/gpu/gb_gpu_memory.cpp | 23 +++++----- lib/gpu/geryon/nvc_device.h | 6 +-- lib/gpu/geryon/nvc_macros.h | 21 +++++++++- lib/gpu/geryon/nvd_device.h | 6 +-- lib/gpu/geryon/nvd_macros.h | 19 ++++++++- lib/gpu/geryon/nvd_memory.h | 6 +-- lib/gpu/geryon/nvd_timer.h | 6 +-- lib/gpu/geryon/ocl_device.h | 6 +-- lib/gpu/geryon/ocl_kernel.h | 8 ++-- lib/gpu/geryon/ocl_timer.h | 4 +- lib/gpu/lj96_cut_gpu.cpp | 8 ++-- lib/gpu/lj96_cut_gpu_kernel.cu | 20 ++++----- lib/gpu/lj_cut_gpu.cpp | 8 ++-- lib/gpu/lj_cut_gpu_kernel.cu | 20 ++++----- lib/gpu/ljc_cut_gpu.cpp | 8 ++-- lib/gpu/ljc_cut_gpu_kernel.cu | 24 +++++------ lib/gpu/ljcl_cut_gpu.cpp | 8 ++-- lib/gpu/ljcl_cut_gpu_kernel.cu | 24 +++++------ lib/gpu/pair_gpu_atom.cpp | 74 ++++++++++++++++++--------------- lib/gpu/pair_gpu_atom.h | 25 ++++++++--- lib/gpu/pair_gpu_balance.h | 12 +++--- lib/gpu/pair_gpu_device.cpp | 66 +++++++++++++++++------------ lib/gpu/pair_gpu_device.h | 33 +++++++++++---- lib/gpu/pair_gpu_precision.h | 1 + 41 files changed, 351 insertions(+), 244 deletions(-) diff --git a/lib/gpu/Makefile.fermi b/lib/gpu/Makefile.fermi index d292bcfc30..d830c8924c 100644 --- a/lib/gpu/Makefile.fermi +++ b/lib/gpu/Makefile.fermi @@ -30,7 +30,7 @@ CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON BIN_DIR = ./ -OBJ_DIR = ./obj +OBJ_DIR = ./ LIB_DIR = ./ AR = ar BSH = /bin/sh diff --git a/lib/gpu/Makefile.lens b/lib/gpu/Makefile.lens index ceec99df7f..3b6301277f 100644 --- a/lib/gpu/Makefile.lens +++ b/lib/gpu/Makefile.lens @@ -26,11 +26,11 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON +CUDR_CPP = mpic++ -DMPI_GERYON -openmp CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias BIN_DIR = ./ -OBJ_DIR = ./obj +OBJ_DIR = ./ LIB_DIR = ./ AR = ar BSH = /bin/sh diff --git a/lib/gpu/Makefile.lincoln b/lib/gpu/Makefile.lincoln index c181fb08fb..97a7901811 100644 --- a/lib/gpu/Makefile.lincoln +++ b/lib/gpu/Makefile.lincoln @@ -28,7 +28,7 @@ CUDR_CPP = mpic++ -DMPI_GERYON CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops BIN_DIR = ./ -OBJ_DIR = ./obj +OBJ_DIR = ./ LIB_DIR = ./ AR = ar diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux index 43ee31bdf9..c0001a54ab 100644 --- a/lib/gpu/Makefile.linux +++ b/lib/gpu/Makefile.linux @@ -30,7 +30,7 @@ CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias BIN_DIR = ./ -OBJ_DIR = ./obj +OBJ_DIR = ./ LIB_DIR = ./ AR = ar BSH = /bin/sh diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl index 44b5d5aa2d..69522298c5 100644 --- a/lib/gpu/Makefile.linux_opencl +++ b/lib/gpu/Makefile.linux_opencl @@ -22,7 +22,7 @@ OCL_LINK = -lOpenCL OCL_PREC = -D_SINGLE_SINGLE BIN_DIR = ./ -OBJ_DIR = ./ocl_obj +OBJ_DIR = ./ LIB_DIR = ./ AR = ar BSH = /bin/sh diff --git a/lib/gpu/Makefile.longhorn b/lib/gpu/Makefile.longhorn index 33a02562cf..ba921f0f68 100644 --- a/lib/gpu/Makefile.longhorn +++ b/lib/gpu/Makefile.longhorn @@ -27,7 +27,7 @@ CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias BIN_DIR = ./ -OBJ_DIR = ./obj +OBJ_DIR = ./ LIB_DIR = ./ AR = ar diff --git a/lib/gpu/Makefile.mac b/lib/gpu/Makefile.mac index f16fe197bc..f061a1a68a 100644 --- a/lib/gpu/Makefile.mac +++ b/lib/gpu/Makefile.mac @@ -30,7 +30,7 @@ CUDR_CPP = mpic++ CUDR_OPTS = -O2 -m32 -g BIN_DIR = ./ -OBJ_DIR = ./obj +OBJ_DIR = ./ LIB_DIR = ./ AR = ar BSH = /bin/sh diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl index dae41dd3ad..53d6d466e2 100644 --- a/lib/gpu/Makefile.mac_opencl +++ b/lib/gpu/Makefile.mac_opencl @@ -22,7 +22,7 @@ OCL_LINK = -framework OpenCL OCL_PREC = -D_SINGLE_SINGLE BIN_DIR = ./ -OBJ_DIR = ./ocl_obj +OBJ_DIR = ./ LIB_DIR = ./ AR = ar BSH = /bin/sh diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index f7de72bd25..adf281e156 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -47,6 +47,7 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \ $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ + $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \ $(CUDPP) @@ -59,8 +60,9 @@ PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \ $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \ $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \ + $(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \ $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \ - $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h + $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h all: $(GPU_LIB) $(EXECS) @@ -169,6 +171,18 @@ $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_me $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp $(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu + +$(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h + +$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o + $(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp + $(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 4da8ce5f12..ac7aecc2ee 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -37,12 +37,14 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \ $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ + $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \ $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \ $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \ $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \ + $(OBJ_DIR)/crml_gpu_cl.h \ $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h OCL_EXECS = $(BIN_DIR)/ocl_get_devices @@ -112,6 +114,15 @@ $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_me $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp $(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h; + +$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o + $(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp + $(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h; diff --git a/lib/gpu/cmm_cut_gpu.cpp b/lib/gpu/cmm_cut_gpu.cpp index b09d713f91..53976ff7e8 100644 --- a/lib/gpu/cmm_cut_gpu.cpp +++ b/lib/gpu/cmm_cut_gpu.cpp @@ -46,7 +46,7 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu); bool message=false; - if (world_me==0 && screen) + if (CMMMF.device->replica_me()==0 && screen) message=true; if (message) { @@ -62,14 +62,14 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, return false; } - MPI_Barrier(MPI_COMM_WORLD); + CMMMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; igpu_comm); + CMMMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu index 2288cf1df9..efc6dbbd6a 100644 --- a/lib/gpu/cmm_cut_gpu_kernel.cu +++ b/lib/gpu/cmm_cut_gpu_kernel.cu @@ -91,14 +91,14 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, if (iiinit_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu); bool message=false; - if (world_me==0 && screen) + if (CMMLMF.device->replica_me()==0 && screen) message=true; if (message) { @@ -66,14 +66,14 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, return false; } - MPI_Barrier(MPI_COMM_WORLD); + CMMLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; igpu_comm); + CMMLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu index 23debb6b53..a43d73ffcf 100644 --- a/lib/gpu/cmmc_long_gpu_kernel.cu +++ b/lib/gpu/cmmc_long_gpu_kernel.cu @@ -114,15 +114,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[7]=sp_lj_in[7]; if (iiinit_message(screen,"gayberne",first_gpu,last_gpu); bool message=false; - if (world_me==0 && screen) + if (GBMF.device->replica_me()==0 && screen) message=true; if (message) { @@ -86,14 +86,14 @@ bool gb_gpu_init(const int ntypes, const double gamma, return false; } - MPI_Barrier(MPI_COMM_WORLD); + GBMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; igpu_comm); + GBMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu index 41174660d3..347e6ede35 100644 --- a/lib/gpu/gb_gpu_kernel.cu +++ b/lib/gpu/gb_gpu_kernel.cu @@ -105,18 +105,18 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q, if (iicast_time(); - MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica()); double avg_split=hd_balancer.all_avg_split(); _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+ @@ -230,12 +230,13 @@ void GB_GPU_MemoryT::clear() { shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+ gamma_upsilon_mu.row_bytes(); double mpi_max_bytes; - MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0, + device->replica()); double max_mb=mpi_max_bytes/(1024*1024); - if (device->world_me()==0) + if (device->replica_me()==0) if (screen && times[3]>0.0) { - int world_size=device->world_size(); + int replica_size=device->replica_size(); fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); @@ -244,15 +245,15 @@ void GB_GPU_MemoryT::clear() { fprintf(screen,"--------------------------------\n"); if (device->procs_per_gpu()==1) { - fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/world_size); - fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/world_size); - fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/world_size); + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); + fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size); + fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size); if (nbor->gpu_nbor()) - fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/world_size); + fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size); else - fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/world_size); - fprintf(screen,"Force calc: %.4f s.\n",times[3]/world_size); - fprintf(screen,"LJ calc: %.4f s.\n",times[4]/world_size); + fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size); + fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size); + fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size); } fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); diff --git a/lib/gpu/geryon/nvc_device.h b/lib/gpu/geryon/nvc_device.h index e5c2d801d0..ed445716f6 100644 --- a/lib/gpu/geryon/nvc_device.h +++ b/lib/gpu/geryon/nvc_device.h @@ -13,7 +13,7 @@ copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -95,7 +95,7 @@ class UCL_Device { /** \note You cannot delete the default stream **/ inline void pop_command_queue() { if (_cq.size()<2) return; - CUDA_SAFE_CALL_NS(cudaStreamDestroy(_cq.back())); + CUDA_DESTRUCT_CALL_NS(cudaStreamDestroy(_cq.back())); _cq.pop_back(); } @@ -290,7 +290,7 @@ inline void UCL_Device::print_all(std::ostream &out) { else out << "Unknown\n"; #endif - #if CUDART_VERSION >= 3000 + #if CUDART_VERSION >= 3010 out << " Concurrent kernel execution: "; if (_properties[i].concurrentKernels) out << "Yes\n"; diff --git a/lib/gpu/geryon/nvc_macros.h b/lib/gpu/geryon/nvc_macros.h index b9a4e8d6f3..916d268ade 100644 --- a/lib/gpu/geryon/nvc_macros.h +++ b/lib/gpu/geryon/nvc_macros.h @@ -6,7 +6,7 @@ #undef _GLIBCXX_ATOMIC_BUILTINS #endif // _GLIBCXX_ATOMIC_BUILTINS #endif // __APPLE__ - + #include #include #include @@ -18,6 +18,11 @@ #define NVC_GERYON_EXIT assert(0==1) #endif +#ifdef UCL_DEBUG +#define UCL_SYNC_DEBUG +#define UCL_DESTRUCT_CHECK +#endif + #ifndef UCL_NO_API_CHECK #define CUDA_SAFE_CALL_NS( call) do { \ @@ -32,7 +37,7 @@ #define CUDA_SAFE_CALL( call) do { \ CUDA_SAFE_CALL_NS( call); \ - cudaError err=cudaThreadSynchronize(); \ + cudaError err=cudaThreadSynchronize(); \ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ __FILE__, __LINE__, cudaGetErrorString( err) ); \ @@ -53,5 +58,17 @@ #endif +#ifdef UCL_DESTRUCT_CHECK + +#define CUDA_DESTRUCT_CALL( call) CUDA_SAFE_CALL( call) +#define CUDA_DESTRUCT_CALL_NS( call) CUDA_SAFE_CALL_NS( call) + +#else + +#define CUDA_DESTRUCT_CALL( call) call +#define CUDA_DESTRUCT_CALL_NS( call) call + +#endif + #endif diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index b407c1ede3..6b70964ba1 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -20,7 +20,7 @@ certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ - + #ifndef NVD_DEVICE #define NVD_DEVICE @@ -233,7 +233,7 @@ inline UCL_Device::UCL_Device() { &_properties.back().canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev)); #endif - #if CUDA_VERSION >= 3000 + #if CUDA_VERSION >= 3010 CU_SAFE_CALL_NS(cuDeviceGetAttribute( &_properties.back().concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev)); @@ -339,7 +339,7 @@ inline void UCL_Device::print_all(std::ostream &out) { else out << "No\n"; #endif - #if CUDA_VERSION >= 3000 + #if CUDA_VERSION >= 3010 out << " Concurrent kernel execution: "; if (_properties[i].concurrentKernels) out << "Yes\n"; diff --git a/lib/gpu/geryon/nvd_macros.h b/lib/gpu/geryon/nvd_macros.h index 8cedf5c212..1fa1ce9cf8 100644 --- a/lib/gpu/geryon/nvd_macros.h +++ b/lib/gpu/geryon/nvd_macros.h @@ -1,6 +1,6 @@ #ifndef NVD_MACROS_H #define NVD_MACROS_H - + #include #include #include @@ -18,6 +18,11 @@ #define NVD_GERYON_EXIT assert(0==1) #endif +#ifdef UCL_DEBUG +#define UCL_SYNC_DEBUG +#define UCL_DESTRUCT_CHECK +#endif + #ifndef UCL_NO_API_CHECK #define CU_SAFE_CALL_NS( call ) do { \ @@ -53,5 +58,17 @@ #endif +#ifdef UCL_DESTRUCT_CHECK + +#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call) +#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call) + +#else + +#define CU_DESTRUCT_CALL( call) call +#define CU_DESTRUCT_CALL_NS( call) call + +#endif + #endif diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h index 41d656b8cf..2023e8586f 100644 --- a/lib/gpu/geryon/nvd_memory.h +++ b/lib/gpu/geryon/nvd_memory.h @@ -20,7 +20,7 @@ certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ - + #ifndef NVD_MEMORY_H #define NVD_MEMORY_H @@ -78,7 +78,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, template inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) { if (kind!=UCL_NOT_PINNED) - CU_SAFE_CALL(cuMemFreeHost(mat.begin())); + CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin())); else free(mat.begin()); } @@ -134,7 +134,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows, template inline void _device_free(mat_type &mat) { - CU_SAFE_CALL(cuMemFree(mat.cbegin())); + CU_DESTRUCT_CALL(cuMemFree(mat.cbegin())); } inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { diff --git a/lib/gpu/geryon/nvd_timer.h b/lib/gpu/geryon/nvd_timer.h index e068e53d8f..59001c03fd 100644 --- a/lib/gpu/geryon/nvd_timer.h +++ b/lib/gpu/geryon/nvd_timer.h @@ -20,7 +20,7 @@ certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ - + #ifndef NVD_TIMER_H #define NVD_TIMER_H @@ -41,8 +41,8 @@ class UCL_Timer { /** \note init() must be called to reuse timer after a clear() **/ inline void clear() { if (_initialized) { - CU_SAFE_CALL(cuEventDestroy(start_event)); - CU_SAFE_CALL(cuEventDestroy(stop_event)); + CU_DESTRUCT_CALL(cuEventDestroy(start_event)); + CU_DESTRUCT_CALL(cuEventDestroy(stop_event)); _initialized=false; _total_time=0.0; } diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 97cbf7f1e7..57f83b533e 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -20,7 +20,7 @@ certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ - + #ifndef OCL_DEVICE #define OCL_DEVICE @@ -265,10 +265,10 @@ inline UCL_Device::UCL_Device() { inline UCL_Device::~UCL_Device() { if (_device>-1) { for (size_t i=0; i<_cq.size(); i++) { - CL_SAFE_CALL(clReleaseCommandQueue(_cq.back())); + CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back())); _cq.pop_back(); } - CL_SAFE_CALL(clReleaseContext(_context)); + CL_DESTRUCT_CALL(clReleaseContext(_context)); } } diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h index 4a02f848a6..8b8807f153 100644 --- a/lib/gpu/geryon/ocl_kernel.h +++ b/lib/gpu/geryon/ocl_kernel.h @@ -13,7 +13,7 @@ copyright : (C) 2010 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -51,9 +51,9 @@ class UCL_Program { /** \note Must call init() after each clear **/ inline void clear() { if (_init_done) { - CL_SAFE_CALL(clReleaseProgram(_program)); - CL_SAFE_CALL(clReleaseContext(_context)); - CL_SAFE_CALL(clReleaseCommandQueue(_cq)); + CL_DESTRUCT_CALL(clReleaseProgram(_program)); + CL_DESTRUCT_CALL(clReleaseContext(_context)); + CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); _init_done=false; } } diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h index 072c2f212d..aafb0aac4b 100644 --- a/lib/gpu/geryon/ocl_timer.h +++ b/lib/gpu/geryon/ocl_timer.h @@ -20,7 +20,7 @@ certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ - + #ifndef OCL_TIMER_H #define OCL_TIMER_H @@ -41,7 +41,7 @@ class UCL_Timer { /** \note init() must be called to reuse timer after a clear() **/ inline void clear() { if (_initialized) { - CL_SAFE_CALL(clReleaseCommandQueue(_cq)); + CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); clReleaseEvent(start_event); clReleaseEvent(stop_event); _initialized=false; diff --git a/lib/gpu/lj96_cut_gpu.cpp b/lib/gpu/lj96_cut_gpu.cpp index 6411b20853..24fb5d8570 100644 --- a/lib/gpu/lj96_cut_gpu.cpp +++ b/lib/gpu/lj96_cut_gpu.cpp @@ -45,7 +45,7 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu); bool message=false; - if (world_me==0 && screen) + if (LJ96MF.device->replica_me()==0 && screen) message=true; if (message) { @@ -61,14 +61,14 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, return false; } - MPI_Barrier(MPI_COMM_WORLD); + LJ96MF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; igpu_comm); + LJ96MF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu index a1faec8f24..083060a7a1 100644 --- a/lib/gpu/lj96_cut_gpu_kernel.cu +++ b/lib/gpu/lj96_cut_gpu_kernel.cu @@ -91,14 +91,14 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, if (iiinit_message(screen,"lj/cut",first_gpu,last_gpu); bool message=false; - if (world_me==0 && screen) + if (LJLMF.device->replica_me()==0 && screen) message=true; if (message) { @@ -62,14 +62,14 @@ bool ljl_gpu_init(const int ntypes, double **cutsq, return false; } - MPI_Barrier(MPI_COMM_WORLD); + LJLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; igpu_comm); + LJLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu index 5c784084c4..25d00a202a 100644 --- a/lib/gpu/lj_cut_gpu_kernel.cu +++ b/lib/gpu/lj_cut_gpu_kernel.cu @@ -91,14 +91,14 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, if (iiinit_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu); bool message=false; - if (world_me==0 && screen) + if (LJCMF.device->replica_me()==0 && screen) message=true; if (message) { @@ -65,14 +65,14 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, return false; } - MPI_Barrier(MPI_COMM_WORLD); + LJCMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; igpu_comm); + LJCMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu index b6d9610f0d..fac49b87ea 100644 --- a/lib/gpu/ljc_cut_gpu_kernel.cu +++ b/lib/gpu/ljc_cut_gpu_kernel.cu @@ -106,15 +106,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[7]=sp_lj_in[7]; if (iiinit_message(screen,"lj/cut/coul/long",first_gpu,last_gpu); bool message=false; - if (world_me==0 && screen) + if (LJCLMF.device->replica_me()==0 && screen) message=true; if (message) { @@ -66,14 +66,14 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, return false; } - MPI_Barrier(MPI_COMM_WORLD); + LJCLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; igpu_comm); + LJCLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu index 9e2fb3c230..be2ae069e4 100644 --- a/lib/gpu/ljcl_cut_gpu_kernel.cu +++ b/lib/gpu/ljcl_cut_gpu_kernel.cu @@ -114,15 +114,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[7]=sp_lj_in[7]; if (ii PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false), - _vflag(false),_inum(0),_ilist(NULL) { + _vflag(false),_inum(0),_ilist(NULL), + _newton(false) { #ifndef USE_OPENCL sort_config.op = CUDPP_ADD; sort_config.datatype = CUDPP_UINT; @@ -64,7 +65,13 @@ int PairGPUAtomT::bytes_per_atom() const { } template -bool PairGPUAtomT::alloc(const int max_atoms) { +bool PairGPUAtomT::alloc(const int inum, const int nall) { + _max_atoms=static_cast(static_cast(nall)*1.10); + if (_newton) + _max_local=_max_atoms; + else + _max_local=static_cast(static_cast(inum)*1.10); + bool success=true; int ans_elements=4; @@ -79,10 +86,10 @@ bool PairGPUAtomT::alloc(const int max_atoms) { // Allocate storage for CUDPP sort #ifndef USE_OPENCL #ifdef WINDLL - _win_sort_alloc(max_atoms); + _win_sort_alloc(_max_atoms); #else if (_gpu_nbor) { - CUDPPResult result = cudppPlan(&sort_plan, sort_config, max_atoms, 1, 0); + CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); if (CUDPP_SUCCESS != result) return false; } @@ -92,23 +99,23 @@ bool PairGPUAtomT::alloc(const int max_atoms) { // -------------------------- Host allocations // Get a host write only buffer #ifdef GPU_CAST - success=success && (host_x_cast.alloc(max_atoms*3,*dev, + success=success && (host_x_cast.alloc(_max_atoms*3,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - success=success && (host_type_cast.alloc(max_atoms,*dev, + success=success && (host_type_cast.alloc(_max_atoms,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #else - success=success && (host_x.alloc(max_atoms*4,*dev, + success=success && (host_x.alloc(_max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #endif - success=success && (host_ans.alloc(ans_elements*max_atoms,*dev)==UCL_SUCCESS); - success=success && (host_engv.alloc(_ev_fields*max_atoms,*dev)==UCL_SUCCESS); + success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS); + success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_charge) - success=success && (host_q.alloc(max_atoms,*dev, + success=success && (host_q.alloc(_max_atoms,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_rot) - success=success && (host_quat.alloc(max_atoms*4,*dev, + success=success && (host_quat.alloc(_max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); @@ -128,43 +135,44 @@ bool PairGPUAtomT::alloc(const int max_atoms) { dev_q.view(host_q); } else { #ifdef GPU_CAST - success=success && (UCL_SUCCESS==dev_x.alloc(max_atoms*4,*dev)); + success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev)); success=success && (UCL_SUCCESS== - dev_x_cast.alloc(max_atoms*3,*dev,UCL_READ_ONLY)); + dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)); success=success && (UCL_SUCCESS== - dev_type_cast.alloc(max_atoms,*dev,UCL_READ_ONLY)); + dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)); _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); #else success=success && (UCL_SUCCESS== - dev_x.alloc(max_atoms*4,*dev,UCL_READ_ONLY)); + dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY)); #endif - success=success && (dev_engv.alloc(_ev_fields*max_atoms,*dev, + success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev, UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && (dev_ans.alloc(ans_elements*max_atoms, + success=success && (dev_ans.alloc(ans_elements*_max_local, *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); if (_charge) { - success=success && (dev_q.alloc(max_atoms,*dev, + success=success && (dev_q.alloc(_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_q.row_bytes(); } if (_rot) { - success=success && (dev_quat.alloc(max_atoms*4,*dev, + success=success && (dev_quat.alloc(_max_atoms*4,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_quat.row_bytes(); } } if (_gpu_nbor) { - success=success && (dev_cell_id.alloc(max_atoms,*dev)==UCL_SUCCESS); - success=success && (dev_particle_id.alloc(max_atoms,*dev)==UCL_SUCCESS); + success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); if (_bonds) { - success=success && (dev_tag.alloc(max_atoms,*dev)==UCL_SUCCESS); + success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); _gpu_bytes+=dev_tag.row_bytes(); } } _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes(); - + + _allocated=true; return success; } @@ -188,14 +196,13 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, _ev_fields=6+_e_fields; // Initialize atom and nbor data - int max_local=static_cast(static_cast(inum)*1.10); - if (max_local==0) - max_local=1000; - if (nall<=inum) - _max_atoms=max_local*2; - else - _max_atoms=static_cast(static_cast(nall)*1.10); - + int ef_inum=inum; + if (ef_inum==0) + ef_inum=1000; + int ef_nall=nall; + if (ef_nall<=ef_inum) + ef_nall=ef_inum*2; + // Initialize timers for the selected device time_pos.init(*dev); time_other.init(*dev); @@ -209,8 +216,7 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, compile_kernels(*dev); #endif - _allocated=true; - return success && alloc(_max_atoms); + return success && alloc(ef_inum,ef_nall); } template @@ -285,7 +291,7 @@ double PairGPUAtomT::host_memory_usage() const { atom_bytes+=4; int ans_bytes=atom_bytes+_ev_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+ - ans_bytes*(_max_atoms)*sizeof(acctyp)+ + ans_bytes*(_max_local)*sizeof(acctyp)+ sizeof(PairGPUAtom); } diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h index 7cec73f98c..e0a1fd9fb1 100644 --- a/lib/gpu/pair_gpu_atom.h +++ b/lib/gpu/pair_gpu_atom.h @@ -77,11 +77,9 @@ class PairGPUAtom { inline bool resize(const int inum, const int nall, bool &success) { _inum=inum; _nall=nall; - if (nall>_max_atoms) { + if (inum>_max_local || nall>_max_atoms) { clear_resize(); - _max_atoms=static_cast(static_cast(nall)*1.10); - _allocated=true; - success = success && alloc(_max_atoms); + success = success && alloc(inum,nall); return true; } return false; @@ -203,6 +201,19 @@ class PairGPUAtom { ucl_copy(dev_v,view,false); } + /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device + template + inline void self_pack2(const int n, UCL_D_Vec &dev_v, + UCL_H_Vec &buffer, t1 **one, t2 **two) { + for (int i=0; i(one[i][i]); + buffer[i*2+1]=static_cast(two[i][i]); + } + UCL_H_Vec view; + view.view((dev_typ*)buffer.begin(),n,*dev); + ucl_copy(dev_v,view,false); + } + // -------------------------COPY TO GPU ---------------------------------- /// Cast positions and types to write buffer @@ -386,15 +397,17 @@ class PairGPUAtom { bool _compiled; - bool alloc(const int max_atoms); + bool alloc(const int inum, const int nall); bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; - int _max_atoms, _nall, _inum, _e_fields, _ev_fields; + int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields; bool _gpu_nbor, _bonds; int *_ilist; double _time_cast; double _gpu_bytes; + + bool _newton; #ifndef USE_OPENCL CUDPPConfiguration sort_config; diff --git a/lib/gpu/pair_gpu_balance.h b/lib/gpu/pair_gpu_balance.h index 5d7b74dac9..a3a0f61a62 100644 --- a/lib/gpu/pair_gpu_balance.h +++ b/lib/gpu/pair_gpu_balance.h @@ -65,11 +65,9 @@ class PairGPUBalance { inline double all_avg_split() { if (_load_balance) { double _all_avg_split=0.0; - int nprocs; - MPI_Comm_size(MPI_COMM_WORLD,&nprocs); MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0, - MPI_COMM_WORLD); - _all_avg_split/=nprocs; + _device->replica()); + _all_avg_split/=_device->replica_size(); return _all_avg_split/_avg_count; } else return _actual_split; @@ -83,10 +81,10 @@ class PairGPUBalance { inline void start_timer() { if (_measure_this_step) { _device->gpu->sync(); - MPI_Barrier(_device->gpu_comm); + _device->gpu_barrier(); _device_time.start(); _device->gpu->sync(); - MPI_Barrier(_device->gpu_comm); + _device->gpu_barrier(); _device->start_host_timer(); } } @@ -178,7 +176,7 @@ void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) { cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full; MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX, - _device->gpu_comm); + _device->gpu_comm()); double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]); split*=_HD_BALANCE_GAP; diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index ab03e814b6..c2d980cf99 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -10,7 +10,7 @@ See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ - + /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ @@ -34,19 +34,28 @@ PairGPUDeviceT::~PairGPUDevice() { } template -bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu, - const int gpu_mode, const double p_split) { +bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, + const int first_gpu, const int last_gpu, + const int gpu_mode, const double p_split, + const int nthreads) { + _nthreads=nthreads; + if (_device_init) return true; _device_init=true; + _comm_world=world; + _comm_replica=replica; _first_device=first_gpu; _last_device=last_gpu; _gpu_mode=gpu_mode; _particle_split=p_split; - // Get the rank within the world - MPI_Comm_rank(MPI_COMM_WORLD,&_world_me); - MPI_Comm_size(MPI_COMM_WORLD,&_world_size); + // Get the rank/size within the world + MPI_Comm_rank(_comm_world,&_world_me); + MPI_Comm_size(_comm_world,&_world_size); + // Get the rank/size within the replica + MPI_Comm_rank(_comm_replica,&_replica_me); + MPI_Comm_size(_comm_replica,&_replica_size); // Get the names of all nodes int name_length; @@ -54,7 +63,7 @@ bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu, char node_names[MPI_MAX_PROCESSOR_NAME*_world_size]; MPI_Get_processor_name(node_name,&name_length); MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, - MPI_MAX_PROCESSOR_NAME,MPI_CHAR,MPI_COMM_WORLD); + MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world); std::string node_string=std::string(node_name); // Get the number of procs per node @@ -80,7 +89,7 @@ bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu, // Set up a per node communicator and find rank within MPI_Comm node_comm; - MPI_Comm_split(MPI_COMM_WORLD, split_id, 0, &node_comm); + MPI_Comm_split(_comm_world, split_id, 0, &node_comm); int node_rank; MPI_Comm_rank(node_comm,&node_rank); @@ -90,8 +99,8 @@ bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu, int my_gpu=node_rank/_procs_per_gpu; // Set up a per device communicator - MPI_Comm_split(node_comm,my_gpu,0,&gpu_comm); - MPI_Comm_rank(gpu_comm,&_gpu_rank); + MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); + MPI_Comm_rank(_comm_gpu,&_gpu_rank); gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) @@ -111,10 +120,13 @@ bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, return false; if (_init_count==0) { // Initialize atom and nbor data - if (!atom.init(nlocal,nall,charge,rot,*gpu,gpu_nbor, + int ef_nlocal=nlocal; + if (_particle_split<1.0 && _particle_split>0.0) + ef_nlocal=static_cast(_particle_split*nlocal); + if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor, gpu_nbor && maxspecial>0)) return false; - if (!nbor.init(nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor, + if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor, gpu_host,pre_cut)) return false; nbor.cell_size(cell_size); @@ -136,7 +148,7 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name, std::string fs=toa(gpu->free_gigabytes())+"/"; #endif - if (_world_me == 0 && screen) { + if (_replica_me == 0 && screen) { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"-------------------------------------\n"); fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); @@ -175,14 +187,14 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, single[3]=time_pair.total_seconds(); single[4]=atom.cast_time(); - MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes; double mpi_max_bytes; - MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); - if (world_me()==0) + if (replica_me()==0) if (screen && times[3]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); @@ -191,14 +203,14 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, fprintf(screen,"--------------------------------\n"); if (procs_per_gpu()==1) { - fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_world_size); - fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_world_size); - fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_world_size); + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); + fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); + fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); if (nbor.gpu_nbor()) - fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_world_size); + fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size); else - fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_world_size); - fprintf(screen,"Force calc: %.4f s.\n",times[3]/_world_size); + fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); + fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); @@ -239,10 +251,11 @@ double PairGPUDeviceT::host_memory_usage() const { template class PairGPUDevice; PairGPUDevice pair_gpu_device; -bool lmp_init_device(const int first_gpu, const int last_gpu, - const int gpu_mode, const double particle_split) { - return pair_gpu_device.init_device(first_gpu,last_gpu,gpu_mode, - particle_split); +bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + const int last_gpu, const int gpu_mode, + const double particle_split, const int nthreads) { + return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, + particle_split,nthreads); } void lmp_clear_device() { @@ -261,3 +274,4 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom, } return 0.0; } + diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index ed2baa0cc6..33aa54959b 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -10,7 +10,7 @@ See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ - + /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ @@ -34,8 +34,9 @@ class PairGPUDevice { /// Initialize the device for use by this process /** Sets up a per-device MPI communicator for load balancing and initializes * the device (>=first_gpu and <=last_gpu) that this proc will be using **/ - bool init_device(const int first_gpu, const int last_gpu, - const int gpu_mode, const double particle_split); + bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + const int last_gpu, const int gpu_mode, + const double particle_split, const int nthreads); /// Initialize the device for Atom and Neighbor storage /** \param rot True if quaternions need to be stored @@ -83,12 +84,26 @@ class PairGPUDevice { /// Return the number of procs sharing a device (size of device commincator) inline int procs_per_gpu() const { return _procs_per_gpu; } - /// Return my rank in the device communicator - inline int gpu_rank() const { return _gpu_rank; } + /// Return the number of threads per proc + inline int num_threads() const { return _nthreads; } /// My rank within all processes inline int world_me() const { return _world_me; } /// Total number of processes inline int world_size() const { return _world_size; } + /// MPI Barrier for world + inline void world_barrier() { MPI_Barrier(_comm_world); } + /// Return the replica MPI communicator + inline MPI_Comm & replica() { return _comm_replica; } + /// My rank within replica communicator + inline int replica_me() const { return _replica_me; } + /// Number of procs in replica communicator + inline int replica_size() const { return _replica_size; } + /// Return the per-GPU MPI communicator + inline MPI_Comm & gpu_comm() { return _comm_gpu; } + /// Return my rank in the device communicator + inline int gpu_rank() const { return _gpu_rank; } + /// MPI Barrier for gpu + inline void gpu_barrier() { MPI_Barrier(_comm_gpu); } /// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH inline int gpu_mode() const { return _gpu_mode; } /// Index of first device used by a node @@ -104,8 +119,6 @@ class PairGPUDevice { /// Geryon Device UCL_Device *gpu; - /// Device communicator - MPI_Comm gpu_comm; enum{GPU_FORCE, GPU_NEIGH}; @@ -122,8 +135,10 @@ class PairGPUDevice { private: int _init_count; bool _device_init; - int _procs_per_gpu, _gpu_rank, _world_me, _world_size; - int _gpu_mode, _first_device, _last_device; + MPI_Comm _comm_world, _comm_replica, _comm_gpu; + int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, + _replica_size; + int _gpu_mode, _first_device, _last_device, _nthreads; double _particle_split; double _cpu_full; diff --git a/lib/gpu/pair_gpu_precision.h b/lib/gpu/pair_gpu_precision.h index 554466a2fa..a5f57c1f95 100644 --- a/lib/gpu/pair_gpu_precision.h +++ b/lib/gpu/pair_gpu_precision.h @@ -85,6 +85,7 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { #endif #define MAX_SHARED_TYPES 8 +#define MAX_BIO_SHARED_TYPES 128 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #endif