diff --git a/lib/gpu/Makefile.cyg b/lib/gpu/Makefile.cyg deleted file mode 100644 index 3d2294b1a2..0000000000 --- a/lib/gpu/Makefile.cyg +++ /dev/null @@ -1,72 +0,0 @@ -# /* ---------------------------------------------------------------------- -# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator -# http://lammps.sandia.gov, Sandia National Laboratories -# Steve Plimpton, sjplimp@sandia.gov -# -# Copyright (2003) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains -# certain rights in this software. This software is distributed under -# the GNU General Public License. -# -# See the README file in the top-level LAMMPS directory. -# ------------------------------------------------------------------------- */ -# -# /* ---------------------------------------------------------------------- -# Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov -# Peng Wang (Nvidia), penwang@nvidia.com -# Paul Crozier (SNL), pscrozi@sandia.gov -# ------------------------------------------------------------------------- */ - -BIN_DIR = . -OBJ_DIR = . -AR = ar -CUDA_CPP = /cygdrive/c/CUDA/bin/nvcc -I/cygdrive/c/CUDA/include -O3 -DWINDLL -DUNIX -Xptxas -v --use_fast_math -CUDA_ARCH = -arch=sm_13 -CUDA_PREC = -D_SINGLE_SINGLE -CUDA_LINK = -L/cygdrive/c/CUDA/lib -lcudart $(CUDA_LIB) - -CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC) - -CUDA_LIB = $(OBJ_DIR)/gpu.dll - -# Headers for CUDA Stuff -NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h -# Headers for Pair Stuff -PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h -# Dependencies for the Texture Tar -TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \ - lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \ - gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu - -ALL_H = $(NVC_H) $(PAIR_H) - -EXECS = $(BIN_DIR)/nvc_get_devices -OBJS = $(OBJ_DIR)/nvc_device.obj $(OBJ_DIR)/pair_gpu_nbor.obj \ - $(OBJ_DIR)/pair_tex_tar.obj $(OBJ_DIR)/pair_gpu_cell.obj - -all: $(CUDA_LIB) $(EXECS) - -$(OBJ_DIR)/nvc_device.obj : nvc_device.cu $(NVC_H) - $(CUDA) -o $@ -c nvc_device.cu - -$(OBJ_DIR)/pair_gpu_nbor.obj: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H) - $(CUDA) -o $@ -c pair_gpu_nbor.cu - -$(OBJ_DIR)/pair_tex_tar.obj: $(TAR_H) - $(CUDA) -o $@ -c pair_tex_tar.cu - -$(OBJ_DIR)/pair_gpu_cell.obj: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h - $(CUDA) -o $@ -c pair_gpu_cell.cu - -$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.obj - $(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.obj - -$(CUDA_LIB): $(OBJS) $(TAR_H) - $(CUDA) -o $@ -shared $(OBJS) - -clean: - rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.exe *.exp *.lib *.dll *.linkinfo - -veryclean: clean - rm -rf *~ *.linkinfo - diff --git a/lib/gpu/Makefile.fermi b/lib/gpu/Makefile.fermi new file mode 100644 index 0000000000..d292bcfc30 --- /dev/null +++ b/lib/gpu/Makefile.fermi @@ -0,0 +1,39 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +# Peng Wang (Nvidia), penwang@nvidia.com +# Paul Crozier (SNL), pscrozi@sandia.gov +# ------------------------------------------------------------------------- */ + +CUDA_HOME = $(HOME)/cuda +NVCC = $(CUDA_HOME)/bin/nvcc + +CUDA_ARCH = -arch=sm_13 +CUDA_PRECISION = -D_SINGLE_DOUBLE +CUDA_INCLUDE = -I$(CUDA_HOME)/include +CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64 +CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math + +CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include +CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON + +BIN_DIR = ./ +OBJ_DIR = ./obj +LIB_DIR = ./ +AR = ar +BSH = /bin/sh + +include Nvidia.makefile + diff --git a/lib/gpu/Makefile.lens b/lib/gpu/Makefile.lens new file mode 100644 index 0000000000..ceec99df7f --- /dev/null +++ b/lib/gpu/Makefile.lens @@ -0,0 +1,39 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +# Peng Wang (Nvidia), penwang@nvidia.com +# Paul Crozier (SNL), pscrozi@sandia.gov +# ------------------------------------------------------------------------- */ + +CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/ +NVCC = nvcc + +CUDA_ARCH = -arch=sm_13 +CUDA_PRECISION = -D_SINGLE_SINGLE +CUDA_INCLUDE = -I$(CUDA_HOME)/include +CUDA_LIB = -L$(CUDA_HOME)/lib64 +CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math + +CUDR_CPP = mpic++ -DMPI_GERYON +CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias + +BIN_DIR = ./ +OBJ_DIR = ./obj +LIB_DIR = ./ +AR = ar +BSH = /bin/sh + +include Nvidia.makefile + diff --git a/lib/gpu/Makefile.lincoln b/lib/gpu/Makefile.lincoln new file mode 100644 index 0000000000..c181fb08fb --- /dev/null +++ b/lib/gpu/Makefile.lincoln @@ -0,0 +1,36 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Makefile for NCSA's lincoln GPU cluster. Tested with "soft +cuda-2.3" +# ------------------------------------------------------------------------- */ + +CUDA_HOME = /usr/local/cuda-2.3 +NVCC = $(CUDA_HOME)/bin/nvcc + +CUDA_ARCH = -arch=sm_13 +CUDA_PRECISION = -D_SINGLE_SINGLE +CUDA_INCLUDE = -I$(CUDA_HOME)/include +CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 +CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math + +CUDR_CPP = mpic++ -DMPI_GERYON +CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops + +BIN_DIR = ./ +OBJ_DIR = ./obj +LIB_DIR = ./ +AR = ar + +include Nvidia.makefile + diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux new file mode 100644 index 0000000000..43ee31bdf9 --- /dev/null +++ b/lib/gpu/Makefile.linux @@ -0,0 +1,39 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +# Peng Wang (Nvidia), penwang@nvidia.com +# Paul Crozier (SNL), pscrozi@sandia.gov +# ------------------------------------------------------------------------- */ + +CUDA_HOME = /usr/local/cuda +NVCC = nvcc + +CUDA_ARCH = -arch=sm_13 +CUDA_PRECISION = -D_SINGLE_SINGLE +CUDA_INCLUDE = -I$(CUDA_HOME)/include +CUDA_LIB = -L$(CUDA_HOME)/lib64 +CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math + +CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias + +BIN_DIR = ./ +OBJ_DIR = ./obj +LIB_DIR = ./ +AR = ar +BSH = /bin/sh + +include Nvidia.makefile + diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl new file mode 100644 index 0000000000..44b5d5aa2d --- /dev/null +++ b/lib/gpu/Makefile.linux_opencl @@ -0,0 +1,31 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +# Peng Wang (Nvidia), penwang@nvidia.com +# Paul Crozier (SNL), pscrozi@sandia.gov +# ------------------------------------------------------------------------- */ + +OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +OCL_LINK = -lOpenCL +OCL_PREC = -D_SINGLE_SINGLE + +BIN_DIR = ./ +OBJ_DIR = ./ocl_obj +LIB_DIR = ./ +AR = ar +BSH = /bin/sh + +include Opencl.makefile + diff --git a/lib/gpu/Makefile.longhorn b/lib/gpu/Makefile.longhorn new file mode 100644 index 0000000000..33a02562cf --- /dev/null +++ b/lib/gpu/Makefile.longhorn @@ -0,0 +1,35 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Makefile for the TACC longhorn cluster. Use "module load cuda". +# ------------------------------------------------------------------------- */ + +CUDA_HOME = $(TACC_CUDA_DIR) +NVCC = nvcc +CUDA_ARCH = -arch=sm_13 +CUDA_PRECISION = -D_SINGLE_SINGLE +CUDA_INCLUDE = -I$(CUDA_HOME)/include +CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB) +CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math + +CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias + +BIN_DIR = ./ +OBJ_DIR = ./obj +LIB_DIR = ./ +AR = ar + +include Nvidia.makefile + diff --git a/lib/gpu/Makefile.mac b/lib/gpu/Makefile.mac new file mode 100644 index 0000000000..f16fe197bc --- /dev/null +++ b/lib/gpu/Makefile.mac @@ -0,0 +1,39 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +# Peng Wang (Nvidia), penwang@nvidia.com +# Paul Crozier (SNL), pscrozi@sandia.gov +# ------------------------------------------------------------------------- */ + +CUDA_HOME = /usr/local/cuda +NVCC = nvcc + +CUDA_ARCH = -arch=sm_11 +CUDA_PRECISION = -D_SINGLE_SINGLE +CUDA_INCLUDE = -I$(CUDA_HOME)/include +CUDA_LIB = -L$(CUDA_HOME)/lib +CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32 + +CUDR_CPP = mpic++ +CUDR_OPTS = -O2 -m32 -g + +BIN_DIR = ./ +OBJ_DIR = ./obj +LIB_DIR = ./ +AR = ar +BSH = /bin/sh + +include Nvidia.makefile + diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl new file mode 100644 index 0000000000..dae41dd3ad --- /dev/null +++ b/lib/gpu/Makefile.mac_opencl @@ -0,0 +1,31 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +# Peng Wang (Nvidia), penwang@nvidia.com +# Paul Crozier (SNL), pscrozi@sandia.gov +# ------------------------------------------------------------------------- */ + +OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON +OCL_LINK = -framework OpenCL +OCL_PREC = -D_SINGLE_SINGLE + +BIN_DIR = ./ +OBJ_DIR = ./ocl_obj +LIB_DIR = ./ +AR = ar +BSH = /bin/sh + +include Opencl.makefile + diff --git a/lib/gpu/Makefile.nvidia b/lib/gpu/Makefile.nvidia deleted file mode 100644 index 78489850b4..0000000000 --- a/lib/gpu/Makefile.nvidia +++ /dev/null @@ -1,72 +0,0 @@ -# /* ---------------------------------------------------------------------- -# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator -# http://lammps.sandia.gov, Sandia National Laboratories -# Steve Plimpton, sjplimp@sandia.gov -# -# Copyright (2003) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains -# certain rights in this software. This software is distributed under -# the GNU General Public License. -# -# See the README file in the top-level LAMMPS directory. -# ------------------------------------------------------------------------- */ -# -# /* ---------------------------------------------------------------------- -# Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov -# Peng Wang (Nvidia), penwang@nvidia.com -# Paul Crozier (SNL), pscrozi@sandia.gov -# ------------------------------------------------------------------------- */ - -BIN_DIR = . -OBJ_DIR = . -AR = ar -CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math -CUDA_ARCH = -arch=sm_13 -CUDA_PREC = -D_SINGLE_SINGLE -CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB) - -CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC) - -CUDA_LIB = $(OBJ_DIR)/libgpu.a - -# Headers for CUDA Stuff -NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h -# Headers for Pair Stuff -PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h -# Dependencies for the Texture Tar -TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \ - lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \ - gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu - -ALL_H = $(NVC_H) $(PAIR_H) - -EXECS = $(BIN_DIR)/nvc_get_devices -OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/pair_gpu_nbor.cu_o \ - $(OBJ_DIR)/pair_tex_tar.cu_o $(OBJ_DIR)/pair_gpu_cell.cu_o - -all: $(CUDA_LIB) $(EXECS) - -$(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H) - $(CUDA) -o $@ -c nvc_device.cu - -$(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H) - $(CUDA) -o $@ -c pair_gpu_nbor.cu - -$(OBJ_DIR)/pair_tex_tar.cu_o: $(TAR_H) - $(CUDA) -o $@ -c pair_tex_tar.cu - -$(OBJ_DIR)/pair_gpu_cell.cu_o: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h - $(CUDA) -o $@ -c pair_gpu_cell.cu - -$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o - $(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o - -$(CUDA_LIB): $(OBJS) - $(AR) -crusv $(CUDA_LIB) $(OBJS) - -clean: - rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo - -veryclean: clean - rm -rf *~ *.linkinfo - diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile new file mode 100644 index 0000000000..f7de72bd25 --- /dev/null +++ b/lib/gpu/Nvidia.makefile @@ -0,0 +1,218 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +# Peng Wang (Nvidia), penwang@nvidia.com +# Paul Crozier (SNL), pscrozi@sandia.gov +# ------------------------------------------------------------------------- */ + +CUDA = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \ + $(CUDA_PRECISION) +CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \ + -Icudpp_mini +CUDA_LINK = $(CUDA_LIB) -lcudart + +GPU_LIB = $(LIB_DIR)/libgpu.a + +# Headers for Geryon +UCL_H = $(wildcard ./geryon/ucl*.h) +NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H) +NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) +# Headers for Pair Stuff +PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \ + pair_gpu_device.h pair_gpu_balance.h + +ALL_H = $(NVD_H) $(PAIR_H) + +EXECS = $(BIN_DIR)/nvc_get_devices +CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \ + $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \ + $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o +OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \ + $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \ + $(OBJ_DIR)/charge_gpu_memory.o \ + $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ + $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ + $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ + $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ + $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ + $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ + $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \ + $(CUDPP) +PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ + $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \ + $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \ + $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \ + $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \ + $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \ + $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \ + $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \ + $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \ + $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \ + $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h + +all: $(GPU_LIB) $(EXECS) + +$(OBJ_DIR)/cudpp.o: cudpp_mini/cudpp.cpp + $(CUDR) -o $@ -c cudpp_mini/cudpp.cpp -Icudpp_mini + +$(OBJ_DIR)/cudpp_plan.o: cudpp_mini/cudpp_plan.cpp + $(CUDR) -o $@ -c cudpp_mini/cudpp_plan.cpp -Icudpp_mini + +$(OBJ_DIR)/cudpp_maximal_launch.o: cudpp_mini/cudpp_maximal_launch.cpp + $(CUDR) -o $@ -c cudpp_mini/cudpp_maximal_launch.cpp -Icudpp_mini + +$(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp + $(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini + +$(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu + $(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu + +$(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu + $(CUDA) -o $@ -c cudpp_mini/scan_app.cu + +$(OBJ_DIR)/pair_gpu_atom_kernel.ptx: pair_gpu_atom_kernel.cu + $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_atom_kernel.cu + +$(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h + +$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h + $(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu + $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu + +$(OBJ_DIR)/pair_gpu_nbor_ptx.h: $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h + +$(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu + $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_build_kernel.cu + +$(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h + +$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H) + $(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H) + $(CUDR) -o $@ -c pair_gpu_device.cpp + +$(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp + $(CUDR) -o $@ -c atomic_gpu_memory.cpp + +$(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp + $(CUDR) -o $@ -c charge_gpu_memory.cpp + +$(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h + $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu + +$(OBJ_DIR)/gb_gpu_kernel_lj.ptx: gb_gpu_kernel_lj.cu pair_gpu_precision.h gb_gpu_extra.h + $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_lj.cu + +$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx: gb_gpu_kernel_nbor.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_nbor.cu + +$(OBJ_DIR)/gb_gpu_ptx.h: $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h + +$(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_ptx.h + $(CUDR) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp + $(CUDR) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_cut_gpu_kernel.ptx: lj_cut_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ lj_cut_gpu_kernel.cu + +$(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h + +$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o + $(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp + $(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ ljc_cut_gpu_kernel.cu + +$(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h + +$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o + $(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp + $(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ ljcl_cut_gpu_kernel.cu + +$(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h + +$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o + $(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp + $(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu + +$(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h + +$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o + $(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp + $(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu + +$(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h + +$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o + $(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp + $(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_long_gpu_kernel.cu + +$(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h + +$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o + $(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp + $(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR) + +$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H) + $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK) + +$(GPU_LIB): $(OBJS) + $(AR) -crusv $(GPU_LIB) $(OBJS) + +clean: + rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo + +veryclean: clean + rm -rf *~ *.linkinfo diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile new file mode 100644 index 0000000000..4da8ce5f12 --- /dev/null +++ b/lib/gpu/Opencl.makefile @@ -0,0 +1,155 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +# Peng Wang (Nvidia), penwang@nvidia.com +# Paul Crozier (SNL), pscrozi@sandia.gov +# ------------------------------------------------------------------------- */ + +OCL = $(OCL_CPP) $(OCL_PREC) -DUSE_OPENCL +OCL_LIB = $(LIB_DIR)/libgpu.a +# Headers for Geryon +UCL_H = $(wildcard ./geryon/ucl*.h) +OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) +# Headers for Pair Stuff +PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \ + pair_gpu_device.h pair_gpu_balance.h + +ALL_H = $(OCL_H) $(PAIR_H) + +EXECS = $(BIN_DIR)/ocl_get_devices +OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \ + $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \ + $(OBJ_DIR)/charge_gpu_memory.o \ + $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ + $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ + $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ + $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ + $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ + $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ + $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o +KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \ + $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \ + $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \ + $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \ + $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h + +OCL_EXECS = $(BIN_DIR)/ocl_get_devices + +all: $(OCL_LIB) $(EXECS) + +$(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom_cl.h + +$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h + $(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h + +$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h + $(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H) + $(OCL) -o $@ -c pair_gpu_device.cpp + +$(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp + $(OCL) -o $@ -c atomic_gpu_memory.cpp + +$(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp + $(OCL) -o $@ -c charge_gpu_memory.cpp + +$(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu + $(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h + +$(OBJ_DIR)/gb_gpu_cl.h: gb_gpu_kernel.cu gb_gpu_kernel_lj.cu gb_gpu_extra.h + cat gb_gpu_extra.h gb_gpu_kernel.cu > $(OBJ_DIR)/gb_gpu_kernel.tar; \ + cat gb_gpu_extra.h gb_gpu_kernel_lj.cu > $(OBJ_DIR)/gb_gpu_kernel_lj.tar; \ + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_cl.h; \ + rm -f $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar + +$(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h + $(OCL) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp + $(OCL) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_cl.h; + +$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o + $(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp + $(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_cl.h; + +$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o + $(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp + $(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_cl.h; + +$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o + $(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp + $(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h; + +$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o + $(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp + $(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h; + +$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o + $(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp + $(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_cl.h; + +$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o + $(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp + $(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR) + +$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp + $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) + +$(OCL_LIB): $(OBJS) $(PTXS) + $(AR) -crusv $(OCL_LIB) $(OBJS) + +opencl: $(OCL_EXECS) + +clean: + rm -rf $(EXECS) $(OCL_EXECS) $(OCL_LIB) $(OBJS) $(KERS) *.linkinfo + +veryclean: clean + rm -rf *~ *.linkinfo + diff --git a/lib/gpu/README b/lib/gpu/README index d459a59c96..567d81886b 100644 --- a/lib/gpu/README +++ b/lib/gpu/README @@ -12,7 +12,7 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov Peng Wang (Nvidia), penwang@nvidia.com Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ @@ -20,57 +20,91 @@ GENERAL NOTES This library, libgpu.a, provides routines for GPU acceleration -of LAMMPS pair styles. Currently, only CUDA enabled GPUs are -supported. Compilation of this library requires installing the CUDA -GPU driver and CUDA toolkit for your operating system. In addition to -the LAMMPS library, the binary nvc_get_devices will also be -built. This can be used to query the names and properties of GPU -devices on your system. +of LAMMPS pair styles. Compilation of this library requires +installing the CUDA GPU driver and CUDA toolkit for your operating +system. In addition to the LAMMPS library, the binary nvc_get_devices +will also be built. This can be used to query the names and +properties of GPU devices on your system. A Makefile for OpenCL +compilation is provided, but support for OpenCL use is not currently +provided by the developers. NOTE: Installation of the CUDA SDK is not required. Current pair styles supporting GPU acceleration: 1. lj/cut/gpu - 2. gayberne/gpu + 2. lj/cut/coul/cut/gpu + 3. lj/cut/coul/long/gpu + 4. lj96/cut/gpu + 5. gayberne/gpu + 6. cmm/cg/gpu + 7. cmm/cg/coul/long/gpu MULTIPLE LAMMPS PROCESSES -When using GPU acceleration, you are restricted to one physical GPU -per LAMMPS process. This can be multiple GPUs on a single node or -across multiple nodes. Intructions on GPU assignment can be found in -the LAMMPS documentation. - - SPEEDUPS - -The speedups that can be obtained using this library are highly -dependent on the GPU architecture and the computational expense of the -pair potential. When comparing a single precision Tesla C1060 run to a -serial Intel Xeon 5140 2.33 GHz serial run, the speedup is ~4.42x for -lj/cut with a cutoff of 2.5. For gayberne with a cutoff of 7, the -speedup is >103x for 8000 particles. The speedup will improve with an -increase in the number of particles or an increase in the cutoff. +Multiple LAMMPS MPI processes can share GPUs on the system, but multiple +GPUs cannot be utilized by a single MPI process. In many cases, the +best performance will be obtained by running as many MPI processes as +CPU cores available with the condition that the number of MPI processes +is an integer multiple of the number of GPUs being used. See the +LAMMPS user manual for details on running with GPU acceleration. BUILDING AND PRECISION MODES -To build, edit the CUDA_CPP, CUDA_ARCH, CUDA_PREC, and CUDA_LINK files for -your machine. Type make. Additionally, the GPU package must be installed and -compiled for LAMMPS. The library supports 3 precision modes as determined by -the CUDA_PREC variable: +To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME, NVCC, CUDA_INCLUD, +CUDA_LIB and CUDA_OPTS variables in one of the Makefiles. CUDA_ARCH should +be set based on the compute capability of your GPU. This can be verified by +running the nvc_get_devices executable after the build is complete. +Additionally, the GPU package must be installed and compiled for LAMMPS. +This may require editing the gpu_SYSPATH variable in the LAMMPS makefile. + +Please note that the GPU library accesses the CUDA driver library directly, +so it needs to be linked not only to the CUDA runtime library (libcudart.so) +that ships with the CUDA toolkit, but also with the CUDA driver library +(libcuda.so) that ships with the Nvidia driver. If you are compiling LAMMPS +on the head node of a GPU cluster, this library may not be installed, +so you may need to copy it over from one of the compute nodes (best into +this directory). + +The gpu library supports 3 precision modes as determined by +the CUDA_PRECISION variable: CUDA_PREC = -D_SINGLE_SINGLE # Single precision for all calculations CUDA_PREC = -D_DOUBLE_DOUBLE # Double precision for all calculations CUDA_PREC = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double -NOTE: For the lj/cut pair style, only single precision will be used, even - if double precision is specified. - -NOTE: Double precision is only supported on certain GPUS (with +NOTE: Double precision is only supported on certain GPUs (with compute capability>=1.3). NOTE: For Tesla and other graphics cards with compute capability>=1.3, make sure that -arch=sm_13 is set on the CUDA_ARCH line. +NOTE: For Fermi, make sure that -arch=sm_20 is set on the CUDA_ARCH line. + NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE package has been installed before installing the GPU package in LAMMPS. - + +NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be + installed if the USER-CG-CMM package has been installed before + installing the GPU package in LAMMPS. + +NOTE: The lj/cut/coul/long/gpu and cg/cmm/coul/long/gpu style will only be + installed if the KSPACE package has been installed before installing + the GPU package in LAMMPS. + + EXAMPLE BUILD PROCESS + +cd ~/lammps/lib/gpu +emacs Makefile.linux +make -f Makefile.linux +./nvc_get_devices +cd ../../src +emacs ./MAKE/Makefile.linux +make yes-asphere +make yes-kspace +make yes-gpu +make linux + +------------------------------------------------------------------------ +Last merge with gpulammps: r561 on 2010-11-12 +------------------------------------------------------------------------ diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp new file mode 100644 index 0000000000..d23ac78523 --- /dev/null +++ b/lib/gpu/atomic_gpu_memory.cpp @@ -0,0 +1,262 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include "atomic_gpu_memory.h" +#define AtomicGPUMemoryT AtomicGPUMemory + +extern PairGPUDevice pair_gpu_device; + +template +AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) { + device=&pair_gpu_device; +} + +template +AtomicGPUMemoryT::~AtomicGPUMemory() { +} + +template +int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors); +} + +template +bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const char *pair_program) { + nbor_time_avail=false; + screen=_screen; + + bool gpu_nbor=false; + if (device->gpu_mode()==PairGPUDevice::GPU_NEIGH) + gpu_nbor=true; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split); + if (host_nlocal>0) + _gpu_host=1; + + if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor, + _gpu_host,max_nbors,cell_size,false)) + return false; + ucl_device=device->gpu; + atom=&device->atom; + nbor=&device->nbor; + + _block_size=BLOCK_1D; + if (static_cast(_block_size)>ucl_device->group_size()) + _block_size=ucl_device->group_size(); + compile_kernels(*ucl_device,pair_program); + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->dev_x,4); + + _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + + return true; +} + +template +void AtomicGPUMemoryT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen); + + if (_compiled) { + k_pair_fast.clear(); + k_pair.clear(); + delete pair_program; + _compiled=false; + } + + time_pair.clear(); + hd_balancer.clear(); + + device->clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + nbor_time_avail=true; + + int mn=nbor->max_nbor_loop(inum,numj); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return false; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline void AtomicGPUMemoryT::build_nbor_list(const int inum, + const int host_inum, + const int nall, double **host_x, + int *host_type, double *boxlo, + double *boxhi, int *tag, + int **nspecial, int **special, + bool &success) { + nbor_time_avail=true; + + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag, + nspecial, special, success, mn); + + double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +void AtomicGPUMemoryT::compute(const int timestep, const int f_ago, + const int inum_full, const int nall, + double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success) { + acc_timers(); + if (inum_full==0) { + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time, + nbor->gpu_nbor()); + atom->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + + loop(eflag,vflag); + atom->copy_answers(eflag,vflag,eatom,vatom,ilist); + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template +int * AtomicGPUMemoryT::compute(const int timestep, const int ago, + const int inum_full, const int nall, + double **host_x, int *host_type, double *boxlo, + double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + const double cpu_time, bool &success) { + acc_timers(); + if (inum_full==0) { + zero_timers(); + return NULL; + } + + hd_balancer.balance(cpu_time,nbor->gpu_nbor()); + int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full); + atom->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + boxlo, boxhi, tag, nspecial, special, success); + if (!success) + return NULL; + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + + loop(eflag,vflag); + atom->copy_answers(eflag,vflag,eatom,vatom); + hd_balancer.stop_timer(); + + return device->nbor.host_nbor.begin(); +} + +template +double AtomicGPUMemoryT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+ + device->nbor.host_memory_usage()+4*sizeof(numtyp)+ + sizeof(AtomicGPUMemory); +} + +template +void AtomicGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) { + if (_compiled) + return; + + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE); + + pair_program=new UCL_Program(dev); + pair_program->load_string(pair_str,flags.c_str()); + k_pair_fast.set_function(*pair_program,"kernel_pair_fast"); + k_pair.set_function(*pair_program,"kernel_pair"); + pos_tex.get_texture(*pair_program,"pos_tex"); + + _compiled=true; +} + +template class AtomicGPUMemory; + diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h new file mode 100644 index 0000000000..91003f5c0d --- /dev/null +++ b/lib/gpu/atomic_gpu_memory.h @@ -0,0 +1,180 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef ATOMIC_GPU_MEMORY_H +#define ATOMIC_GPU_MEMORY_H + +#define BLOCK_1D 64 + +#include "pair_gpu_device.h" +#include "pair_gpu_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +template +class AtomicGPUMemory { + public: + AtomicGPUMemory(); + virtual ~AtomicGPUMemory(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device **/ + bool init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const char *pair_program); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(inum, nall, success)) + pos_tex.bind_float(atom->dev_x,4); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (nbor_time_avail) { + nbor->time_nbor.add_to_total(); + nbor->time_kernel.add_to_total(); + nbor_time_avail=false; + } + time_pair.add_to_total(); + atom->acc_timers(); + } + + /// Zero timers + inline void zero_timers() { + nbor_time_avail=false; + time_pair.zero(); + atom->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int timestep, const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, const double cpu_time, bool &success); + + /// Pair loop with device neighboring + int * compute(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, double *boxlo, + double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + PairGPUDevice *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + PairGPUBalance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + PairGPUAtom *atom; + + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + PairGPUNbor *nbor; + + /// True if we need to accumulate time for neighboring + bool nbor_time_avail; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program; + UCL_Kernel k_pair_fast, k_pair; + inline int block_size() { return _block_size; } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + + protected: + bool _compiled; + int _block_size; + double _max_bytes, _max_an_bytes; + + void compile_kernels(UCL_Device &dev, const char *pair_string); + + virtual void loop(const bool _eflag, const bool _vflag) = 0; +}; + +#endif + + diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp new file mode 100644 index 0000000000..a14f7b7205 --- /dev/null +++ b/lib/gpu/charge_gpu_memory.cpp @@ -0,0 +1,270 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include "charge_gpu_memory.h" +#define ChargeGPUMemoryT ChargeGPUMemory + +extern PairGPUDevice pair_gpu_device; + +template +ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) { + device=&pair_gpu_device; +} + +template +ChargeGPUMemoryT::~ChargeGPUMemory() { +} + +template +int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors); +} + +template +bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const char *pair_program) { + nbor_time_avail=false; + screen=_screen; + + bool gpu_nbor=false; + if (device->gpu_mode()==PairGPUDevice::GPU_NEIGH) + gpu_nbor=true; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split); + if (host_nlocal>0) + _gpu_host=1; + + if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor, + _gpu_host,max_nbors,cell_size,false)) + return false; + ucl_device=device->gpu; + atom=&device->atom; + nbor=&device->nbor; + + _block_size=BLOCK_1D; + if (static_cast(_block_size)>ucl_device->group_size()) + _block_size=ucl_device->group_size(); + compile_kernels(*ucl_device,pair_program); + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->dev_x,4); + q_tex.bind_float(atom->dev_q,1); + + _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + + return true; +} + +template +void ChargeGPUMemoryT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen); + + if (_compiled) { + k_pair_fast.clear(); + k_pair.clear(); + delete pair_program; + _compiled=false; + } + + time_pair.clear(); + hd_balancer.clear(); + + device->clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + nbor_time_avail=true; + + int mn=nbor->max_nbor_loop(inum,numj); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return false; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline void ChargeGPUMemoryT::build_nbor_list(const int inum, + const int host_inum, + const int nall, double **host_x, + int *host_type, double *boxlo, + double *boxhi, int *tag, + int **nspecial, int **special, + bool &success) { + nbor_time_avail=true; + + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag, + nspecial, special, success, mn); + + double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +void ChargeGPUMemoryT::compute(const int timestep, const int f_ago, + const int inum_full, const int nall, + double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double *host_q) { + acc_timers(); + if (inum_full==0) { + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time, + nbor->gpu_nbor()); + atom->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + atom->add_other_data(); + + loop(eflag,vflag); + atom->copy_answers(eflag,vflag,eatom,vatom,ilist); + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template +int * ChargeGPUMemoryT::compute(const int timestep, const int ago, + const int inum_full, const int nall, + double **host_x, int *host_type, double *boxlo, + double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + const double cpu_time, bool &success, + double *host_q) { + acc_timers(); + if (inum_full==0) { + zero_timers(); + return NULL; + } + + hd_balancer.balance(cpu_time,nbor->gpu_nbor()); + int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full); + atom->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + boxlo, boxhi, tag, nspecial, special, success); + if (!success) + return NULL; + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_other_data(); + + loop(eflag,vflag); + atom->copy_answers(eflag,vflag,eatom,vatom); + hd_balancer.stop_timer(); + + return device->nbor.host_nbor.begin(); +} + +template +double ChargeGPUMemoryT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+ + device->nbor.host_memory_usage()+4*sizeof(numtyp)+ + sizeof(ChargeGPUMemory); +} + +template +void ChargeGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) { + if (_compiled) + return; + + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE); + + pair_program=new UCL_Program(dev); + pair_program->load_string(pair_str,flags.c_str()); + k_pair_fast.set_function(*pair_program,"kernel_pair_fast"); + k_pair.set_function(*pair_program,"kernel_pair"); + pos_tex.get_texture(*pair_program,"pos_tex"); + q_tex.get_texture(*pair_program,"q_tex"); + + _compiled=true; +} + +template class ChargeGPUMemory; + diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h new file mode 100644 index 0000000000..c53f897118 --- /dev/null +++ b/lib/gpu/charge_gpu_memory.h @@ -0,0 +1,183 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef CHARGE_GPU_MEMORY_H +#define CHARGE_GPU_MEMORY_H + +#define BLOCK_1D 64 + +#include "pair_gpu_device.h" +#include "pair_gpu_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +template +class ChargeGPUMemory { + public: + ChargeGPUMemory(); + virtual ~ChargeGPUMemory(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device **/ + bool init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const char *pair_program); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(inum, nall, success)) { + pos_tex.bind_float(atom->dev_x,4); + q_tex.bind_float(atom->dev_q,1); + } + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (nbor_time_avail) { + nbor->time_nbor.add_to_total(); + nbor->time_kernel.add_to_total(); + nbor_time_avail=false; + } + time_pair.add_to_total(); + atom->acc_timers(); + } + + /// Zero timers + inline void zero_timers() { + nbor_time_avail=false; + time_pair.zero(); + atom->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int timestep, const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, const double cpu_time, bool &success, + double *charge); + + /// Pair loop with device neighboring + int * compute(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, double *boxlo, + double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *charge); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + PairGPUDevice *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + PairGPUBalance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + PairGPUAtom *atom; + + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + PairGPUNbor *nbor; + + /// True if we need to accumulate time for neighboring + bool nbor_time_avail; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program; + UCL_Kernel k_pair_fast, k_pair; + inline int block_size() { return _block_size; } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture q_tex; + + protected: + bool _compiled; + int _block_size; + double _max_bytes, _max_an_bytes; + + void compile_kernels(UCL_Device &dev, const char *pair_string); + + virtual void loop(const bool _eflag, const bool _vflag) = 0; +}; + +#endif + diff --git a/lib/gpu/cmm_cut_gpu.cpp b/lib/gpu/cmm_cut_gpu.cpp new file mode 100644 index 0000000000..11e0e912eb --- /dev/null +++ b/lib/gpu/cmm_cut_gpu.cpp @@ -0,0 +1,124 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include +#include +#include + +#include "cmm_cut_gpu_memory.h" + +using namespace std; + +static CMM_GPU_Memory CMMMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen) { + CMMMF.clear(); + gpu_mode=CMMMF.device->gpu_mode(); + double gpu_split=CMMMF.device->particle_split(); + int first_gpu=CMMMF.device->first_device(); + int last_gpu=CMMMF.device->last_device(); + int world_me=CMMMF.device->world_me(); + int gpu_rank=CMMMF.device->gpu_rank(); + int procs_per_gpu=CMMMF.device->procs_per_gpu(); + + CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu); + + bool message=false; + if (world_me==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + if (world_me==0) { + bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + if (!init_ok) + return false; + } + + MPI_Barrier(MPI_COMM_WORLD); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_comm); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + return true; +} + +void cmm_gpu_clear() { + CMMMF.clear(); +} + +int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, + boxhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, cpu_time, success); +} + +void cmm_gpu_compute(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success) { + CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double cmm_gpu_bytes() { + return CMMMF.host_memory_usage(); +} + + diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu new file mode 100644 index 0000000000..2288cf1df9 --- /dev/null +++ b/lib/gpu/cmm_cut_gpu_kernel.cu @@ -0,0 +1,296 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef CMM_GPU_KERNEL +#define CMM_GPU_KERNEL + +#define MAX_SHARED_TYPES 8 + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#ifdef NV_KERNEL + +#include "geryon/ucl_nv_kernel.h" +texture pos_tex; + +#ifdef _DOUBLE_DOUBLE +__inline double4 fetch_pos(const int& i, const double4 *pos) +{ + return pos[i]; +} +#else +__inline float4 fetch_pos(const int& i, const float4 *pos) +{ + return tex1Dfetch(pos_tex, i); +} +#endif + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline + +#define fetch_pos(i,y) x_[i] + +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch) { + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X; + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + if (ii0) + energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)- + lj3[mtype].z; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in,__global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch) { + // ii indexes the two interacting particles in gi + int ii=THREAD_ID_X; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (ii<4) + sp_lj[ii]=sp_lj_in[ii]; + if (ii0) + lj3[ii]=lj3_in[ii]; + } + ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + __syncthreads(); + + if (ii0) + energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)- + lj3[mtype].z; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii*/ +} + +#endif + diff --git a/lib/gpu/cmm_cut_gpu_memory.cpp b/lib/gpu/cmm_cut_gpu_memory.cpp new file mode 100644 index 0000000000..bbef5e2119 --- /dev/null +++ b/lib/gpu/cmm_cut_gpu_memory.cpp @@ -0,0 +1,150 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifdef USE_OPENCL +#include "cmm_cut_gpu_cl.h" +#else +#include "cmm_cut_gpu_ptx.h" +#endif + +#include "cmm_cut_gpu_memory.h" +#include +#define CMM_GPU_MemoryT CMM_GPU_Memory + +extern PairGPUDevice pair_gpu_device; + +template +CMM_GPU_MemoryT::CMM_GPU_Memory() : AtomicGPUMemory(), _allocated(false) { +} + +template +CMM_GPU_MemoryT::~CMM_GPU_Memory() { + clear(); +} + +template +int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,cmm_cut_gpu_kernel); + + // If atom type constants fit in shared memory use fast kernel + int cmm_types=ntypes; + shared_types=false; + if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { + cmm_types=MAX_SHARED_TYPES; + shared_types=true; + } + _cmm_types=cmm_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(cmm_types*cmm_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, + host_cg_type,host_lj1,host_lj2); + + lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return true; +} + +template +void CMM_GPU_MemoryT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double CMM_GPU_MemoryT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CMM_GPU_Memory); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + + int ainum=this->atom->inum(); + int anall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch); + } + this->time_pair.stop(); +} + +template class CMM_GPU_Memory; diff --git a/lib/gpu/cmm_cut_gpu_memory.h b/lib/gpu/cmm_cut_gpu_memory.h new file mode 100644 index 0000000000..b74b809e29 --- /dev/null +++ b/lib/gpu/cmm_cut_gpu_memory.h @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef CMM_GPU_MEMORY_H +#define CMM_GPU_MEMORY_H + +#include "atomic_gpu_memory.h" + +template +class CMM_GPU_Memory : public AtomicGPUMemory { + public: + CMM_GPU_Memory(); + ~CMM_GPU_Memory(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device **/ + bool init(const int ntypes, double **host_cutsq, int **host_cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = cutsq, lj1.y=cg_type, lj1.z = lj1, lj1.w = lj2 + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _cmm_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +#endif + diff --git a/lib/gpu/cmmc_long_gpu.cpp b/lib/gpu/cmmc_long_gpu.cpp new file mode 100644 index 0000000000..a647a08e4d --- /dev/null +++ b/lib/gpu/cmmc_long_gpu.cpp @@ -0,0 +1,130 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include +#include +#include + +#include "cmmc_long_gpu_memory.h" + +using namespace std; + +static CMML_GPU_Memory CMMLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + CMMLMF.clear(); + gpu_mode=CMMLMF.device->gpu_mode(); + double gpu_split=CMMLMF.device->particle_split(); + int first_gpu=CMMLMF.device->first_device(); + int last_gpu=CMMLMF.device->last_device(); + int world_me=CMMLMF.device->world_me(); + int gpu_rank=CMMLMF.device->gpu_rank(); + int procs_per_gpu=CMMLMF.device->procs_per_gpu(); + + CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu); + + bool message=false; + if (world_me==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + if (world_me==0) { + bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, + host_lj3, host_lj4, offset, special_lj, inum, + nall, 300, maxspecial, cell_size, gpu_split, + screen, host_cut_ljsq, host_cut_coulsq, + host_special_coul, qqrd2e,g_ewald); + if (!init_ok) + return false; + } + + MPI_Barrier(MPI_COMM_WORLD); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_comm); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + return true; +} + +void cmml_gpu_clear() { + CMMLMF.clear(); +} + +int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q) { + return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, + boxhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, cpu_time, success, host_q); +} + +void cmml_gpu_compute(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q) { + CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q); +} + +double cmml_gpu_bytes() { + return CMMLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu new file mode 100644 index 0000000000..23debb6b53 --- /dev/null +++ b/lib/gpu/cmmc_long_gpu_kernel.cu @@ -0,0 +1,378 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef CMML_GPU_KERNEL +#define CMML_GPU_KERNEL + +#define MAX_SHARED_TYPES 8 + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#define EWALD_F (numtyp)1.12837917 +#define EWALD_P (numtyp)0.3275911 +#define A1 (numtyp)0.254829592 +#define A2 (numtyp)-0.284496736 +#define A3 (numtyp)1.421413741 +#define A4 (numtyp)-1.453152027 +#define A5 (numtyp)1.061405429 + +#ifdef NV_KERNEL + +#include "geryon/ucl_nv_kernel.h" +texture pos_tex; +texture q_tex; + +#ifdef _DOUBLE_DOUBLE +__inline double4 fetch_pos(const int& i, const double4 *pos) +{ + return pos[i]; +} +__inline double fetch_q(const int& i, const double *q) +{ + return q[i]; +} +#else +__inline float4 fetch_pos(const int& i, const float4 *pos) +{ + return tex1Dfetch(pos_tex, i); +} +__inline float fetch_q(const int& i, const float *q) +{ + return tex1Dfetch(q_tex, i); +} +#endif + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline + +#define fetch_pos(i,y) x_[i] +#define fetch_q(i,y) q_[i] + +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + __global numtyp *q_ , const numtyp cut_coulsq, + const numtyp qqrd2e, const numtyp g_ewald) { + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X; + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + if (ii0) { + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].y) { + energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- + lj3[mtype].w; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + __global numtyp *q_ , const numtyp cut_coulsq, + const numtyp qqrd2e, const numtyp g_ewald) { + // ii indexes the two interacting particles in gi + int ii=THREAD_ID_X; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (ii<8) + sp_lj[ii]=sp_lj_in[ii]; + if (ii0) { + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].y) { + energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- + lj3[mtype].w; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii*/ +} + +#endif + diff --git a/lib/gpu/cmmc_long_gpu_memory.cpp b/lib/gpu/cmmc_long_gpu_memory.cpp new file mode 100644 index 0000000000..3625ef1caf --- /dev/null +++ b/lib/gpu/cmmc_long_gpu_memory.cpp @@ -0,0 +1,164 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifdef USE_OPENCL +#include "cmmc_long_gpu_cl.h" +#else +#include "cmmc_long_gpu_ptx.h" +#endif + +#include "cmmc_long_gpu_memory.h" +#include +#define CMML_GPU_MemoryT CMML_GPU_Memory + +extern PairGPUDevice pair_gpu_device; + +template +CMML_GPU_MemoryT::CMML_GPU_Memory() : ChargeGPUMemory(), + _allocated(false) { +} + +template +CMML_GPU_MemoryT::~CMML_GPU_Memory() { + clear(); +} + +template +int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, + const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,cmmc_long_gpu_kernel); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { + lj_types=MAX_SHARED_TYPES; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq, + host_cut_ljsq,host_lj1,host_lj2); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3, + host_lj4,host_offset); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return true; +} + +template +void CMML_GPU_MemoryT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double CMML_GPU_MemoryT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CMML_GPU_Memory); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + + int ainum=this->atom->inum(); + int anall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch, + &this->atom->dev_q.begin(), &_cut_coulsq, + &_qqrd2e, &_g_ewald); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch, &this->atom->dev_q.begin(), + &_cut_coulsq, &_qqrd2e, &_g_ewald); + } + this->time_pair.stop(); +} + +template class CMML_GPU_Memory; diff --git a/lib/gpu/cmmc_long_gpu_memory.h b/lib/gpu/cmmc_long_gpu_memory.h new file mode 100644 index 0000000000..a9e6e56934 --- /dev/null +++ b/lib/gpu/cmmc_long_gpu_memory.h @@ -0,0 +1,75 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef CMML_GPU_MEMORY_H +#define CMML_GPU_MEMORY_H + +#include "charge_gpu_memory.h" + +template +class CMML_GPU_Memory : public ChargeGPUMemory { + public: + CMML_GPU_Memory(); + ~CMML_GPU_Memory(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device **/ + bool init(const int ntypes, double **host_cutsq, int ** cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, + UCL_D_Vec lj1; + /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset + UCL_D_Vec lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +#endif + diff --git a/lib/gpu/cudpp_mini/README b/lib/gpu/cudpp_mini/README new file mode 100644 index 0000000000..873e884e8b --- /dev/null +++ b/lib/gpu/cudpp_mini/README @@ -0,0 +1,5 @@ +This is a stripped down and customized version +of the CUDA performance primitives library for +use with the GPU package in LAMMPS. +Don't use for anything else, get the real thing +from http://code.google.com/p/cudpp/ instead! diff --git a/lib/gpu/cudpp_mini/cta/radixsort_cta.cu b/lib/gpu/cudpp_mini/cta/radixsort_cta.cu new file mode 100644 index 0000000000..e7551b2407 --- /dev/null +++ b/lib/gpu/cudpp_mini/cta/radixsort_cta.cu @@ -0,0 +1,337 @@ +// ------------------------------------------------------------- +// CUDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- +#include +#include "cudpp_radixsort.h" +#include "cta/scan_cta.cu" +#include +#include + +#include +#include +#include "sharedmem.h" + + +#ifdef __DEVICE_EMULATION__ +#define __EMUSYNC __syncthreads() +#else +#define __EMUSYNC +#endif + +/** + * @file + * sort_cta.cu + * + * @brief CUDPP CTA-level sort routines + */ + +/** \addtogroup cudpp_cta +* @{ +*/ + +/** @name Radix Sort Functions +* @{ +*/ + + +typedef unsigned int uint; + +/** + * @brief Flips bits of single-precision floating-point number (parameterized by doFlip) + * + * flip a float for sorting + * finds SIGN of fp number. + * if it's 1 (negative float), it flips all bits + * if it's 0 (positive float), it flips the sign only + * @param[in] f floating-point input (passed as unsigned int) + * @see floatUnflip +**/ + +template +__device__ uint floatFlip(uint f) +{ + if (doFlip) + { + uint mask = -int(f >> 31) | 0x80000000; + return f ^ mask; + } + else + return f; +} + +/** + * @brief Reverses bit-flip of single-precision floating-point number (parameterized by doFlip) + * + * flip a float back (invert FloatFlip) + * signed was flipped from above, so: + * if sign is 1 (negative), it flips the sign bit back + * if sign is 0 (positive), it flips all bits back + * @param[in] f floating-point input (passed as unsigned int) + * @see floatFlip +**/ +template +__device__ uint floatUnflip(uint f) +{ + if (doFlip) + { + uint mask = ((f >> 31) - 1) | 0x80000000; + return f ^ mask; + } + else + return f; +} + +/** + * @brief Scans one warp quickly, optimized for 32-element warps, using shared memory + * + * Scans each warp in parallel ("warp-scan"), one element per thread. + * uses 2 numElements of shared memory per thread (64 numElements per warp) + * + * @param[in] val Elements per thread to scan + * @param[in,out] sData +**/ +template +__device__ T scanwarp(T val, volatile T* sData) +{ + // The following is the same as 2 * WARP_SIZE * warpId + threadInWarp = + // 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE - 1)) + int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1)); + sData[idx] = 0; + idx += WARP_SIZE; + T t = sData[idx] = val; __EMUSYNC; + +#ifdef __DEVICE_EMULATION__ + t = sData[idx - 1]; __EMUSYNC; + sData[idx] += t; __EMUSYNC; + t = sData[idx - 2]; __EMUSYNC; + sData[idx] += t; __EMUSYNC; + t = sData[idx - 4]; __EMUSYNC; + sData[idx] += t; __EMUSYNC; + t = sData[idx - 8]; __EMUSYNC; + sData[idx] += t; __EMUSYNC; + t = sData[idx - 16]; __EMUSYNC; + sData[idx] += t; __EMUSYNC; +#else + if (0 <= maxlevel) { sData[idx] = t = t + sData[idx - 1]; } __EMUSYNC; + if (1 <= maxlevel) { sData[idx] = t = t + sData[idx - 2]; } __EMUSYNC; + if (2 <= maxlevel) { sData[idx] = t = t + sData[idx - 4]; } __EMUSYNC; + if (3 <= maxlevel) { sData[idx] = t = t + sData[idx - 8]; } __EMUSYNC; + if (4 <= maxlevel) { sData[idx] = t = t + sData[idx -16]; } __EMUSYNC; +#endif + return sData[idx] - val; // convert inclusive -> exclusive +} + +/** + * @brief Scans 4*CTA_SIZE unsigned ints in a block + * + * scan4 scans 4*CTA_SIZE numElements in a block (4 per + * thread), using a warp-scan algorithm + * + * @param[in] idata 4-vector of integers to scan +**/ +__device__ uint4 scan4(uint4 idata) +{ + extern __shared__ uint ptr[]; + + uint idx = threadIdx.x; + + uint4 val4 = idata; + uint sum[3]; + sum[0] = val4.x; + sum[1] = val4.y + sum[0]; + sum[2] = val4.z + sum[1]; + + uint val = val4.w + sum[2]; + + val = scanwarp(val, ptr); + __syncthreads(); + + if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1) + { + ptr[idx >> 5] = val + val4.w + sum[2]; + } + __syncthreads(); + +#ifndef __DEVICE_EMULATION__ + if (idx < WARP_SIZE) +#endif + { + ptr[idx] = scanwarp(ptr[idx], ptr); + } + __syncthreads(); + + val += ptr[idx >> 5]; + + val4.x = val; + val4.y = val + sum[0]; + val4.z = val + sum[1]; + val4.w = val + sum[2]; + + return val4; +} + +/** + * @brief Computes output position for each thread given predicate; trues come first then falses + * + * Rank is the core of the radix sort loop. Given a predicate, it + * computes the output position for each thread in an ordering where all + * True threads come first, followed by all False threads. + * This version handles 4 predicates per thread; hence, "rank4". + * + * @param[in] preds true/false values for each of the 4 elements in this thread + * + * @todo is the description of "preds" correct? +**/ +template +__device__ uint4 rank4(uint4 preds) +{ + uint4 address = scan4(preds); + + __shared__ uint numtrue; + if (threadIdx.x == ctasize-1) + { + numtrue = address.w + preds.w; + } + __syncthreads(); + + uint4 rank; + uint idx = threadIdx.x << 2; + rank.x = (preds.x) ? address.x : numtrue + idx - address.x; + rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y; + rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z; + rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w; + + return rank; +} + +/** + * @brief Sorts one block + * + * Uses rank to sort one bit at a time: Sorts a block according + * to bits startbit -> nbits + startbit + * @param[in,out] key + * @param[in,out] value +**/ +template +__device__ void radixSortBlock(uint4 &key, uint4 &value) +{ + extern __shared__ uint sMem1[]; + for(uint shift = startbit; shift < (startbit + nbits); ++shift) + { + uint4 lsb; + lsb.x = !((key.x >> shift) & 0x1); + lsb.y = !((key.y >> shift) & 0x1); + lsb.z = !((key.z >> shift) & 0x1); + lsb.w = !((key.w >> shift) & 0x1); + + uint4 r = rank4<256>(lsb); + +#if 1 + // This arithmetic strides the ranks across 4 SORT_CTA_SIZE regions + sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x; + sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y; + sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z; + sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w; + __syncthreads(); + + // The above allows us to read without 4-way bank conflicts: + key.x = sMem1[threadIdx.x]; + key.y = sMem1[threadIdx.x + SORT_CTA_SIZE]; + key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE]; + key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE]; + + __syncthreads(); + + sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = value.x; + sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = value.y; + sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = value.z; + sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = value.w; + __syncthreads(); + + value.x = sMem1[threadIdx.x]; + value.y = sMem1[threadIdx.x + SORT_CTA_SIZE]; + value.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE]; + value.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE]; +#else + sMem1[r.x] = key.x; + sMem1[r.y] = key.y; + sMem1[r.z] = key.z; + sMem1[r.w] = key.w; + __syncthreads(); + + // This access has 4-way bank conflicts + key = sMem[threadIdx.x]; + + __syncthreads(); + + sMem1[r.x] = value.x; + sMem1[r.y] = value.y; + sMem1[r.z] = value.z; + sMem1[r.w] = value.w; + __syncthreads(); + + value = sMem[threadIdx.x]; +#endif + + __syncthreads(); + } +} + +/** + * @brief Sorts one block. Key-only version. + * + * Uses rank to sort one bit at a time: Sorts a block according + * to bits startbit -> nbits + startbit + * @param[in,out] key +**/ + +template +__device__ void radixSortBlockKeysOnly(uint4 &key) +{ + extern __shared__ uint sMem1[]; + for(uint shift = startbit; shift < (startbit + nbits); ++shift) + { + uint4 lsb; + lsb.x = !((key.x >> shift) & 0x1); + lsb.y = !((key.y >> shift) & 0x1); + lsb.z = !((key.z >> shift) & 0x1); + lsb.w = !((key.w >> shift) & 0x1); + + uint4 r = rank4<256>(lsb); + +#if 1 + // This arithmetic strides the ranks across 4 CTA_SIZE regions + sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x; + sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y; + sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z; + sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w; + __syncthreads(); + + // The above allows us to read without 4-way bank conflicts: + key.x = sMem1[threadIdx.x]; + key.y = sMem1[threadIdx.x + SORT_CTA_SIZE]; + key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE]; + key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE]; +#else + sMem1[r.x] = key.x; + sMem1[r.y] = key.y; + sMem1[r.z] = key.z; + sMem1[r.w] = key.w; + __syncthreads(); + + // This access has 4-way bank conflicts + key = sMem[threadIdx.x]; +#endif + + __syncthreads(); + } +} + +/** @} */ // end radix sort functions +/** @} */ // end cudpp_cta diff --git a/lib/gpu/cudpp_mini/cta/scan_cta.cu b/lib/gpu/cudpp_mini/cta/scan_cta.cu new file mode 100644 index 0000000000..850b4f2800 --- /dev/null +++ b/lib/gpu/cudpp_mini/cta/scan_cta.cu @@ -0,0 +1,619 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision: 5633 $ +// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * scan_cta.cu + * + * @brief CUDPP CTA-level scan routines + */ + +/** \defgroup cudpp_cta CUDPP CTA-Level API + * The CUDPP CTA-Level API contains functions that run on the GPU + * device. These are CUDA \c __device__ functions that are called + * from within other CUDA device functions (typically + * \link cudpp_kernel CUDPP Kernel-Level API\endlink functions). + * They are called CTA-level functions because they typically process + * s_data "owned" by each CTA within shared memory, and are agnostic of + * any other CTAs that may be running (or how many CTAs are running), + * other than to compute appropriate global memory addresses. + * @{ + */ + +/** @name Scan Functions +* @{ +*/ + +#include +#include +#include +#include + +/** + * @brief Macro to insert necessary __syncthreads() in device emulation mode + */ +#ifdef __DEVICE_EMULATION__ +#define __EMUSYNC __syncthreads() +#else +#define __EMUSYNC +#endif + +/** + * @brief Template class containing compile-time parameters to the scan functions + * + * ScanTraits is passed as a template parameter to all scan functions. By + * using these compile-time functions we can enable generic code while + * maintaining the highest performance. This is crucial for the performance + * of low-level workhorse algorithms like scan. + * + * @param T The datatype of the scan + * @param oper The ::CUDPPOperator to use for the scan (add, max, etc.) + * @param multiRow True if this is a multi-row scan + * @param unroll True if scan inner loops should be unrolled + * @param sums True if each block should write it's sum to the d_blockSums array (false for single-block scans) + * @param backward True if this is a backward scan + * @param fullBlock True if all blocks in this scan are full (CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements) + * @param exclusive True for exclusive scans, false for inclusive scans + */ +template +class ScanTraits +{ +public: + + //! Returns true if this is a backward scan + static __device__ bool isBackward() { return backward; }; + //! Returns true if this is an exclusive scan + static __device__ bool isExclusive() { return exclusive; }; + //! Returns true if this a multi-row scan. + static __device__ bool isMultiRow() { return multiRow; }; + //! Returns true if this scan writes the sum of each block to the d_blockSums array (multi-block scans) + static __device__ bool writeSums() { return sums; }; + //! Returns true if this is a full scan -- all blocks process CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements + static __device__ bool isFullBlock() { return fullBlock; }; + + + //! The operator function used for the scan + static __device__ T op(const T a, const T b) + { + return Operator::op(a, b); + } + + //! The identity value used by the scan + static __device__ T identity() { return Operator::identity(); } +}; + +//! This is used to insert syncthreads to avoid perf loss caused by 128-bit +//! load overlap that happens on G80. This gives about a 15% boost on scans on +//! G80. +//! @todo Parameterize this in case this perf detail changes on future GPUs. +#define DISALLOW_LOADSTORE_OVERLAP 1 + +/** +* @brief Handles loading input s_data from global memory to shared memory +* (vec4 version) +* +* Load a chunk of 8*blockDim.x elements from global memory into a +* shared memory array. Each thread loads two T4 elements (where +* T4 is, e.g. int4 or float4), computes the scan of those two vec4s in +* thread local arrays (in registers), and writes the two total sums of the +* vec4s into shared memory, where they will be cooperatively scanned with +* the other partial sums by all threads in the CTA. +* +* @param[out] s_out The output (shared) memory array +* @param[out] threadScan0 Intermediate per-thread partial sums array 1 +* @param[out] threadScan1 Intermediate per-thread partial sums array 2 +* @param[in] d_in The input (device) memory array +* @param[in] numElements The number of elements in the array being scanned +* @param[in] iDataOffset the offset of the input array in global memory for this +* thread block +* @param[out] ai The shared memory address for the thread's first element +* (returned for reuse) +* @param[out] bi The shared memory address for the thread's second element +* (returned for reuse) +* @param[out] aiDev The device memory address for this thread's first element +* (returned for reuse) +* @param[out] biDev The device memory address for this thread's second element +* (returned for reuse) +*/ +template +__device__ void loadSharedChunkFromMem4(T *s_out, + T threadScan0[4], + T threadScan1[4], + const T *d_in, + int numElements, + int iDataOffset, + int &ai, + int &bi, + int &aiDev, + int &biDev) +{ + int thid = threadIdx.x; + aiDev = iDataOffset + thid; + biDev = aiDev + blockDim.x; + + // convert to 4-vector + typename typeToVector::Result tempData; + typename typeToVector::Result* inData = (typename typeToVector::Result*)d_in; + + ai = thid; + bi = thid + blockDim.x; + + // read into tempData; + if (traits::isBackward()) + { + int i = aiDev * 4; + if (traits::isFullBlock() || i + 3 < numElements) + { + tempData = inData[aiDev]; + threadScan0[3] = tempData.w; + threadScan0[2] = traits::op(tempData.z, threadScan0[3]); + threadScan0[1] = traits::op(tempData.y, threadScan0[2]); + threadScan0[0] = s_out[ai] + = traits::op(tempData.x, threadScan0[1]); + } + else + { + threadScan0[3] = traits::identity(); + threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[3]); + threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[2]); + threadScan0[0] = s_out[ai] + = traits::op((i < numElements) ? d_in[i] : traits::identity(), threadScan0[1]); + } + +#ifdef DISALLOW_LOADSTORE_OVERLAP + __syncthreads(); +#endif + + i = biDev * 4; + if (traits::isFullBlock() || i + 3 < numElements) + { + tempData = inData[biDev]; + threadScan1[3] = tempData.w; + threadScan1[2] = traits::op(tempData.z, threadScan1[3]); + threadScan1[1] = traits::op(tempData.y, threadScan1[2]); + threadScan1[0] = s_out[bi] + = traits::op(tempData.x, threadScan1[1]); + } + else + { + threadScan1[3] = traits::identity(); + threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[3]); + threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[2]); + threadScan1[0] = s_out[bi] + = traits::op((i < numElements) ? d_in[i] : traits::identity(), threadScan1[1]); + } + __syncthreads(); + + // reverse s_data in shared memory + if (ai < CTA_SIZE) + { + unsigned int leftIdx = ai; + unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai; + + if (leftIdx < rightIdx) + { + T tmp = s_out[leftIdx]; + s_out[leftIdx] = s_out[rightIdx]; + s_out[rightIdx] = tmp; + } + } + __syncthreads(); + } + else + { + int i = aiDev * 4; + if (traits::isFullBlock() || i + 3 < numElements) + { + tempData = inData[aiDev]; + threadScan0[0] = tempData.x; + threadScan0[1] = traits::op(tempData.y, threadScan0[0]); + threadScan0[2] = traits::op(tempData.z, threadScan0[1]); + threadScan0[3] = s_out[ai] + = traits::op(tempData.w, threadScan0[2]); + } + else + { + threadScan0[0] = (i < numElements) ? d_in[i] : traits::identity(); + threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[0]); + threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[1]); + threadScan0[3] = s_out[ai] + = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan0[2]); + } + + +#ifdef DISALLOW_LOADSTORE_OVERLAP + __syncthreads(); +#endif + + i = biDev * 4; + if (traits::isFullBlock() || i + 3 < numElements) + { + tempData = inData[biDev]; + threadScan1[0] = tempData.x; + threadScan1[1] = traits::op(tempData.y, threadScan1[0]); + threadScan1[2] = traits::op(tempData.z, threadScan1[1]); + threadScan1[3] = s_out[bi] + = traits::op(tempData.w, threadScan1[2]); + } + else + { + threadScan1[0] = (i < numElements) ? d_in[i] : traits::identity(); + threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[0]); + threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[1]); + threadScan1[3] = s_out[bi] + = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan1[2]); + } + __syncthreads(); + } +} + + +/** +* @brief Handles storing result s_data from shared memory to global memory +* (vec4 version) +* +* Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory +* into a device memory array. Each thread stores reads two elements from shared +* memory, adds them to the intermediate sums computed in +* loadSharedChunkFromMem4(), and writes two T4 elements (where +* T4 is, e.g. int4 or float4) to global memory. +* +* @param[out] d_out The output (device) memory array +* @param[in] threadScan0 Intermediate per-thread partial sums array 1 +* (contents computed in loadSharedChunkFromMem4()) +* @param[in] threadScan1 Intermediate per-thread partial sums array 2 +* (contents computed in loadSharedChunkFromMem4()) +* @param[in] s_in The input (shared) memory array +* @param[in] numElements The number of elements in the array being scanned +* @param[in] oDataOffset the offset of the output array in global memory +* for this thread block +* @param[in] ai The shared memory address for the thread's first element +* (computed in loadSharedChunkFromMem4()) +* @param[in] bi The shared memory address for the thread's second element +* (computed in loadSharedChunkFromMem4()) +* @param[in] aiDev The device memory address for this thread's first element +* (computed in loadSharedChunkFromMem4()) +* @param[in] biDev The device memory address for this thread's second element +* (computed in loadSharedChunkFromMem4()) +*/ +template +__device__ void storeSharedChunkToMem4(T *d_out, + T threadScan0[4], + T threadScan1[4], + T *s_in, + int numElements, + int oDataOffset, + int ai, + int bi, + int aiDev, + int biDev) +{ + // Convert to 4-vector + typename typeToVector::Result tempData; + typename typeToVector::Result* outData = (typename typeToVector::Result*)d_out; + + // write results to global memory + if (traits::isBackward()) + { + if (ai < CTA_SIZE) + { + + unsigned int leftIdx = ai; + unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai; + + if (leftIdx < rightIdx) + { + T tmp = s_in[leftIdx]; + s_in[leftIdx] = s_in[rightIdx]; + s_in[rightIdx] = tmp; + } + } + __syncthreads(); + + T temp = s_in[ai]; + + if (traits::isExclusive()) + { + tempData.w = temp; + tempData.z = traits::op(temp, threadScan0[3]); + tempData.y = traits::op(temp, threadScan0[2]); + tempData.x = traits::op(temp, threadScan0[1]); + } + else + { + tempData.w = traits::op(temp, threadScan0[3]); + tempData.z = traits::op(temp, threadScan0[2]); + tempData.y = traits::op(temp, threadScan0[1]); + tempData.x = traits::op(temp, threadScan0[0]); + } + + int i = aiDev * 4; + if (traits::isFullBlock() || i + 3 < numElements) + { + outData[aiDev] = tempData; + } + else + { + if (i < numElements) { d_out[i] = tempData.x; + if (i+1 < numElements) { d_out[i+1] = tempData.y; + if (i+2 < numElements) { d_out[i+2] = tempData.z; }}} + } + +#ifdef DISALLOW_LOADSTORE_OVERLAP + __syncthreads(); +#endif + + temp = s_in[bi]; + + if (traits::isExclusive()) + { + tempData.w = temp; + tempData.z = traits::op(temp, threadScan1[3]); + tempData.y = traits::op(temp, threadScan1[2]); + tempData.x = traits::op(temp, threadScan1[1]); + } + else + { + tempData.w = traits::op(temp, threadScan1[3]); + tempData.z = traits::op(temp, threadScan1[2]); + tempData.y = traits::op(temp, threadScan1[1]); + tempData.x = traits::op(temp, threadScan1[0]); + } + + i = biDev * 4; + if (traits::isFullBlock() || i + 3 < numElements) + { + outData[biDev] = tempData; + } + else + { + if (i < numElements) { d_out[i] = tempData.x; + if (i+1 < numElements) { d_out[i+1] = tempData.y; + if (i+2 < numElements) { d_out[i+2] = tempData.z; }}} + } + } + else + { + T temp; + temp = s_in[ai]; + + if (traits::isExclusive()) + { + tempData.x = temp; + tempData.y = traits::op(temp, threadScan0[0]); + tempData.z = traits::op(temp, threadScan0[1]); + tempData.w = traits::op(temp, threadScan0[2]); + } + else + { + tempData.x = traits::op(temp, threadScan0[0]); + tempData.y = traits::op(temp, threadScan0[1]); + tempData.z = traits::op(temp, threadScan0[2]); + tempData.w = traits::op(temp, threadScan0[3]); + } + + int i = aiDev * 4; + if (traits::isFullBlock() || i + 3 < numElements) + { + outData[aiDev] = tempData; + } + else + { + // we can't use vec4 because the original array isn't a multiple of + // 4 elements + if ( i < numElements) { d_out[i] = tempData.x; + if ((i+1) < numElements) { d_out[i+1] = tempData.y; + if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } } + } + +#ifdef DISALLOW_LOADSTORE_OVERLAP + __syncthreads(); +#endif + + temp = s_in[bi]; + + if (traits::isExclusive()) + { + tempData.x = temp; + tempData.y = traits::op(temp, threadScan1[0]); + tempData.z = traits::op(temp, threadScan1[1]); + tempData.w = traits::op(temp, threadScan1[2]); + } + else + { + tempData.x = traits::op(temp, threadScan1[0]); + tempData.y = traits::op(temp, threadScan1[1]); + tempData.z = traits::op(temp, threadScan1[2]); + tempData.w = traits::op(temp, threadScan1[3]); + } + + i = biDev * 4; + if (traits::isFullBlock() || i + 3 < numElements) + { + outData[biDev] = tempData; + } + else + { + // we can't use vec4 because the original array isn't a multiple of + // 4 elements + if ( i < numElements) { d_out[i] = tempData.x; + if ((i+1) < numElements) { d_out[i+1] = tempData.y; + if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } } + } + } +} + +/** @brief Scan all warps of a CTA without synchronization + * + * The warp-scan algorithm breaks a block of data into warp-sized chunks, and + * scans the chunks independently with a warp of threads each. Because warps + * execute instructions in SIMD fashion, there is no need to synchronize in + * order to share data within a warp (only across warps). Also, in SIMD the + * most efficient algorithm is a step-efficient algorithm. Therefore, within + * each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps + * to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the + * work-efficient tree-based algorithm described by Guy Blelloch [1990] that + * takes 2 * log(N) steps and is in general more complex to implement. + * Previous versions of CUDPP used the Blelloch algorithm. For current GPUs, + * the warp size is 32, so this takes five steps per warp. + * + * Each thread is responsible for a single element of the array to be scanned. + * Each thread inputs a single value to the scan via \a val and returns + * its own scanned result element. The threads of each warp cooperate + * via the shared memory array \a s_data to scan WARP_SIZE elements. + * + * Template parameter \a maxlevel allows this warpscan to be performed on + * partial warps. For example, if only the first 8 elements of each warp need + * to be scanned, then warpscan only performs log2(8)=3 steps rather than 5. + * + * The computation uses 2 * WARP_SIZE elements of shared memory per warp to + * enable warps to offset beyond their input data and receive the identity + * element without using any branch instructions. + * + * \note s_data is declared volatile here to prevent the compiler from + * optimizing away writes to shared memory, and ensure correct intrawarp + * communication in the absence of __syncthreads. + * + * @return The result of the warp scan for the current thread + * @param[in] val The current threads's input to the scan + * @param[in,out] s_data A pointer to a temporary shared array of 2*CTA_SIZE + * elements used to compute the warp scans + */ +template +__device__ T warpscan(T val, volatile T* s_data) +{ + // The following is the same as 2 * 32 * warpId + threadInWarp = + // 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE-1)) + int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE-1)); + s_data[idx] = traits::identity(); + idx += WARP_SIZE; + T t = s_data[idx] = val; __EMUSYNC; + + // This code is needed because the warp size of device emulation + // is only 1 thread, so sync-less cooperation within a warp doesn't + // work. +#ifdef __DEVICE_EMULATION__ + t = s_data[idx - 1]; __EMUSYNC; + s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC; + t = s_data[idx - 2]; __EMUSYNC; + s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC; + t = s_data[idx - 4]; __EMUSYNC; + s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC; + t = s_data[idx - 8]; __EMUSYNC; + s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC; + t = s_data[idx - 16]; __EMUSYNC; + s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC; +#else + if (0 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 1]); } + if (1 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 2]); } + if (2 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 4]); } + if (3 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 8]); } + if (4 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx -16]); } +#endif + + return s_data[idx-1]; // convert inclusive -> exclusive +} + +/** @brief Perform a full CTA scan using the warp-scan algorithm + * + * As described in the comment for warpscan(), the warp-scan algorithm breaks + * a block of data into warp-sized chunks, and scans the chunks independently + * with a warp of threads each. To complete the scan, each warp j then + * writes its last element to element j of a temporary shared array. + * Then a single warp exclusive-scans these "warp sums". Finally, each thread + * adds the result of the warp sum scan to the result of the scan from the + * first pass. + * + * Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan + * twice. + * + * @param x The first input value for the current thread + * @param y The second input value for the current thread + * @param s_data Temporary shared memory space of 2*CTA_SIZE elements for + * performing the scan + */ +template +__device__ void scanWarps(T x, T y, + T *s_data) +{ + T val = warpscan(x, s_data); + __syncthreads(); + T val2 = warpscan(y, s_data); + + int idx = threadIdx.x; + + if ((idx & 31)==31) + { + s_data[idx >> 5] = traits::op(val, x); + s_data[(idx + blockDim.x) >> 5] = traits::op(val2, y); + } + __syncthreads(); + +#ifndef __DEVICE_EMULATION__ + if (idx < 32) +#endif + { + s_data[idx] = warpscan(s_data[idx], s_data); + } + __syncthreads(); + + val = traits::op(val, s_data[idx >> 5]); + + val2 = traits::op(val2, s_data[(idx + blockDim.x) >> 5]); + + __syncthreads(); + + s_data[idx] = val; + s_data[idx+blockDim.x] = val2; +} + +/** +* @brief CTA-level scan routine; scans s_data in shared memory in each thread block +* +* This function is the main CTA-level scan function. It may be called by other +* CUDA __global__ or __device__ functions. This function scans 2 * CTA_SIZE elements. +* Each thread is responsible for one element in each half of the input array. +* \note This code is intended to be run on a CTA of 128 threads. Other sizes are +* untested. +* +* @param[in] s_data The array to be scanned in shared memory +* @param[out] d_blockSums Array of per-block sums +* @param[in] blockSumIndex Location in \a d_blockSums to which to write this block's sum +*/ +template +__device__ void scanCTA(T *s_data, + T *d_blockSums, + unsigned int blockSumIndex) +{ + T val = s_data[threadIdx.x]; + T val2 = s_data[threadIdx.x + blockDim.x]; + __syncthreads(); + + scanWarps(val, val2, s_data); + __syncthreads(); + + if (traits::writeSums() && threadIdx.x == blockDim.x - 1) + { + d_blockSums[blockSumIndex] = traits::op(val2, s_data[threadIdx.x + blockDim.x]); + } + + +#ifdef __DEVICE_EMULATION__ + // must sync in emulation mode when doing backward scans, because otherwise the + // shared memory array will get reversed before the block sums are read! + if (traits::isBackward()) + __syncthreads(); +#endif +} + + +/** @} */ // end scan functions +/** @} */ // end cudpp_cta diff --git a/lib/gpu/cudpp_mini/cudpp.cpp b/lib/gpu/cudpp_mini/cudpp.cpp new file mode 100644 index 0000000000..7a0dfac9f5 --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp.cpp @@ -0,0 +1,417 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision: 5632 $ +// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt in +// the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * cudpp.cpp + * + * @brief Main library source file. Implements wrappers for public + * interface. + * + * Main library source file. Implements wrappers for public + * interface. These wrappers call application-level operators. + * As this grows we may decide to partition into multiple source + * files. + */ + +/** + * \defgroup publicInterface CUDPP Public Interface + * The CUDA public interface comprises the functions, structs, and enums + * defined in cudpp.h. Public interface functions call functions in the + * \link cudpp_app Application-Level\endlink interface. The public + * interface functions include Plan Interface functions and Algorithm + * Interface functions. Plan Inteface functions are used for creating + * CUDPP Plan objects which contain configuration details, intermediate + * storage space, and in the case of cudppSparseMatrix(), data. The + * Algorithm Interface is the set of functions that do the real work + * of CUDPP, such as cudppScan() and cudppSparseMatrixVectorMultiply. + * + * @{ + */ + +/** @name Algorithm Interface + * @{ + */ + +#include "cudpp.h" +#include "cudpp_plan_manager.h" +#include "cudpp_scan.h" +//#include "cudpp_segscan.h" +//#include "cudpp_compact.h" +//#include "cudpp_spmvmult.h" +#include "cudpp_radixsort.h" +//#include "cudpp_rand.h" + +/** + * @brief Performs a scan operation of numElements on its input in + * GPU memory (d_in) and places the output in GPU memory + * (d_out), with the scan parameters specified in the plan pointed to by + * planHandle. + + * The input to a scan operation is an input array, a binary associative + * operator (like + or max), and an identity element for that operator + * (+'s identity is 0). The output of scan is the same size as its input. + * Informally, the output at each element is the result of operator + * applied to each input that comes before it. For instance, the + * output of sum-scan at each element is the sum of all the input + * elements before that input. + * + * More formally, for associative operator + * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly, + * outi = in0 + * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly + * in1 + * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly ... + * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly + * ini-1. + * + * CUDPP supports "exclusive" and "inclusive" scans. For the ADD operator, + * an exclusive scan computes the sum of all input elements before the + * current element, while an inclusive scan computes the sum of all input + * elements up to and including the current element. + * + * Before calling scan, create an internal plan using cudppPlan(). + * + * After you are finished with the scan plan, clean up with cudppDestroyPlan(). + * + * @param[in] planHandle Handle to plan for this scan + * @param[out] d_out output of scan, in GPU memory + * @param[in] d_in input to scan, in GPU memory + * @param[in] numElements number of elements to scan + * + * @see cudppPlan, cudppDestroyPlan + */ +CUDPP_DLL +CUDPPResult cudppScan(CUDPPHandle planHandle, + void *d_out, + const void *d_in, + size_t numElements) +{ + CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle); + if (plan != NULL) + { + cudppScanDispatch(d_out, d_in, numElements, 1, plan); + return CUDPP_SUCCESS; + } + else + { + return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors + } +} + +/** + * @brief Performs a segmented scan operation of numElements on its input in + * GPU memory (d_idata) and places the output in GPU memory + * (d_out), with the scan parameters specified in the plan pointed to by + * planHandle. + + * The input to a segmented scan operation is an input array of data, + * an input array of flags which demarcate segments, a binary associative + * operator (like + or max), and an identity element for that operator + * (+'s identity is 0). The array of flags is the same length as the input + * with 1 marking the the first element of a segment and 0 otherwise. The + * output of segmented scan is the same size as its input. Informally, the + * output at each element is the result of operator applied to each input + * that comes before it in that segment. For instance, the output of + * segmented sum-scan at each element is the sum of all the input elements + * before that input in that segment. + * + * More formally, for associative operator + * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly, + * outi = ink + * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly + * ink+1 + * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly ... + * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly + * ini-1. + * k is the index of the first element of the segment in which i lies + * + * We support both "exclusive" and "inclusive" variants. For a segmented sum-scan, + * the exclusive variant computes the sum of all input elements before the + * current element in that segment, while the inclusive variant computes the + * sum of all input elements up to and including the current element, in + * that segment. + * + * Before calling segmented scan, create an internal plan using cudppPlan(). + * + * After you are finished with the scan plan, clean up with cudppDestroyPlan(). + * @param[in] planHandle Handle to plan for this scan + * @param[out] d_out output of segmented scan, in GPU memory + * @param[in] d_idata input data to segmented scan, in GPU memory + * @param[in] d_iflags input flags to segmented scan, in GPU memory + * @param[in] numElements number of elements to perform segmented scan on + * + * @see cudppPlan, cudppDestroyPlan + +CUDPP_DLL +CUDPPResult cudppSegmentedScan(CUDPPHandle planHandle, + void *d_out, + const void *d_idata, + const unsigned int *d_iflags, + size_t numElements) +{ + CUDPPSegmentedScanPlan *plan = + (CUDPPSegmentedScanPlan*)CUDPPPlanManager::GetPlan(planHandle); + if (plan != NULL) + { + cudppSegmentedScanDispatch(d_out, d_idata, d_iflags, numElements, plan); + return CUDPP_SUCCESS; + } + else + { + return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors + } +} +*/ +/** + * @brief Performs numRows parallel scan operations of numElements + * each on its input (d_in) and places the output in d_out, + * with the scan parameters set by config. Exactly like cudppScan + * except that it runs on multiple rows in parallel. + * + * Note that to achieve good performance with cudppMultiScan one should + * allocate the device arrays passed to it so that all rows are aligned + * to the correct boundaries for the architecture the app is running on. + * The easy way to do this is to use cudaMallocPitch() to allocate a + * 2D array on the device. Use the \a rowPitch parameter to cudppPlan() + * to specify this pitch. The easiest way is to pass the device pitch + * returned by cudaMallocPitch to cudppPlan() via \a rowPitch. + * + * @param[in] planHandle handle to CUDPPScanPlan + * @param[out] d_out output of scan, in GPU memory + * @param[in] d_in input to scan, in GPU memory + * @param[in] numElements number of elements (per row) to scan + * @param[in] numRows number of rows to scan in parallel + * + * @see cudppScan, cudppPlan + +CUDPP_DLL +CUDPPResult cudppMultiScan(CUDPPHandle planHandle, + void *d_out, + const void *d_in, + size_t numElements, + size_t numRows) +{ + CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle); + if (plan != NULL) + { + cudppScanDispatch(d_out, d_in, numElements, numRows, plan); + return CUDPP_SUCCESS; + } + else + { + return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors + } +} +*/ + +/** + * @brief Given an array \a d_in and an array of 1/0 flags in \a + * deviceValid, returns a compacted array in \a d_out of corresponding + * only the "valid" values from \a d_in. + * + * Takes as input an array of elements in GPU memory + * (\a d_in) and an equal-sized unsigned int array in GPU memory + * (\a deviceValid) that indicate which of those input elements are + * valid. The output is a packed array, in GPU memory, of only those + * elements marked as valid. + * + * Internally, uses cudppScan. + * + * Example: + * \code + * d_in = [ a b c d e f ] + * deviceValid = [ 1 0 1 1 0 1 ] + * d_out = [ a c d f ] + * \endcode + * + * @todo [MJH] We need to evaluate whether cudppCompact should be a core member + * of the public interface. It's not clear to me that what the user always + * wants is a final compacted array. Often one just wants the array of indices + * to which each input element should go in the output. The split() routine used + * in radix sort might make more sense to expose. + * + * @param[in] planHandle handle to CUDPPCompactPlan + * @param[out] d_out compacted output + * @param[out] d_numValidElements set during cudppCompact; is set with the + * number of elements valid flags in the d_isValid input array + * @param[in] d_in input to compact + * @param[in] d_isValid which elements in d_in are valid + * @param[in] numElements number of elements in d_in + +CUDPP_DLL +CUDPPResult cudppCompact(CUDPPHandle planHandle, + void *d_out, + size_t *d_numValidElements, + const void *d_in, + const unsigned int *d_isValid, + size_t numElements) +{ + CUDPPCompactPlan *plan = (CUDPPCompactPlan*)CUDPPPlanManager::GetPlan(planHandle); + if (plan != NULL) + { + cudppCompactDispatch(d_out, d_numValidElements, d_in, d_isValid, + numElements, plan); + return CUDPP_SUCCESS; + } + else + { + return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors. + } +} +*/ +/** + * @brief Sorts key-value pairs or keys only + * + * Takes as input an array of keys in GPU memory + * (d_keys) and an optional array of corresponding values, + * and outputs sorted arrays of keys and (optionally) values in place. + * Key-value and key-only sort is selected through the configuration of + * the plan, using the options CUDPP_OPTION_KEYS_ONLY and + * CUDPP_OPTION_KEY_VALUE_PAIRS. + * + * Supported key types are CUDPP_FLOAT and CUDPP_UINT. Values can be + * any 32-bit type (internally, values are treated only as a payload + * and cast to unsigned int). + * + * @todo Determine if we need to provide an "out of place" sort interface. + * + * @param[in] planHandle handle to CUDPPSortPlan + * @param[out] d_keys keys by which key-value pairs will be sorted + * @param[in] d_values values to be sorted + * @param[in] keyBits the number of least significant bits in each element + * of d_keys to sort by + * @param[in] numElements number of elements in d_keys and d_values + * + * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm + */ +CUDPP_DLL +CUDPPResult cudppSort(CUDPPHandle planHandle, + void *d_keys, + void *d_values, + int keyBits, + size_t numElements) +{ + CUDPPRadixSortPlan *plan = (CUDPPRadixSortPlan*)CUDPPPlanManager::GetPlan(planHandle); + if (plan != NULL) + { + cudppRadixSortDispatch(d_keys, d_values, numElements, keyBits, plan); + return CUDPP_SUCCESS; + } + else + { + return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors. + } +} + +/** @brief Perform matrix-vector multiply y = A*x for arbitrary sparse matrix A and vector x + * + * Given a matrix object handle (which has been initialized using cudppSparseMatrix()), + * This function multiplies the input vector \a d_x by the matrix referred to by + * \a sparseMatrixHandle, returning the result in \a d_y. + * + * @param sparseMatrixHandle Handle to a sparse matrix object created with cudppSparseMatrix() + * @param d_y The output vector, y + * @param d_x The input vector, x + * + * @see cudppSparseMatrix, cudppDestroySparseMatrix + +CUDPP_DLL +CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle, + void *d_y, + const void *d_x) +{ + CUDPPSparseMatrixVectorMultiplyPlan *plan = + (CUDPPSparseMatrixVectorMultiplyPlan*)CUDPPPlanManager::GetPlan(sparseMatrixHandle); + + if (plan != NULL) + { + cudppSparseMatrixVectorMultiplyDispatch(d_y, d_x, plan); + return CUDPP_SUCCESS; + } + else + { + return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors. + } +} +*/ +/** + * @brief Rand puts \a numElements random 32-bit elements into \a d_out + * + + * Outputs \a numElements random values to \a d_out. \a d_out must be of + * type unsigned int, allocated in device memory. + * + * The algorithm used for the random number generation is stored in \a planHandle. + * Depending on the specification of the pseudo random number generator(PRNG), + * the generator may have one or more seeds. To set the seed, use cudppRandSeed(). + * + * @todo Currently only MD5 PRNG is supported. We may provide more rand routines in + * the future. + * + * @param[in] planHandle Handle to plan for rand + * @param[in] numElements number of elements in d_out. + * @param[out] d_out output of rand, in GPU memory. Should be an array of unsigned integers. + * + * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm + +CUDPP_DLL +CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements) +{ + CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle); + if(plan != NULL) + { + //dispatch the rand algorithm here + cudppRandDispatch(d_out, numElements, plan); + return CUDPP_SUCCESS; + } + else + return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors +} +*/ + +/**@brief Sets the seed used for rand + * + * The seed is crucial to any random number generator as it allows a + * sequence of random numbers to be replicated. Since there may be + * multiple different rand algorithms in CUDPP, cudppRandSeed + * uses \a planHandle to determine which seed to set. Each rand + * algorithm has its own unique set of seeds depending on what + * the algorithm needs. + * + * @param[in] planHandle the handle to the plan which specifies which rand seed to set + * @param[in] seed the value which the internal cudpp seed will be set to + +CUDPP_DLL +CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed) +{ + CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle); + //switch on the plan to figure out which seed to update + switch(plan->m_config.algorithm) + { + case CUDPP_RAND_MD5: + plan->m_seed = seed; + break; + default: + break; + } + + return CUDPP_SUCCESS; +}//end cudppRandSeed +*/ +/** @} */ // end Algorithm Interface +/** @} */ // end of publicInterface group + +// Leave this at the end of the file +// Local Variables: +// mode:c++ +// c-file-style: "NVIDIA" +// End: + diff --git a/lib/gpu/cudpp_mini/cudpp.h b/lib/gpu/cudpp_mini/cudpp.h new file mode 100644 index 0000000000..3093681523 --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp.h @@ -0,0 +1,525 @@ +// ------------------------------------------------------------- +// CUDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt in +// the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * cudpp.h + * + * @brief Main library header file. Defines public interface. + * + * The CUDPP public interface is a C-only interface to enable + * linking with code written in other languages (e.g. C, C++, + * and Fortran). While the internals of CUDPP are not limited + * to C (C++ features are used), the public interface is + * entirely C (thus it is declared "extern C"). + */ + +/** + * \mainpage + * + * \section introduction Introduction + * + * CUDPP is the CUDA Data Parallel Primitives Library. CUDPP is a + * library of data-parallel algorithm primitives such as + * parallel-prefix-sum ("scan"), parallel sort and parallel reduction. + * Primitives such as these are important building blocks for a wide + * variety of data-parallel algorithms, including sorting, stream + * compaction, and building data structures such as trees and + * summed-area tables. + * + * \section overview Overview Presentation + * + * A brief set of slides that describe the features, design principles, + * applications and impact of CUDPP is available here: + * CUDPP Presentation. + * + * \section homepage Homepage + * Homepage for CUDPP: http://code.google.com/p/cudpp + * + * Announcements and discussion of CUDPP are hosted on the + * CUDPP Google Group. + * + * \section getting-started Getting Started with CUDPP + * + * You may want to start by browsing the \link publicInterface CUDPP Public + * Interface\endlink. For information on building CUDPP, see + * \ref building-cudpp "Building CUDPP". + * + * The "apps" subdirectory included with CUDPP has a few source code samples + * that use CUDPP: + * - \ref example_simpleCUDPP "simpleCUDPP", a simple example of using + * cudppScan() + * - satGL, an example of using cudppMultiScan() to generate a summed-area + * table (SAT) of a scene rendered in real time. The SAT is then used to simulate + * depth of field blur. + * - cudpp_testrig, a comprehensive test application for all the functionality + * of CUDPP + * + * We have also provided a code walkthrough of the + * \ref example_simpleCUDPP "simpleCUDPP" example. + * + * \section getting-help Getting Help and Reporting Problems + * + * To get help using CUDPP, please use the + * CUDPP Google Group. + * + * To report CUDPP bugs or request features, you may use either the above + * CUDPP Google Group, or you can file an issue directly using + * Google Code. + * + * \section release-notes Release Notes + * + * For specific release details see the \ref changelog "Change Log". + * + * This release (1.1.1) is a bugfix release to CUDPP 1.1 that includes + * fixes to support CUDA 3.0 and the new NVIDIA Fermi architecture, + * including GeForce 400 series and Tesla 20 series GPUs. It also has + * bug fixes for 64-bit OSes. + * + * \section opSys Operating System Support + * + * This release (1.1.1) has been thoroughly tested on the following OSes. + * - Windows XP (32-bit) (CUDA 2.2, 3.0) + * - Windows 7 (64-bit) (CUDA 3.0) + * - Redhat Enterprise Linux 5 (64-bit) (CUDA 3.0) + * - and Mac OS X 10.6 (Snow Leopard, 64-bit) (CUDA 3.0) + * + * We expect CUDPP to build and run correctly on other flavors of Linux + * and Windows, but these are not actively tested by the developers at + * this time. + * + * Notes: CUDPP is not compatible with CUDA 2.1. A compiler bug in 2.1 + * causes the compiler to crash. Also, starting with CUDPP 1.1.1, we are + * no longer testing CUDA device emulation, because it is deprecated in + * CUDA 3.0 and will be removed from future CUDA versions. + * + * \section cuda CUDA + * CUDPP is implemented in + * CUDA C/C++. It requires the + * CUDA Toolkit version 2.2 or later. Please see the NVIDIA + * CUDA homepage to download + * CUDA as well as the CUDA Programming Guide and CUDA SDK, which includes many + * CUDA code examples. Some of the samples in the CUDA SDK (including + * "marchingCubes", "lineOfSight", and radixSort) also use CUDPP. + * + * \section design-goals Design Goals + * Design goals for CUDPP include: + * + * - Performance. We aim to provide best-of-class performance for our + * primitives. We welcome suggestions and contributions that will improve + * CUDPP performance. We also want to provide primitives that can be easily + * benchmarked, and compared against other implementations on GPUs and other + * processors. + * - Modularity. We want our primitives to be easily included in other + * applications. To that end we have made the following design decisions: + * - CUDPP is provided as a library that can link against other applications. + * - CUDPP calls run on the GPU on GPU data. Thus they can be used + * as standalone calls on the GPU (on GPU data initialized by the + * calling application) and, more importantly, as GPU components in larger + * CPU/GPU applications. + * - CUDPP is implemented as 4 layers: + * -# The \link publicInterface Public Interface\endlink is the external + * library interface, which is the intended entry point for most + * applications. The public interface calls into the + * \link cudpp_app Application-Level API\endlink. + * -# The \link cudpp_app Application-Level API\endlink comprises functions + * callable from CPU code. These functions execute code jointly on the + * CPU (host) and the GPU by calling into the + * \link cudpp_kernel Kernel-Level API\endlink below them. + * -# The \link cudpp_kernel Kernel-Level API\endlink comprises functions + * that run entirely on the GPU across an entire grid of thread blocks. + * These functions may call into the \link cudpp_cta CTA-Level API\endlink + * below them. + * -# The \link cudpp_cta CTA-Level API\endlink comprises functions that run + * entirely on the GPU within a single Cooperative Thread Array (CTA, + * aka thread block). These are low-level functions that implement core + * data-parallel algorithms, typically by processing data within shared + * (CUDA \c __shared__) memory. + * + * Programmers may use any of the lower three CUDPP layers in their own + * programs by building the source directly into their application. However, + * the typical usage of CUDPP is to link to the library and invoke functions in + * the CUDPP \link publicInterface Public Interface\endlink, as in the + * \ref example_simpleCUDPP "simpleCUDPP", satGL, and cudpp_testrig application + * examples included in the CUDPP distribution. + * + * In the future, if and when CUDA supports building device-level libraries, we + * hope to enhance CUDPP to ease the use of CUDPP internal algorithms at all + * levels. + * + * \subsection uses Use Cases + * We expect the normal use of CUDPP will be in one of two ways: + * -# Linking the CUDPP library against another application. + * -# Running our "test" application, cudpp_testrig, that exercises + * CUDPP functionality. + * + * \section references References + * The following publications describe work incorporated in CUDPP. + * + * - Mark Harris, Shubhabrata Sengupta, and John D. Owens. "Parallel Prefix Sum (Scan) with CUDA". In Hubert Nguyen, editor, GPU Gems 3, chapter 39, pages 851–876. Addison Wesley, August 2007. http://graphics.idav.ucdavis.edu/publications/print_pub?pub_id=916 + * - Shubhabrata Sengupta, Mark Harris, Yao Zhang, and John D. Owens. "Scan Primitives for GPU Computing". In Graphics Hardware 2007, pages 97–106, August 2007. http://graphics.idav.ucdavis.edu/publications/print_pub?pub_id=915 + * - Shubhabrata Sengupta, Mark Harris, and Michael Garland. "Efficient parallel scan algorithms for GPUs". NVIDIA Technical Report NVR-2008-003, December 2008. http://mgarland.org/papers.html#segscan-tr + * - Nadathur Satish, Mark Harris, and Michael Garland. "Designing Efficient Sorting Algorithms for Manycore GPUs". In Proceedings of the 23rd IEEE International Parallel & Distributed Processing Symposium, May 2009. http://mgarland.org/papers.html#gpusort + * - Stanley Tzeng, Li-Yi Wei. "Parallel White Noise Generation on a GPU via Cryptographic Hash". In Proceedings of the 2008 Symposium on Interactive 3D Graphics and Games, pages 79–87, February 2008. http://research.microsoft.com/apps/pubs/default.aspx?id=70502 + * + * Many researchers are using CUDPP in their work, and there are many publications + * that have used it \ref cudpp_refs "(references)". If your work uses CUDPP, please + * let us know by sending us a reference (preferably in BibTeX format) to your work. + * + * \section citing Citing CUDPP + * + * If you make use of CUDPP primitives in your work and want to cite + * CUDPP (thanks!), we would prefer for you to cite the appropriate + * papers above, since they form the core of CUDPP. To be more specific, + * the GPU Gems paper describes (unsegmented) scan, multi-scan for + * summed-area tables, and stream compaction. The NVIDIA technical report + * describes the current scan and segmented scan algorithms used in the + * library, and the Graphics Hardware paper describes an earlier + * implementation of segmented scan, quicksort, and sparse matrix-vector + * multiply. The IPDPS paper describes the radix sort used in CUDPP, and + * the I3D paper describes the random number generation algorithm. + * + * \section credits Credits + * \subsection developers CUDPP Developers + * - Mark Harris, NVIDIA Corporation + * - John D. Owens, University of California, Davis + * - Shubho Sengupta, University of California, Davis + * - Stanley Tzeng, University of California, Davis + * - Yao Zhang, University of California, Davis + * - Andrew Davidson, University of California, Davis (formerly Louisiana State University) + * + * \subsection contributors Other CUDPP Contributors + * - Nadatur Satish, University of California, Berkeley + * + * \subsection acknowledgments Acknowledgments + * + * Thanks to Jim Ahrens, Timo Aila, Nathan Bell, Ian Buck, Guy Blelloch, + * Jeff Bolz, Michael Garland, Jeff Inman, Eric Lengyel, Samuli Laine, + * David Luebke, Pat McCormick, and Richard Vuduc for their contributions + * during the development of this library. + * + * CUDPP Developers from UC Davis thank their funding agencies: + * - Department of Energy Early Career Principal Investigator Award + * DE-FG02-04ER25609 + * - SciDAC Institute for Ultrascale Visualization (http://www.iusv.org/) + * - Los Alamos National Laboratory + * - National Science Foundation (grant 0541448) + * - Generous hardware donations from NVIDIA + * + * \section license-overview CUDPP Copyright and Software License + * CUDPP is copyright The Regents of the University of California, Davis campus + * and NVIDIA Corporation. The library, examples, and all source code are + * released under the BSD license, designed to encourage reuse of this software + * in other projects, both commercial and non-commercial. For details, please + * see the \ref license page. + * + * Note that prior to release 1.1 of CUDPP, the license used was a modified + * BSD license. With release 1.1, this license was replaced with the pure BSD + * license to facilitate the use of open source hosting of the code. + */ + +/** + * @page license CUDPP License + * + * \section licenseBSD CUDPP License + * + * CUDPP is released under the + * BSD license. + * + * @include license.txt + * + */ + +/** + * @page changelog CUDPP Change Log + * + * @include changelog.txt + */ + +/** + * @page cudpp_refs Publications that use CUDPP + * + * @htmlinclude doc/bib/cudpp_refs.html + */ + +/** + * @page cudpp_refs_bib Bibliography for publications that use CUDPP + * + * @htmlinclude doc/bib/cudpp_refs_bib.html + */ + +/** + * @page building-cudpp Building CUDPP + * + * CUDPP has currently been tested in Windows XP, Windows Vista, Mac OS X + * and Linux. See \ref release-notes for release specific platform support. + * + * \section build-win32 Building CUDPP on Windows XP + * + * CUDPP can be built using either or MSVC 8 (2005) or MSVC 9 (2008). To + * build, open cudpp/cudpp.sln. Then you can build the library + * using the "build" command as you would with any other workspace. There are + * four configurations: debug, release, emudebug, and emurelease. The first + * two are self-explanatory. The second two are built to use CUDA device + * emulation, meaning they will be run (slowly) on the CPU. + * + * \section build-linux Building CUDPP on Linux and Mac OS X + * + * CUDPP can be built using standard g++ and Make tools on Linux, by typing + * "make" in the "cudpp/" subdirectory. Before building CUDPP, you should + * first build the CUDA Utility Library (libcutil) by typing "make; make dbg=1" + * in the "common/" subdirectory. This will generate libcutil.a and + * libcutilD.a. + * + * The makefile for CUDPP and all sample applications take the optional + * arguments "emu=1" and "dbg=1". The former builds CUDPP for device emulation, + * and the latter for debugging. The two flags can be combined. "verbose=1" + * can be used to see all compiler output. + * + * \section build-apps Building CUDPP Sample Applications + * + * The sample applications in the "apps/" subdirectory can be built exactly + * like CUDPP is--either by opening the appropriate .sln/.vcproj file in MSVC + * in Windows, or using "make" in Linux. + * + * On some Linux installations you will get linker errors relating to "-lXi" + * and "-lXmu". To fix this, you will need to install libXi and libXmu. On + * Debian and Ubuntu, for example, you can simply run + * "sudo apt-get install libxi-dev", and + * "sudo apt-get install libxmu-dev" + * + */ + +#ifndef __CUDPP_H__ +#define __CUDPP_H__ + +#include // for size_t + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief CUDPP Result codes returned by CUDPP API functions. + */ +enum CUDPPResult +{ + CUDPP_SUCCESS = 0, /**< No error. */ + CUDPP_ERROR_INVALID_HANDLE, /**< Specified handle (for example, + to a plan) is invalid. **/ + CUDPP_ERROR_ILLEGAL_CONFIGURATION, /**< Specified configuration is + illegal. For example, an + invalid or illogical + combination of options. */ + CUDPP_ERROR_UNKNOWN = 9999 /**< Unknown or untraceable error. */ +}; + +/** + * @brief Options for configuring CUDPP algorithms. + * + * @see CUDPPConfiguration, cudppPlan, CUDPPAlgorithm + */ +enum CUDPPOption +{ + CUDPP_OPTION_FORWARD = 0x1, /**< Algorithms operate forward: + * from start to end of input + * array */ + CUDPP_OPTION_BACKWARD = 0x2, /**< Algorithms operate backward: + * from end to start of array */ + CUDPP_OPTION_EXCLUSIVE = 0x4, /**< Exclusive (for scans) - scan + * includes all elements up to (but + * not including) the current + * element */ + CUDPP_OPTION_INCLUSIVE = 0x8, /**< Inclusive (for scans) - scan + * includes all elements up to and + * including the current element */ + CUDPP_OPTION_CTA_LOCAL = 0x10, /**< Algorithm performed only on + * the CTAs (blocks) with no + * communication between blocks. + * @todo Currently ignored. */ + CUDPP_OPTION_KEYS_ONLY = 0x20, /**< No associated value to a key + * (for global radix sort) */ + CUDPP_OPTION_KEY_VALUE_PAIRS = 0x40, /**< Each key has an associated value */ +}; + + +/** + * @brief Datatypes supported by CUDPP algorithms. + * + * @see CUDPPConfiguration, cudppPlan + */ +enum CUDPPDatatype +{ + CUDPP_CHAR, //!< Character type (C char) + CUDPP_UCHAR, //!< Unsigned character (byte) type (C unsigned char) + CUDPP_INT, //!< Integer type (C int) + CUDPP_UINT, //!< Unsigned integer type (C unsigned int) + CUDPP_FLOAT //!< Float type (C float) +}; + +/** + * @brief Operators supported by CUDPP algorithms (currently scan and + * segmented scan). + * + * These are all binary associative operators. + * + * @see CUDPPConfiguration, cudppPlan + */ +enum CUDPPOperator +{ + CUDPP_ADD, //!< Addition of two operands + CUDPP_MULTIPLY, //!< Multiplication of two operands + CUDPP_MIN, //!< Minimum of two operands + CUDPP_MAX //!< Maximum of two operands +}; + +/** +* @brief Algorithms supported by CUDPP. Used to create appropriate plans using +* cudppPlan. +* +* @see CUDPPConfiguration, cudppPlan +*/ +enum CUDPPAlgorithm +{ + CUDPP_SCAN, //!< Scan or prefix-sum + CUDPP_SEGMENTED_SCAN, //!< Segmented scan + CUDPP_COMPACT, //!< Stream compact + CUDPP_REDUCE, //!< Parallel reduction (NOTE: currently unimplemented) + CUDPP_SORT_RADIX, //!< Radix sort + CUDPP_SPMVMULT, //!< Sparse matrix-dense vector multiplication + CUDPP_RAND_MD5, //!< PseudoRandom Number Generator using MD5 hash algorithm + CUDPP_ALGORITHM_INVALID, //!< Placeholder at end of enum +}; + +/** +* @brief Configuration struct used to specify algorithm, datatype, +* operator, and options when creating a plan for CUDPP algorithms. +* +* @see cudppPlan +*/ +struct CUDPPConfiguration +{ + CUDPPAlgorithm algorithm; //!< The algorithm to be used + CUDPPOperator op; //!< The numerical operator to be applied + CUDPPDatatype datatype; //!< The datatype of the input arrays + unsigned int options; //!< Options to configure the algorithm +}; + +#define CUDPP_INVALID_HANDLE 0xC0DABAD1 +typedef size_t CUDPPHandle; + +/* To use CUDPP as a static library, #define CUDPP_STATIC_LIB before + * including cudpp.h + */ +#define CUDPP_STATIC_LIB +#ifndef CUDPP_DLL + #ifdef _WIN32 + #ifdef CUDPP_STATIC_LIB + #define CUDPP_DLL + #else + #ifdef BUILD_DLL + #define CUDPP_DLL __declspec(dllexport) + #else + #define CUDPP_DLL __declspec(dllimport) + #endif + #endif + #else + #define CUDPP_DLL + #endif +#endif + +// Plan allocation (for scan, sort, and compact) + +CUDPP_DLL +CUDPPResult cudppPlan(CUDPPHandle *planHandle, + CUDPPConfiguration config, + size_t n, + size_t rows, + size_t rowPitch); + +CUDPP_DLL +CUDPPResult cudppDestroyPlan(CUDPPHandle plan); + +// Scan and sort algorithms + +CUDPP_DLL +CUDPPResult cudppScan(CUDPPHandle planHandle, + void *d_out, + const void *d_in, + size_t numElements); + +CUDPP_DLL +CUDPPResult cudppMultiScan(CUDPPHandle planHandle, + void *d_out, + const void *d_in, + size_t numElements, + size_t numRows); + +CUDPP_DLL +CUDPPResult cudppSegmentedScan(CUDPPHandle planHandle, + void *d_out, + const void *d_idata, + const unsigned int *d_iflags, + size_t numElements); + +CUDPP_DLL +CUDPPResult cudppCompact(CUDPPHandle planHandle, + void *d_out, + size_t *d_numValidElements, + const void *d_in, + const unsigned int *d_isValid, + size_t numElements); + +CUDPP_DLL +CUDPPResult cudppSort(CUDPPHandle planHandle, + void *d_keys, + void *d_values, + int keybits, + size_t numElements); + +// Sparse matrix allocation + +CUDPP_DLL +CUDPPResult cudppSparseMatrix(CUDPPHandle *sparseMatrixHandle, + CUDPPConfiguration config, + size_t n, + size_t rows, + const void *A, + const unsigned int *h_rowIndices, + const unsigned int *h_indices); + +CUDPP_DLL +CUDPPResult cudppDestroySparseMatrix(CUDPPHandle sparseMatrixHandle); + +// Sparse matrix-vector algorithms + +CUDPP_DLL +CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle, + void *d_y, + const void *d_x); + +// random number generation algorithms +CUDPP_DLL +CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements); + +CUDPP_DLL +CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed); + +#ifdef __cplusplus +} +#endif + +#endif + +// Leave this at the end of the file +// Local Variables: +// mode:c++ +// c-file-style: "NVIDIA" +// End: diff --git a/lib/gpu/cudpp_mini/cudpp_globals.h b/lib/gpu/cudpp_mini/cudpp_globals.h new file mode 100644 index 0000000000..3d18a5727c --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp_globals.h @@ -0,0 +1,66 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt in +// the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * cudpp_globals.h + * + * @brief Global declarations defining machine characteristics of GPU target + * These are currently set for best performance on G8X GPUs. The optimal + * parameters may change on future GPUs. In the future, we hope to make + * CUDPP a self-tuning library. + */ + +#ifndef __CUDPP_GLOBALS_H__ +#define __CUDPP_GLOBALS_H__ + +const int NUM_BANKS = 16; /**< Number of shared memory banks */ +const int LOG_NUM_BANKS = 4; /**< log_2(NUM_BANKS) */ +const int CTA_SIZE = 128; /**< Number of threads in a CTA */ +const int WARP_SIZE = 32; /**< Number of threads in a warp */ +const int LOG_CTA_SIZE = 7; /**< log_2(CTA_SIZE) */ +const int LOG_WARP_SIZE = 5; /**< log_2(WARP_SIZE) */ +const int LOG_SIZEOF_FLOAT = 2; /**< log_2(sizeof(float)) */ +const int SCAN_ELTS_PER_THREAD = 8; /**< Number of elements per scan thread */ +const int SEGSCAN_ELTS_PER_THREAD = 8; /**< Number of elements per segmented scan thread */ + +const int maxSharedMemoryPerBlock = 16384; /**< Number of bytes of shared + memory in each block */ +const int maxThreadsPerBlock = CTA_SIZE; /**< Maximum number of + * threads in a CTA */ + +/** +* @brief Macro to insert necessary __syncthreads() in device emulation mode +*/ +#ifdef __DEVICE_EMULATION__ +#define __EMUSYNC __syncthreads() +#else +#define __EMUSYNC +#endif + + +#define AVOID_BANK_CONFLICTS /**< Set if by default, we want our + * shared memory allocation to perform + * additional computation to avoid bank + * conflicts */ + +#ifdef AVOID_BANK_CONFLICTS +#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS) +#else +#define CONFLICT_FREE_OFFSET(index) (0) +#endif + +#endif // __CUDPP_GLOBALS_H__ + +// Leave this at the end of the file +// Local Variables: +// mode:c++ +// c-file-style: "NVIDIA" +// End: diff --git a/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp b/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp new file mode 100644 index 0000000000..5bf3a55d98 --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp @@ -0,0 +1,94 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- +#include "cudpp_maximal_launch.h" + +inline size_t min(size_t x, size_t y) +{ + return (x <= y) ? x : y; +} + +inline size_t max(size_t x, size_t y) +{ + return (x >= y) ? x : y; +} + +// computes next highest multiple of f from x +inline size_t multiple(size_t x, size_t f) +{ + return ((x + (f-1)) / f); +} + + +// MS Excel-style CEIL() function +// Rounds x up to nearest multiple of f +inline size_t ceiling(size_t x, size_t f) +{ + return multiple(x, f) * f; +} + +extern "C" +size_t maxBlocks(cudaFuncAttributes &attribs, + cudaDeviceProp &devprop, + size_t bytesDynamicSharedMem, + size_t threadsPerBlock) +{ + + // Determine the maximum number of CTAs that can be run simultaneously for each kernel + // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet + const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers + const unsigned int warpAllocationMultiple = 2; + const unsigned int smemAllocationUnit = 512; // in bytes + const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024 + const unsigned int maxBlocksPerSM = 8; + + // Number of warps (round up to nearest whole multiple of warp size) + size_t numWarps = multiple(threadsPerBlock, devprop.warpSize); + // Round up to warp allocation multiple + numWarps = ceiling(numWarps, warpAllocationMultiple); + + // Number of regs is regs per thread times number of warps times warp size + size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps; + // Round up to multiple of register allocation unit size + regsPerCTA = ceiling(regsPerCTA, regAllocationUnit); + + size_t smemBytes = attribs.sharedSizeBytes + bytesDynamicSharedMem; + size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit); + + size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM; + size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM; + size_t ctaLimitThreads = maxThreadsPerSM / threadsPerBlock; + + return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM))); +} + +extern "C" +size_t maxBlocksFromPointer(void* kernel, + size_t bytesDynamicSharedMem, + size_t threadsPerBlock) +{ + cudaDeviceProp devprop; + int deviceID = -1; + cudaError_t err = cudaGetDevice(&deviceID); + if (err == cudaSuccess) + { + err = cudaGetDeviceProperties(&devprop, deviceID); + if (err != cudaSuccess) + return -1; + + cudaFuncAttributes attr; + err = cudaFuncGetAttributes(&attr, (const char*)kernel); + if (err != cudaSuccess) + return -1; + + return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock); + } + + return -1; +} diff --git a/lib/gpu/cudpp_mini/cudpp_maximal_launch.h b/lib/gpu/cudpp_mini/cudpp_maximal_launch.h new file mode 100644 index 0000000000..54e31c352a --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp_maximal_launch.h @@ -0,0 +1,37 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- +#ifndef _MAXIMAL_LAUNCH_H_ +#define _MAXIMAL_LAUNCH_H_ + +#include "cuda_runtime.h" + +extern "C" +size_t maxBlocks(cudaFuncAttributes &attribs, + cudaDeviceProp &devprop, + size_t bytesDynamicSharedMem, + size_t threadsPerBlock); + +extern "C" +size_t maxBlocksFromPointer(void* kernel, + size_t bytesDynamicSharedMem, + size_t threadsPerBlock); + +#ifdef __cplusplus + +template +size_t maxBlocks(T kernel, + size_t bytesDynamicSharedMem, + size_t threadsPerBlock) +{ + return maxBlocksFromPointer((void*)kernel, bytesDynamicSharedMem, threadsPerBlock); +} +#endif + +#endif // _MAXIMAL_LAUNCH_H_ diff --git a/lib/gpu/cudpp_mini/cudpp_plan.cpp b/lib/gpu/cudpp_mini/cudpp_plan.cpp new file mode 100644 index 0000000000..62d6a3da69 --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp_plan.cpp @@ -0,0 +1,459 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision: 3572$ +// $Date: 2007-11-19 13:58:06 +0000 (Mon, 19 Nov 2007) $ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- + +#include "cudpp.h" +#include "cudpp_plan_manager.h" +#include "cudpp_scan.h" +//#include "cudpp_segscan.h" +//#include "cudpp_compact.h" +//#include "cudpp_spmvmult.h" +#include "cudpp_radixsort.h" + +#include + +CUDPPPlanManager* CUDPPPlanManager::m_instance = NULL; + +CUDPPResult validateOptions(CUDPPConfiguration config, size_t /*numElements*/, size_t numRows, size_t /*rowPitch*/) +{ + CUDPPResult ret = CUDPP_SUCCESS; + if ((config.options & CUDPP_OPTION_BACKWARD) && (config.options & CUDPP_OPTION_FORWARD)) + ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION; + if ((config.options & CUDPP_OPTION_EXCLUSIVE) && (config.options & CUDPP_OPTION_INCLUSIVE)) + ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION; + + if (config.algorithm == CUDPP_COMPACT && numRows > 1) + ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION; //!< @todo: add support for multi-row cudppCompact + + return ret; +} + +/** @addtogroup publicInterface + * @{ + */ + +/** @name Plan Interface + * @{ + */ + + +/** @brief Create a CUDPP plan + * + * A plan is a data structure containing state and intermediate storage space + * that CUDPP uses to execute algorithms on data. A plan is created by + * passing to cudppPlan() a CUDPPConfiguration that specifies the algorithm, + * operator, datatype, and options. The size of the data must also be passed + * to cudppPlan(), in the \a numElements, \a numRows, and \a rowPitch + * arguments. These sizes are used to allocate internal storage space at the + * time the plan is created. The CUDPP planner may use the sizes, options, + * and information about the present hardware to choose optimal settings. + * + * Note that \a numElements is the maximum size of the array to be processed + * with this plan. That means that a plan may be re-used to process (for + * example, to sort or scan) smaller arrays. + * + * @param[out] planHandle A pointer to an opaque handle to the internal plan + * @param[in] config The configuration struct specifying algorithm and options + * @param[in] numElements The maximum number of elements to be processed + * @param[in] numRows The number of rows (for 2D operations) to be processed + * @param[in] rowPitch The pitch of the rows of input data, in elements + */ +CUDPP_DLL +CUDPPResult cudppPlan(CUDPPHandle *planHandle, + CUDPPConfiguration config, + size_t numElements, + size_t numRows, + size_t rowPitch) +{ + CUDPPResult result = CUDPP_SUCCESS; + + CUDPPPlan *plan; + + result = validateOptions(config, numElements, numRows, rowPitch); + if (result != CUDPP_SUCCESS) + { + *planHandle = CUDPP_INVALID_HANDLE; + return result; + } + + switch (config.algorithm) + { + case CUDPP_SCAN: + { + plan = new CUDPPScanPlan(config, numElements, numRows, rowPitch); + break; + } +// case CUDPP_COMPACT: +// { +// plan = new CUDPPCompactPlan(config, numElements, numRows, rowPitch); +// break; +// } + case CUDPP_SORT_RADIX: + //case CUDPP_SORT_RADIX_GLOBAL: + { + plan = new CUDPPRadixSortPlan(config, numElements); + break; + } +/* case CUDPP_SEGMENTED_SCAN: + { + plan = new CUDPPSegmentedScanPlan(config, numElements); + break; + } + //new rand plan + case CUDPP_RAND_MD5: + { + plan = new CUDPPRandPlan(config, numElements); + break; + } + case CUDPP_REDUCE:*/ + default: + //! @todo: implement cudppReduce() + return CUDPP_ERROR_ILLEGAL_CONFIGURATION; + break; + } + + *planHandle = CUDPPPlanManager::AddPlan(plan); + if (CUDPP_INVALID_HANDLE == *planHandle) + return CUDPP_ERROR_UNKNOWN; + else + return CUDPP_SUCCESS; +} + +/** @brief Destroy a CUDPP Plan + * + * Deletes the plan referred to by \a planHandle and all associated internal + * storage. + * + * @param[in] planHandle The CUDPPHandle to the plan to be destroyed + */ +CUDPP_DLL +CUDPPResult cudppDestroyPlan(CUDPPHandle planHandle) +{ + if (CUDPPPlanManager::RemovePlan(planHandle) == false) + return CUDPP_ERROR_INVALID_HANDLE; + else + return CUDPP_SUCCESS; +} + +/** @brief Create a CUDPP Sparse Matrix Object + * + * The sparse matrix plan is a data structure containing state and intermediate storage space + * that CUDPP uses to perform sparse matrix dense vector multiply. This plan is created by + * passing to CUDPPSparseMatrixVectorMultiplyPlan() a CUDPPConfiguration that specifies the + * algorithm (sprarse matrix-dense vector multiply) and datatype, along with the sparse matrix + * itself in CSR format. The number of non-zero elements in the sparse matrix must also be passed + * as \a numNonZeroElements. This is used to allocate internal storage space at the time the + * sparse matrix plan is created. + * + * @param[out] sparseMatrixHandle A pointer to an opaque handle to the sparse matrix object + * @param[in] config The configuration struct specifying algorithm and options + * @param[in] numNonZeroElements The number of non zero elements in the sparse matrix + * @param[in] numRows This is the number of rows in y, x and A for y = A * x + * @param[in] A The matrix data + * @param[in] h_rowIndices An array containing the index of the start of each row in \a A + * @param[in] h_indices An array containing the index of each nonzero element in \a A + +CUDPP_DLL +CUDPPResult cudppSparseMatrix(CUDPPHandle *sparseMatrixHandle, + CUDPPConfiguration config, + size_t numNonZeroElements, + size_t numRows, + const void *A, + const unsigned int *h_rowIndices, + const unsigned int *h_indices) +{ + CUDPPResult result = CUDPP_SUCCESS; + + CUDPPPlan *sparseMatrix; + + if ((config.algorithm != CUDPP_SPMVMULT) || + (numNonZeroElements <= 0) || (numRows <= 0)) + { + result = CUDPP_ERROR_ILLEGAL_CONFIGURATION; + } + + if (result != CUDPP_SUCCESS) + { + *sparseMatrixHandle = CUDPP_INVALID_HANDLE; + return result; + } + + sparseMatrix = + new CUDPPSparseMatrixVectorMultiplyPlan(config, numNonZeroElements, A, + h_rowIndices, h_indices, numRows); + + *sparseMatrixHandle = CUDPPPlanManager::AddPlan(sparseMatrix); + if (CUDPP_INVALID_HANDLE == *sparseMatrixHandle) + return CUDPP_ERROR_UNKNOWN; + else + return CUDPP_SUCCESS; +} +*/ +/** @brief Destroy a CUDPP Sparse Matrix Object + * + * Deletes the sparse matrix data and plan referred to by \a sparseMatrixHandle + * and all associated internal storage. + * + * @param[in] sparseMatrixHandle The CUDPPHandle to the matrix object to be destroyed + +CUDPP_DLL +CUDPPResult cudppDestroySparseMatrix(CUDPPHandle sparseMatrixHandle) +{ + return cudppDestroyPlan(sparseMatrixHandle); +} +*/ +/** @} */ // end Plan Interface +/** @} */ // end publicInterface + + +/** @brief Plan base class constructor + * + * @param[in] config The configuration struct specifying algorithm and options + * @param[in] numElements The maximum number of elements to be processed + * @param[in] numRows The number of rows (for 2D operations) to be processed + * @param[in] rowPitch The pitch of the rows of input data, in elements + */ +CUDPPPlan::CUDPPPlan(CUDPPConfiguration config, + size_t numElements, + size_t numRows, + size_t rowPitch) +: m_config(config), + m_numElements(numElements), + m_numRows(numRows), + m_rowPitch(rowPitch) +{ +} + +/** @brief Scan Plan constructor +* +* @param[in] config The configuration struct specifying algorithm and options +* @param[in] numElements The maximum number of elements to be scanned +* @param[in] numRows The maximum number of rows (for 2D operations) to be scanned +* @param[in] rowPitch The pitch of the rows of input data, in elements +*/ +CUDPPScanPlan::CUDPPScanPlan(CUDPPConfiguration config, + size_t numElements, + size_t numRows, + size_t rowPitch) +: CUDPPPlan(config, numElements, numRows, rowPitch), + m_blockSums(0), + m_rowPitches(0), + m_numEltsAllocated(0), + m_numRowsAllocated(0), + m_numLevelsAllocated(0) +{ + allocScanStorage(this); +} + +/** @brief CUDPP scan plan destructor */ +CUDPPScanPlan::~CUDPPScanPlan() +{ + freeScanStorage(this); +} + +/** @brief SegmentedScan Plan constructor +* +* @param[in] config The configuration struct specifying options +* @param[in] numElements The maximum number of elements to be scanned + +CUDPPSegmentedScanPlan::CUDPPSegmentedScanPlan(CUDPPConfiguration config, + size_t numElements) +: CUDPPPlan(config, numElements, 1, 0), + m_blockSums(0), + m_blockFlags(0), + m_blockIndices(0), + m_numEltsAllocated(0), + m_numLevelsAllocated(0) +{ + allocSegmentedScanStorage(this); +} +*/ +/** @brief SegmentedScan plan destructor +CUDPPSegmentedScanPlan::~CUDPPSegmentedScanPlan() +{ + freeSegmentedScanStorage(this); +} +*/ +/** @brief Compact Plan constructor +* +* @param[in] config The configuration struct specifying options +* @param[in] numElements The maximum number of elements to be compacted +* @param[in] numRows The number of rows (for 2D operations) to be compacted +* @param[in] rowPitch The pitch of the rows of input data, in elements + +CUDPPCompactPlan::CUDPPCompactPlan(CUDPPConfiguration config, + size_t numElements, + size_t numRows, + size_t rowPitch) +: CUDPPPlan(config, numElements, numRows, rowPitch), + m_d_outputIndices(0) +{ + assert(numRows == 1); //!< @todo Add support for multirow compaction + + CUDPPConfiguration scanConfig = + { + CUDPP_SCAN, + CUDPP_ADD, + CUDPP_UINT, + (config.options & CUDPP_OPTION_BACKWARD) ? + CUDPP_OPTION_BACKWARD | CUDPP_OPTION_EXCLUSIVE : + CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE + }; + m_scanPlan = new CUDPPScanPlan(scanConfig, numElements, numRows, rowPitch); + + allocCompactStorage(this); +} +*/ +/** @brief Compact plan destructor +CUDPPCompactPlan::~CUDPPCompactPlan() +{ + delete m_scanPlan; + freeCompactStorage(this); +} +*/ +/** @brief Sort Plan constructor +* +* @param[in] config The configuration struct specifying algorithm and options +* @param[in] numElements The maximum number of elements to be sorted +*/ +/*CUDPPSortPlan::CUDPPSortPlan(CUDPPConfiguration config, size_t numElements) +: CUDPPPlan(config, numElements, 1, 0), + m_scanPlan(0), + m_d_temp(0), + m_d_tempAddress(0) +{ + CUDPPConfiguration scanConfig = + { + CUDPP_SCAN, + CUDPP_ADD, + CUDPP_UINT, + CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE + }; + + //if (config.algorithm == CUDPP_SORT_RADIX_GLOBAL) + { + m_scanPlan = new CUDPPScanPlan(scanConfig, numElements, 1, 0); + } + + allocSortStorage(this); +}*/ + +/** @brief Sort plan destructor */ +/*CUDPPSortPlan::~CUDPPSortPlan() +{ + delete m_scanPlan; + freeSortStorage(this); +}*/ + +CUDPPRadixSortPlan::CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements) +: CUDPPPlan(config, numElements, 1, 0), + m_scanPlan(0), + m_tempKeys(0), + m_tempValues(0), + m_counters(0), + m_countersSum(0), + m_blockOffsets(0) +{ + size_t numBlocks2 = ((numElements % (SORT_CTA_SIZE * 2)) == 0) ? + (numElements / (SORT_CTA_SIZE * 2)) : (numElements / (SORT_CTA_SIZE * 2) + 1); + + CUDPPConfiguration scanConfig = + { + CUDPP_SCAN, + CUDPP_ADD, + CUDPP_UINT, + CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE + }; + + if(m_config.options == CUDPP_OPTION_KEYS_ONLY) + m_bKeysOnly = true; + else + m_bKeysOnly = false; + + m_scanPlan = new CUDPPScanPlan(scanConfig, numBlocks2*16, 1, 0); + + allocRadixSortStorage(this); +} + +CUDPPRadixSortPlan::~CUDPPRadixSortPlan() +{ + delete m_scanPlan; + freeRadixSortStorage(this); +} + +/** @brief SparseMatrixVectorMultiply Plan constructor +* +* @param[in] config The configuration struct specifying options +* @param[in] numNonZeroElements The number of non-zero elements in sparse matrix +* @param[in] A Array of non-zero matrix elements +* @param[in] rowIndex Array of indices of the first element of each row +* in the "flattened" version of the sparse matrix +* @param[in] index Array of indices of non-zero elements in the matrix +* @param[in] numRows The number of rows in the sparse matrix + +CUDPPSparseMatrixVectorMultiplyPlan::CUDPPSparseMatrixVectorMultiplyPlan( + CUDPPConfiguration config, + size_t numNonZeroElements, + const void *A, + const unsigned int *rowIndex, + const unsigned int *index, + size_t numRows + ) +: CUDPPPlan(config, numNonZeroElements, 1, 0), + m_segmentedScanPlan(0), + m_d_prod(0), + m_d_flags(0), + m_d_rowFinalIndex(0), + m_rowFinalIndex(0), + m_numRows(numRows), + m_numNonZeroElements(numNonZeroElements) +{ + CUDPPConfiguration segScanConfig = + { + CUDPP_SEGMENTED_SCAN, + CUDPP_ADD, + config.datatype, + (CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE) + }; + m_segmentedScanPlan = new CUDPPSegmentedScanPlan(segScanConfig, m_numNonZeroElements); + + // Generate an array of the indices of the last element of each row + // in the "flattened" version of the sparse matrix + m_rowFinalIndex = new unsigned int [m_numRows]; + for (unsigned int i=0; i < m_numRows; ++i) + { + if (i < m_numRows-1) + m_rowFinalIndex[i] = rowIndex[i+1]; + else + m_rowFinalIndex[i] = (unsigned int)numNonZeroElements; + } + + allocSparseMatrixVectorMultiplyStorage(this, A, rowIndex, index); +} +*/ +/** @brief Sparse matrix-vector plan destructor +CUDPPSparseMatrixVectorMultiplyPlan::~CUDPPSparseMatrixVectorMultiplyPlan() +{ + freeSparseMatrixVectorMultiplyStorage(this); + delete m_segmentedScanPlan; + delete [] m_rowFinalIndex; +} +*/ +/** @brief CUDPP Rand Plan Constructor + * @param[in] config The configuration struct specifying options + * @param[in] num_elements The number of elements to generate random bits for + +CUDPPRandPlan::CUDPPRandPlan(CUDPPConfiguration config, size_t num_elements) + : CUDPPPlan(config, num_elements, 1, 0), + m_seed(0) +{ + +} +*/ + diff --git a/lib/gpu/cudpp_mini/cudpp_plan.h b/lib/gpu/cudpp_mini/cudpp_plan.h new file mode 100644 index 0000000000..4e4b3bafb7 --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp_plan.h @@ -0,0 +1,158 @@ +// ------------------------------------------------------------- +// CUDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision: 3572$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- +#ifndef __CUDPP_PLAN_H__ +#define __CUDPP_PLAN_H__ + +typedef void* KernelPointer; + +extern "C" size_t getNumCTAs(KernelPointer kernel); +extern "C" void compNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock); + +template +size_t numCTAs(T kernel) +{ + return getNumCTAs((KernelPointer)kernel); +} + +template +void computeNumCTAs(T kernel, unsigned int bytesDynamicSharedMem, size_t threadsPerBlock) +{ + compNumCTAs((KernelPointer)kernel, bytesDynamicSharedMem, threadsPerBlock); +} + +/** @brief Base class for CUDPP Plan data structures + * + * CUDPPPlan and its subclasses provide the internal (i.e. not visible to the + * library user) infrastructure for planning algorithm execution. They + * own intermediate storage for CUDPP algorithms as well as, in some cases, + * information about optimal execution configuration for the present hardware. + * + */ +class CUDPPPlan +{ +public: + CUDPPPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch); + virtual ~CUDPPPlan() {} + + // Note anything passed to functions compiled by NVCC must be public + CUDPPConfiguration m_config; //!< @internal Options structure + size_t m_numElements; //!< @internal Maximum number of input elements + size_t m_numRows; //!< @internal Maximum number of input rows + size_t m_rowPitch; //!< @internal Pitch of input rows in elements +}; + +/** @brief Plan class for scan algorithm + * + */ +class CUDPPScanPlan : public CUDPPPlan +{ +public: + CUDPPScanPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch); + virtual ~CUDPPScanPlan(); + + void **m_blockSums; //!< @internal Intermediate block sums array + size_t *m_rowPitches; //!< @internal Pitch of each row in elements (for cudppMultiScan()) + size_t m_numEltsAllocated; //!< @internal Number of elements allocated (maximum scan size) + size_t m_numRowsAllocated; //!< @internal Number of rows allocated (for cudppMultiScan()) + size_t m_numLevelsAllocated; //!< @internal Number of levels allocaed (in _scanBlockSums) +}; + +/** @brief Plan class for segmented scan algorithm +* +*/ +class CUDPPSegmentedScanPlan : public CUDPPPlan +{ +public: + CUDPPSegmentedScanPlan(CUDPPConfiguration config, size_t numElements); + virtual ~CUDPPSegmentedScanPlan(); + + void **m_blockSums; //!< @internal Intermediate block sums array + unsigned int **m_blockFlags; //!< @internal Intermediate block flags array + unsigned int **m_blockIndices; //!< @internal Intermediate block indices array + size_t m_numEltsAllocated; //!< @internal Number of elements allocated (maximum scan size) + size_t m_numLevelsAllocated; //!< @internal Number of levels allocaed (in _scanBlockSums) +}; + +/** @brief Plan class for compact algorithm +* +*/ +class CUDPPCompactPlan : public CUDPPPlan +{ +public: + CUDPPCompactPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch); + virtual ~CUDPPCompactPlan(); + + CUDPPScanPlan *m_scanPlan; //!< @internal Compact performs a scan of type unsigned int using this plan + unsigned int* m_d_outputIndices; //!< @internal Output address of compacted elements; this is the result of scan + +}; + +class CUDPPRadixSortPlan : public CUDPPPlan +{ +public: + CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements); + virtual ~CUDPPRadixSortPlan(); + + bool m_bKeysOnly; + bool m_bManualCoalesce; + bool m_bUsePersistentCTAs; + unsigned int m_persistentCTAThreshold[2]; + unsigned int m_persistentCTAThresholdFullBlocks[2]; + CUDPPScanPlan *m_scanPlan; //!< @internal Sort performs a scan of type unsigned int using this plan + unsigned int m_keyBits; + mutable void *m_tempKeys; //!< @internal Intermediate storage for keys + mutable void *m_tempValues; //!< @internal Intermediate storage for values + unsigned int *m_counters; //!< @internal Counter for each radix + unsigned int *m_countersSum; //!< @internal Prefix sum of radix counters + unsigned int *m_blockOffsets; //!< @internal Global offsets of each radix in each block + +}; + +/** @brief Plan class for sparse-matrix dense-vector multiply +* +*/ +class CUDPPSparseMatrixVectorMultiplyPlan : public CUDPPPlan +{ +public: + CUDPPSparseMatrixVectorMultiplyPlan(CUDPPConfiguration config, size_t numNZElts, + const void *A, + const unsigned int *rowindx, + const unsigned int *indx, size_t numRows); + virtual ~CUDPPSparseMatrixVectorMultiplyPlan(); + + CUDPPSegmentedScanPlan *m_segmentedScanPlan; //!< @internal Performs a segmented scan of type T using this plan + void *m_d_prod; //!< @internal Vector of products (of an element in A and its corresponding (thats is + //! belongs to the same row) element in x; this is the input and output of + //! segmented scan + unsigned int *m_d_flags; //!< @internal Vector of flags where a flag is set if an element of A is the first element + //! of its row; this is the flags vector for segmented scan + unsigned int *m_d_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A + //! which is the last element of that row. Resides in GPU memory. + unsigned int *m_d_rowIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A + //! which is the first element of that row. Resides in GPU memory. + unsigned int *m_d_index; //!<@internal Vector of column numbers one for each element in A + void *m_d_A; //!<@internal The A matrix + unsigned int *m_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A + //! which is the last element of that row. Resides in CPU memory. + size_t m_numRows; //!< Number of rows + size_t m_numNonZeroElements; //!::iterator it; + + for (it = m_instance->plans.begin(); it != m_instance->plans.end(); it++) + { + CUDPPPlan* plan = it->second; + delete plan; + plan = NULL; + } + m_instance->plans.clear(); + + m_instance->numCTAsTable.clear(); +} + +/** @brief Add a plan to the plan manager +* +* @returns a valid CUDPPHandle if the plan was successfully added, or +* CUDPP_INVALID_HANDLE otherwise +* @param[in] plan The plan to add +*/ +CUDPPHandle CUDPPPlanManager::AddPlan(CUDPPPlan* plan) +{ + Instantiate(); + + std::pair::iterator, bool> ret; + + CUDPPHandle handle = (CUDPPHandle)m_instance->plans.size(); + ret = m_instance->plans.insert(std::pair(handle, plan)); + if (ret.second == true) + return handle; + else + return CUDPP_INVALID_HANDLE; +} + +/** @brief Remove a plan from the plan manager +* +* @returns true if the plan was successfully removed, false otherwise +* @param[in] handle The handle to the plan to remove +*/ +bool CUDPPPlanManager::RemovePlan(CUDPPHandle handle) +{ + if (m_instance == NULL) + { + return false; + } + + std::map::iterator it; + it = m_instance->plans.find(handle); + + if (it != m_instance->plans.end()) + { + CUDPPPlan* plan = it->second; + delete plan; + plan = NULL; + m_instance->plans.erase(it); + + if (0 == m_instance->plans.size()) + { + Destroy(); + } + + return true; + } + else + { + return false; + } +} + +/** @brief Get a plan from the plan manager by handle +* +* @returns A pointer to the plan if found, or NULL otherwise +* @param handle The handle to the requested plan +*/ +CUDPPPlan* CUDPPPlanManager::GetPlan(CUDPPHandle handle) +{ + if (m_instance == NULL) + { + return NULL; + } + + std::map::iterator it; + it = m_instance->plans.find(handle); + if (it != m_instance->plans.end()) + { + return it->second; + } + else + { + return NULL; + } +} + +size_t CUDPPPlanManager::numCTAs(KernelPointer kernel) +{ + if (m_instance == NULL) + { + return 0; + } + + return m_instance->numCTAsTable[kernel]; +} + +void CUDPPPlanManager::computeNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock) +{ + Instantiate(); + + m_instance->numCTAsTable[kernel] = maxBlocks(kernel, bytesDynamicSharedMem, threadsPerBlock); +} diff --git a/lib/gpu/cudpp_mini/cudpp_plan_manager.h b/lib/gpu/cudpp_mini/cudpp_plan_manager.h new file mode 100644 index 0000000000..fcf33a43ec --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp_plan_manager.h @@ -0,0 +1,56 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision: 3572$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- +#ifndef __CUDPP_PLAN_MANAGER_H__ +#define __CUDPP_PLAN_MANAGER_H__ + +#include + +class CUDPPPlan; +typedef void* KernelPointer; + +/** @brief Singleton manager class for CUDPPPlan objects + * + * This class manages all active plans in CUDPP. It is a singleton class, + * meaning that only one instance can exist. It is created automatically the + * first time AddPlan() is called, and destroyed when the last plan is removed + * using RemovePlan(). + */ +class CUDPPPlanManager +{ +public: + static CUDPPHandle AddPlan(CUDPPPlan* plan); + static bool RemovePlan(CUDPPHandle handle); + static CUDPPPlan* GetPlan(CUDPPHandle handle); + + static size_t numCTAs(KernelPointer kernel); + static void computeNumCTAs(KernelPointer kernel, + size_t bytesDynamicSharedMem, + size_t threadsPerBlock); + +protected: + static CUDPPPlanManager* m_instance; + std::map plans; + std::map numCTAsTable; + +private: + + + //! @internal Instantiate the plan manager singleton object + static void Instantiate(); + //! @internal Destroy the plan manager singleton object + static void Destroy(); + +private: + CUDPPPlanManager() {} + CUDPPPlanManager(const CUDPPPlanManager&) {} + ~CUDPPPlanManager(); +}; + +#endif // __CUDPP_PLAN_MANAGER_H__ diff --git a/lib/gpu/cudpp_mini/cudpp_radixsort.h b/lib/gpu/cudpp_mini/cudpp_radixsort.h new file mode 100644 index 0000000000..eee009cedb --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp_radixsort.h @@ -0,0 +1,34 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- +#ifndef __RADIXSORT_H__ +#define __RADIXSORT_H__ + +#define SORT_CTA_SIZE 256 //This CTA_SIZE must equal 16 * number of radices + +#include "cudpp_globals.h" +#include "cudpp.h" +#include "cudpp_plan.h" + + +extern "C" +void allocRadixSortStorage(CUDPPRadixSortPlan* plan); + +extern "C" +void freeRadixSortStorage(CUDPPRadixSortPlan* plan); + +extern "C" +void cudppRadixSortDispatch(void *keys, + void *values, + size_t numElements, + int keyBits, + const CUDPPRadixSortPlan *plan); + + +#endif // __RADIXSORT_H__ diff --git a/lib/gpu/cudpp_mini/cudpp_scan.h b/lib/gpu/cudpp_mini/cudpp_scan.h new file mode 100644 index 0000000000..6b55d80f70 --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp_scan.h @@ -0,0 +1,36 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- + +/** +* @file +* cudpp_scan.h +* +* @brief Scan functionality header file - contains CUDPP interface (not public) +*/ + +#ifndef _CUDPP_SCAN_H_ +#define _CUDPP_SCAN_H_ + +class CUDPPScanPlan; + +extern "C" +void allocScanStorage(CUDPPScanPlan *plan); + +extern "C" +void freeScanStorage(CUDPPScanPlan *plan); + +extern "C" +void cudppScanDispatch(void *d_out, + const void *d_in, + size_t numElements, + size_t numRows, + const CUDPPScanPlan *plan); + +#endif // _CUDPP_SCAN_H_ diff --git a/lib/gpu/cudpp_mini/cudpp_util.h b/lib/gpu/cudpp_mini/cudpp_util.h new file mode 100644 index 0000000000..8815b5bf5f --- /dev/null +++ b/lib/gpu/cudpp_mini/cudpp_util.h @@ -0,0 +1,363 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt in +// the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * cudpp_util.h + * + * @brief C++ utility functions and classes used internally to cuDPP + */ + +#ifndef __CUDPP_UTIL_H__ +#define __CUDPP_UTIL_H__ + +#ifdef WIN32 +#include +#endif + +#include +#include +#include +#include + +#if (CUDA_VERSION >= 3000) +#define LAUNCH_BOUNDS(x) __launch_bounds__((x)) +#define LAUNCH_BOUNDS_MINBLOCKs(x, y) __launch_bounds__((x),(y)) +#else +#define LAUNCH_BOUNDS(x) +#define LAUNCH_BOUNDS_MINBLOCKS(x, y) +#endif + + +/** @brief Determine if \a n is a power of two. + * @param n Value to be checked to see if it is a power of two + * @returns True if \a n is a power of two, false otherwise + */ +inline bool +isPowerOfTwo(int n) +{ + return ((n&(n-1))==0) ; +} + +/** @brief Determine if an integer \a n is a multiple of an integer \a f. + * @param n Multiple + * @param f Factor + * @returns True if \a n is a multiple of \a f, false otherwise + */ +inline bool +isMultiple(int n, int f) +{ + if (isPowerOfTwo(f)) + return ((n&(f-1))==0); + else + return (n%f==0); +} + +/** @brief Compute the smallest power of two larger than \a n. + * @param n Input value + * @returns The smallest power f two larger than \a n + */ +inline int +ceilPow2(int n) +{ + double log2n = log2((double)n); + if (isPowerOfTwo(n)) + return n; + else + return 1 << (int)ceil(log2n); +} + +/** @brief Compute the largest power of two smaller than \a n. + * @param n Input value + * @returns The largest power of two smaller than \a n. + */ +inline int +floorPow2(int n) +{ +#ifdef WIN32 + // method 2 + return 1 << (int)_logb((float)n); +#else + // method 3 + int exp; + frexp((float)n, &exp); + return 1 << (exp - 1); +#endif +} + +/** @brief Returns the maximum value for type \a T. + * + * Implemented using template specialization on \a T. + */ +template +__host__ __device__ inline T getMax() { return 0; } +/** @brief Returns the minimum value for type \a T. +* +* Implemented using template specialization on \a T. +*/ +template +__host__ __device__ inline T getMin() { return 0; } +// type specializations for the above +// getMax +template <> __host__ __device__ inline int getMax() { return INT_MAX; } +template <> __host__ __device__ inline unsigned int getMax() { return INT_MAX; } +template <> __host__ __device__ inline float getMax() { return FLT_MAX; } +template <> __host__ __device__ inline char getMax() { return (char)INT_MAX; } +template <> __host__ __device__ inline unsigned char getMax() { return (unsigned char)INT_MAX; } +// getMin +template <> __host__ __device__ inline int getMin() { return INT_MIN; } +template <> __host__ __device__ inline unsigned int getMin() { return 0; } +template <> __host__ __device__ inline float getMin() { return -FLT_MAX; } +template <> __host__ __device__ inline char getMin() { return (char)INT_MIN; } +template <> __host__ __device__ inline unsigned char getMin() { return (unsigned char)0; } + +/** @brief Returns the maximum of three values. + * @param a First value. + * @param b Second value. + * @param c Third value. + * @returns The maximum of \a a, \a b and \a c. + */ +template +inline int max3(T a, T b, T c) +{ + return (a > b) ? ((a > c)? a : c) : ((b > c) ? b : c); +} + +/** @brief Utility template struct for generating small vector types from scalar types + * + * Given a base scalar type (\c int, \c float, etc.) and a vector length (1 through 4) as + * template parameters, this struct defines a vector type (\c float3, \c int4, etc.) of the + * specified length and base type. For example: + * \code + * template + * __device__ void myKernel(T *data) + * { + * typeToVector::Result myVec4; // create a vec4 of type T + * myVec4 = (typeToVector::Result*)data[0]; // load first element of data as a vec4 + * } + * \endcode + * + * This functionality is implemented using template specialization. Currently specializations + * for int, float, and unsigned int vectors of lengths 2-4 are defined. Note that this results + * in types being generated at compile time -- there is no runtime cost. typeToVector is used by + * the optimized scan \c __device__ functions in scan_cta.cu. + */ +template +struct typeToVector +{ + typedef T Result; +}; + +template<> +struct typeToVector +{ + typedef int4 Result; +}; +template<> +struct typeToVector +{ + typedef uint4 Result; +}; +template<> +struct typeToVector +{ + typedef float4 Result; +}; +template<> +struct typeToVector +{ + typedef int3 Result; +}; +template<> +struct typeToVector +{ + typedef uint3 Result; +}; +template<> +struct typeToVector +{ + typedef float3 Result; +}; +template<> +struct typeToVector +{ + typedef int2 Result; +}; +template<> +struct typeToVector +{ + typedef uint2 Result; +}; +template<> +struct typeToVector +{ + typedef float2 Result; +}; + +/** @brief Templatized operator class used by scan and segmented scan + * + * This Operator class is used to allow generic support of binary + * associative operators in scan. It defines two member functions, + * op() and identity(), that are used in place of + and 0 (for + * example) in the scan and segmented scan code. Because this is + * template code, all decisions in the code are made at compile + * time, resulting in optimal operator code. Currently the operators + * CUDPP_ADD, CUDPP_MULTIPLY, CUDPP_MIN, and CUDPP_MAX are supported. + * Operator is implemented using template specialization for the + * types \c int, \c unsigned int, and \c float. + */ +template +class Operator +{ +public: + /** Applies the operator to operands \a a and \a b. + * @param a First operand + * @param b Second operand + * @returns a OP b, where OP is defined by ::CUDPPOperator \a oper. + */ + static __device__ T op(const T a, const T b) + { + switch (oper) + { + case CUDPP_ADD: + return a + b; + case CUDPP_MULTIPLY: + return a * b; + case CUDPP_MIN: + return min(a, b); + case CUDPP_MAX: + return max(a, b); + } + } + + /** Returns the identity element defined for type \a T */ + static __device__ T identity() { return 0; } +}; + +// specializations for different types +template +class Operator +{ +public: + static __device__ int op(const int a, const int b) + { + switch (oper) + { + default: + case CUDPP_ADD: + return a + b; + case CUDPP_MULTIPLY: + return a * b; + case CUDPP_MIN: + return min(a, b); + case CUDPP_MAX: + return max(a, b); + } + } + + static __device__ int identity() + { + switch (oper) + { + default: + case CUDPP_ADD: + return 0; + case CUDPP_MULTIPLY: + return 1; + case CUDPP_MIN: + return INT_MAX; + case CUDPP_MAX: + return INT_MIN; + } + } +}; + +template +class Operator +{ +public: + static __device__ unsigned int op(const unsigned int a, const unsigned int b) + { + switch (oper) + { + default: + case CUDPP_ADD: + return a + b; + case CUDPP_MULTIPLY: + return a * b; + case CUDPP_MIN: + return min(a, b); + case CUDPP_MAX: + return max(a, b); + } + } + + static __device__ unsigned int identity() + { + switch (oper) + { + default: + case CUDPP_ADD: + return 0; + case CUDPP_MULTIPLY: + return 1; + case CUDPP_MIN: + return UINT_MAX; + case CUDPP_MAX: + return 0; + } + } +}; + + +template +class Operator +{ +public: + static __device__ float op(const float a, const float b) + { + switch (oper) + { + default: + case CUDPP_ADD: + return a + b; + case CUDPP_MULTIPLY: + return a * b; + case CUDPP_MIN: + return min(a, b); + case CUDPP_MAX: + return max(a, b); + } + } + + static __device__ float identity() + { + switch (oper) + { + default: + case CUDPP_ADD: + return 0.0f; + case CUDPP_MULTIPLY: + return 1.0f; + case CUDPP_MIN: + return FLT_MAX; + case CUDPP_MAX: + return -FLT_MAX; + } + } +}; + +#endif // __CUDPP_UTIL_H__ + +// Leave this at the end of the file +// Local Variables: +// mode:c++ +// c-file-style: "NVIDIA" +// End: diff --git a/lib/gpu/cudpp_mini/cutil.h b/lib/gpu/cudpp_mini/cutil.h new file mode 100644 index 0000000000..390b40615c --- /dev/null +++ b/lib/gpu/cudpp_mini/cutil.h @@ -0,0 +1,879 @@ +/* +* Copyright 1993-2006 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO USER: +* +* This source code is subject to NVIDIA ownership rights under U.S. and +* international Copyright laws. +* +* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +* OR PERFORMANCE OF THIS SOURCE CODE. +* +* U.S. Government End Users. This source code is a "commercial item" as +* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +* "commercial computer software" and "commercial computer software +* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +* and is provided to the U.S. Government only as a commercial end item. +* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +* source code with only those rights set forth herein. +*/ + + +/* CUda UTility Library */ + +#ifndef _CUTIL_H_ +#define _CUTIL_H_ + +#include + +#ifdef _WIN32 +# pragma warning( disable : 4996 ) // disable deprecated warning +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + // helper typedefs for building DLL +#ifdef _WIN32 +# ifdef BUILD_DLL +# define DLL_MAPPING __declspec(dllexport) +# else +# define DLL_MAPPING __declspec(dllimport) +# endif +#else +# define DLL_MAPPING +#endif + +#ifdef _WIN32 + #define CUTIL_API __stdcall +#else + #define CUTIL_API +#endif + + + //////////////////////////////////////////////////////////////////////////// + //! CUT bool type + //////////////////////////////////////////////////////////////////////////// + enum CUTBoolean + { + CUTFalse = 0, + CUTTrue = 1 + }; + + //////////////////////////////////////////////////////////////////////////// + //! Deallocate memory allocated within Cutil + //! @param pointer to memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + void CUTIL_API + cutFree( void* ptr); + + //////////////////////////////////////////////////////////////////////////// + //! Helper for bank conflict checking (should only be used with the + //! CUT_BANK_CHECKER macro) + //! @param tidx thread id in x dimension of block + //! @param tidy thread id in y dimension of block + //! @param tidz thread id in z dimension of block + //! @param bdimx block size in x dimension + //! @param bdimy block size in y dimension + //! @param bdimz block size in z dimension + //! @param file name of the source file where the access takes place + //! @param line line in the source file where the access takes place + //! @param aname name of the array which is accessed + //! @param index index into the array + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + void CUTIL_API + cutCheckBankAccess( unsigned int tidx, unsigned int tidy, unsigned int tidz, + unsigned int bdimx, unsigned int bdimy, + unsigned int bdimz, const char* file, const int line, + const char* aname, const int index); + + //////////////////////////////////////////////////////////////////////////// + //! Find the path for a filename within a hardcoded set of paths + //! @return the path if succeeded, otherwise 0 + //! @param filename name of the file + //! @param executablePath optional absolute path of the executable + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + char* CUTIL_API + cutFindFilePath(const char* filename, const char* executablePath); + + //////////////////////////////////////////////////////////////////////////// + //! Find the path for a filename within a specified directory tree + //! @return the path if succeeded, otherwise 0 + //! @param filename name of the file + //! @param executablePath optional absolute path of the executable + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutFindFile(char * outputPath, const char * startDir, const char * dirName); + + //////////////////////////////////////////////////////////////////////////// + //! Find the path for a filename within a specified directory tree + //! @return the path if succeeded, otherwise 0 + //! @param filename name of the file + //! @param executablePath optional absolute path of the executable + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutFindDir(char * outputPath, const char * startDir, const char * dirName); + + //////////////////////////////////////////////////////////////////////////// + //! Read file \filename containing single precision floating point data + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param filename name of the source file + //! @param data uninitialized pointer, returned initialized and pointing to + //! the data read + //! @param len number of data elements in data, -1 on error + //! @note If a NULL pointer is passed to this function and it is + //! initialized within Cutil then cutFree() has to be used to + //! deallocate the memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutReadFilef( const char* filename, float** data, unsigned int* len, + bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Read file \filename containing double precision floating point data + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param filename name of the source file + //! @param data uninitialized pointer, returned initialized and pointing to + //! the data read + //! @param len number of data elements in data, -1 on error + //! @note If a NULL pointer is passed to this function and it is + //! initialized within Cutil then cutFree() has to be used to + //! deallocate the memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutReadFiled( const char* filename, double** data, unsigned int* len, + bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Read file \filename containing integer data + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param filename name of the source file + //! @param data uninitialized pointer, returned initialized and pointing to + //! the data read + //! @param len number of data elements in data, -1 on error + //! @note If a NULL pointer is passed to this function and it is + //! initialized within Cutil then cutFree() has to be used to + //! deallocate the memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Read file \filename containing unsigned integer data + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param filename name of the source file + //! @param data uninitialized pointer, returned initialized and pointing to + //! the data read + //! @param len number of data elements in data, -1 on error + //! @note If a NULL pointer is passed to this function and it is + //! initialized within Cutil then cutFree() has to be used to + //! deallocate the memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutReadFileui( const char* filename, unsigned int** data, + unsigned int* len, bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Read file \filename containing char / byte data + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param filename name of the source file + //! @param data uninitialized pointer, returned initialized and pointing to + //! the data read + //! @param len number of data elements in data, -1 on error + //! @note If a NULL pointer is passed to this function and it is + //! initialized within Cutil then cutFree() has to be used to + //! deallocate the memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutReadFileb( const char* filename, char** data, unsigned int* len, + bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Read file \filename containing unsigned char / byte data + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param filename name of the source file + //! @param data uninitialized pointer, returned initialized and pointing to + //! the data read + //! @param len number of data elements in data, -1 on error + //! @note If a NULL pointer is passed to this function and it is + //! initialized within Cutil then cutFree() has to be used to + //! deallocate the memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutReadFileub( const char* filename, unsigned char** data, + unsigned int* len, bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Write a data file \filename containing single precision floating point + //! data + //! @return CUTTrue if writing the file succeeded, otherwise false + //! @param filename name of the file to write + //! @param data pointer to data to write + //! @param len number of data elements in data, -1 on error + //! @param epsilon epsilon for comparison + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutWriteFilef( const char* filename, const float* data, unsigned int len, + const float epsilon, bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Write a data file \filename containing double precision floating point + //! data + //! @return CUTTrue if writing the file succeeded, otherwise false + //! @param filename name of the file to write + //! @param data pointer to data to write + //! @param len number of data elements in data, -1 on error + //! @param epsilon epsilon for comparison + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutWriteFiled( const char* filename, const float* data, unsigned int len, + const double epsilon, bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Write a data file \filename containing integer data + //! @return CUTTrue if writing the file succeeded, otherwise false + //! @param filename name of the file to write + //! @param data pointer to data to write + //! @param len number of data elements in data, -1 on error + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutWriteFilei( const char* filename, const int* data, unsigned int len, + bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Write a data file \filename containing unsigned integer data + //! @return CUTTrue if writing the file succeeded, otherwise false + //! @param filename name of the file to write + //! @param data pointer to data to write + //! @param len number of data elements in data, -1 on error + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutWriteFileui( const char* filename,const unsigned int* data, + unsigned int len, bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Write a data file \filename containing char / byte data + //! @return CUTTrue if writing the file succeeded, otherwise false + //! @param filename name of the file to write + //! @param data pointer to data to write + //! @param len number of data elements in data, -1 on error + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutWriteFileb( const char* filename, const char* data, unsigned int len, + bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Write a data file \filename containing unsigned char / byte data + //! @return CUTTrue if writing the file succeeded, otherwise false + //! @param filename name of the file to write + //! @param data pointer to data to write + //! @param len number of data elements in data, -1 on error + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutWriteFileub( const char* filename,const unsigned char* data, + unsigned int len, bool verbose = false); + + //////////////////////////////////////////////////////////////////////////// + //! Load PGM image file (with unsigned char as data element type) + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //! @note If a NULL pointer is passed to this function and it is + //! initialized within Cutil then cutFree() has to be used to + //! deallocate the memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutLoadPGMub( const char* file, unsigned char** data, + unsigned int *w,unsigned int *h); + + //////////////////////////////////////////////////////////////////////////// + //! Load PPM image file (with unsigned char as data element type) + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutLoadPPMub( const char* file, unsigned char** data, + unsigned int *w,unsigned int *h); + + //////////////////////////////////////////////////////////////////////////// + //! Load PPM image file (with unsigned char as data element type), padding + //! 4th component + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutLoadPPM4ub( const char* file, unsigned char** data, + unsigned int *w,unsigned int *h); + + //////////////////////////////////////////////////////////////////////////// + //! Load PGM image file (with unsigned int as data element type) + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //! @note If a NULL pointer is passed to this function and it is + //! initialized within Cutil then cutFree() has to be used to + //! deallocate the memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutLoadPGMi( const char* file, unsigned int** data, + unsigned int* w, unsigned int* h); + + //////////////////////////////////////////////////////////////////////////// + //! Load PGM image file (with unsigned short as data element type) + //! @return CUTTrue if reading the file succeeded, otherwise false + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //! @note If a NULL pointer is passed to this function and it is + //! initialized withing Cutil then cutFree() has to be used to + //! deallocate the memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutLoadPGMs( const char* file, unsigned short** data, + unsigned int* w, unsigned int* h); + + //////////////////////////////////////////////////////////////////////////// + //! Load PGM image file (with float as data element type) + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //! @note If a NULL pointer is passed to this function and it is + //! initialized withing Cutil then cutFree() has to be used to + //! deallocate the memory + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutLoadPGMf( const char* file, float** data, + unsigned int* w, unsigned int* h); + + //////////////////////////////////////////////////////////////////////////// + //! Save PGM image file (with unsigned char as data element type) + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutSavePGMub( const char* file, unsigned char* data, + unsigned int w, unsigned int h); + + //////////////////////////////////////////////////////////////////////////// + //! Save PPM image file (with unsigned char as data element type) + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutSavePPMub( const char* file, unsigned char *data, + unsigned int w, unsigned int h); + + //////////////////////////////////////////////////////////////////////////// + //! Save PPM image file (with unsigned char as data element type, padded to + //! 4 bytes) + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutSavePPM4ub( const char* file, unsigned char *data, + unsigned int w, unsigned int h); + + //////////////////////////////////////////////////////////////////////////// + //! Save PGM image file (with unsigned int as data element type) + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutSavePGMi( const char* file, unsigned int* data, + unsigned int w, unsigned int h); + + //////////////////////////////////////////////////////////////////////////// + //! Save PGM image file (with unsigned short as data element type) + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutSavePGMs( const char* file, unsigned short* data, + unsigned int w, unsigned int h); + + //////////////////////////////////////////////////////////////////////////// + //! Save PGM image file (with float as data element type) + //! @param file name of the image file + //! @param data handle to the data read + //! @param w width of the image + //! @param h height of the image + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutSavePGMf( const char* file, float* data, + unsigned int w, unsigned int h); + + //////////////////////////////////////////////////////////////////////////// + // Command line arguments: General notes + // * All command line arguments begin with '--' followed by the token; + // token and value are seperated by '='; example --samples=50 + // * Arrays have the form --model=[one.obj,two.obj,three.obj] + // (without whitespaces) + //////////////////////////////////////////////////////////////////////////// + + //////////////////////////////////////////////////////////////////////////// + //! Check if command line argument \a flag-name is given + //! @return CUTTrue if command line argument \a flag_name has been given, + //! otherwise 0 + //! @param argc argc as passed to main() + //! @param argv argv as passed to main() + //! @param flag_name name of command line flag + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutCheckCmdLineFlag( const int argc, const char** argv, + const char* flag_name); + + //////////////////////////////////////////////////////////////////////////// + //! Get the value of a command line argument of type int + //! @return CUTTrue if command line argument \a arg_name has been given and + //! is of the requested type, otherwise CUTFalse + //! @param argc argc as passed to main() + //! @param argv argv as passed to main() + //! @param arg_name name of the command line argument + //! @param val value of the command line argument + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutGetCmdLineArgumenti( const int argc, const char** argv, + const char* arg_name, int* val); + + //////////////////////////////////////////////////////////////////////////// + //! Get the value of a command line argument of type float + //! @return CUTTrue if command line argument \a arg_name has been given and + //! is of the requested type, otherwise CUTFalse + //! @param argc argc as passed to main() + //! @param argv argv as passed to main() + //! @param arg_name name of the command line argument + //! @param val value of the command line argument + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutGetCmdLineArgumentf( const int argc, const char** argv, + const char* arg_name, float* val); + + //////////////////////////////////////////////////////////////////////////// + //! Get the value of a command line argument of type string + //! @return CUTTrue if command line argument \a arg_name has been given and + //! is of the requested type, otherwise CUTFalse + //! @param argc argc as passed to main() + //! @param argv argv as passed to main() + //! @param arg_name name of the command line argument + //! @param val value of the command line argument + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutGetCmdLineArgumentstr( const int argc, const char** argv, + const char* arg_name, char** val); + + //////////////////////////////////////////////////////////////////////////// + //! Get the value of a command line argument list those element are strings + //! @return CUTTrue if command line argument \a arg_name has been given and + //! is of the requested type, otherwise CUTFalse + //! @param argc argc as passed to main() + //! @param argv argv as passed to main() + //! @param arg_name name of the command line argument + //! @param val command line argument list + //! @param len length of the list / number of elements + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutGetCmdLineArgumentListstr( const int argc, const char** argv, + const char* arg_name, char** val, + unsigned int* len); + + //////////////////////////////////////////////////////////////////////////// + //! Extended assert + //! @return CUTTrue if the condition \a val holds, otherwise CUTFalse + //! @param val condition to test + //! @param file __FILE__ macro + //! @param line __LINE__ macro + //! @note This function should be used via the CONDITION(val) macro + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutCheckCondition( int val, const char* file, const int line); + + //////////////////////////////////////////////////////////////////////////// + //! Compare two float arrays + //! @return CUTTrue if \a reference and \a data are identical, + //! otherwise CUTFalse + //! @param reference handle to the reference data / gold image + //! @param data handle to the computed data + //! @param len number of elements in reference and data + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutComparef( const float* reference, const float* data, + const unsigned int len); + + //////////////////////////////////////////////////////////////////////////// + //! Compare two integer arrays + //! @return CUTTrue if \a reference and \a data are identical, + //! otherwise CUTFalse + //! @param reference handle to the reference data / gold image + //! @param data handle to the computed data + //! @param len number of elements in reference and data + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutComparei( const int* reference, const int* data, + const unsigned int len ); + + //////////////////////////////////////////////////////////////////////////// + //! Compare two unsigned char arrays + //! @return CUTTrue if \a reference and \a data are identical, + //! otherwise CUTFalse + //! @param reference handle to the reference data / gold image + //! @param data handle to the computed data + //! @param len number of elements in reference and data + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutCompareub( const unsigned char* reference, const unsigned char* data, + const unsigned int len ); + + //////////////////////////////////////////////////////////////////////////////// + //! Compare two integer arrays witha n epsilon tolerance for equality + //! @return CUTTrue if \a reference and \a data are identical, + //! otherwise CUTFalse + //! @param reference handle to the reference data / gold image + //! @param data handle to the computed data + //! @param len number of elements in reference and data + //! @param epsilon epsilon to use for the comparison + //////////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutCompareube( const unsigned char* reference, const unsigned char* data, + const unsigned int len, const int epsilon ); + + //////////////////////////////////////////////////////////////////////////// + //! Compare two float arrays with an epsilon tolerance for equality + //! @return CUTTrue if \a reference and \a data are identical, + //! otherwise CUTFalse + //! @param reference handle to the reference data / gold image + //! @param data handle to the computed data + //! @param len number of elements in reference and data + //! @param epsilon epsilon to use for the comparison + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutComparefe( const float* reference, const float* data, + const unsigned int len, const float epsilon ); + + //////////////////////////////////////////////////////////////////////////// + //! Compare two float arrays using L2-norm with an epsilon tolerance for + //! equality + //! @return CUTTrue if \a reference and \a data are identical, + //! otherwise CUTFalse + //! @param reference handle to the reference data / gold image + //! @param data handle to the computed data + //! @param len number of elements in reference and data + //! @param epsilon epsilon to use for the comparison + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutCompareL2fe( const float* reference, const float* data, + const unsigned int len, const float epsilon ); + + //////////////////////////////////////////////////////////////////////////// + //! Timer functionality + + //////////////////////////////////////////////////////////////////////////// + //! Create a new timer + //! @return CUTTrue if a time has been created, otherwise false + //! @param name of the new timer, 0 if the creation failed + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutCreateTimer( unsigned int* name); + + //////////////////////////////////////////////////////////////////////////// + //! Delete a timer + //! @return CUTTrue if a time has been deleted, otherwise false + //! @param name of the timer to delete + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutDeleteTimer( unsigned int name); + + //////////////////////////////////////////////////////////////////////////// + //! Start the time with name \a name + //! @param name name of the timer to start + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutStartTimer( const unsigned int name); + + //////////////////////////////////////////////////////////////////////////// + //! Stop the time with name \a name. Does not reset. + //! @param name name of the timer to stop + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutStopTimer( const unsigned int name); + + //////////////////////////////////////////////////////////////////////////// + //! Resets the timer's counter. + //! @param name name of the timer to reset. + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + CUTBoolean CUTIL_API + cutResetTimer( const unsigned int name); + + //////////////////////////////////////////////////////////////////////////// + //! Returns total execution time in milliseconds for the timer over all + //! runs since the last reset or timer creation. + //! @param name name of the timer to return the time of + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + float CUTIL_API + cutGetTimerValue( const unsigned int name); + + //////////////////////////////////////////////////////////////////////////// + //! Return the average time in milliseconds for timer execution as the + //! total time for the timer dividied by the number of completed (stopped) + //! runs the timer has made. + //! Excludes the current running time if the timer is currently running. + //! @param name name of the timer to return the time of + //////////////////////////////////////////////////////////////////////////// + DLL_MAPPING + float CUTIL_API + cutGetAverageTimerValue( const unsigned int name); + + //////////////////////////////////////////////////////////////////////////// + //! Macros + +#ifdef _DEBUG + +#if __DEVICE_EMULATION__ + // Interface for bank conflict checker +#define CUT_BANK_CHECKER( array, index) \ + (cutCheckBankAccess( threadIdx.x, threadIdx.y, threadIdx.z, blockDim.x, \ + blockDim.y, blockDim.z, \ + __FILE__, __LINE__, #array, index ), \ + array[index]) +#else +#define CUT_BANK_CHECKER( array, index) array[index] +#endif + +# define CU_SAFE_CALL_NO_SYNC( call ) do { \ + CUresult err = call; \ + if( CUDA_SUCCESS != err) { \ + fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \ + err, __FILE__, __LINE__ ); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define CU_SAFE_CALL( call ) do { \ + CU_SAFE_CALL_NO_SYNC(call); \ + CUresult err = cuCtxSynchronize(); \ + if( CUDA_SUCCESS != err) { \ + fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \ + err, __FILE__, __LINE__ ); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define CUDA_SAFE_CALL_NO_SYNC( call) do { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define CUDA_SAFE_CALL( call) do { \ + CUDA_SAFE_CALL_NO_SYNC(call); \ + cudaError err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define CUFFT_SAFE_CALL( call) do { \ + cufftResult err = call; \ + if( CUFFT_SUCCESS != err) { \ + fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define CUT_SAFE_CALL( call) \ + if( CUTTrue != call) { \ + fprintf(stderr, "Cut error in file '%s' in line %i.\n", \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } + + //! Check for CUDA error +# define CUT_CHECK_ERROR(errorMessage) do { \ + cudaError_t err = cudaGetLastError(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } } while (0) + + //! Check for malloc error +# define CUT_SAFE_MALLOC( mallocCall ) do{ \ + if( !(mallocCall)) { \ + fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } } while(0); + + //! Check if conditon is true (flexible assert) +# define CUT_CONDITION( val) \ + if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) { \ + exit(EXIT_FAILURE); \ + } + +#else // not DEBUG + +#define CUT_BANK_CHECKER( array, index) array[index] + + // void macros for performance reasons +# define CUT_CHECK_ERROR(errorMessage) +# define CUT_CHECK_ERROR_GL() +# define CUT_CONDITION( val) +# define CU_SAFE_CALL_NO_SYNC( call) call +# define CU_SAFE_CALL( call) call +# define CUDA_SAFE_CALL_NO_SYNC( call) call +# define CUDA_SAFE_CALL( call) call +# define CUT_SAFE_CALL( call) call +# define CUFFT_SAFE_CALL( call) call +# define CUT_SAFE_MALLOC( mallocCall ) mallocCall + +#endif + +#if __DEVICE_EMULATION__ + +# define CUT_DEVICE_INIT(ARGC, ARGV) + +#else + +# define CUT_DEVICE_INIT(ARGC, ARGV) { \ + int deviceCount; \ + CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount)); \ + if (deviceCount == 0) { \ + fprintf(stderr, "cutil error: no devices supporting CUDA.\n"); \ + exit(EXIT_FAILURE); \ + } \ + int dev = 0; \ + cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev); \ + if (dev > deviceCount-1) dev = deviceCount - 1; \ + cudaDeviceProp deviceProp; \ + CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev)); \ + if (deviceProp.major < 1) { \ + fprintf(stderr, "cutil error: device does not support CUDA.\n"); \ + exit(EXIT_FAILURE); \ + } \ + if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) \ + fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name); \ + CUDA_SAFE_CALL(cudaSetDevice(dev)); \ +} + +#endif + +# define CUT_DEVICE_INIT_DRV(cuDevice, ARGC, ARGV) { \ + cuDevice = 0; \ + int deviceCount = 0; \ + CUresult err = cuInit(0); \ + if (CUDA_SUCCESS == err) \ + CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount)); \ + if (deviceCount == 0) { \ + fprintf(stderr, "cutil error: no devices supporting CUDA\n"); \ + exit(EXIT_FAILURE); \ + } \ + int dev = 0; \ + cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev); \ + if (dev > deviceCount-1) dev = deviceCount - 1; \ + CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&cuDevice, dev)); \ + char name[100]; \ + cuDeviceGetName(name, 100, cuDevice); \ + if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) \ + fprintf(stderr, "Using device %d: %s\n", dev, name); \ +} + +#define CUT_EXIT(argc, argv) \ + if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) { \ + printf("\nPress ENTER to exit...\n"); \ + fflush( stdout); \ + fflush( stderr); \ + getchar(); \ + } \ + exit(EXIT_SUCCESS); + + +#ifdef __cplusplus +} +#endif // #ifdef _DEBUG (else branch) + +#endif // #ifndef _CUTIL_H_ diff --git a/lib/gpu/cudpp_mini/kernel/radixsort_kernel.cu b/lib/gpu/cudpp_mini/kernel/radixsort_kernel.cu new file mode 100644 index 0000000000..ac66b9a9f2 --- /dev/null +++ b/lib/gpu/cudpp_mini/kernel/radixsort_kernel.cu @@ -0,0 +1,868 @@ +// ------------------------------------------------------------- +// CUDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- + +#include "cudpp_radixsort.h" +#include +#include "sharedmem.h" +#include "cta/radixsort_cta.cu" + +#ifdef __DEVICE_EMULATION__ +#define __EMUSYNC __syncthreads() +#else +#define __EMUSYNC +#endif + +/** + * @file + * radixsort_app.cu + * + * @brief CUDPP kernel-level radix sorting routines + */ + +/** \addtogroup cudpp_kernel + * @{ + */ + +/** @name RadixSort Functions + * @{ + */ + + + +typedef unsigned int uint; + +/** @brief And empty kernel used to reset CTA issue hardware + **/ +__global__ void emptyKernel() {} + + +/** @brief Does special binary arithmetic before sorting floats + * + * Uses floatFlip function to flip bits. + * @param[in,out] values Values to be manipulated + * @param[in] numValues Number of values to be flipped + **/ + +__global__ void +LAUNCH_BOUNDS(SORT_CTA_SIZE) +flipFloats(uint *values, uint numValues) +{ + uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x; + if (index < numValues) values[index] = floatFlip(values[index]); + index += blockDim.x; + if (index < numValues) values[index] = floatFlip(values[index]); + index += blockDim.x; + if (index < numValues) values[index] = floatFlip(values[index]); + index += blockDim.x; + if (index < numValues) values[index] = floatFlip(values[index]); +} + +/** @brief Undoes the flips from flipFloats + * + * Uses floatUnflip function to unflip bits. + * @param[in,out] values Values to be manipulated + * @param[in] numValues Number of values to be unflipped + **/ +__global__ void +LAUNCH_BOUNDS(SORT_CTA_SIZE) +unflipFloats(uint *values, uint numValues) +{ + uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x; + if (index < numValues) values[index] = floatUnflip(values[index]); + index += blockDim.x; + if (index < numValues) values[index] = floatUnflip(values[index]); + index += blockDim.x; + if (index < numValues) values[index] = floatUnflip(values[index]); + index += blockDim.x; + if (index < numValues) values[index] = floatUnflip(values[index]); +} + + +/** @brief Optimization for sorts of WARP_SIZE or fewer elements + * + * @param[in,out] keys Keys to be sorted. + * @param[in,out] values Associated values to be sorted (through keys). + * @param[in] numElements Number of elements in the sort. + */ +template +__global__ +LAUNCH_BOUNDS(WARP_SIZE) +void radixSortSingleWarp(uint *keys, + uint *values, + uint numElements) +{ + volatile __shared__ uint sKeys[WARP_SIZE]; //remove class distinctions + volatile __shared__ uint sValues[WARP_SIZE]; + volatile __shared__ uint sFlags[WARP_SIZE]; + + sKeys[threadIdx.x] = floatFlip(keys[threadIdx.x]); + sValues[threadIdx.x] = values[threadIdx.x]; + + __EMUSYNC; // emulation only + + for(uint i = 1; i < numElements; i++) + { + uint key_i = sKeys[i]; + uint val_i = sValues[i]; + + sFlags[threadIdx.x] = 0; + + uint temp, tempval; + if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) + { + temp = sKeys[threadIdx.x]; + tempval = sValues[threadIdx.x]; + sFlags[threadIdx.x] = 1; + +#ifdef __DEVICE_EMULATION__ + } + __EMUSYNC; + if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) + { +#endif + sKeys[threadIdx.x + 1] = temp; + sValues[threadIdx.x + 1] = tempval; + sFlags[threadIdx.x + 1] = 0; + } + + + if(sFlags[threadIdx.x] == 1 ) + { + sKeys[threadIdx.x] = key_i; + sValues[threadIdx.x] = val_i; + } + + __EMUSYNC; // emulation only + + } + keys[threadIdx.x] = floatUnflip(sKeys[threadIdx.x]); + values[threadIdx.x] = sValues[threadIdx.x]; +} + + +/** @brief Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version. + * + * @param[in,out] keys Keys to be sorted + * @param[in] numElements Total number of elements to be sorted +**/ + +template +__global__ +LAUNCH_BOUNDS(WARP_SIZE) +void radixSortSingleWarpKeysOnly(uint *keys, + uint numElements) +{ + volatile __shared__ uint sKeys[WARP_SIZE]; + volatile __shared__ uint sFlags[WARP_SIZE]; + + sKeys[threadIdx.x] = floatFlip(keys[threadIdx.x]); + + __EMUSYNC; // emulation only + + for(uint i = 1; i < numElements; i++) + { + uint key_i = sKeys[i]; + + sFlags[threadIdx.x] = 0; + + uint temp; + if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) + { + temp = sKeys[threadIdx.x]; + sFlags[threadIdx.x] = 1; +#ifdef __DEVICE_EMULATION__ + } + __EMUSYNC; + if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) + { +#endif + sKeys[threadIdx.x + 1] = temp; + sFlags[threadIdx.x + 1] = 0; + } + if(sFlags[threadIdx.x] == 1 ) + { + sKeys[threadIdx.x] = key_i; + } + + __EMUSYNC; // emulation only + + } + keys[threadIdx.x] = floatUnflip(sKeys[threadIdx.x]); +} + +/** @brief sorts all blocks of data independently in shared memory. +* Each thread block (CTA) sorts one block of 4*CTA_SIZE elements +* +* The radix sort is done in two stages. This stage calls radixSortBlock on each +* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits) +* +* Template parameters are used to generate efficient code for various special cases +* For example, we have to handle arrays that are a multiple of the block size (fullBlocks) +* differently than arrays that are not. "flip" is used to only compile in the +* float flip code when float keys are used. "loop" is used when persistent CTAs +* are used. +* +* By persistent CTAs we mean that we launch only as many thread blocks as can +* be resident in the GPU and no more, rather than launching as many threads as +* we have elements. Persistent CTAs loop over blocks of elements until all work +* is complete. This can be faster in some cases. In our tests it is faster +* for large sorts (and the threshold is higher on compute version 1.1 and earlier +* GPUs than it is on compute version 1.2 GPUs. +* +* @param[out] keysOut Output of sorted keys +* @param[out] valuesOut Output of associated values +* @param[in] keysIn Input of unsorted keys in GPU +* @param[in] valuesIn Input of associated input values +* @param[in] numElements Total number of elements to sort +* @param[in] totalBlocks The number of blocks of data to sort +*/ +template +__global__ void +LAUNCH_BOUNDS(SORT_CTA_SIZE) +radixSortBlocks(uint4* keysOut, uint4* valuesOut, + uint4* keysIn, uint4* valuesIn, + uint numElements, uint totalBlocks) +{ + extern __shared__ uint4 sMem[]; + + uint4 key, value; + + + uint blockId = blockIdx.x; + + while (!loop || blockId < totalBlocks) + { + uint i = blockId * blockDim.x + threadIdx.x; + uint idx = i << 2; + + // handle non-full last block if array is not multiple of 1024 numElements + if (!fullBlocks && idx+3 >= numElements) + { + if (idx >= numElements) + { + key = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX); + value = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX); + } + else + { + // for non-full block, we handle uint1 values instead of uint4 + uint *keys1 = (uint*)keysIn; + uint *values1 = (uint*)valuesIn; + + key.x = (idx < numElements) ? floatFlip(keys1[idx]) : UINT_MAX; + key.y = (idx+1 < numElements) ? floatFlip(keys1[idx+1]) : UINT_MAX; + key.z = (idx+2 < numElements) ? floatFlip(keys1[idx+2]) : UINT_MAX; + key.w = UINT_MAX; + + value.x = (idx < numElements) ? values1[idx] : UINT_MAX; + value.y = (idx+1 < numElements) ? values1[idx+1] : UINT_MAX; + value.z = (idx+2 < numElements) ? values1[idx+2] : UINT_MAX; + value.w = UINT_MAX; + } + } + else + { + key = keysIn[i]; + value = valuesIn[i]; + + if (flip) + { + key.x = floatFlip(key.x); + key.y = floatFlip(key.y); + key.z = floatFlip(key.z); + key.w = floatFlip(key.w); + } + } + __syncthreads(); + radixSortBlock(key, value); + + // handle non-full last block if array is not multiple of 1024 numElements + if(!fullBlocks && idx+3 >= numElements) + { + if (idx < numElements) + { + // for non-full block, we handle uint1 values instead of uint4 + uint *keys1 = (uint*)keysOut; + uint *values1 = (uint*)valuesOut; + + keys1[idx] = key.x; + values1[idx] = value.x; + + if (idx + 1 < numElements) + { + keys1[idx + 1] = key.y; + values1[idx + 1] = value.y; + + if (idx + 2 < numElements) + { + keys1[idx + 2] = key.z; + values1[idx + 2] = value.z; + } + } + } + } + else + { + keysOut[i] = key; + valuesOut[i] = value; + } + + if (loop) + blockId += gridDim.x; + else + break; + } +} + +/** @brief Computes the number of keys of each radix in each block stores offset. +* +* Given an array with blocks sorted according to a 4-bit radix group, each +* block counts the number of keys that fall into each radix in the group, and +* finds the starting offset of each radix in the block. It then writes the radix +* counts to the counters array, and the starting offsets to the blockOffsets array. +* +* Template parameters are used to generate efficient code for various special cases +* For example, we have to handle arrays that are a multiple of the block size +* (fullBlocks) differently than arrays that are not. "loop" is used when persistent +* CTAs are used. +* +* By persistent CTAs we mean that we launch only as many thread blocks as can +* be resident in the GPU and no more, rather than launching as many threads as +* we have elements. Persistent CTAs loop over blocks of elements until all work +* is complete. This can be faster in some cases. In our tests it is faster +* for large sorts (and the threshold is higher on compute version 1.1 and earlier +* GPUs than it is on compute version 1.2 GPUs. +* +* @param[in] keys Input keys +* @param[out] counters Radix count for each block +* @param[out] blockOffsets The offset address for each block +* @param[in] numElements Total number of elements +* @param[in] totalBlocks Total number of blocks +**/ +template +__global__ void +LAUNCH_BOUNDS(SORT_CTA_SIZE) +findRadixOffsets(uint2 *keys, + uint *counters, + uint *blockOffsets, + uint numElements, + uint totalBlocks) +{ + extern __shared__ uint sRadix1[]; + __shared__ uint sStartPointers[16]; + + uint blockId = blockIdx.x; + + while (!loop || blockId < totalBlocks) + { + uint2 radix2; + + uint i = blockId * blockDim.x + threadIdx.x; + + // handle non-full last block if array is not multiple of 1024 numElements + if(!fullBlocks && ((i + 1) << 1 ) > numElements ) + { + // handle uint1 rather than uint2 for non-full blocks + uint *keys1 = (uint*)keys; + uint j = i << 1; + + radix2.x = (j < numElements) ? keys1[j] : UINT_MAX; + j++; + radix2.y = (j < numElements) ? keys1[j] : UINT_MAX; + } + else + { + radix2 = keys[i]; + } + + sRadix1[2 * threadIdx.x] = (radix2.x >> startbit) & 0xF; + sRadix1[2 * threadIdx.x + 1] = (radix2.y >> startbit) & 0xF; + + // Finds the position where the sRadix1 entries differ and stores start + // index for each radix. + if(threadIdx.x < 16) + { + sStartPointers[threadIdx.x] = 0; + } + __syncthreads(); + + if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) ) + { + sStartPointers[sRadix1[threadIdx.x]] = threadIdx.x; + } + if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]) + { + sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE]] = threadIdx.x + SORT_CTA_SIZE; + } + __syncthreads(); + + if(threadIdx.x < 16) + { + blockOffsets[blockId*16 + threadIdx.x] = sStartPointers[threadIdx.x]; + } + __syncthreads(); + + // Compute the sizes of each block. + if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) ) + { + sStartPointers[sRadix1[threadIdx.x - 1]] = + threadIdx.x - sStartPointers[sRadix1[threadIdx.x - 1]]; + } + if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1] ) + { + sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]] = + threadIdx.x + SORT_CTA_SIZE - sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]]; + } + + + if(threadIdx.x == SORT_CTA_SIZE - 1) + { + sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]] = + 2 * SORT_CTA_SIZE - sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]]; + } + __syncthreads(); + + if(threadIdx.x < 16) + { + counters[threadIdx.x * totalBlocks + blockId] = + sStartPointers[threadIdx.x]; + } + + if (loop) + blockId += gridDim.x; + else + break; + } +} + + +/**@brief Reorders data in the global array. +* +* reorderData shuffles data in the array globally after the radix +* offsets have been found. On compute version 1.1 and earlier GPUs, this code depends +* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits). +* +* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures +* that all writes are coalesced using extra work in the kernel. On later +* GPUs coalescing rules have been relaxed, so this extra overhead hurts +* performance. On these GPUs we set manualCoalesce=false and directly store +* the results. +* +* Template parameters are used to generate efficient code for various special cases +* For example, we have to handle arrays that are a multiple of the block size +* (fullBlocks) differently than arrays that are not. "loop" is used when persistent +* CTAs are used. +* +* By persistent CTAs we mean that we launch only as many thread blocks as can +* be resident in the GPU and no more, rather than launching as many threads as +* we have elements. Persistent CTAs loop over blocks of elements until all work +* is complete. This can be faster in some cases. In our tests it is faster +* for large sorts (and the threshold is higher on compute version 1.1 and earlier +* GPUs than it is on compute version 1.2 GPUs. +* +* @param[out] outKeys Output of sorted keys +* @param[out] outValues Output of associated values +* @param[in] keys Input of unsorted keys in GPU +* @param[in] values Input of associated input values +* @param[in] blockOffsets The offset address for each block +* @param[in] offsets Address of each radix within each block +* @param[in] sizes Number of elements in a block +* @param[in] numElements Total number of elements +* @param[in] totalBlocks Total number of data blocks to process +* +* @todo Args that are const below should be prototyped as const +**/ +template +__global__ void +LAUNCH_BOUNDS(SORT_CTA_SIZE) +reorderData(uint *outKeys, + uint *outValues, + uint2 *keys, + uint2 *values, + uint *blockOffsets, + uint *offsets, + uint *sizes, + uint numElements, + uint totalBlocks) +{ + __shared__ uint2 sKeys2[SORT_CTA_SIZE]; + __shared__ uint2 sValues2[SORT_CTA_SIZE]; + __shared__ uint sOffsets[16]; + __shared__ uint sBlockOffsets[16]; + + uint *sKeys1 = (uint*)sKeys2; + uint *sValues1 = (uint*)sValues2; + + uint blockId = blockIdx.x; + + while (!loop || blockId < totalBlocks) + { + uint i = blockId * blockDim.x + threadIdx.x; + + // handle non-full last block if array is not multiple of 1024 numElements + if(!fullBlocks && (((i + 1) << 1) > numElements)) + { + uint *keys1 = (uint*)keys; + uint *values1 = (uint*)values; + uint j = i << 1; + + sKeys1[threadIdx.x << 1] = (j < numElements) ? keys1[j] : UINT_MAX; + sValues1[threadIdx.x << 1] = (j < numElements) ? values1[j] : UINT_MAX; + j++; + sKeys1[(threadIdx.x << 1) + 1] = (j < numElements) ? keys1[j] : UINT_MAX; + sValues1[(threadIdx.x << 1) + 1] = (j < numElements) ? values1[j] : UINT_MAX; + } + else + { + sKeys2[threadIdx.x] = keys[i]; + sValues2[threadIdx.x] = values[i]; + } + + if (!manualCoalesce) + { + if(threadIdx.x < 16) + { + sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId]; + sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x]; + } + __syncthreads(); + + uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF; + uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix]; + + if (fullBlocks || globalOffset < numElements) + { + outKeys[globalOffset] = floatUnflip(sKeys1[threadIdx.x]); + outValues[globalOffset] = sValues1[threadIdx.x]; + } + + radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF; + globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix]; + + if (fullBlocks || globalOffset < numElements) + { + outKeys[globalOffset] = floatUnflip(sKeys1[threadIdx.x + SORT_CTA_SIZE]); + outValues[globalOffset] = sValues1[threadIdx.x + SORT_CTA_SIZE]; + } + } + else + { + __shared__ uint sSizes[16]; + + if(threadIdx.x < 16) + { + sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId]; + sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x]; + sSizes[threadIdx.x] = sizes[threadIdx.x * totalBlocks + blockId]; + } + __syncthreads(); + + // 1 half-warp is responsible for writing out all values for 1 radix. + // Loops if there are more than 16 values to be written out. + // All start indices are rounded down to the nearest multiple of 16, and + // all end indices are rounded up to the nearest multiple of 16. + // Thus it can do extra work if the start and end indices are not multiples of 16 + // This is bounded by a factor of 2 (it can do 2X more work at most). + + const uint halfWarpID = threadIdx.x >> 4; + + const uint halfWarpOffset = threadIdx.x & 0xF; + const uint leadingInvalid = sOffsets[halfWarpID] & 0xF; + + uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0; + uint endPos = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 - + ((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF); + uint numIterations = endPos - startPos; + + uint outOffset = startPos + halfWarpOffset; + uint inOffset = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset; + + for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16) + { + if( (outOffset >= sOffsets[halfWarpID]) && + (inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID])) + { + if(blockId < totalBlocks - 1 || outOffset < numElements) + { + outKeys[outOffset] = floatUnflip(sKeys1[inOffset]); + outValues[outOffset] = sValues1[inOffset]; + } + } + } + } + + if (loop) + { + blockId += gridDim.x; + __syncthreads(); + } + else + break; + } +} + +/** @brief Sorts all blocks of data independently in shared memory. +* Each thread block (CTA) sorts one block of 4*CTA_SIZE elements +* +* The radix sort is done in two stages. This stage calls radixSortBlock on each +* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits) +* +* Template parameters are used to generate efficient code for various special cases +* For example, we have to handle arrays that are a multiple of the block size (fullBlocks) +* differently than arrays that are not. "flip" is used to only compile in the +* float flip code when float keys are used. "loop" is used when persistent CTAs +* are used. +* +* By persistent CTAs we mean that we launch only as many thread blocks as can +* be resident in the GPU and no more, rather than launching as many threads as +* we have elements. Persistent CTAs loop over blocks of elements until all work +* is complete. This can be faster in some cases. In our tests it is faster +* for large sorts (and the threshold is higher on compute version 1.1 and earlier +* GPUs than it is on compute version 1.2 GPUs. +* +* @param[out] keysOut Output of sorted keys GPU main memory +* @param[in] keysIn Input of unsorted keys in GPU main memory +* @param[in] numElements Total number of elements to sort +* @param[in] totalBlocks Total number of blocks to sort +* +*/ +template +__global__ void +LAUNCH_BOUNDS(SORT_CTA_SIZE) +radixSortBlocksKeysOnly(uint4* keysOut, uint4* keysIn, uint numElements, uint totalBlocks) +{ + extern __shared__ uint4 sMem[]; + + uint4 key; + + uint blockId = blockIdx.x; + + while (!loop || blockId < totalBlocks) + { + uint i = blockId * blockDim.x + threadIdx.x; + uint idx = i << 2; + + // handle non-full last block if array is not multiple of 1024 numElements + if (!fullBlocks && idx+3 >= numElements) + { + if (idx >= numElements) + { + key = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX); + } + else + { + // for non-full block, we handle uint1 values instead of uint4 + uint *keys1 = (uint*)keysIn; + + key.x = (idx < numElements) ? floatFlip(keys1[idx]) : UINT_MAX; + key.y = (idx+1 < numElements) ? floatFlip(keys1[idx+1]) : UINT_MAX; + key.z = (idx+2 < numElements) ? floatFlip(keys1[idx+2]) : UINT_MAX; + key.w = UINT_MAX; + } + } + else + { + key = keysIn[i]; + if (flip) + { + key.x = floatFlip(key.x); + key.y = floatFlip(key.y); + key.z = floatFlip(key.z); + key.w = floatFlip(key.w); + } + } + __syncthreads(); + radixSortBlockKeysOnly(key); + + // handle non-full last block if array is not multiple of 1024 numElements + if(!fullBlocks && idx+3 >= numElements) + { + if (idx < numElements) + { + // for non-full block, we handle uint1 values instead of uint4 + uint *keys1 = (uint*)keysOut; + + keys1[idx] = key.x; + + if (idx + 1 < numElements) + { + keys1[idx + 1] = key.y; + + if (idx + 2 < numElements) + { + keys1[idx + 2] = key.z; + } + } + } + } + else + { + keysOut[i] = key; + } + + if (loop) + blockId += gridDim.x; + else + break; + } +} + +/** @brief Reorders data in the global array. +* +* reorderDataKeysOnly shuffles data in the array globally after the radix offsets +* have been found. On compute version 1.1 and earlier GPUs, this code depends +* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits). +* +* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures +* that all writes are coalesced using extra work in the kernel. On later +* GPUs coalescing rules have been relaxed, so this extra overhead hurts +* performance. On these GPUs we set manualCoalesce=false and directly store +* the results. +* +* Template parameters are used to generate efficient code for various special cases +* For example, we have to handle arrays that are a multiple of the block size +* (fullBlocks) differently than arrays that are not. "loop" is used when persistent +* CTAs are used. +* +* By persistent CTAs we mean that we launch only as many thread blocks as can +* be resident in the GPU and no more, rather than launching as many threads as +* we have elements. Persistent CTAs loop over blocks of elements until all work +* is complete. This can be faster in some cases. In our tests it is faster +* for large sorts (and the threshold is higher on compute version 1.1 and earlier +* GPUs than it is on compute version 1.2 GPUs. +* +* @param[out] outKeys Output result of reorderDataKeysOnly() +* @param[in] keys Keys to be reordered +* @param[in] blockOffsets Start offset for each block +* @param[in] offsets Offset of each radix within each block +* @param[in] sizes Number of elements in a block +* @param[in] numElements Total number of elements +* @param[in] totalBlocks Total number of blocks +*/ +template +__global__ void +LAUNCH_BOUNDS(SORT_CTA_SIZE) +reorderDataKeysOnly(uint *outKeys, + uint2 *keys, + uint *blockOffsets, + uint *offsets, + uint *sizes, + uint numElements, + uint totalBlocks) +{ + __shared__ uint2 sKeys2[SORT_CTA_SIZE]; + __shared__ uint sOffsets[16]; + __shared__ uint sBlockOffsets[16]; + + uint *sKeys1 = (uint*)sKeys2; + + uint blockId = blockIdx.x; + + while (!loop || blockId < totalBlocks) + { + uint i = blockId * blockDim.x + threadIdx.x; + + // handle non-full last block if array is not multiple of 1024 numElements + if(!fullBlocks && (((i + 1) << 1) > numElements)) + { + uint *keys1 = (uint*)keys; + uint j = i << 1; + + sKeys1[threadIdx.x << 1] = (j < numElements) ? keys1[j] : UINT_MAX; + j++; + sKeys1[(threadIdx.x << 1) + 1] = (j < numElements) ? keys1[j] : UINT_MAX; + } + else + { + sKeys2[threadIdx.x] = keys[i]; + } + + if (!manualCoalesce) + { + if(threadIdx.x < 16) + { + sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId]; + sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x]; + } + __syncthreads(); + + uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF; + uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix]; + + if (fullBlocks || globalOffset < numElements) + { + outKeys[globalOffset] = floatUnflip(sKeys1[threadIdx.x]); + } + + radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF; + globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix]; + + if (fullBlocks || globalOffset < numElements) + { + outKeys[globalOffset] = floatUnflip(sKeys1[threadIdx.x + SORT_CTA_SIZE]); + } + } + else + { + __shared__ uint sSizes[16]; + + if(threadIdx.x < 16) + { + sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId]; + sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x]; + sSizes[threadIdx.x] = sizes[threadIdx.x * totalBlocks + blockId]; + } + __syncthreads(); + + // 1 half-warp is responsible for writing out all values for 1 radix. + // Loops if there are more than 16 values to be written out. + // All start indices are rounded down to the nearest multiple of 16, and + // all end indices are rounded up to the nearest multiple of 16. + // Thus it can do extra work if the start and end indices are not multiples of 16 + // This is bounded by a factor of 2 (it can do 2X more work at most). + + const uint halfWarpID = threadIdx.x >> 4; + + const uint halfWarpOffset = threadIdx.x & 0xF; + const uint leadingInvalid = sOffsets[halfWarpID] & 0xF; + + uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0; + uint endPos = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 - + ((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF); + uint numIterations = endPos - startPos; + + uint outOffset = startPos + halfWarpOffset; + uint inOffset = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset; + + for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16) + { + if( (outOffset >= sOffsets[halfWarpID]) && + (inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID])) + { + if(blockId < totalBlocks - 1 || outOffset < numElements) + { + outKeys[outOffset] = floatUnflip(sKeys1[inOffset]); + } + } + } + } + + if (loop) + { + blockId += gridDim.x; + __syncthreads(); + } + else + break; + } +} + +/** @} */ // end radixsort functions +/** @} */ // end cudpp_kernel diff --git a/lib/gpu/cudpp_mini/kernel/scan_kernel.cu b/lib/gpu/cudpp_mini/kernel/scan_kernel.cu new file mode 100644 index 0000000000..966634c89b --- /dev/null +++ b/lib/gpu/cudpp_mini/kernel/scan_kernel.cu @@ -0,0 +1,113 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision: 5633 $ +// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * scan_kernel.cu + * + * @brief CUDPP kernel-level scan routines + */ + +/** \defgroup cudpp_kernel CUDPP Kernel-Level API + * The CUDPP Kernel-Level API contains functions that run on the GPU + * device across a grid of Cooperative Thread Array (CTA, aka Thread + * Block). These kernels are declared \c __global__ so that they + * must be invoked from host (CPU) code. They generally invoke GPU + * \c __device__ routines in the CUDPP \link cudpp_cta CTA-Level API\endlink. + * Kernel-Level API functions are used by CUDPP + * \link cudpp_app Application-Level\endlink functions to implement their + * functionality. + * @{ + */ + +/** @name Scan Functions +* @{ +*/ + +#include +#include "cta/scan_cta.cu" +#include "sharedmem.h" + +/** + * @brief Main scan kernel + * + * This __global__ device function performs one level of a multiblock scan on + * an arbitrary-dimensioned array in \a d_in, returning the result in \a d_out + * (which may point to the same array). The same function may be used for + * single or multi-row scans. To perform a multirow scan, pass the width of + * each row of the input row (in elements) in \a dataRowPitch, and the width of + * the rows of \a d_blockSums (in elements) in \a blockSumRowPitch, and invoke + * with a thread block grid with height greater than 1. + * + * This function peforms one level of a recursive, multiblock scan. At the + * app level, this function is called by cudppScan and cudppMultiScan and used + * in combination with vectorAddUniform4() to produce a complete scan. + * + * Template parameter \a T is the datatype of the array to be scanned. + * Template parameter \a traits is the ScanTraits struct containing + * compile-time options for the scan, such as whether it is forward or + * backward, exclusive or inclusive, multi- or single-row, etc. + * + * @param[out] d_out The output (scanned) array + * @param[in] d_in The input array to be scanned + * @param[out] d_blockSums The array of per-block sums + * @param[in] numElements The number of elements to scan + * @param[in] dataRowPitch The width of each row of \a d_in in elements + * (for multi-row scans) + * @param[in] blockSumRowPitch The with of each row of \a d_blockSums in elements + * (for multi-row scans) + */ +template +__global__ void scan4(T *d_out, + const T *d_in, + T *d_blockSums, + int numElements, + unsigned int dataRowPitch, + unsigned int blockSumRowPitch) +{ + SharedMemory smem; + T* temp = smem.getPointer(); + + int devOffset, ai, bi, aiDev, biDev; + T threadScan0[4], threadScan1[4]; + + unsigned int blockN = numElements; + unsigned int blockSumIndex = blockIdx.x; + + if (traits::isMultiRow()) + { + //int width = __mul24(gridDim.x, blockDim.x) << 1; + int yIndex = __umul24(blockDim.y, blockIdx.y) + threadIdx.y; + devOffset = __umul24(dataRowPitch, yIndex); + blockN += (devOffset << 2); + devOffset += __umul24(blockIdx.x, blockDim.x << 1); + blockSumIndex += __umul24(blockSumRowPitch << 2, yIndex) ; + } + else + { + devOffset = __umul24(blockIdx.x, (blockDim.x << 1)); + } + + // load data into shared memory + loadSharedChunkFromMem4 + (temp, threadScan0, threadScan1, d_in, + blockN, devOffset, ai, bi, aiDev, biDev); + + scanCTA(temp, d_blockSums, blockSumIndex); + + // write results to device memory + storeSharedChunkToMem4 + (d_out, threadScan0, threadScan1, temp, + blockN, devOffset, ai, bi, aiDev, biDev); + +} + +/** @} */ // end scan functions +/** @} */ // end cudpp_kernel diff --git a/lib/gpu/cudpp_mini/kernel/vector_kernel.cu b/lib/gpu/cudpp_mini/kernel/vector_kernel.cu new file mode 100644 index 0000000000..2da1de254a --- /dev/null +++ b/lib/gpu/cudpp_mini/kernel/vector_kernel.cu @@ -0,0 +1,469 @@ +// ------------------------------------------------------------- +// CUDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision: 5632 $ +// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt in +// the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * vector_kernel.cu + * + * @brief CUDA kernel methods for basic operations on vectors. + * + * CUDA kernel methods for basic operations on vectors. + * + * Examples: + * - vectorAddConstant(): d_vector + constant + * - vectorAddUniform(): d_vector + uniform (per-block constants) + * - vectorAddVectorVector(): d_vector + d_vector + */ + +// MJH: these functions assume there are 2N elements for N threads. +// Is this always going to be a good idea? There may be cases where +// we have as many threads as elements, but for large problems +// we are probably limited by max CTA size for simple kernels like +// this so we should process multiple elements per thread. +// we may want to extend these with looping versions that process +// many elements per thread. + +#include "cudpp_util.h" +#include "sharedmem.h" +#include "cudpp.h" + +/** \addtogroup cudpp_kernel + * @{ + */ + +/** @name Vector Functions + * CUDA kernel methods for basic operations on vectors. + * @{ + */ + +/** @brief Adds a constant value to all values in the input d_vector + * + * Each thread adds two pairs of elements. + * @todo Test this function -- it is currently not yet used. + * + * @param[in,out] d_vector The array of elements to be modified + * @param[in] constant The constant value to be added to elements of + * \a d_vector + * @param[in] n The number of elements in the d_vector to be modified + * @param[in] baseIndex An optional offset to the beginning of the + * elements in the input array to be processed + */ +template +__global__ void vectorAddConstant(T *d_vector, + T constant, + int n, + int baseIndex) +{ + // Compute this thread's output address + unsigned int address = baseIndex + threadIdx.x + + __mul24(blockIdx.x, (blockDim.x << 1)); + + // note two adds per thread: one in first half of the block, one in last + d_vector[address] += constant; + d_vector[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * constant; +} + + /** @brief Add a uniform value to each data element of an array + * + * This function reads one value per CTA from \a d_uniforms into shared + * memory and adds that value to all values "owned" by the CTA in \a + * d_vector. Each thread adds two pairs of values. + * + * @param[out] d_vector The d_vector whose values will have the uniform added + * @param[in] d_uniforms The array of uniform values (one per CTA) + * @param[in] numElements The number of elements in \a d_vector to process + * @param[in] blockOffset an optional offset to the beginning of this block's + * data. + * @param[in] baseIndex an optional offset to the beginning of the array + * within \a d_vector. + */ +template +__global__ void vectorAddUniform(T *d_vector, + const T *d_uniforms, + int numElements, + int blockOffset, + int baseIndex) +{ + __shared__ T uni; + // Get this block's uniform value from the uniform array in device memory + // We store it in shared memory so that the hardware's shared memory + // broadcast capability can be used to share among all threads in each warp + // in a single cycle + if (threadIdx.x == 0) + { + uni = d_uniforms[blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset]; + } + + // Compute this thread's output address + int width = __mul24(gridDim.x,(blockDim.x << 1)); + + unsigned int address = baseIndex + __mul24(width, blockIdx.y) + + threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 1)); + + __syncthreads(); + + // note two adds per thread: one in first half of the block, one in last + d_vector[address] += uni; + if (threadIdx.x + blockDim.x < numElements) d_vector[address + blockDim.x] += uni; +} + + +/** @brief Add a uniform value to each data element of an array (vec4 version) + * + * This function reads one value per CTA from \a d_uniforms into shared + * memory and adds that value to all values "owned" by the CTA in \a d_vector. + * Each thread adds the uniform value to eight values in \a d_vector. + * + * @param[out] d_vector The d_vector whose values will have the uniform added + * @param[in] d_uniforms The array of uniform values (one per CTA) + * @param[in] numElements The number of elements in \a d_vector to process + * @param[in] vectorRowPitch For 2D arrays, the pitch (in elements) of the + * rows of \a d_vector. + * @param[in] uniformRowPitch For 2D arrays, the pitch (in elements) of the + * rows of \a d_uniforms. + * @param[in] blockOffset an optional offset to the beginning of this block's + * data. + * @param[in] baseIndex an optional offset to the beginning of the array + * within \a d_vector. + */ +template +__global__ void vectorAddUniform4(T *d_vector, + const T *d_uniforms, + int numElements, + int vectorRowPitch, // width of input array in elements + int uniformRowPitch, // width of uniform array in elements + int blockOffset, + int baseIndex) +{ + __shared__ T uni; + // Get this block's uniform value from the uniform array in device memory + // We store it in shared memory so that the hardware's shared memory + // broadcast capability can be used to share among all threads in each warp + // in a single cycle + if (threadIdx.x == 0) + { + uni = d_uniforms[blockIdx.x + __umul24(uniformRowPitch, blockIdx.y) + blockOffset]; + } + + // Compute this thread's output address + //int width = __mul24(gridDim.x,(blockDim.x << 1)); + + unsigned int address = baseIndex + __umul24(vectorRowPitch, blockIdx.y) + + threadIdx.x + __umul24(blockIdx.x, (blockDim.x * elementsPerThread)); + numElements += __umul24(vectorRowPitch, blockIdx.y); + + __syncthreads(); + + switch (op) + { + case CUDPP_ADD: + for (int i = 0; i < elementsPerThread && address < numElements; i++) + { + d_vector[address] += uni; + address += blockDim.x; + } + break; + + case CUDPP_MULTIPLY: + for (int i = 0; i < elementsPerThread && address < numElements; i++) + { + d_vector[address] *= uni; + address += blockDim.x; + } + break; + + case CUDPP_MAX: + for (int i = 0; i < elementsPerThread && address < numElements; i++) + { + d_vector[address] = max(d_vector[address], uni); + address += blockDim.x; + } + break; + + case CUDPP_MIN: + for (int i = 0; i < elementsPerThread && address < numElements; i++) + { + d_vector[address] = min(d_vector[address], uni); + address += blockDim.x; + } + break; + default: + break; + } +} + +/** @brief Adds together two vectors + * + * Each thread adds two pairs of elements. + * @todo Test this function -- it is currently not yet used. + * + * @param[out] d_vectorA The left operand array and the result + * @param[in] d_vectorB The right operand array + * @param[in] numElements The number of elements in the vectors to be added. + * @param[in] baseIndex An optional offset to the beginning of the + * elements in the input arrays to be processed + */ +template +__global__ void vectorAddVector(T *d_vectorA, // A += B + const T *d_vectorB, + int numElements, + int baseIndex) +{ + // Compute this thread's output address + unsigned int address = baseIndex + threadIdx.x + + __mul24(blockIdx.x, (blockDim.x << 1)); + + // note two adds per thread: one in first half of the block, one in last + d_vectorA[address] += d_vectorB[address]; + d_vectorA[address + blockDim.x] += + (threadIdx.x + blockDim.x < numElements) * d_vectorB[address]; +} + +/** @brief Add a uniform value to data elements of an array (vec4 version) + * + * This function reads one value per CTA from \a d_uniforms into shared + * memory and adds that value to values "owned" by the CTA in \a d_vector. + * The uniform value is added to only those values "owned" by the CTA which + * have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX + * it adds the uniform to all values "owned" by the CTA. + * Each thread adds the uniform value to eight values in \a d_vector. + * + * @param[out] d_vector The d_vector whose values will have the uniform added + * @param[in] d_uniforms The array of uniform values (one per CTA) + * @param[in] d_maxIndices The array of maximum indices (one per CTA). This is + * index upto which the uniform would be added. If this is UINT_MAX + * the uniform is added to all elements of the CTA. This index is + * 1-based. + * @param[in] numElements The number of elements in \a d_vector to process + * @param[in] blockOffset an optional offset to the beginning of this block's + * data. + * @param[in] baseIndex an optional offset to the beginning of the array + * within \a d_vector. + */ +template +__global__ void vectorSegmentedAddUniform4(T *d_vector, + const T *d_uniforms, + const unsigned int *d_maxIndices, + unsigned int numElements, + int blockOffset, + int baseIndex) +{ + __shared__ T uni[2]; + + unsigned int blockAddress = + blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset; + + // Get this block's uniform value from the uniform array in device memory + // We store it in shared memory so that the hardware's shared memory + // broadcast capability can be used to share among all threads in each warp + // in a single cycle + + if (threadIdx.x == 0) + { + if (blockAddress > 0) + uni[0] = d_uniforms[blockAddress-1]; + else + uni[0] = Operator::identity(); + + // Tacit assumption that T is four-byte wide + uni[1] = (T)(d_maxIndices[blockAddress]); + } + + // Compute this thread's output address + int width = __mul24(gridDim.x,(blockDim.x << 1)); + + unsigned int address = baseIndex + __mul24(width, blockIdx.y) + + threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3)); + + __syncthreads(); + + unsigned int maxIndex = (unsigned int)(uni[1]); + + bool isLastBlock = (blockIdx.x == (gridDim.x-1)); + + if (maxIndex < UINT_MAX) + { + // Since maxIndex is a 1 based index + --maxIndex; + bool leftLess = address < maxIndex; + bool rightLess = (address + 7 * blockDim.x) < maxIndex; + + if (leftLess) + { + if (rightLess) + { + for (unsigned int i = 0; i < 8; ++i) + d_vector[address + i * blockDim.x] = + Operator::op(d_vector[address + i * blockDim.x], uni[0]); + } + else + { + for (unsigned int i=0; i < 8; ++i) + { + if (address < maxIndex) + d_vector[address] = + Operator::op(d_vector[address], uni[0]); + + address += blockDim.x; + } + } + } + } + else + { + if (!isLastBlockFull && isLastBlock) + { + for (unsigned int i = 0; i < 8; ++i) + { + if (address < numElements) + d_vector[address] = + Operator::op(d_vector[address], uni[0]); + + address += blockDim.x; + } + } + else + { + for (unsigned int i=0; i<8; ++i) + { + d_vector[address] = + Operator::op(d_vector[address], uni[0]); + + address += blockDim.x; + } + } + } +} + +/** @brief Add a uniform value to data elements of an array (vec4 version) + * + * This function reads one value per CTA from \a d_uniforms into shared + * memory and adds that value to values "owned" by the CTA in \a d_vector. + * The uniform value is added to only those values "owned" by the CTA which + * have an index greater than d_minIndex. If d_minIndex for that CTA is 0 + * it adds the uniform to all values "owned" by the CTA. + * Each thread adds the uniform value to eight values in \a d_vector. + * + * @param[out] d_vector The d_vector whose values will have the uniform added + * @param[in] d_uniforms The array of uniform values (one per CTA) + * @param[in] d_minIndices The array of minimum indices (one per CTA). The + * uniform is added to the right of this index (that is, to every index + * that is greater than this index). If this is 0, the uniform is + * added to all elements of the CTA. This index is 1-based to + * prevent overloading of what 0 means. In our case it means + * absence of a flag. But if the first element of a CTA has + * flag the index will also be 0. Hence we use 1-based indices + * so the index is 1 in the latter case. + * @param[in] numElements The number of elements in \a d_vector to process + * @param[in] blockOffset an optional offset to the beginning of this block's + * data. + * @param[in] baseIndex an optional offset to the beginning of the array + * within \a d_vector. + * + */ +template +__global__ void vectorSegmentedAddUniformToRight4(T *d_vector, + const T *d_uniforms, + const unsigned int *d_minIndices, + unsigned int numElements, + int blockOffset, + int baseIndex) +{ + __shared__ T uni[2]; + + unsigned int blockAddress = + blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset; + + // Get this block's uniform value from the uniform array in device memory + // We store it in shared memory so that the hardware's shared memory + // broadcast capability can be used to share among all threads in each warp + // in a single cycle + + if (threadIdx.x == 0) + { + // FIXME - blockAddress test here is incompatible with how it is calculated + // above + if (blockAddress < (gridDim.x-1)) + uni[0] = d_uniforms[blockAddress+1]; + else + uni[0] = Operator::identity(); + + // Tacit assumption that T is four-byte wide + uni[1] = (T)(d_minIndices[blockAddress]); + } + + // Compute this thread's output address + int width = __mul24(gridDim.x,(blockDim.x << 1)); + + unsigned int address = baseIndex + __mul24(width, blockIdx.y) + + threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3)); + + __syncthreads(); + + unsigned int minIndex = (unsigned int)(uni[1]); + + bool isLastBlock = (blockIdx.x == (gridDim.x-1)); + + if (minIndex > 0) + { + // Since minIndex is a 1 based index + --minIndex; + bool leftInRange = address > minIndex; + bool rightInRange = (address + 7 * blockDim.x) > minIndex; + + if (rightInRange) + { + if (leftInRange) + { + for (unsigned int i = 0; i < 8; ++i) + d_vector[address + i * blockDim.x] = + Operator::op(d_vector[address + i * blockDim.x], uni[0]); + } + else + { + for (unsigned int i=0; i < 8; ++i) + { + if (address > minIndex) + d_vector[address] = + Operator::op(d_vector[address], uni[0]); + + address += blockDim.x; + } + } + } + } + else + { + if (!isLastBlockFull && isLastBlock) + { + for (unsigned int i = 0; i < 8; ++i) + { + if (address < numElements) + d_vector[address] = + Operator::op(d_vector[address], uni[0]); + + address += blockDim.x; + } + } + else + { + for (unsigned int i=0; i<8; ++i) + { + d_vector[address] = + Operator::op(d_vector[address], uni[0]); + + address += blockDim.x; + } + } + } +} + +/** @} */ // end d_vector functions +/** @} */ // end cudpp_kernel diff --git a/lib/gpu/cudpp_mini/license.txt b/lib/gpu/cudpp_mini/license.txt new file mode 100644 index 0000000000..4a14588e1d --- /dev/null +++ b/lib/gpu/cudpp_mini/license.txt @@ -0,0 +1,25 @@ +Copyright (c) 2007-2010 The Regents of the University of California, Davis +campus ("The Regents") and NVIDIA Corporation ("NVIDIA"). All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the The Regents, nor NVIDIA, nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/lib/gpu/cudpp_mini/radixsort_app.cu b/lib/gpu/cudpp_mini/radixsort_app.cu new file mode 100644 index 0000000000..966fe609e7 --- /dev/null +++ b/lib/gpu/cudpp_mini/radixsort_app.cu @@ -0,0 +1,993 @@ +// ------------------------------------------------------------- +// CUDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * radixsort_app.cu + * + * @brief CUDPP application-level radix sorting routines + */ + +/** @addtogroup cudpp_app + * @{ + */ + +/** @name RadixSort Functions + * @{ + */ + + +#include "cudpp.h" +#include "cudpp_util.h" +#include "cudpp_radixsort.h" +#include "cudpp_scan.h" +#include "kernel/radixsort_kernel.cu" + +#include +#include +#include +#include + +typedef unsigned int uint; + +/** @brief Perform one step of the radix sort. Sorts by nbits key bits per step, +* starting at startbit. +* +* Uses cudppScanDispatch() for the prefix sum of radix counters. +* +* @param[in,out] keys Keys to be sorted. +* @param[in,out] values Associated values to be sorted (through keys). +* @param[in] plan Configuration information for RadixSort. +* @param[in] numElements Number of elements in the sort. +**/ +template +void radixSortStep(uint *keys, + uint *values, + const CUDPPRadixSortPlan *plan, + uint numElements) +{ + const uint eltsPerBlock = SORT_CTA_SIZE * 4; + const uint eltsPerBlock2 = SORT_CTA_SIZE * 2; + + bool fullBlocks = ((numElements % eltsPerBlock) == 0); + uint numBlocks = (fullBlocks) ? + (numElements / eltsPerBlock) : + (numElements / eltsPerBlock + 1); + uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ? + (numElements / eltsPerBlock2) : + (numElements / eltsPerBlock2 + 1); + + bool loop = numBlocks > 65535; + uint blocks = loop ? 65535 : numBlocks; + uint blocksFind = loop ? 65535 : numBlocks2; + uint blocksReorder = loop ? 65535 : numBlocks2; + + uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[0] : plan->m_persistentCTAThreshold[0]; + + bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold); + + if (persist) + { + loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536); + + blocks = numBlocks; + blocksFind = numBlocks2; + blocksReorder = numBlocks2; + + // Run an empty kernel -- this seems to reset some of the CTA scheduling hardware + // on GT200, resulting in better scheduling and lower run times + if (startbit > 0) + { + emptyKernel<<>>(); + } + } + + if (fullBlocks) + { + if (loop) + { + if (persist) + { + blocks = flip? numCTAs(radixSortBlocks<4, 0, true, true, true>) : + numCTAs(radixSortBlocks<4, 0, true, false, true>); + } + + radixSortBlocks + <<>> + ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks); + } + else + { + radixSortBlocks + <<>> + ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks); + } + } + else + { + if (loop) + { + if (persist) + { + blocks = flip ? numCTAs(radixSortBlocks<4, 0, false, true, true>) : + numCTAs(radixSortBlocks<4, 0, false, false, true>); + } + + radixSortBlocks + <<>> + ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks); + } + else + { + radixSortBlocks + <<>> + ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks); + } + } + + CUT_CHECK_ERROR("radixSortBlocks"); + + if (fullBlocks) + { + if (loop) + { + if (persist) + { + blocksFind = numCTAs(findRadixOffsets<0, true, true>); + } + findRadixOffsets + <<>> + ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); + } + else + { + findRadixOffsets + <<>> + ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); + } + } + else + { + if (loop) + { + if (persist) + { + blocksFind = numCTAs(findRadixOffsets<0, false, true>); + } + findRadixOffsets + <<>> + ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); + } + else + { + findRadixOffsets + <<>> + ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); + } + } + + CUT_CHECK_ERROR("findRadixOffsets"); + + cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan); + + if (fullBlocks) + { + if (plan->m_bManualCoalesce) + { + if (loop) + { + if (persist) + { + blocksReorder = unflip ? numCTAs(reorderData<0, true, true, true, true>) : + numCTAs(reorderData<0, true, true, false, true>); + } + reorderData + <<>> + (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, + plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); + } + else + { + reorderData + <<>> + (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, + plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); + } + } + else + { + if (loop) + { + if (persist) + { + blocksReorder = unflip ? numCTAs(reorderData<0, true, false, true, true>) : + numCTAs(reorderData<0, true, false, false, true>); + } + reorderData + <<>> + (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, + plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); + } + else + { + reorderData + <<>> + (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, + plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); + } + } + } + else + { + if (plan->m_bManualCoalesce) + { + if (loop) + { + if (persist) + { + blocksReorder = unflip ? + numCTAs(reorderData<0, false, true, true, true>) : + numCTAs(reorderData<0, false, true, false, true>); + } + reorderData + <<>> + (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, + plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); + } + else + { + reorderData + <<>> + (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, + plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); + } + } + else + { + if (loop) + { + if (persist) + { + blocksReorder = unflip ? + numCTAs(reorderData<0, false, false, true, true>) : + numCTAs(reorderData<0, false, false, false, true>); + } + reorderData + <<>> + (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, + plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); + } + else + { + reorderData + <<>> + (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, + plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); + } + } + } + + CUT_CHECK_ERROR("radixSortStep"); +} + +/** + * @brief Single-block optimization for sorts of fewer than 4 * CTA_SIZE elements + * + * @param[in,out] keys Keys to be sorted. + * @param[in,out] values Associated values to be sorted (through keys). + * @param numElements Number of elements in the sort. +**/ +template +void radixSortSingleBlock(uint *keys, + uint *values, + uint numElements) +{ + bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0); + if (fullBlocks) + { + radixSortBlocks<32, 0, true, flip, false> + <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>> + ((uint4*)keys, (uint4*)values, + (uint4*)keys, (uint4*)values, + numElements, 0); + } + else + { + radixSortBlocks<32, 0, false, flip, false> + <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>> + ((uint4*)keys, (uint4*)values, + (uint4*)keys, (uint4*)values, + numElements, 0); + } + + if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements); + + CUT_CHECK_ERROR("radixSortSingleBlock"); +} + +/** + * @brief Main radix sort function + * + * Main radix sort function. Sorts in place in the keys and values arrays, + * but uses the other device arrays as temporary storage. All pointer + * parameters are device pointers. Uses cudppScan() for the prefix sum of + * radix counters. + * + * @param[in,out] keys Keys to be sorted. + * @param[in,out] values Associated values to be sorted (through keys). + * @param[in] plan Configuration information for RadixSort. + * @param[in] numElements Number of elements in the sort. + * @param[in] flipBits Is set true if key datatype is a float + * (neg. numbers) for special float sorting operations. + * @param[in] keyBits Number of interesting bits in the key + **/ +void radixSort(uint *keys, + uint* values, + const CUDPPRadixSortPlan *plan, + size_t numElements, + bool flipBits, + int keyBits) +{ + if(numElements <= WARP_SIZE) + { + if (flipBits) + radixSortSingleWarp<<<1, numElements>>> + (keys, values, numElements); + else + radixSortSingleWarp<<<1, numElements>>> + (keys, values, numElements); + + CUT_CHECK_ERROR("radixSortSingleWarp"); + return; + } +#ifdef __DEVICE_EMULATION__ + printf("bits: %d\n", keyBits); +#endif + + if(numElements <= SORT_CTA_SIZE * 4) + { + if (flipBits) + radixSortSingleBlock(keys, values, numElements); + else + radixSortSingleBlock(keys, values, numElements); + return; + } + + // flip float bits on the first pass, unflip on the last pass + if (flipBits) + { + radixSortStep<4, 0, true, false> + (keys, values, plan, numElements); + } + else + { + radixSortStep<4, 0, false, false> + (keys, values, plan, numElements); + } + + if (keyBits > 4) + { + radixSortStep<4, 4, false, false> + (keys, values, plan, numElements); + } + if (keyBits > 8) + { + radixSortStep<4, 8, false, false> + (keys, values, plan, numElements); + } + if (keyBits > 12) + { + radixSortStep<4, 12, false, false> + (keys, values, plan, numElements); + } + if (keyBits > 16) + { + radixSortStep<4, 16, false, false> + (keys, values, plan, numElements); + } + if (keyBits > 20) + { + radixSortStep<4, 20, false, false> + (keys, values, plan, numElements); + } + if (keyBits > 24) + { + radixSortStep<4, 24, false, false> + (keys, values, plan, numElements); + } + if (keyBits > 28) + { + if (flipBits) // last pass + { + radixSortStep<4, 28, false, true> + (keys, values, plan, numElements); + } + else + { + radixSortStep<4, 28, false, false> + (keys, values, plan, numElements); + } + } +} + +/** + * @brief Wrapper to call main radix sort function. For float configuration. + * + * Calls the main radix sort function. For float configuration. + * + * @param[in,out] keys Keys to be sorted. + * @param[in,out] values Associated values to be sorted (through keys). + * @param[in] plan Configuration information for RadixSort. + * @param[in] numElements Number of elements in the sort. + * @param[in] negativeKeys Is set true if key datatype has neg. numbers. + * @param[in] keyBits Number of interesting bits in the key + **/ +extern "C" +void radixSortFloatKeys(float* keys, + uint* values, + const CUDPPRadixSortPlan *plan, + size_t numElements, + bool negativeKeys, + int keyBits) +{ + + radixSort((uint*)keys, (uint*)values, plan, + numElements, negativeKeys, keyBits); +} + +/** @brief Perform one step of the radix sort. Sorts by nbits key bits per step, + * starting at startbit. + * + * @param[in,out] keys Keys to be sorted. + * @param[in] plan Configuration information for RadixSort. + * @param[in] numElements Number of elements in the sort. +**/ +template +void radixSortStepKeysOnly(uint *keys, + const CUDPPRadixSortPlan *plan, + uint numElements) +{ + const uint eltsPerBlock = SORT_CTA_SIZE * 4; + const uint eltsPerBlock2 = SORT_CTA_SIZE * 2; + + bool fullBlocks = ((numElements % eltsPerBlock) == 0); + uint numBlocks = (fullBlocks) ? + (numElements / eltsPerBlock) : + (numElements / eltsPerBlock + 1); + uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ? + (numElements / eltsPerBlock2) : + (numElements / eltsPerBlock2 + 1); + + bool loop = numBlocks > 65535; + + uint blocks = loop ? 65535 : numBlocks; + uint blocksFind = loop ? 65535 : numBlocks2; + uint blocksReorder = loop ? 65535 : numBlocks2; + + uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[1] : plan->m_persistentCTAThreshold[1]; + + bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold); + + if (persist) + { + loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536); + + blocks = numBlocks; + blocksFind = numBlocks2; + blocksReorder = numBlocks2; + } + + if (fullBlocks) + { + if (loop) + { + if (persist) + { + blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>) : + numCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>); + } + + radixSortBlocksKeysOnly + <<>> + ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks); + } + else + radixSortBlocksKeysOnly + <<>> + ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks); + } + else + { + if (loop) + { + if (persist) + { + blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>) : + numCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>); + } + + radixSortBlocksKeysOnly + <<>> + ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks); + } + else + radixSortBlocksKeysOnly + <<>> + ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks); + + } + + if (fullBlocks) + { + if (loop) + { + if (persist) + { + blocksFind = numCTAs(findRadixOffsets<0, true, true>); + } + findRadixOffsets + <<>> + ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); + } + else + findRadixOffsets + <<>> + ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); + } + else + { + if (loop) + { + if (persist) + { + blocksFind = numCTAs(findRadixOffsets<0, false, true>); + } + findRadixOffsets + <<>> + ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); + } + else + findRadixOffsets + <<>> + ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); + + } + + cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan); + + if (fullBlocks) + { + if (plan->m_bManualCoalesce) + { + if (loop) + { + if (persist) + { + blocksReorder = unflip ? + numCTAs(reorderDataKeysOnly<0, true, true, true, true>) : + numCTAs(reorderDataKeysOnly<0, true, true, false, true>); + } + reorderDataKeysOnly + <<>> + (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, + numElements, numBlocks2); + } + else + reorderDataKeysOnly + <<>> + (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, + numElements, numBlocks2); + } + else + { + if (loop) + { + if (persist) + { + blocksReorder = unflip ? + numCTAs(reorderDataKeysOnly<0, true, false, true, true>) : + numCTAs(reorderDataKeysOnly<0, true, false, false, true>); + } + reorderDataKeysOnly + <<>> + (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, + numElements, numBlocks2); + } + else + reorderDataKeysOnly + <<>> + (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, + numElements, numBlocks2); + } + } + else + { + if (plan->m_bManualCoalesce) + { + if (loop) + { + if (persist) + { + blocksReorder = unflip ? + numCTAs(reorderDataKeysOnly<0, false, true, true, true>) : + numCTAs(reorderDataKeysOnly<0, false, true, false, true>); + } + reorderDataKeysOnly + <<>> + (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, + numElements, numBlocks2); + } + else + reorderDataKeysOnly + <<>> + (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, + numElements, numBlocks2); + } + else + { + if (loop) + { + if (persist) + { + blocksReorder = unflip ? + numCTAs(reorderDataKeysOnly<0, false, false, true, true>) : + numCTAs(reorderDataKeysOnly<0, false, false, false, true>); + } + reorderDataKeysOnly + <<>> + (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, + numElements, numBlocks2); + } + else + reorderDataKeysOnly + <<>> + (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, + numElements, numBlocks2); + } + } + + CUT_CHECK_ERROR("radixSortStepKeysOnly"); +} + +/** + * @brief Optimization for sorts of fewer than 4 * CTA_SIZE elements (keys only). + * + * @param[in,out] keys Keys to be sorted. + * @param numElements Number of elements in the sort. +**/ +template +void radixSortSingleBlockKeysOnly(uint *keys, + uint numElements) +{ + bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0); + if (fullBlocks) + { + radixSortBlocksKeysOnly<32, 0, true, flip, false> + <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>> + ((uint4*)keys, (uint4*)keys, numElements, 1 ); + } + else + { + radixSortBlocksKeysOnly<32, 0, false, flip, false> + <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>> + ((uint4*)keys, (uint4*)keys, numElements, 1 ); + } + + if (flip) + unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements); + + + CUT_CHECK_ERROR("radixSortSingleBlock"); +} + +/** + * @brief Main radix sort function. For keys only configuration. + * + * Main radix sort function. Sorts in place in the keys array, + * but uses the other device arrays as temporary storage. All pointer + * parameters are device pointers. Uses scan for the prefix sum of + * radix counters. + * + * @param[in,out] keys Keys to be sorted. + * @param[in] plan Configuration information for RadixSort. + * @param[in] flipBits Is set true if key datatype is a float (neg. numbers) + * for special float sorting operations. + * @param[in] numElements Number of elements in the sort. + * @param[in] keyBits Number of interesting bits in the key +**/ +extern "C" +void radixSortKeysOnly(uint *keys, + const CUDPPRadixSortPlan *plan, + bool flipBits, + size_t numElements, + int keyBits) +{ + + if(numElements <= WARP_SIZE) + { + if (flipBits) + radixSortSingleWarpKeysOnly<<<1, numElements>>>(keys, numElements); + else + radixSortSingleWarpKeysOnly<<<1, numElements>>>(keys, numElements); + return; + } + if(numElements <= SORT_CTA_SIZE * 4) + { + if (flipBits) + radixSortSingleBlockKeysOnly(keys, numElements); + else + radixSortSingleBlockKeysOnly(keys, numElements); + return; + } + + // flip float bits on the first pass, unflip on the last pass + if (flipBits) + { + radixSortStepKeysOnly<4, 0, true, false>(keys, plan, numElements); + } + else + { + radixSortStepKeysOnly<4, 0, false, false>(keys, plan, numElements); + } + + if (keyBits > 4) + { + radixSortStepKeysOnly<4, 4, false, false>(keys, plan, numElements); + } + if (keyBits > 8) + { + radixSortStepKeysOnly<4, 8, false, false>(keys, plan, numElements); + } + if (keyBits > 12) + { + radixSortStepKeysOnly<4, 12, false, false>(keys, plan, numElements); + } + if (keyBits > 16) + { + radixSortStepKeysOnly<4, 16, false, false>(keys, plan, numElements); + } + if (keyBits > 20) + { + radixSortStepKeysOnly<4, 20, false, false>(keys, plan, numElements); + } + if (keyBits > 24) + { + radixSortStepKeysOnly<4, 24, false, false>(keys, plan, numElements); + } + if (keyBits > 28) + { + if (flipBits) // last pass + { + radixSortStepKeysOnly<4, 28, false, true>(keys, plan, numElements); + } + else + { + radixSortStepKeysOnly<4, 28, false, false>(keys, plan, numElements); + } + } +} + +/** + * @brief Wrapper to call main radix sort function. For floats and keys only. + * + * Calls the radixSortKeysOnly function setting parameters for floats. + * + * @param[in,out] keys Keys to be sorted. + * @param[in] plan Configuration information for RadixSort. + * @param[in] negativeKeys Is set true if key flipBits is to be true in + * radixSortKeysOnly(). + * @param[in] numElements Number of elements in the sort. + * @param[in] keyBits Number of interesting bits in the key +**/ +extern "C" +void radixSortFloatKeysOnly(float *keys, + const CUDPPRadixSortPlan *plan, + bool negativeKeys, + size_t numElements, + int keyBits) +{ + radixSortKeysOnly((uint*)keys, plan, negativeKeys, numElements, keyBits); +} + +extern "C" +void initDeviceParameters(CUDPPRadixSortPlan *plan) +{ + int deviceID = -1; + if (cudaSuccess == cudaGetDevice(&deviceID)) + { + cudaDeviceProp devprop; + cudaGetDeviceProperties(&devprop, deviceID); + + int smVersion = devprop.major * 10 + devprop.minor; + + // sm_12 and later devices don't need help with coalesce in reorderData kernel + plan->m_bManualCoalesce = (smVersion < 12); + + // sm_20 and later devices are better off not using persistent CTAs + plan->m_bUsePersistentCTAs = (smVersion < 20); + + if (plan->m_bUsePersistentCTAs) + { + // The following is only true on pre-sm_20 devices (pre-Fermi): + // Empirically we have found that for some (usually larger) sort + // sizes it is better to use exactly as many "persistent" CTAs + // as can fill the GPU, which loop over the "blocks" of work. For smaller + // arrays it is better to use the typical CUDA approach of launching one CTA + // per block of work. + // 0-element of these two-element arrays is for key-value sorts + // 1-element is for key-only sorts + plan->m_persistentCTAThreshold[0] = plan->m_bManualCoalesce ? 16777216 : 524288; + plan->m_persistentCTAThresholdFullBlocks[0] = plan->m_bManualCoalesce ? 2097152: 524288; + plan->m_persistentCTAThreshold[1] = plan->m_bManualCoalesce ? 16777216 : 8388608; + plan->m_persistentCTAThresholdFullBlocks[1] = plan->m_bManualCoalesce ? 2097152: 0; + + // create a map of function pointers to register counts for more accurate occupancy calculation + // Must pass in the dynamic shared memory used by each kernel, since the runtime doesn't know it + // Note we only insert the "loop" version of the kernels (the one with the last template param = true) + // Because those are the only ones that require persistent CTAs that maximally fill the device. + computeNumCTAs(radixSortBlocks<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); + computeNumCTAs(radixSortBlocks<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); + computeNumCTAs(radixSortBlocks<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); + computeNumCTAs(radixSortBlocks<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); + + computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); + computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); + computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); + computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); + + computeNumCTAs(findRadixOffsets<0, false, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); + computeNumCTAs(findRadixOffsets<0, true, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); + + computeNumCTAs(reorderData<0, false, false, false, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderData<0, false, false, true, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderData<0, false, true, false, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderData<0, false, true, true, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderData<0, true, false, false, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderData<0, true, false, true, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderData<0, true, true, false, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderData<0, true, true, true, true>, 0, SORT_CTA_SIZE); + + computeNumCTAs(reorderDataKeysOnly<0, false, false, false, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderDataKeysOnly<0, false, false, true, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderDataKeysOnly<0, false, true, false, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderDataKeysOnly<0, false, true, true, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderDataKeysOnly<0, true, false, false, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderDataKeysOnly<0, true, false, true, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderDataKeysOnly<0, true, true, false, true>, 0, SORT_CTA_SIZE); + computeNumCTAs(reorderDataKeysOnly<0, true, true, true, true>, 0, SORT_CTA_SIZE); + + computeNumCTAs(emptyKernel, 0, SORT_CTA_SIZE); + } + } +} + +/** + * @brief From the programmer-specified sort configuration, + * creates internal memory for performing the sort. + * + * @param[in] plan Pointer to CUDPPRadixSortPlan object +**/ +extern "C" +void allocRadixSortStorage(CUDPPRadixSortPlan *plan) +{ + + unsigned int numElements = plan->m_numElements; + + unsigned int numBlocks = + ((numElements % (SORT_CTA_SIZE * 4)) == 0) ? + (numElements / (SORT_CTA_SIZE * 4)) : + (numElements / (SORT_CTA_SIZE * 4) + 1); + + switch(plan->m_config.datatype) + { + case CUDPP_UINT: + CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys, + numElements * sizeof(unsigned int))); + + if (!plan->m_bKeysOnly) + CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues, + numElements * sizeof(unsigned int))); + + CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters, + WARP_SIZE * numBlocks * sizeof(unsigned int))); + + CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum, + WARP_SIZE * numBlocks * sizeof(unsigned int))); + + CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets, + WARP_SIZE * numBlocks * sizeof(unsigned int))); + break; + + case CUDPP_FLOAT: + CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys, + numElements * sizeof(float))); + + if (!plan->m_bKeysOnly) + CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues, + numElements * sizeof(float))); + + CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters, + WARP_SIZE * numBlocks * sizeof(float))); + + CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum, + WARP_SIZE * numBlocks * sizeof(float))); + + CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets, + WARP_SIZE * numBlocks * sizeof(float))); + break; + } + + initDeviceParameters(plan); +} + +/** @brief Deallocates intermediate memory from allocRadixSortStorage. + * + * + * @param[in] plan Pointer to CUDPPRadixSortPlan object +**/ +extern "C" +void freeRadixSortStorage(CUDPPRadixSortPlan* plan) +{ + CUDA_SAFE_CALL( cudaFree(plan->m_tempKeys)); + CUDA_SAFE_CALL( cudaFree(plan->m_tempValues)); + CUDA_SAFE_CALL( cudaFree(plan->m_counters)); + CUDA_SAFE_CALL( cudaFree(plan->m_countersSum)); + CUDA_SAFE_CALL( cudaFree(plan->m_blockOffsets)); +} + +/** @brief Dispatch function to perform a sort on an array with + * a specified configuration. + * + * This is the dispatch routine which calls radixSort...() with + * appropriate template parameters and arguments as specified by + * the plan. + * @param[in,out] keys Keys to be sorted. + * @param[in,out] values Associated values to be sorted (through keys). + * @param[in] numElements Number of elements in the sort. + * @param[in] keyBits Number of interesting bits in the key* + * @param[in] plan Configuration information for RadixSort. +**/ +extern "C" +void cudppRadixSortDispatch(void *keys, + void *values, + size_t numElements, + int keyBits, + const CUDPPRadixSortPlan *plan) +{ + if(plan->m_bKeysOnly) + { + switch(plan->m_config.datatype) + { + case CUDPP_UINT: + radixSortKeysOnly((uint*)keys, plan, false, + numElements, keyBits); + break; + case CUDPP_FLOAT: + radixSortFloatKeysOnly((float*)keys, plan, true, + numElements, keyBits); + } + } + else + { + switch(plan->m_config.datatype) + { + case CUDPP_UINT: + radixSort((uint*)keys, (uint*) values, plan, + numElements, false, keyBits); + break; + case CUDPP_FLOAT: + radixSortFloatKeys((float*)keys, (uint*) values, plan, + numElements, true, keyBits); + } + } +} + +/** @} */ // end radixsort functions +/** @} */ // end cudpp_app diff --git a/lib/gpu/cudpp_mini/scan_app.cu b/lib/gpu/cudpp_mini/scan_app.cu new file mode 100644 index 0000000000..1718de3780 --- /dev/null +++ b/lib/gpu/cudpp_mini/scan_app.cu @@ -0,0 +1,771 @@ +// ------------------------------------------------------------- +// CUDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision: 5633 $ +// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * scan_app.cu + * + * @brief CUDPP application-level scan routines + */ + +/** \defgroup cudpp_app CUDPP Application-Level API + * The CUDPP Application-Level API contains functions + * that run on the host CPU and invoke GPU routines in + * the CUDPP \link cudpp_kernel Kernel-Level API\endlink. + * Application-Level API functions are used by + * CUDPP \link publicInterface Public Interface\endlink + * functions to implement CUDPP's core functionality. + * @{ + */ + +/** @name Scan Functions + * @{ + */ + +#include "cudpp.h" +#include "cudpp_util.h" +#include "cudpp_plan.h" +#include "kernel/scan_kernel.cu" +#include "kernel/vector_kernel.cu" + + +#include +#include +#include +#include + +/** @brief Perform recursive scan on arbitrary size arrays + * + * This is the CPU-side workhorse function of the scan engine. This function + * invokes the CUDA kernels which perform the scan on individual blocks. + * + * Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans, + * where each block is scanned by a single CUDA thread block. At each recursive level of the + * scanArrayRecursive first invokes a kernel to scan all blocks of that level, and if the level + * has more than one block, it calls itself recursively. On returning from each recursive level, + * the total sum of each block from the level below is added to all elements of the corresponding + * block in this level. See "Parallel Prefix Sum (Scan) in CUDA" for more information (see + * \ref references ). + * + * Template parameter \a T is the datatype; \a isBackward specifies backward or forward scan; + * \a isExclusive specifies exclusive or inclusive scan, and \a op specifies the binary associative + * operator to be used. + * + * @param[out] d_out The output array for the scan results + * @param[in] d_in The input array to be scanned + * @param[out] d_blockSums Array of arrays of per-block sums (one array per recursive level, allocated + * by allocScanStorage()) + * @param[in] numElements The number of elements in the array to scan + * @param[in] numRows The number of rows in the array to scan + * @param[in] rowPitches Array of row pitches (one array per recursive level, allocated by + * allocScanStorage()) + * @param[in] level The current recursive level of the scan + */ +template +void scanArrayRecursive(T *d_out, + const T *d_in, + T **d_blockSums, + size_t numElements, + size_t numRows, + const size_t *rowPitches, + int level) +{ + unsigned int numBlocks = + max(1, (unsigned int)ceil((double)numElements / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE))); + + unsigned int sharedEltsPerBlock = CTA_SIZE * 2; + + unsigned int sharedMemSize = sizeof(T) * sharedEltsPerBlock; + + // divide pitch by four since scan's load/store addresses are for vec4 elements + unsigned int rowPitch = 1; + unsigned int blockSumRowPitch = 1; + + if (numRows > 1) + { + rowPitch = rowPitches[level] / 4; + blockSumRowPitch = (numBlocks > 1) ? rowPitches[level+1] / 4 : 0; + } + + bool fullBlock = (numElements == numBlocks * SCAN_ELTS_PER_THREAD * CTA_SIZE); + + // setup execution parameters + dim3 grid(numBlocks, numRows, 1); + dim3 threads(CTA_SIZE, 1, 1); + + // make sure there are no CUDA errors before we start + CUT_CHECK_ERROR("scanArray before kernels"); + + unsigned int traitsCode = 0; + if (numBlocks > 1) traitsCode |= 1; + if (numRows > 1) traitsCode |= 2; + if (fullBlock) traitsCode |= 4; + + switch (traitsCode) + { + case 0: // single block, single row, non-full block + scan4 > + <<< grid, threads, sharedMemSize >>> + (d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch); + break; + case 1: // multiblock, single row, non-full block + scan4< T, ScanTraits > + <<< grid, threads, sharedMemSize >>> + (d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch); + break; + case 2: // single block, multirow, non-full block + scan4 > + <<< grid, threads, sharedMemSize >>> + (d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch); + break; + case 3: // multiblock, multirow, non-full block + scan4 > + <<< grid, threads, sharedMemSize >>> + (d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch); + break; + case 4: // single block, single row, full block + scan4 > + <<< grid, threads, sharedMemSize >>> + (d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch); + break; + case 5: // multiblock, single row, full block + scan4< T, ScanTraits > + <<< grid, threads, sharedMemSize >>> + (d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch); + break; + case 6: // single block, multirow, full block + scan4 > + <<< grid, threads, sharedMemSize >>> + (d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch); + break; + case 7: // multiblock, multirow, full block + scan4 > + <<< grid, threads, sharedMemSize >>> + (d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch); + break; + } + + CUT_CHECK_ERROR("prescan"); + + if (numBlocks > 1) + { + // After scanning all the sub-blocks, we are mostly done. But + // now we need to take all of the last values of the + // sub-blocks and scan those. This will give us a new value + // that must be sdded to each block to get the final results. + + scanArrayRecursive + ((T*)d_blockSums[level], (const T*)d_blockSums[level], + (T**)d_blockSums, numBlocks, numRows, rowPitches, level + 1); // recursive (CPU) call + + vectorAddUniform4 + <<< grid, threads >>>(d_out, + (T*)d_blockSums[level], + numElements, + rowPitch*4, + blockSumRowPitch*4, + 0, 0); + CUT_CHECK_ERROR("vectorAddUniform"); + } +} + +// global + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** @brief Allocate intermediate arrays used by scan. + * + * Scans of large arrays must be split (possibly recursively) into a hierarchy + * of block scans, where each block is scanned by a single CUDA thread block. + * At each recursive level of the scan, we need an array in which to store the + * total sums of all blocks in that level. This function computes the amount + * of storage needed and allocates it. + * + * @param plan Pointer to CUDPPScanPlan object containing options and number + * of elements, which is used to compute storage requirements, and + * within which intermediate storage is allocated. + */ +void allocScanStorage(CUDPPScanPlan *plan) +{ + //assert(config->_numEltsAllocated == 0); // shouldn't be called + + plan->m_numEltsAllocated = plan->m_numElements; + + size_t numElts = plan->m_numElements; + + size_t level = 0; + + do + { + size_t numBlocks = + max(1, (unsigned int)ceil((double)numElts / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE))); + if (numBlocks > 1) + { + level++; + } + numElts = numBlocks; + } while (numElts > 1); + + size_t elementSize = 0; + + switch(plan->m_config.datatype) + { + case CUDPP_INT: + plan->m_blockSums = (void**) malloc(level * sizeof(int*)); + elementSize = sizeof(int); + break; + case CUDPP_UINT: + plan->m_blockSums = (void**) malloc(level * sizeof(unsigned int*)); + elementSize = sizeof(unsigned int); + break; + case CUDPP_FLOAT: + plan->m_blockSums = (void**) malloc(level * sizeof(float*)); + elementSize = sizeof(float); + break; + default: + break; + } + + plan->m_numLevelsAllocated = level; + numElts = plan->m_numElements; + size_t numRows = plan->m_numRows; + plan->m_numRowsAllocated = numRows; + plan->m_rowPitches = 0; + + if (numRows > 1) + { + plan->m_rowPitches = (size_t*) malloc((level + 1) * sizeof(size_t)); + plan->m_rowPitches[0] = plan->m_rowPitch; + } + + level = 0; + + do + { + size_t numBlocks = + max(1, (unsigned int)ceil((double)numElts / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE))); + if (numBlocks > 1) + { + // Use cudaMallocPitch for multi-row block sums to ensure alignment + if (numRows > 1) + { + size_t dpitch; + CUDA_SAFE_CALL( cudaMallocPitch((void**) &(plan->m_blockSums[level]), + &dpitch, + numBlocks * elementSize, + numRows)); + plan->m_rowPitches[level+1] = dpitch / elementSize; + level++; + } + else + { + CUDA_SAFE_CALL(cudaMalloc((void**) &(plan->m_blockSums[level++]), + numBlocks * elementSize)); + } + } + numElts = numBlocks; + } while (numElts > 1); + + CUT_CHECK_ERROR("allocScanStorage"); +} + +/** @brief Deallocate intermediate block sums arrays in a CUDPPScanPlan object. + * + * These arrays must have been allocated by allocScanStorage(), which is called + * by the constructor of cudppScanPlan(). + * + * @param plan Pointer to CUDPPScanPlan object initialized by allocScanStorage(). + */ +void freeScanStorage(CUDPPScanPlan *plan) +{ + for (unsigned int i = 0; i < plan->m_numLevelsAllocated; i++) + { + cudaFree(plan->m_blockSums[i]); + } + + CUT_CHECK_ERROR("freeScanStorage"); + + free((void**)plan->m_blockSums); + if (plan->m_numRows > 1) + free((void*)plan->m_rowPitches); + + plan->m_blockSums = 0; + plan->m_numEltsAllocated = 0; + plan->m_numLevelsAllocated = 0; +} + + +/** @brief Dispatch function to perform a scan (prefix sum) on an + * array with the specified configuration. + * + * This is the dispatch routine which calls scanArrayRecursive() with + * appropriate template parameters and arguments to achieve the scan as + * specified in \a plan. + * + * @param[out] d_out The output array of scan results + * @param[in] d_in The input array + * @param[in] numElements The number of elements to scan + * @param[in] numRows The number of rows to scan in parallel + * @param[in] plan Pointer to CUDPPScanPlan object containing scan options + * and intermediate storage + */ +void cudppScanDispatch(void *d_out, + const void *d_in, + size_t numElements, + size_t numRows, + const CUDPPScanPlan *plan) +{ + if (CUDPP_OPTION_EXCLUSIVE & plan->m_config.options) + { + if (CUDPP_OPTION_BACKWARD & plan->m_config.options) + { + switch (plan->m_config.datatype) + { + case CUDPP_INT: + + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + } + + break; + + case CUDPP_UINT: + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + } + + break; + + case CUDPP_FLOAT: + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + } + break; + + default: + break; + } + } + else + { + switch (plan->m_config.datatype) + { + case CUDPP_INT: + + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + } + + break; + + case CUDPP_UINT: + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + + } + + break; + + case CUDPP_FLOAT: + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + } + break; + + default: + break; + } + } + } + else + { + if (CUDPP_OPTION_BACKWARD & plan->m_config.options) + { + switch (plan->m_config.datatype) + { + case CUDPP_INT: + + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + } + + break; + + case CUDPP_UINT: + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + } + + break; + + case CUDPP_FLOAT: + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + } + break; + + default: + break; + } + } + else + { + switch (plan->m_config.datatype) + { + case CUDPP_INT: + + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((int*)d_out, (const int*)d_in, + (int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + } + + break; + + case CUDPP_UINT: + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((unsigned int*)d_out, (const unsigned int*)d_in, + (unsigned int**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + + } + + break; + + case CUDPP_FLOAT: + switch(plan->m_config.op) + { + case CUDPP_ADD: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MULTIPLY: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MAX: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + case CUDPP_MIN: + scanArrayRecursive + ((float*)d_out, (const float*)d_in, + (float**)plan->m_blockSums, + numElements, numRows, plan->m_rowPitches, 0); + break; + default: + break; + } + break; + + default: + break; + } + } + } +} + +#ifdef __cplusplus +} +#endif + +/** @} */ // end scan functions +/** @} */ // end cudpp_app diff --git a/lib/gpu/cudpp_mini/sharedmem.h b/lib/gpu/cudpp_mini/sharedmem.h new file mode 100644 index 0000000000..77f92adfed --- /dev/null +++ b/lib/gpu/cudpp_mini/sharedmem.h @@ -0,0 +1,166 @@ +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision$ +// $Date$ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * sharedmem.h + * + * @brief Shared memory declaration struct for templatized types. + * + * Because dynamically sized shared memory arrays are declared "extern" in CUDA, + * we can't templatize their types directly. To get around this, we declare a + * simple wrapper struct that will declare the extern array with a different + * name depending on the type. This avoids linker errors about multiple + * definitions. + * + * To use dynamically allocated shared memory in a templatized __global__ or + * __device__ function, just replace code like this: + * + *
+ *  template
+ *  __global__ void
+ *  foo( T* d_out, T* d_in) 
+ *  {
+ *      // Shared mem size is determined by the host app at run time
+ *      extern __shared__  T sdata[];
+ *      ...
+ *      doStuff(sdata);
+ *      ...
+ *  }
+ * 
+ * + * With this + *
+ *  template
+ *  __global__ void
+ *  foo( T* d_out, T* d_in) 
+ *  {
+ *      // Shared mem size is determined by the host app at run time
+ *      SharedMemory smem;
+ *      T* sdata = smem.getPointer();
+ *      ...
+ *      doStuff(sdata);
+ *      ...
+ *  }
+ * 
+ */ + +#ifndef _SHAREDMEM_H_ +#define _SHAREDMEM_H_ + + +/** @brief Wrapper class for templatized dynamic shared memory arrays. + * + * This struct uses template specialization on the type \a T to declare + * a differently named dynamic shared memory array for each type + * (\code extern __shared__ T s_type[] \endcode). + * + * Currently there are specializations for the following types: + * \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long, + * \c unsigned long, \c bool, \c float, and \c double. One can also specialize it + * for user defined types. + */ +template +struct SharedMemory +{ + /** Return a pointer to the runtime-sized shared memory array. **/ + __device__ T* getPointer() + { + extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types + Error_UnsupportedType(); + return (T*)0; + } + // TODO: Use operator overloading to make this class look like a regular array +}; + +// Following are the specializations for the following types. +// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double +// One could also specialize it for user-defined types. + +template <> +struct SharedMemory +{ + __device__ int* getPointer() { extern __shared__ int s_int[]; return s_int; } +}; + +template <> +struct SharedMemory +{ + __device__ unsigned int* getPointer() { extern __shared__ unsigned int s_uint[]; return s_uint; } +}; + +template <> +struct SharedMemory +{ + __device__ char* getPointer() { extern __shared__ char s_char[]; return s_char; } +}; + +template <> +struct SharedMemory +{ + __device__ unsigned char* getPointer() { extern __shared__ unsigned char s_uchar[]; return s_uchar; } +}; + +template <> +struct SharedMemory +{ + __device__ short* getPointer() { extern __shared__ short s_short[]; return s_short; } +}; + +template <> +struct SharedMemory +{ + __device__ unsigned short* getPointer() { extern __shared__ unsigned short s_ushort[]; return s_ushort; } +}; + +template <> +struct SharedMemory +{ + __device__ long* getPointer() { extern __shared__ long s_long[]; return s_long; } +}; + +template <> +struct SharedMemory +{ + __device__ unsigned long* getPointer() { extern __shared__ unsigned long s_ulong[]; return s_ulong; } +}; + +template <> +struct SharedMemory +{ + __device__ bool* getPointer() { extern __shared__ bool s_bool[]; return s_bool; } +}; + +template <> +struct SharedMemory +{ + __device__ float* getPointer() { extern __shared__ float s_float[]; return s_float; } +}; + +template <> +struct SharedMemory +{ + __device__ double* getPointer() { extern __shared__ double s_double[]; return s_double; } +}; + +template <> +struct SharedMemory +{ + __device__ uchar4* getPointer() { extern __shared__ uchar4 s_uchar4[]; return s_uchar4; } +}; + + +#endif //_SHAREDMEM_H_ + +// Leave this at the end of the file +// Local Variables: +// mode:c++ +// c-file-style: "NVIDIA" +// End: diff --git a/lib/gpu/gb_gpu.cpp b/lib/gpu/gb_gpu.cpp new file mode 100644 index 0000000000..444f4e80bd --- /dev/null +++ b/lib/gpu/gb_gpu.cpp @@ -0,0 +1,449 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include +#include +#include + +#include "gb_gpu_memory.h" + +using namespace std; + +static GB_GPU_Memory GBMF; +#define GBMT GB_GPU_Memory + +template +void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, + const int inum, const int form_low, const int form_high) { + int stride=gbm.nbor->nbor_pitch(); + int anall=gbm.atom->nall(); + if (gbm.shared_types) { + GBMF.k_gb_nbor_fast.set_size(GX,BX); + GBMF.k_gb_nbor_fast.run(&gbm.atom->dev_x.begin(), + &gbm.cut_form.begin(), &gbm.nbor->dev_nbor.begin(), &stride, + &start, &inum, &gbm.nbor->dev_packed.begin(), &form_low, + &form_high, &anall); + } else { + GBMF.k_gb_nbor.set_size(GX,BX); + GBMF.k_gb_nbor.run(&gbm.atom->dev_x.begin(), &gbm.cut_form.begin(), + &gbm._lj_types, &gbm.nbor->dev_nbor.begin(), &stride, + &start, &inum, &gbm.nbor->dev_packed.begin(), &form_low, + &form_high, &anall); + } +} + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +bool gb_gpu_init(const int ntypes, const double gamma, + const double upsilon, const double mu, double **shape, + double **well, double **cutsq, double **sigma, + double **epsilon, double *host_lshape, int **form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const double cell_size, int &gpu_mode, FILE *screen) { + GBMF.clear(); + gpu_mode=GBMF.device->gpu_mode(); + double gpu_split=GBMF.device->particle_split(); + int first_gpu=GBMF.device->first_device(); + int last_gpu=GBMF.device->last_device(); + int world_me=GBMF.device->world_me(); + int gpu_rank=GBMF.device->gpu_rank(); + int procs_per_gpu=GBMF.device->procs_per_gpu(); + + GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu); + + bool message=false; + if (world_me==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + if (world_me==0) { + bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, + sigma, epsilon, host_lshape, form, host_lj1, + host_lj2, host_lj3, host_lj4, offset, special_lj, + inum, nall, max_nbors, cell_size, gpu_split, screen); + if (!init_ok) + return false; + } + + MPI_Barrier(MPI_COMM_WORLD); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_comm); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + return true; +} + +// --------------------------------------------------------------------------- +// Clear memory on host and device +// --------------------------------------------------------------------------- +void gb_gpu_clear() { + GBMF.clear(); +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum, + const int host_inum, const int nall, + double **host_x, double **host_quat, + int *host_type, double *boxlo, + double *boxhi, bool &success) { + gbm.nbor_time_avail=true; + + success=true; + gbm.resize_atom(inum,nall,success); + gbm.resize_local(inum,host_inum,gbm.nbor->max_nbors(),0,success); + if (!success) + return; + + gbm.atom->cast_copy_x(host_x,host_type); + int mn; + gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom, + boxlo, boxhi, NULL, NULL, NULL, success, mn); + gbm.nbor->copy_unpacked(inum,mn); + gbm.last_ellipse=inum; + gbm.max_last_ellipse=inum; +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host and (if spheres) reorder so ellipses first +// --------------------------------------------------------------------------- +template +void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, + const int inum, const int osize, + int *ilist, int *numj, + int *type, int **firstneigh, + bool &success) { + success=true; + + gbm.nbor_time_avail=true; + + int mn=gbm.nbor->max_nbor_loop(inum,numj); + gbm.resize_atom(inum,nall,success); + gbm.resize_local(inum,0,mn,osize,success); + if (!success) + return; + + if (gbm.multiple_forms) { + int p=0; + for (int i=0; iget_host(inum,gbm.host_olist.begin(),numj,firstneigh, + gbm.block_size()); + gbm.nbor->copy_unpacked(inum,mn); + return; + } + gbm.last_ellipse=inum; + gbm.max_last_ellipse=inum; + gbm.nbor->get_host(inum,ilist,numj,firstneigh,gbm.block_size()); + gbm.nbor->copy_unpacked(inum,mn); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=gbm.block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(gbm.atom->inum())/BX)); + int stride=gbm.nbor->nbor_pitch(); + int ainum=gbm.atom->inum(); + int anall=gbm.atom->nall(); + + if (gbm.multiple_forms) { + gbm.time_kernel.start(); + if (gbm.last_ellipse>0) { + // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- + GX=static_cast(ceil(static_cast(gbm.last_ellipse)/ + static_cast(BX))); + gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE, + ELLIPSE_ELLIPSE); + gbm.time_kernel.stop(); + + gbm.time_gayberne.start(); + GBMF.k_gayberne.set_size(GX,BX); + GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(), + &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(), + &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), + &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), + &stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(), + &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall); + gbm.time_gayberne.stop(); + + if (gbm.last_ellipse==gbm.atom->inum()) { + gbm.time_kernel2.start(); + gbm.time_kernel2.stop(); + gbm.time_gayberne2.start(); + gbm.time_gayberne2.stop(); + gbm.time_pair.start(); + gbm.time_pair.stop(); + return; + } + + // ------------ SPHERE_ELLIPSE --------------- + + gbm.time_kernel2.start(); + GX=static_cast(ceil(static_cast(gbm.atom->inum()- + gbm.last_ellipse)/BX)); + gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(), + SPHERE_ELLIPSE,SPHERE_ELLIPSE); + gbm.time_kernel2.stop(); + + gbm.time_gayberne2.start(); + GBMF.k_sphere_gb.set_size(GX,BX); + GBMF.k_sphere_gb.run(&gbm.atom->dev_x.begin(),&gbm.atom->dev_quat.begin(), + &gbm.shape.begin(), &gbm.well.begin(), + &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), + &gbm._lj_types, &gbm.lshape.begin(), + &gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(), + &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, + &vflag, &gbm.last_ellipse, &ainum, &anall); + gbm.time_gayberne2.stop(); + } else { + gbm.atom->dev_ans.zero(); + gbm.atom->dev_engv.zero(); + gbm.time_kernel.stop(); + gbm.time_gayberne.start(); + gbm.time_gayberne.stop(); + gbm.time_kernel2.start(); + gbm.time_kernel2.stop(); + gbm.time_gayberne2.start(); + gbm.time_gayberne2.stop(); + } + + // ------------ LJ --------------- + gbm.time_pair.start(); + if (gbm.last_ellipseinum()) { + if (gbm.shared_types) { + GBMF.k_lj_fast.set_size(GX,BX); + GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), + &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(), + &stride, &gbm.nbor->dev_packed.begin(), + &gbm.atom->dev_ans.begin(), + &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), + &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); + } else { + GBMF.k_lj.set_size(GX,BX); + GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), + &gbm.lj3.begin(), &gbm._lj_types, + &gbm.gamma_upsilon_mu.begin(), &stride, + &gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(), + &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), + &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); + } + } + gbm.time_pair.stop(); + } else { + gbm.time_kernel.start(); + gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE, + ELLIPSE_ELLIPSE); + gbm.time_kernel.stop(); + gbm.time_gayberne.start(); + GBMF.k_gayberne.set_size(GX,BX); + GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(), &gbm.atom->dev_quat.begin(), + &gbm.shape.begin(), &gbm.well.begin(), + &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), + &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), + &stride, &gbm.atom->dev_ans.begin(), &ainum, + &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), + &eflag, &vflag, &ainum, &anall); + gbm.time_gayberne.stop(); + } +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, torques, energies +// --------------------------------------------------------------------------- +template +inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago, + const int inum_full, const int nall, + double **host_x, int *host_type, + double *boxlo, double *boxhi, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + const double cpu_time, bool &success, + double **host_quat) { + gbm.acc_timers(); + if (inum_full==0) { + gbm.zero_timers(); + return NULL; + } + + gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor()); + int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full); + gbm.atom->inum(inum); + gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + _gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x, + host_quat, host_type, boxlo, boxhi, success); + if (!success) + return NULL; + gbm.atom->cast_quat_data(host_quat[0]); + gbm.hd_balancer.start_timer(); + } else { + gbm.atom->cast_x_data(host_x,host_type); + gbm.atom->cast_quat_data(host_quat[0]); + gbm.hd_balancer.start_timer(); + gbm.atom->add_x_data(host_x,host_type); + } + + gbm.atom->add_other_data(); + + _gb_gpu_gayberne(gbm,eflag,vflag); + gbm.atom->copy_answers(eflag,vflag,eatom,vatom); + gbm.hd_balancer.stop_timer(); + return gbm.device->nbor.host_nbor.begin(); +} + +int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, const double cpu_time, bool &success, + double **host_quat) { + return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x, + host_type, boxlo, boxhi, eflag, vflag, eatom, vatom, + host_start, cpu_time, success, host_quat); +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, torques,.. +// --------------------------------------------------------------------------- +template +inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago, + const int inum_full,const int nall,double **host_x, + int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + const double cpu_time, bool &success, + double **host_quat) { + gbm.acc_timers(); + if (inum_full==0) { + gbm.zero_timers(); + return NULL; + } + + int ago=gbm.hd_balancer.ago_first(f_ago); + int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time, + gbm.nbor->gpu_nbor()); + gbm.atom->inum(inum); + gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse); + host_start=inum; + + if (ago==0) { + _gb_gpu_reset_nbors(gbm, nall, inum, inum_full, ilist, numj, host_type, + firstneigh, success); + if (!success) + return NULL; + } + int *list; + if (gbm.multiple_forms) + list=gbm.host_olist.begin(); + else + list=ilist; + + gbm.atom->cast_x_data(host_x,host_type); + gbm.atom->cast_quat_data(host_quat[0]); + gbm.hd_balancer.start_timer(); + gbm.atom->add_x_data(host_x,host_type); + gbm.atom->add_other_data(); + + _gb_gpu_gayberne(gbm,eflag,vflag); + gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list); + gbm.hd_balancer.stop_timer(); + return list; +} + +int * gb_gpu_compute(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double **host_quat) { + return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x, + host_type, ilist, numj, firstneigh, eflag, vflag, + eatom, vatom, host_start, cpu_time, success, + host_quat); +} + +// --------------------------------------------------------------------------- +// Return memory usage +// --------------------------------------------------------------------------- +double gb_gpu_bytes() { + return GBMF.host_memory_usage(); +} diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu deleted file mode 100644 index f95e1cbd77..0000000000 --- a/lib/gpu/gb_gpu.cu +++ /dev/null @@ -1,595 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#include -#include -#include "nvc_macros.h" -#include "nvc_timer.h" -#include "nvc_device.h" -#include "gb_gpu_memory.cu" -#include "gb_gpu_kernel.h" - -using namespace std; - -static GB_GPU_Memory GBMF[MAX_GPU_THREADS]; -#define GBMT GB_GPU_Memory - -// --------------------------------------------------------------------------- -// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access -// -- Only pack neighbors matching the specified inclusive range of forms -// -- Only pack neighbors within cutoff -// --------------------------------------------------------------------------- -template -__global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch, - const int start, const int inum, - const int *dev_ij, const int form_low, - const int form_high, const int nall) { - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start; - - if (ii=nall) - j%=nall; - vec4 jx=x_[j]; - int jtype=jx.w; - - if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) { - // Compute r12; - numtyp rsq=jx.x-ix.x; - rsq*=rsq; - numtyp t=jx.y-ix.y; - rsq+=t*t; - t=jx.z-ix.z; - rsq+=t*t; - - if (rsq< _cutsq_(itype,jtype)) { - *nbor=j; - nbor+=nbor_pitch; - newj++; - } - } - } - *nbor_newj=newj; - } -} - -// --------------------------------------------------------------------------- -// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access -// -- Only pack neighbors matching the specified inclusive range of forms -// -- Only pack neighbors within cutoff -// -- Fast version of routine that uses shared memory for LJ constants -// --------------------------------------------------------------------------- -template -__global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch, - const int start, const int inum, - const int *dev_ij, const int form_low, - const int form_high, const int nall) { - - int ii=threadIdx.x; - __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - if (ii(itype,jtype); - form[ii]=_form_(itype,jtype); - } - ii+=INT_MUL(blockIdx.x,blockDim.x)+start; - __syncthreads(); - - if (ii=nall) - j%=nall; - vec4 jx=x_[j]; - int jtype=jx.w; - int mtype=itype+jtype; - - if (form[mtype]>=form_low && form[mtype]<=form_high) { - // Compute r12; - numtyp rsq=jx.x-ix.x; - rsq*=rsq; - numtyp t=jx.y-ix.y; - rsq+=t*t; - t=jx.z-ix.z; - rsq+=t*t; - - if (rsq -void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, - const int inum, const int form_low, const int form_high) { - if (gbm.shared_types) { - kernel_pack_nbor_fast<<>> - ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), - gbm.atom.inum(), start, inum, - gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); - } else - kernel_pack_nbor<<>> - ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), - gbm.atom.inum(), start, inum, - gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); -} - -// --------------------------------------------------------------------------- -// Convert something to a string -// --------------------------------------------------------------------------- -#include -template -inline string gb_gpu_toa(const t& in) { - ostringstream o; - o.precision(2); - o << in; - return o.str(); -} - -// --------------------------------------------------------------------------- -// Return string with GPU info -// --------------------------------------------------------------------------- -EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) { - string sname=GBMF[0].gpu.name(id)+", "+ - gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+ - gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+ - gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ"; - strcpy(name,sname.c_str()); -} - -// --------------------------------------------------------------------------- -// Allocate memory on host and device and copy constants to device -// --------------------------------------------------------------------------- -EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma, - const double upsilon, const double mu, double **shape, - double **well, double **cutsq, double **sigma, - double **epsilon, double *host_lshape, int **form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int nlocal, const int nall, - const int max_nbors, const int thread, const int gpu_id) { - assert(thread -inline void _gb_gpu_atom(PairGPUAtom &atom, double **host_x, - double **host_quat, const int *host_type, - const bool rebuild, cudaStream_t &stream) { - atom.time_atom.start(); - atom.reset_write_buffer(); - - // Rows 1-3 of dev_x are position; rows 4-7 are quaternion - atom.add_x_data(host_x,host_type); - atom.add_q_data(host_quat[0]); - - atom.copy_x_data(stream); - atom.copy_q_data(stream); - atom.time_atom.stop(); -} - -EXTERN void gb_gpu_atom(double **host_x, double **host_quat, - const int *host_type, const bool rebuild, - const int thread) { - _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild, - GBMF[thread].pair_stream); -} - -// --------------------------------------------------------------------------- -// Signal that we need to transfer a new neighbor list -// --------------------------------------------------------------------------- -template -int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, - const int inum, int *ilist, const int *numj, - const int *type, bool &success) { - success=true; - - gbm.nbor.time_nbor.start(); - - int mn=0; - for (int i=0; igbm.max_atoms) - gbm.resize_atom(nall,success); - if (nlocal>gbm.max_local || mn>gbm._max_nbors) - gbm.resize_local(nlocal,mn,success); - if (!success) - return false; - - gbm.atom.nall(nall); - gbm.atom.inum(inum); - - if (gbm.multiple_forms) { - int ij_size=gbm.nbor.host_ij.numel(); - if (inum*20) { - gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset, - hi,gbm.pair_stream); - gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+ - inum*2+offset, - hi,gbm.pair_stream); - } - gbm.nbor.ij_total=0; - } - } else { - gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream); - gbm.last_ellipse=inum; - } - - gbm.nbor.time_nbor.stop(); - - if (gbm.multiple_forms) - return gbm.host_olist.begin(); - return ilist; -} - -EXTERN int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, - int *ilist, const int *numj, const int *type, - const int thread, bool &success) { - return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type, - success); -} - -// --------------------------------------------------------------------------- -// Copy a set of ij_size ij interactions to device and compute energies, -// forces, and torques for those interactions -// --------------------------------------------------------------------------- -template -void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij, - const bool eflag) { - gbm.nbor.time_nbor.add_to_total(); - // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed - - memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int)); - gbm.nbor.time_nbor.start(); - gbm.nbor.add(num_ij,gbm.pair_stream); - gbm.nbor.time_nbor.stop(); -} - -EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, - const int thread) { - _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag); -} - - -template -void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) { - gbm.atom.time_answer.start(); - gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream); - gbm.atom.time_answer.stop(); -} - -// --------------------------------------------------------------------------- -// Calculate energies, forces, and torques for all ij interactions -// --------------------------------------------------------------------------- -template -void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, - const bool rebuild) { - // Compute the block size and grid size to keep all cores busy - const int BX=BLOCK_1D; - int ans_pitch=6; - if (eflag) - ans_pitch++; - if (vflag) - ans_pitch+=6; - - int GX=static_cast(ceil(static_cast(gbm.atom.inum())/BX)); - - if (gbm.multiple_forms) { - gbm.time_kernel.start(); - if (gbm.last_ellipse>0) { - // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- - GX=static_cast(ceil(static_cast(gbm.last_ellipse)/ - static_cast(BX))); - pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE); - gbm.time_kernel.stop(); - - gbm.time_gayberne.start(); - kernel_gayberne<<>> - ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), - gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), - gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(), - eflag, vflag, gbm.last_ellipse, gbm.atom.nall()); - gbm.time_gayberne.stop(); - - if (gbm.last_ellipse==gbm.atom.inum()) { - gbm.time_kernel2.start(); - gbm.time_kernel2.stop(); - gbm.time_gayberne2.start(); - gbm.time_gayberne2.stop(); - gbm.time_pair.start(); - gbm.time_pair.stop(); - return; - } - - // ------------ SPHERE_ELLIPSE --------------- - - gbm.time_kernel2.start(); - GX=static_cast(ceil(static_cast(gbm.atom.inum()- - gbm.last_ellipse)/BX)); - pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE, - ELLIPSE_SPHERE); - gbm.time_kernel2.stop(); - - gbm.time_gayberne2.start(); - kernel_sphere_gb<<>> - ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), - gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), - gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(), - eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); - gbm.time_gayberne2.stop(); - } else { - gbm.atom.ans.zero(); - gbm.time_kernel.stop(); - gbm.time_gayberne.start(); - gbm.time_gayberne.stop(); - gbm.time_kernel2.start(); - gbm.time_kernel2.stop(); - gbm.time_gayberne2.start(); - gbm.time_gayberne2.stop(); - } - - // ------------ LJ --------------- - gbm.time_pair.start(); - if (gbm.last_ellipse<<>> - ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(), - gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag, - vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); - else - kernel_lj<<>> - ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(), - gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(), - eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); - } - gbm.time_pair.stop(); - } else { - gbm.time_kernel.start(); - pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE); - gbm.time_kernel.stop(); - - gbm.time_gayberne.start(); - kernel_gayberne<<>> - ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), - gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), - gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), - eflag, vflag, gbm.atom.inum(), gbm.atom.nall()); - gbm.time_gayberne.stop(); - } -} - -EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, - const int thread) { - _gb_gpu_gayberne(GBMF[thread],eflag,vflag,rebuild); - _gb_gpu_enqueue(GBMF[thread],eflag,vflag); -} - -// --------------------------------------------------------------------------- -// Get energies, forces, and torques to host -// --------------------------------------------------------------------------- -template -double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist, - const bool eflag, const bool vflag, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial) { - double evdw; - - gbm.atom.time_atom.add_to_total(); - gbm.nbor.time_nbor.add_to_total(); - gbm.time_kernel.add_to_total(); - gbm.time_gayberne.add_to_total(); - if (gbm.multiple_forms) { - gbm.time_kernel2.add_to_total(); - gbm.time_gayberne2.add_to_total(); - gbm.time_pair.add_to_total(); - } - CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); - if (gbm.last_ellipse>gbm.atom.inum()) { - if (eflag || vflag) - evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial, - f,tor,gbm.atom.inum()); - else - gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum()); - } else { - if (eflag || vflag) - evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial, - f,tor,gbm.last_ellipse); - else - gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse); - } - gbm.atom.time_answer.add_to_total(); - return evdw; -} - -EXTERN double gb_gpu_forces(double **f, double **tor, const int *ilist, - const bool eflag, const bool vflag, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial, const int thread) { - return _gb_gpu_forces - (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom, - vflag_atom,eatom,vatom,virial); -} - -EXTERN void gb_gpu_time(const int i) { - cout.precision(4); - cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds() - << " s.\n" - << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() - << " s.\n" - << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+ - GBMF[i].time_kernel2.total_seconds() << " s.\n" - << "Force calc: " << GBMF[i].time_gayberne.total_seconds()+ - GBMF[i].time_gayberne2.total_seconds()<< " s.\n"; - if (GBMF[i].multiple_forms) - cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n"; - cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds() - << " s.\n"; -} - -EXTERN int gb_gpu_num_devices() { - return GBMF[0].gpu.num_devices(); -} - -EXTERN double gb_gpu_bytes() { - return GBMF[0].host_memory_usage(); -} - diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h index 1e51389197..6ac390437a 100644 --- a/lib/gpu/gb_gpu_extra.h +++ b/lib/gpu/gb_gpu_extra.h @@ -12,44 +12,60 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef GB_GPU_EXTRA_H #define GB_GPU_EXTRA_H -#include "math.h" -#include "stdio.h" -#include "string.h" +#define MAX_SHARED_TYPES 8 +enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; -/* ---------------------------------------------------------------------- - Atomic update of global memory -------------------------------------------------------------------------- */ -/* -template __device__ -inline void atomicAdd(numtyp *address, numtyp val); +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif -template <> -__device__ inline void atomicAdd(float *address, float val) -{ - int i_val = __float_as_int(val); - int tmp0 = 0; - int tmp1; +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif - while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) { - tmp0 = tmp1; - i_val = __float_as_int(val + __int_as_float(tmp1)); - } -}*/ +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#ifdef NV_KERNEL + +#include "geryon/ucl_nv_kernel.h" + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline + +#endif /* ---------------------------------------------------------------------- dot product of 2 vectors ------------------------------------------------------------------------- */ -template -static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2) +__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2) { return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2]; } @@ -58,9 +74,7 @@ static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2) cross product of 2 vectors ------------------------------------------------------------------------- */ -template -static __inline__ __device__ void gpu_cross3(const numtyp *v1, - const numtyp *v2, numtyp *ans) +__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans) { ans[0] = v1[1]*v2[2]-v1[2]*v2[1]; ans[1] = v1[2]*v2[0]-v1[0]*v2[2]; @@ -71,8 +85,7 @@ static __inline__ __device__ void gpu_cross3(const numtyp *v1, determinant of a matrix ------------------------------------------------------------------------- */ -template -static __inline__ __device__ numtyp gpu_det3(const numtyp m[9]) +__inline numtyp gpu_det3(const numtyp m[9]) { numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + @@ -84,47 +97,25 @@ static __inline__ __device__ numtyp gpu_det3(const numtyp m[9]) diagonal matrix times a full matrix ------------------------------------------------------------------------- */ -template -static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9], - numtyp ans[9]) +__inline void gpu_times3(const numtyp4 shape, const numtyp m[9], + numtyp ans[9]) { - ans[0] = _well_(i,0)*m[0]; - ans[1] = _well_(i,0)*m[1]; - ans[2] = _well_(i,0)*m[2]; - ans[3] = _well_(i,1)*m[3]; - ans[4] = _well_(i,1)*m[4]; - ans[5] = _well_(i,1)*m[5]; - ans[6] = _well_(i,2)*m[6]; - ans[7] = _well_(i,2)*m[7]; - ans[8] = _well_(i,2)*m[8]; -} - -/* ---------------------------------------------------------------------- - diagonal matrix times a full matrix -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9], - numtyp ans[9]) -{ - ans[0] = _shape_(i,0)*m[0]; - ans[1] = _shape_(i,0)*m[1]; - ans[2] = _shape_(i,0)*m[2]; - ans[3] = _shape_(i,1)*m[3]; - ans[4] = _shape_(i,1)*m[4]; - ans[5] = _shape_(i,1)*m[5]; - ans[6] = _shape_(i,2)*m[6]; - ans[7] = _shape_(i,2)*m[7]; - ans[8] = _shape_(i,2)*m[8]; + ans[0] = shape.x*m[0]; + ans[1] = shape.x*m[1]; + ans[2] = shape.x*m[2]; + ans[3] = shape.y*m[3]; + ans[4] = shape.y*m[4]; + ans[5] = shape.y*m[5]; + ans[6] = shape.z*m[6]; + ans[7] = shape.z*m[7]; + ans[8] = shape.z*m[8]; } /* ---------------------------------------------------------------------- add two matrices ------------------------------------------------------------------------- */ -template -static __inline__ __device__ void gpu_plus3(const numtyp m[9], - const numtyp m2[9], numtyp ans[9]) +__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9]) { ans[0] = m[0]+m2[0]; ans[1] = m[1]+m2[1]; @@ -141,10 +132,8 @@ static __inline__ __device__ void gpu_plus3(const numtyp m[9], multiply the transpose of mat1 times mat2 ------------------------------------------------------------------------- */ -template -static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9], - const numtyp m2[9], - numtyp ans[9]) +__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9], + numtyp ans[9]) { ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6]; ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7]; @@ -161,9 +150,7 @@ static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9], row vector times matrix ------------------------------------------------------------------------- */ -template -static __inline__ __device__ void gpu_row_times3(const numtyp *v, - const numtyp m[9], numtyp *ans) +__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans) { ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6]; ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7]; @@ -176,10 +163,8 @@ static __inline__ __device__ void gpu_row_times3(const numtyp *v, error_flag set to 2 if bad matrix inversion attempted ------------------------------------------------------------------------- */ -template -static __inline__ __device__ void gpu_mldivide3(const numtyp m[9], - const numtyp *v, numtyp *ans, - int *error_flag) +__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans, + __global int *error_flag) { // create augmented matrix for pivoting @@ -297,12 +282,10 @@ static __inline__ __device__ void gpu_mldivide3(const numtyp m[9], quat = [w i j k] ------------------------------------------------------------------------- */ -template -static __inline__ __device__ void gpu_quat_to_mat_trans(const vec4 *qif, - const int qi, - numtyp mat[9]) +__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, + numtyp mat[9]) { - vec4 q=qif[qi]; + numtyp4 q=qif[qi]; numtyp w2 = q.x*q.x; numtyp i2 = q.y*q.y; diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu new file mode 100644 index 0000000000..41174660d3 --- /dev/null +++ b/lib/gpu/gb_gpu_kernel.cu @@ -0,0 +1,383 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef GB_GPU_KERNEL +#define GB_GPU_KERNEL + +#ifdef NV_KERNEL +#include "gb_gpu_extra.h" +#endif + +__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, + numtyp ans[9]) +{ + numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- + m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- + m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; + den = (numtyp)1.0/den; + + ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- + m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ + m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- + m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ + m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; + + ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ + (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- + (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- + m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ + m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; + + ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- + m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- + m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ + (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ + m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; + + ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ + m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ + m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- + m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- + m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; + + ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ + (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- + (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ + m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- + m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; + + ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- + m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ + (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ + m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- + (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; + + ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ + (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ + m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- + m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- + m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; + + ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- + (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ + (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- + m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ + m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; + + ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- + m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- + m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ + (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ + m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; +} + +__kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q, + __global numtyp4* shape, __global numtyp4* well, + __global numtyp *gum, __global numtyp2* sig_eps, + const int ntypes, __global numtyp *lshape, + __global int *dev_nbor, const int stride, + __global acctyp4 *ans, const int astride, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag, const int inum, + const int nall) { + __local numtyp sp_lj[4]; + + // ii indexes the two interacting particles in gi + int ii=THREAD_ID_X; + if (ii<4) + sp_lj[ii]=gum[ii+3]; + ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + __syncthreads(); + + if (ii0) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag>0) { + r12[0]*=-r; + r12[1]*=-r; + r12[2]*=-r; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + f.x+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + f.y+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + f.z+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + f.x+=temp1*dchi[0]-temp2*dUr[0]; + f.y+=temp1*dchi[1]-temp2*dUr[1]; + f.z+=temp1*dchi[2]-temp2*dUr[2]; + } + + // Torque on 1 + temp1 = -u_r*eta*factor_lj; + temp2 = -u_r*chi*factor_lj; + numtyp temp3 = -chi*eta*factor_lj; + tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; + tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; + tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=astride; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=astride; + } + } + ans[ii]=f; + ans[ii+astride]=tor; + } // if ii +} + +#endif + diff --git a/lib/gpu/gb_gpu_kernel.h b/lib/gpu/gb_gpu_kernel.h deleted file mode 100644 index b3f8b3ccc0..0000000000 --- a/lib/gpu/gb_gpu_kernel.h +++ /dev/null @@ -1,863 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#ifndef GB_GPU_KERNEL -#define GB_GPU_KERNEL - -#include "gb_gpu_extra.h" - -template -static __inline__ __device__ void compute_eta_torque(numtyp m[9], - numtyp m2[9], - const int i, - numtyp ans[9]) -{ - numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- - m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- - m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; - den = (numtyp)1.0/den; - - numtyp shapex=_shape_(i,0); - numtyp shapey=_shape_(i,1); - numtyp shapez=_shape_(i,2); - - ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- - m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ - m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- - m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ - m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; - - ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ - (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- - (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- - m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ - m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; - - ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- - m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- - m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ - (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ - m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; - - ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ - m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ - m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- - m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- - m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; - - ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ - (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- - (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ - m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- - m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; - - ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- - m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ - (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ - m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- - (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; - - ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ - (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ - m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- - m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- - m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; - - ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- - (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ - (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- - m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ - m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; - - ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- - m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- - m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ - (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ - m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; -} - -#include "gb_gpu_kernel.h" - -template -__global__ void kernel_gayberne(const vec4* x_, const vec4 *q, - const numtyp *gum, const numtyp *special_lj, - const int *dev_nbor, const size_t nbor_pitch, - acctyp *ans, size_t ans_pitch, int *err_flag, - const bool eflag, const bool vflag, - const int inum, const int nall) { - - __shared__ numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - ii+=INT_MUL(blockIdx.x,blockDim.x); - __syncthreads(); - - if (ii(itype,jtype); - numtyp epsilon = _epsilon_(itype,jtype); - numtyp varrho = sigma/(h12+gum[0]*sigma); - numtyp varrho6 = varrho*varrho*varrho; - varrho6*=varrho6; - numtyp varrho12 = varrho6*varrho6; - u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); - - numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; - temp1 = temp1*(numtyp)24.0*epsilon; - uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; - numtyp temp2 = gpu_dot3(kappa,r12); - uslj_rsq = uslj_rsq*ir*ir; - - dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]); - dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]); - dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]); - } - - // torque for particle 1 - { - numtyp tempv[3], tempv2[3]; - tempv[0] = -uslj_rsq*kappa[0]; - tempv[1] = -uslj_rsq*kappa[1]; - tempv[2] = -uslj_rsq*kappa[2]; - gpu_row_times3(kappa,g1,tempv2); - gpu_cross3(tempv,tempv2,tUr); - } - } - } - - // Compute eta - { - eta = (numtyp)2.0*_lshape_(itype)*_lshape_(jtype); - numtyp det_g12 = gpu_det3(g12); - eta = pow(eta/det_g12,gum[1]); - } - - // Compute teta - numtyp temp[9], tempv[3], tempv2[3]; - compute_eta_torque(g12,a1,itype,temp); - numtyp temp1 = -eta*gum[1]; - - tempv[0] = temp1*temp[0]; - tempv[1] = temp1*temp[1]; - tempv[2] = temp1*temp[2]; - gpu_cross3(a1,tempv,tempv2); - teta[0] = tempv2[0]; - teta[1] = tempv2[1]; - teta[2] = tempv2[2]; - - tempv[0] = temp1*temp[3]; - tempv[1] = temp1*temp[4]; - tempv[2] = temp1*temp[5]; - gpu_cross3(a1+3,tempv,tempv2); - teta[0] += tempv2[0]; - teta[1] += tempv2[1]; - teta[2] += tempv2[2]; - - tempv[0] = temp1*temp[6]; - tempv[1] = temp1*temp[7]; - tempv[2] = temp1*temp[8]; - gpu_cross3(a1+6,tempv,tempv2); - teta[0] += tempv2[0]; - teta[1] += tempv2[1]; - teta[2] += tempv2[2]; - } - - numtyp chi, dchi[3], tchi[3]; - { // Compute chi and dchi - - // Compute b12 - numtyp b2[9], b12[9]; - { - gpu_well_times3(jtype,a2,b12); - gpu_transpose_times3(a2,b12,b2); - gpu_plus3(b1,b2,b12); - } - - // compute chi_12 - r12[0]*=r; - r12[1]*=r; - r12[2]*=r; - numtyp iota[3]; - gpu_mldivide3(b12,r12,iota,err_flag); - // -- iota is now iota/r - iota[0]*=ir; - iota[1]*=ir; - iota[2]*=ir; - r12[0]*=ir; - r12[1]*=ir; - r12[2]*=ir; - chi = gpu_dot3(r12,iota); - chi = pow(chi*(numtyp)2.0,gum[2]); - - // -- iota is now ok - iota[0]*=r; - iota[1]*=r; - iota[2]*=r; - - numtyp temp1 = gpu_dot3(iota,r12); - numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); - dchi[0] = temp2*(iota[0]-temp1*r12[0]); - dchi[1] = temp2*(iota[1]-temp1*r12[1]); - dchi[2] = temp2*(iota[2]-temp1*r12[2]); - - // compute t_chi - numtyp tempv[3]; - gpu_row_times3(iota,b1,tempv); - gpu_cross3(tempv,iota,tchi); - temp1 = (numtyp)-4.0*ir*ir; - tchi[0] *= temp1; - tchi[1] *= temp1; - tchi[2] *= temp1; - } - - numtyp temp2 = factor_lj*eta*chi; - if (eflag) - energy+=u_r*temp2; - numtyp temp1 = -eta*u_r*factor_lj; - if (vflag) { - r12[0]*=-r; - r12[1]*=-r; - r12[2]*=-r; - numtyp ft=temp1*dchi[0]-temp2*dUr[0]; - fx+=ft; - virial[0]+=r12[0]*ft; - ft=temp1*dchi[1]-temp2*dUr[1]; - fy+=ft; - virial[1]+=r12[1]*ft; - virial[3]+=r12[0]*ft; - ft=temp1*dchi[2]-temp2*dUr[2]; - fz+=ft; - virial[2]+=r12[2]*ft; - virial[4]+=r12[0]*ft; - virial[5]+=r12[1]*ft; - } else { - fx+=temp1*dchi[0]-temp2*dUr[0]; - fy+=temp1*dchi[1]-temp2*dUr[1]; - fz+=temp1*dchi[2]-temp2*dUr[2]; - } - - // Torque on 1 - temp1 = -u_r*eta*factor_lj; - temp2 = -u_r*chi*factor_lj; - numtyp temp3 = -chi*eta*factor_lj; - torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; - tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; - torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; - - } // for nbor - - // Store answers - acctyp *ap1=ans+ii*ans_pitch; - if (eflag) { - *ap1=energy; - ap1++; - } - if (vflag) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; - ap1++; - } - } - *ap1=fx; - ap1++; - *ap1=fy; - ap1++; - *ap1=fz; - ap1++; - *ap1=torx; - ap1++; - *ap1=tory; - ap1++; - *ap1=torz; - - } // if ii - -} - -template -__global__ void kernel_sphere_gb(const vec4 *x_, const vec4 *q, - const numtyp *gum, const numtyp *special_lj, - const int *dev_nbor, const size_t nbor_pitch, - acctyp *ans, size_t ans_pitch, int *err_flag, - const bool eflag, const bool vflag, - const int start, const int inum, - const int nall) { - __shared__ numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - ii+=INT_MUL(blockIdx.x,blockDim.x)+start; - __syncthreads(); - - if (ii(itype,0); - numtyp one_well=_well_(itype,0); - - numtyp factor_lj; - for ( ; nbor(itype,jtype); - numtyp epsilon = _epsilon_(itype,jtype); - numtyp varrho = sigma/(h12+gum[0]*sigma); - numtyp varrho6 = varrho*varrho*varrho; - varrho6*=varrho6; - numtyp varrho12 = varrho6*varrho6; - u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); - - numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; - temp1 = temp1*(numtyp)24.0*epsilon; - uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; - numtyp temp2 = gpu_dot3(kappa,r12hat); - uslj_rsq = uslj_rsq*ir*ir; - - dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]); - dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]); - dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]); - } - } - } - - // Compute eta - { - eta = (numtyp)2.0*_lshape_(itype)*_lshape_(jtype); - numtyp det_g12 = gpu_det3(g12); - eta = pow(eta/det_g12,gum[1]); - } - } - - numtyp chi, dchi[3]; - { // Compute chi and dchi - - // Compute b12 - numtyp b12[9]; - { - numtyp b2[9]; - gpu_well_times3(jtype,a2,b12); - gpu_transpose_times3(a2,b12,b2); - b12[0]=b2[0]+one_well; - b12[4]=b2[4]+one_well; - b12[8]=b2[8]+one_well; - b12[1]=b2[1]; - b12[2]=b2[2]; - b12[3]=b2[3]; - b12[5]=b2[5]; - b12[6]=b2[6]; - b12[7]=b2[7]; - } - - // compute chi_12 - numtyp iota[3]; - gpu_mldivide3(b12,r12,iota,err_flag); - // -- iota is now iota/r - iota[0]*=ir; - iota[1]*=ir; - iota[2]*=ir; - chi = gpu_dot3(r12hat,iota); - chi = pow(chi*(numtyp)2.0,gum[2]); - - // -- iota is now ok - iota[0]*=r; - iota[1]*=r; - iota[2]*=r; - - numtyp temp1 = gpu_dot3(iota,r12hat); - numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); - dchi[0] = temp2*(iota[0]-temp1*r12hat[0]); - dchi[1] = temp2*(iota[1]-temp1*r12hat[1]); - dchi[2] = temp2*(iota[2]-temp1*r12hat[2]); - } - - numtyp temp2 = factor_lj*eta*chi; - if (eflag) - energy+=u_r*temp2; - numtyp temp1 = -eta*u_r*factor_lj; - if (vflag) { - r12[0]*=-1; - r12[1]*=-1; - r12[2]*=-1; - numtyp ft=temp1*dchi[0]-temp2*dUr[0]; - fx+=ft; - virial[0]+=r12[0]*ft; - ft=temp1*dchi[1]-temp2*dUr[1]; - fy+=ft; - virial[1]+=r12[1]*ft; - virial[3]+=r12[0]*ft; - ft=temp1*dchi[2]-temp2*dUr[2]; - fz+=ft; - virial[2]+=r12[2]*ft; - virial[4]+=r12[0]*ft; - virial[5]+=r12[1]*ft; - } else { - fx+=temp1*dchi[0]-temp2*dUr[0]; - fy+=temp1*dchi[1]-temp2*dUr[1]; - fz+=temp1*dchi[2]-temp2*dUr[2]; - } - } // for nbor - - // Store answers - acctyp *ap1=ans+ii*ans_pitch; - if (eflag) { - *ap1=energy; - ap1++; - } - if (vflag) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; - ap1++; - } - } - *ap1=fx; - ap1++; - *ap1=fy; - ap1++; - *ap1=fz; - } // if ii -} - -template -__global__ void kernel_lj(const vec4 *x_, - const numtyp *special_lj, const int *dev_nbor, - const size_t nbor_pitch, const int *dev_ij, acctyp *ans, - size_t ans_pitch, int *err_flag, const bool eflag, - const bool vflag, const int start, const int inum, - const int nall) { - __shared__ numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - ii+=INT_MUL(blockIdx.x,blockDim.x)+start; - __syncthreads(); - - if (ii(itype,jtype) && - _form_(itype,jtype)==SPHERE_SPHERE) { - r2inv=(numtyp)1.0/r2inv; - numtyp r6inv = r2inv*r2inv*r2inv; - numtyp force = r2inv*r6inv*(_lj1_(itype,jtype).x*r6inv- - _lj1_(itype,jtype).y); - force*=factor_lj; - - fx+=delx*force; - fy+=dely*force; - fz+=delz*force; - - if (eflag) { - numtyp e=r6inv*(_lj3_(itype,jtype).x*r6inv- - _lj3_(itype,jtype).y); - energy+=factor_lj*(e-_offset_(1,1)); - } - if (vflag) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } - } - - } // for nbor - - // Store answers - acctyp *ap1=ans+ii*ans_pitch; - if (eflag) { - *ap1+=energy; - ap1++; - } - if (vflag) { - for (int i=0; i<6; i++) { - *ap1+=virial[i]; - ap1++; - } - } - *ap1+=fx; - ap1++; - *ap1+=fy; - ap1++; - *ap1+=fz; - - } // if ii -} - -template -__global__ void kernel_lj_fast(const vec4 *x_, - const numtyp *special_lj, const int *dev_nbor, - const size_t nbor_pitch, const int *dev_ij, - acctyp *ans, size_t ans_pitch,int *err_flag, - const bool eflag, const bool vflag, - const int start, const int inum, const int nall){ - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - __shared__ numtyp sp_lj[4]; - __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - if (ii(itype,jtype); - form[ii]=_form_(itype,jtype); - lj1[ii]=_lj1_(itype,jtype).x; - lj2[ii]=_lj1_(itype,jtype).y; - if (eflag) { - lj3[ii]=_lj3_(itype,jtype).x; - lj4[ii]=_lj3_(itype,jtype).y; - offset[ii]=_offset_(itype,jtype); - } - } - ii+=INT_MUL(blockIdx.x,blockDim.x)+start; - __syncthreads(); - - if (ii0) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag>0) { + r12[0]*=-1; + r12[1]*=-1; + r12[2]*=-1; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + f.x+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + f.y+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + f.z+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + f.x+=temp1*dchi[0]-temp2*dUr[0]; + f.y+=temp1*dchi[1]-temp2*dUr[1]; + f.z+=temp1*dchi[2]-temp2*dUr[2]; + } + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *gum, + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int nall) { + __local numtyp sp_lj[4]; + + // ii indexes the two interacting particles in gi + int ii=THREAD_ID_X; + if (ii<4) + sp_lj[ii]=gum[ii+3]; + ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; + __syncthreads(); + + if (ii0) { + numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); + energy+=factor_lj*(e-lj3[ii].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1+=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1+=virial[i]; + ap1+=inum; + } + } + acctyp4 old=ans[ii]; + old.x+=f.x; + old.y+=f.y; + old.z+=f.z; + ans[ii]=old; + } // if ii +} + +__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, __global numtyp *gum, + const int stride, + __global int *dev_ij, __global acctyp4 *ans, + __global acctyp *engv, __global int *err_flag, + const int eflag,const int vflag, const int start, + const int inum, const int nall) { + // ii indexes the two interacting particles in gi + int ii=THREAD_ID_X; + __local numtyp sp_lj[4]; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + if (ii<4) + sp_lj[ii]=gum[ii+3]; + if (ii0) + lj3[ii]=lj3_in[ii]; + } + ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; + __syncthreads(); + + if (ii0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1+=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1+=virial[i]; + ap1+=inum; + } + } + acctyp4 old=ans[ii]; + old.x+=f.x; + old.y+=f.y; + old.z+=f.z; + ans[ii]=old; + } // if ii +} + +#endif diff --git a/lib/gpu/gb_gpu_kernel_nbor.cu b/lib/gpu/gb_gpu_kernel_nbor.cu new file mode 100644 index 0000000000..80da8b8d9d --- /dev/null +++ b/lib/gpu/gb_gpu_kernel_nbor.cu @@ -0,0 +1,170 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef PAIR_GPU_KERNEL_H +#define PAIR_GPU_KERNEL_H + +#define MAX_SHARED_TYPES 8 + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#else +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#endif + +#ifdef NV_KERNEL + +#include "geryon/ucl_nv_kernel.h" + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) + +#endif + +// --------------------------------------------------------------------------- +// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access +// -- Only unpack neighbors matching the specified inclusive range of forms +// -- Only unpack neighbors within cutoff +// --------------------------------------------------------------------------- +__kernel void kernel_gb_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, + const int ntypes, __global int *dev_nbor, + const int nbor_pitch, + const int start, const int inum, + __global int *dev_ij, const int form_low, + const int form_high, const int nall) { + + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X+start; + + if (ii=nall) + j%=nall; + numtyp4 jx=x_[j]; + int jtype=jx.w; + int mtype=itype+jtype; + numtyp2 cf=cut_form[mtype]; + if (cf.y>=form_low && cf.y<=form_high) { + // Compute r12; + numtyp rsq=jx.x-ix.x; + rsq*=rsq; + numtyp t=jx.y-ix.y; + rsq+=t*t; + t=jx.z-ix.z; + rsq+=t*t; + + if (rsq=nall) + j%=nall; + numtyp4 jx=x_[j]; + int jtype=jx.w; + int mtype=itype+jtype; + + if (form[mtype]>=form_low && form[mtype]<=form_high) { + // Compute r12; + numtyp rsq=jx.x-ix.x; + rsq*=rsq; + numtyp t=jx.y-ix.y; + rsq+=t*t; + t=jx.z-ix.z; + rsq+=t*t; + + if (rsq +#define GB_GPU_MemoryT GB_GPU_Memory + +extern PairGPUDevice pair_gpu_device; + +template +GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false), + _max_bytes(0.0) { + device=&pair_gpu_device; +} + +template +GB_GPU_MemoryT::~GB_GPU_Memory() { + clear(); +} + +template +int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { + return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors); +} + +template +bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, + const double upsilon, const double mu, + double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, + int **h_form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, + double **host_offset, const double *host_special_lj, + const int nlocal, const int nall, + const int max_nbors, const double cell_size, + const double gpu_split, FILE *_screen) { + nbor_time_avail=false; + screen=_screen; + + bool gpu_nbor=false; + if (device->gpu_mode()==PairGPUDevice::GPU_NEIGH) + gpu_nbor=true; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split); + if (host_nlocal>0) + _gpu_host=1; + + if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host, + max_nbors,cell_size,true)) + return false; + ucl_device=device->gpu; + atom=&device->atom; + nbor=&device->nbor; + + _block_size=BLOCK_1D; + if (static_cast(_block_size)>ucl_device->group_size()) + _block_size=ucl_device->group_size(); + compile_kernels(*ucl_device); + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) { + lj_types=MAX_SHARED_TYPES; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for copying type data + UCL_H_Vec host_write(lj_types*lj_types*32,*ucl_device, + UCL_WRITE_OPTIMIZED); + + for (int i=0; iatom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write, + host_sigma,host_epsilon); + + cut_form.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,cut_form,host_write, + host_cutsq,h_form); + + lj1.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq,h_form); + + lj3.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + dev_error.alloc(1,*ucl_device); + dev_error.zero(); + + _allocated=true; + + host_form=h_form; + + // Initialize timers for the selected GPU + time_kernel.init(*ucl_device); + time_gayberne.init(*ucl_device); + time_kernel2.init(*ucl_device); + time_gayberne2.init(*ucl_device); + time_kernel.zero(); + time_gayberne.zero(); + time_kernel2.zero(); + time_gayberne2.zero(); + + // Allocate, cast and asynchronous memcpy of constant data + // Copy data for bonded interactions + gamma_upsilon_mu.alloc(7,*ucl_device,UCL_READ_ONLY); + host_write[0]=static_cast(gamma); + host_write[1]=static_cast(upsilon); + host_write[2]=static_cast(mu); + host_write[3]=static_cast(host_special_lj[0]); + host_write[4]=static_cast(host_special_lj[1]); + host_write[5]=static_cast(host_special_lj[2]); + host_write[6]=static_cast(host_special_lj[3]); + ucl_copy(gamma_upsilon_mu,host_write,7,false); + + lshape.alloc(ntypes,*ucl_device,UCL_READ_ONLY); + UCL_H_Vec d_view; + d_view.view(host_lshape,lshape.numel(),*ucl_device); + ucl_copy(lshape,d_view,false); + + // Copy shape, well, sigma, epsilon, and cutsq onto GPU + // - cast if necessary + shape.alloc(ntypes,*ucl_device,UCL_READ_ONLY); + for (int i=0; i view4; + view4.view((numtyp4*)host_write.begin(),shape.numel(),*ucl_device); + ucl_copy(shape,view4,false); + + well.alloc(ntypes,*ucl_device,UCL_READ_ONLY); + for (int i=0; i0) { + std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n"; + exit(1); + } + + if (multiple_forms) + atom->dev_ans.zero(); + + _max_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + + // Memory for ilist ordered by particle type + return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS); +} + +template +void GB_GPU_MemoryT::clear() { + if (!_allocated) + return; + + UCL_H_Vec err_flag(1,*ucl_device); + ucl_copy(err_flag,dev_error,false); + if (err_flag[0] == 2) + std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; + err_flag.clear(); + + _allocated=false; + + // Output any timing information + acc_timers(); + double single[6], times[6]; + + single[0]=atom->transfer_time(); + single[1]=nbor->time_nbor.total_seconds(); + single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+ + nbor->time_kernel.total_seconds(); + single[3]=time_gayberne.total_seconds()+time_gayberne2.total_seconds(); + if (multiple_forms) + single[4]=time_pair.total_seconds(); + else + single[4]=0; + single[5]=atom->cast_time(); + + MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + double avg_split=hd_balancer.all_avg_split(); + + _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+ + sigma_epsilon.row_bytes()+cut_form.row_bytes()+ + shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+ + gamma_upsilon_mu.row_bytes(); + double mpi_max_bytes; + MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + double max_mb=mpi_max_bytes/(1024*1024); + + if (device->world_me()==0) + if (screen && times[3]>0.0) { + int world_size=device->world_size(); + + fprintf(screen,"\n\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + fprintf(screen," GPU Time Info (average): "); + fprintf(screen,"\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + + if (device->procs_per_gpu()==1) { + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/world_size); + fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/world_size); + fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/world_size); + if (nbor->gpu_nbor()) + fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/world_size); + else + fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/world_size); + fprintf(screen,"Force calc: %.4f s.\n",times[3]/world_size); + fprintf(screen,"LJ calc: %.4f s.\n",times[4]/world_size); + } + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"-------------------------------------"); + fprintf(screen,"--------------------------------\n\n"); + } + _max_bytes=0.0; + + dev_error.clear(); + lj1.clear(); + lj3.clear(); + sigma_epsilon.clear(); + cut_form.clear(); + + shape.clear(); + well.clear(); + lshape.clear(); + gamma_upsilon_mu.clear(); + host_olist.clear(); + + time_kernel.clear(); + time_gayberne.clear(); + time_kernel2.clear(); + time_gayberne2.clear(); + time_pair.clear(); + hd_balancer.clear(); + + if (_compiled) { + k_gb_nbor_fast.clear(); + k_gb_nbor.clear(); + k_gayberne.clear(); + k_sphere_gb.clear(); + k_lj_fast.clear(); + k_lj.clear(); + delete pair_program; + delete gb_program; + delete gb_lj_program; + _compiled=false; + } + + device->clear(); +} + +template +double GB_GPU_MemoryT::host_memory_usage() const { + return device->atom.host_memory_usage()+ + device->nbor.host_memory_usage()+4*sizeof(numtyp)+ + sizeof(GB_GPU_Memory)+ + device->nbor.max_atoms()*sizeof(int); +} + +template +void GB_GPU_MemoryT::compile_kernels(UCL_Device &dev) { + if (_compiled) + return; + + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE); + + pair_program=new UCL_Program(dev); + pair_program->load_string(gb_gpu_kernel_nbor,flags.c_str()); + k_gb_nbor_fast.set_function(*pair_program,"kernel_gb_nbor_fast"); + k_gb_nbor.set_function(*pair_program,"kernel_gb_nbor"); + + gb_program=new UCL_Program(dev); + gb_program->load_string(gb_gpu_kernel,flags.c_str()); + k_gayberne.set_function(*gb_program,"kernel_gayberne"); + + gb_lj_program=new UCL_Program(dev); + gb_lj_program->load_string(gb_gpu_kernel_lj,flags.c_str()); + k_sphere_gb.set_function(*gb_lj_program,"kernel_sphere_gb"); + k_lj_fast.set_function(*gb_lj_program,"kernel_lj_fast"); + k_lj.set_function(*gb_lj_program,"kernel_lj"); + + _compiled=true; +} + +template class GB_GPU_Memory; + diff --git a/lib/gpu/gb_gpu_memory.cu b/lib/gpu/gb_gpu_memory.cu deleted file mode 100644 index c313f8be01..0000000000 --- a/lib/gpu/gb_gpu_memory.cu +++ /dev/null @@ -1,156 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#include "gb_gpu_memory.h" -#define GB_GPU_MemoryT GB_GPU_Memory - -template -GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() { - this->atom.atom_fields(8); - this->atom.ans_fields(13); - this->nbor.packing(true); -} - -template -GB_GPU_MemoryT::~GB_GPU_Memory() { - clear(); -} - -template -bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes, - const double gamma, const double upsilon, - const double mu, double **host_shape, - double **host_well, double **host_cutsq, - double **host_sigma, double **host_epsilon, - double *host_lshape, int **h_form, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const bool force_d, const int me) { - _max_nbors=max_nbors; - if (this->allocated) - clear(); - - bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon, - host_lj1, host_lj2, host_lj3, host_lj4, - host_offset, host_special_lj, max_nbors, me, - nlocal, nall); - if (!p) - return false; - - host_form=h_form; - - // Initialize timers for the selected GPU - time_kernel.init(); - time_gayberne.init(); - time_kernel2.init(); - time_gayberne2.init(); - - // Use the write buffer from atom for data initialization - NVC_HostT &host_write=this->atom.host_write; - assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2); - - // Allocate, cast and asynchronous memcpy of constant data - gamma_upsilon_mu.safe_alloc(3); - host_write[0]=static_cast(gamma); - host_write[1]=static_cast(upsilon); - host_write[2]=static_cast(mu); - gamma_upsilon_mu.copy_from_host(host_write.begin()); - - lshape.safe_alloc(ntypes,lshape_get_texture()); - lshape.cast_copy(host_lshape,host_write); - lshape.copy_from_host(host_write.begin()); - - // Copy shape, well, sigma, epsilon, and cutsq onto GPU - shape.safe_alloc(ntypes,3,shape_get_texture()); - shape.cast_copy(host_shape[0],host_write); - well.safe_alloc(ntypes,3,well_get_texture()); - well.cast_copy(host_well[0],host_write); - - // Copy LJ data onto GPU - int lj_types=ntypes; - if (lj_types<=MAX_SHARED_TYPES) - lj_types=MAX_SHARED_TYPES; - form.safe_alloc(lj_types,lj_types,form_get_texture()); - form.copy_2Dfrom_host(host_form[0],ntypes,ntypes); - - // See if we want fast GB-sphere or sphere-sphere calculations - multiple_forms=false; - for (int i=1; imax_local); -} - -template -void GB_GPU_MemoryT::resize_atom(const int nall, bool &success) { - this->max_atoms=static_cast(static_cast(nall)*1.10); - this->atom.resize(this->max_atoms, success); -} - -template -void GB_GPU_MemoryT::resize_local(const int nlocal, const int max_nbors, - bool &success) { - if (nlocal>this->max_local) { - this->max_local=static_cast(static_cast(nlocal)*1.10); - host_olist.clear(); - success=success && host_olist.alloc_rw(this->max_local); - } - if (max_nbors>_max_nbors) - _max_nbors=static_cast(static_cast(max_nbors)*1.10); - this->nbor.resize(this->max_local,_max_nbors,success); -} - -template -void GB_GPU_MemoryT::clear() { - if (!this->allocated) - return; - - int err_flag; - this->dev_error.copy_to_host(&err_flag); - if (err_flag == 1) - std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N " - << "and RECOMPILE.\n"; - else if (err_flag == 2) - std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; - - LJ_GPU_MemoryT::clear(); - - lshape.unbind(); - - shape.clear(); - well.clear(); - form.clear(); - lshape.clear(); - gamma_upsilon_mu.clear(); - host_olist.clear(); -} - -template -double GB_GPU_MemoryT::host_memory_usage() { - return this->atom.host_memory_usage(this->max_atoms)+ - this->nbor.host_memory_usage()+4*sizeof(numtyp)+ - sizeof(GB_GPU_Memory)+this->max_atoms*sizeof(int); -} - -template class GB_GPU_Memory; diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h index 63627065bf..2cfc805cd8 100644 --- a/lib/gpu/gb_gpu_memory.h +++ b/lib/gpu/gb_gpu_memory.h @@ -12,61 +12,183 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef GB_GPU_MEMORY_H #define GB_GPU_MEMORY_H -#define MAX_GPU_THREADS 4 -#include "lj_gpu_memory.h" +#define BLOCK_1D 64 -enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; +#include "pair_gpu_device.h" +#include "pair_gpu_balance.h" +#include "mpi.h" template -class GB_GPU_Memory : public LJ_GPU_Memory { +class GB_GPU_Memory { public: GB_GPU_Memory(); ~GB_GPU_Memory(); - - bool init(const int ij_size, const int ntypes, const double gamma, + + /// Clear any previous data and set up for a new LAMMPS run + /** \param gpu_nbor true if neighboring performed on device + * \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \return false if there is not sufficient memory or device init prob **/ + bool init(const int ntypes, const double gamma, const double upsilon, const double mu, double **host_shape, double **host_well, double **host_cutsq, double **host_sigma, double **host_epsilon, double *host_lshape, int **h_form, double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int max_nbors, const int nlocal, const int nall, - const bool force_d, const int me); + double **host_lj4, double **host_offset, + const double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const double cell_size, + const double gpu_split, FILE *screen); - void resize_atom(const int nall, bool &success); - void resize_local(const int nlocal, const int max_nbors, bool &success); + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + atom->resize(inum, nall, success); + if (multiple_forms) atom->dev_ans.zero(); + double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_bytes) + _max_bytes=bytes; + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \param olist_size size of list of particles from CPU neighboring + * \note host_inum is 0 if the host is performing neighboring + * \note if GPU is neighboring nlocal+host_inum=total number local particles + * \note if CPU is neighboring olist_size=total number of local particles + * \note if GPU is neighboring olist_size=0 **/ + inline void resize_local(const int nlocal, const int host_inum, + const int max_nbors, const int olist_size, + bool &success) { + if (olist_size>static_cast(host_olist.numel())) { + host_olist.clear(); + int new_size=static_cast(static_cast(olist_size)*1.10); + success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS); + } + nbor->resize(nlocal,host_inum,max_nbors,success); + double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_bytes) + _max_bytes=bytes; + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ void clear(); - double host_memory_usage(); + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + /// Accumulate timers + inline void acc_timers() { + if (nbor_time_avail) { + nbor->time_nbor.add_to_total(); + nbor->time_kernel.add_to_total(); + nbor_time_avail=false; + } + time_kernel.add_to_total(); + time_gayberne.add_to_total(); + if (multiple_forms) { + time_kernel2.add_to_total(); + time_gayberne2.add_to_total(); + time_pair.add_to_total(); + } + atom->acc_timers(); + } - // ---------------------------- DATA ---------------------------- + /// Accumulate timers + inline void zero_timers() { + nbor_time_avail=false; + time_kernel.zero(); + time_gayberne.zero(); + if (multiple_forms) { + time_kernel2.zero(); + time_gayberne2.zero(); + time_pair.zero(); + } + atom->zero_timers(); + } - // ilist with particles sorted by type - NVC_HostI host_olist; - - // --------------- Const Data for Atoms - NVC_ConstMatT shape, well; - NVC_ConstMatI form; - NVC_VecT lshape, gamma_upsilon_mu; + // -------------------------- DEVICE DATA ------------------------- + /// Device Properties and Atom and Neighbor storage + PairGPUDevice *device; + /// Geryon device + UCL_Device *ucl_device; + + /// Device Error Flag - Set if a bad matrix inversion occurs + UCL_D_Vec dev_error; + /// Device timers + UCL_Timer time_kernel, time_gayberne, time_kernel2, time_gayberne2, time_pair; + /// Host device load balancer + PairGPUBalance hd_balancer; + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- TYPE DATA -------------------------- - // --------------- Timing Stuff - NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2; + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon + UCL_D_Vec sigma_epsilon; + /// cut_form.x = cutsq, cut_form.y = form + UCL_D_Vec cut_form; + // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ... + UCL_D_Vec gamma_upsilon_mu; // True if we want to use fast GB-sphere or sphere-sphere calculations bool multiple_forms; int **host_form; - int last_ellipse; - int _max_nbors; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + int _lj_types; + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + PairGPUAtom *atom; + + /// Aspherical Const Data for Atoms + UCL_D_Vec shape, well; + /// Aspherical Const Data for Atoms + UCL_D_Vec lshape; + + int last_ellipse, max_last_ellipse; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + PairGPUNbor *nbor; + /// ilist with particles sorted by type + UCL_H_Vec host_olist; + /// True if we should accumulate the neighbor timer + bool nbor_time_avail; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program, *gb_program, *gb_lj_program; + UCL_Kernel k_gb_nbor_fast, k_gb_nbor; + UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj; + inline int block_size() { return _block_size; } + private: + bool _allocated, _compiled; + int _block_size; + double _max_bytes; + + void compile_kernels(UCL_Device &dev); }; #endif diff --git a/lib/gpu/geryon/README b/lib/gpu/geryon/README new file mode 100644 index 0000000000..601c19dc3c --- /dev/null +++ b/lib/gpu/geryon/README @@ -0,0 +1,27 @@ +Geryon + + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + +Geryon is intended to be a simple library for managing the CUDA Runtime, +CUDA Driver, and OpenCL APIs with a consistent interface: + + * Change from one API to another by simply changing the namespace + * Use multiple APIs in the same code + * Lightweight (only include files - no build required) + * Manage device query and selection + * Simple vector and matrix containers + * Simple routines for data copy and type casting + * Simple routines for data I/O + * Simple classes for managing device timing + * Simple classes for managing kernel compilation and execution + +Geryon does not require building (although a Makefile is provided for testing +purposes). The library is a set of header files that can be included with +your code. + +Documentation and examples are provided at + +http://users.nccs.gov/~wb8/geryon/index.htm diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt new file mode 100644 index 0000000000..f01f03ca0a --- /dev/null +++ b/lib/gpu/geryon/VERSION.txt @@ -0,0 +1 @@ +Geryon Version 10.280 diff --git a/lib/gpu/geryon/file_to_cstr.sh b/lib/gpu/geryon/file_to_cstr.sh new file mode 100755 index 0000000000..e8264dfad8 --- /dev/null +++ b/lib/gpu/geryon/file_to_cstr.sh @@ -0,0 +1,47 @@ +#!/bin/sh + +# convert ptx assembly output into +# a c-style string constant written +# in portable posix shell script. +# requires: sed, rm, mv +# +# Author: Axel Kohlmeyer, Temple University + +num_args=$# + +# we write to a scratch file, since +# we know the real file name only at +# the very end. +output=geryon.tmp.$$ +: > $output + +# remove temporary file in case we're interrupted. +cleanup () { + rm -f geryon.tmp.$$ +} +trap cleanup INT QUIT TERM + +# loop over arguments and convert to +# string constants. +i=1 +while [ $i -lt $num_args ] +do \ + src=$1 + krn=${src##*/} + krn=${krn%.*} + echo "Converting kernel $krn from $src to a c-style string" + echo "const char * $krn = " >> $output + sed -e 's/\\/\\\\/g' \ + -e 's/"/\\"/g' \ + -e 's/ *\/\/.*$//' \ + -e '/\.file/D' \ + -e '/^[ ]*$/D' \ + -e 's/^\(.*\)$/"\1\\n"/' $src >> $output + echo ';' >> $output + shift + i=`expr $i + 1` +done + +# $1 holds now the real output file name +mv $output $1 + diff --git a/lib/gpu/geryon/nvc_device.h b/lib/gpu/geryon/nvc_device.h new file mode 100644 index 0000000000..2187385077 --- /dev/null +++ b/lib/gpu/geryon/nvc_device.h @@ -0,0 +1,311 @@ +/*************************************************************************** + nvc_device.h + ------------------- + W. Michael Brown + + Utilities for dealing with cuda devices + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Wed Jan 28 2009 + copyright : (C) 2009 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef NVC_DEVICE +#define NVC_DEVICE + +#include +#include +#include +#include +#include "nvc_macros.h" +#include "ucl_types.h" + +namespace ucl_cudart { + +// -------------------------------------------------------------------------- +// - COMMAND QUEUE STUFF +// -------------------------------------------------------------------------- +typedef cudaStream_t command_queue; + +inline void ucl_sync(cudaStream_t &stream) { + CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); +} + +/// Class for looking at device properties +/** \note Calls to change the device outside of the class results in incorrect + * behavior + * \note There is no error checking for indexing past the number of devices **/ +class UCL_Device { + public: + /// Collect properties for every GPU on the node + /** \note You must set the active GPU with set() before using the device **/ + UCL_Device(); + + ~UCL_Device(); + + /// Returns 1 (For compatibility with OpenCL) + inline int num_platforms() { return 1; } + + /// Return a string with name and info of the current platform + std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA"; } + + /// Return the number of devices that support CUDA + inline int num_devices() { return _properties.size(); } + + /// Set the CUDA device to the specified device number + void set(int num); + + /// Get the current device number + inline int device_num() { return _device; } + + /// Returns the default stream for the current device + inline command_queue & cq() { return cq(0); } + + /// Returns the stream indexed by i + inline command_queue & cq(const int i) { return _cq[i]; } + + /// Block until all commands in the default stream have completed + inline void sync() { sync(0); } + + /// Block until all commands in the specified stream have completed + inline void sync(const int i) { ucl_sync(cq(i)); } + + /// Get the number of command queues currently available on device + inline int num_queues() + { if (_device==-1) return 0; else return _cq.size(); } + + /// Add a stream for device computations + inline void push_command_queue() { + _cq.push_back(cudaStream_t()); + CUDA_SAFE_CALL_NS(cudaStreamCreate(&_cq.back())); + } + + /// Remove a stream for device computations + /** \note You cannot delete the default stream **/ + inline void pop_command_queue() { + if (_cq.size()<2) return; + CUDA_SAFE_CALL_NS(cudaStreamDestroy(_cq.back())); + _cq.pop_back(); + } + + /// Get the current CUDA device name + inline std::string name() { return name(_device); } + /// Get the CUDA device name + inline std::string name(const int i) + { return std::string(_properties[i].name); } + + /// Get a string telling the type of the current device + inline std::string device_type_name() { return device_type_name(_device); } + /// Get a string telling the type of the device + inline std::string device_type_name(const int i) { return "GPU"; } + + /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) + inline int device_type() { return device_type(_device); } + /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) + inline int device_type(const int i) { return UCL_GPU; } + + /// Returns true if double precision is support for the current device + bool double_precision() { return double_precision(_device); } + /// Returns true if double precision is support for the device + bool double_precision(const int i) {return arch(i)>=1.3;} + + /// Get the number of cores in the current device + inline unsigned cores() { return cores(_device); } + /// Get the number of cores + inline unsigned cores(const int i) + { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; + else return _properties[i].multiProcessorCount*32; } + + /// Get the gigabytes of global memory in the current device + inline double gigabytes() { return gigabytes(_device); } + /// Get the gigabytes of global memory + inline double gigabytes(const int i) + { return static_cast(_properties[i].totalGlobalMem)/1073741824; } + + /// Get the bytes of global memory in the current device + inline size_t bytes() { return bytes(_device); } + /// Get the bytes of global memory + inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; } + + /// Return the GPGPU compute capability for current device + inline double arch() { return arch(_device); } + /// Return the GPGPU compute capability + inline double arch(const int i) + { return static_cast(_properties[i].minor)/10+_properties[i].major;} + + /// Clock rate in GHz for current device + inline double clock_rate() { return clock_rate(_device); } + /// Clock rate in GHz + inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;} + + /// Get the maximum number of threads per block + inline size_t group_size() { return group_size(_device); } + /// Get the maximum number of threads per block + inline size_t group_size(const int i) + { return _properties[i].maxThreadsPerBlock; } + + /// Return the maximum memory pitch in bytes for current device + inline size_t max_pitch() { return max_pitch(_device); } + /// Return the maximum memory pitch in bytes + inline size_t max_pitch(const int i) { return _properties[i].memPitch; } + + /// List all devices along with all properties + void print_all(std::ostream &out); + + private: + int _device, _num_devices; + std::vector _properties; + std::vector _cq; +}; + +// Grabs the properties for all devices +inline UCL_Device::UCL_Device() { + CUDA_SAFE_CALL_NS(cudaGetDeviceCount(&_num_devices)); + for (int dev=0; dev<_num_devices; ++dev) { + cudaDeviceProp deviceProp; + CUDA_SAFE_CALL_NS(cudaGetDeviceProperties(&deviceProp, dev)); + if (deviceProp.major == 9999 && deviceProp.minor == 9999) + break; + _properties.push_back(deviceProp); + } + _device=-1; + _cq.push_back(cudaStream_t()); + _cq.back()=0; +} + +inline UCL_Device::~UCL_Device() { + for (int i=1; i= 2020 + int driver_version, runtime_version; + cudaDriverGetVersion(&driver_version); + out << "CUDA Driver Version: " + << driver_version/1000 << "." << driver_version%100 + << std::endl; + cudaRuntimeGetVersion(&runtime_version); + out << "CUDA Runtime Version: " + << runtime_version/1000 << "." << runtime_version%100 + << std::endl; + #endif + + if (num_devices() == 0) + out << "There is no device supporting CUDA\n"; + for (int i=0; i= 2000 + out << " Number of compute units/multiprocessors: " + << _properties[i].multiProcessorCount << std::endl; + out << " Number of cores: " + << cores(i) << std::endl; + #endif + out << " Total amount of constant memory: " + << _properties[i].totalConstMem << " bytes\n"; + out << " Total amount of local/shared memory per block: " + << _properties[i].sharedMemPerBlock << " bytes\n"; + out << " Total number of registers available per block: " + << _properties[i].regsPerBlock << std::endl; + out << " Warp size: " + << _properties[i].warpSize << std::endl; + out << " Maximum number of threads per block: " + << _properties[i].maxThreadsPerBlock << std::endl; + out << " Maximum group size (# of threads per block) " + << _properties[i].maxThreadsDim[0] << " x " + << _properties[i].maxThreadsDim[1] << " x " + << _properties[i].maxThreadsDim[2] << std::endl; + out << " Maximum item sizes (# threads for each dim) " + << _properties[i].maxGridSize[0] << " x " + << _properties[i].maxGridSize[1] << " x " + << _properties[i].maxGridSize[2] << std::endl; + out << " Maximum memory pitch: " + << max_pitch(i) << " bytes\n"; + out << " Texture alignment: " + << _properties[i].textureAlignment << " bytes\n"; + out << " Clock rate: " + << clock_rate(i) << " GHz\n"; + #if CUDART_VERSION >= 2000 + out << " Concurrent copy and execution: "; + if (_properties[i].deviceOverlap) + out << "Yes\n"; + else + out << "No\n"; + #endif + #if CUDART_VERSION >= 2020 + out << " Run time limit on kernels: "; + if (_properties[i].kernelExecTimeoutEnabled) + out << "Yes\n"; + else + out << "No\n"; + out << " Integrated: "; + if (_properties[i].integrated) + out << "Yes\n"; + else + out << "No\n"; + out << " Support host page-locked memory mapping: "; + if (_properties[i].canMapHostMemory) + out << "Yes\n"; + else + out << "No\n"; + out << " Compute mode: "; + if (_properties[i].computeMode == cudaComputeModeDefault) + out << "Default\n"; // multiple threads can use device + else if (_properties[i].computeMode == cudaComputeModeExclusive) + out << "Exclusive\n"; // only thread can use device + else if (_properties[i].computeMode == cudaComputeModeProhibited) + out << "Prohibited\n"; // no thread can use device + else + out << "Unknown\n"; + #endif + #if CUDART_VERSION >= 3000 + out << " Concurrent kernel execution: "; + if (_properties[i].concurrentKernels) + out << "Yes\n"; + else + out << "No\n"; + out << " Device has ECC support enabled: "; + if (_properties[i].ECCEnabled) + out << "Yes\n"; + else + out << "No\n"; + #endif + } +} + +} + +#endif + diff --git a/lib/gpu/geryon/nvc_macros.h b/lib/gpu/geryon/nvc_macros.h new file mode 100644 index 0000000000..3fb488072c --- /dev/null +++ b/lib/gpu/geryon/nvc_macros.h @@ -0,0 +1,57 @@ +#ifndef NVC_MACROS_H +#define NVC_MACROS_H + +#if defined(__APPLE__) +#if _GLIBCXX_ATOMIC_BUILTINS == 1 +#undef _GLIBCXX_ATOMIC_BUILTINS +#endif // _GLIBCXX_ATOMIC_BUILTINS +#endif // __APPLE__ + +#include +#include +#include + +#ifdef MPI_GERYON +#include "mpi.h" +#define NVC_GERYON_EXIT MPI_Abort(MPI_COMM_WORLD,-1) +#else +#define NVC_GERYON_EXIT assert(0==1) +#endif + +#ifndef UCL_NO_API_CHECK + +#define CUDA_SAFE_CALL_NS( call) do { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in call at file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + NVC_GERYON_EXIT; \ + } } while (0) + +#ifdef UCL_SYNC_DEBUG + +#define CUDA_SAFE_CALL( call) do { \ + CUDA_SAFE_CALL_NS( call); \ + cudaError err=cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + NVC_GERYON_EXIT; \ + } } while (0) + +#else + +#define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NS( call) + +#endif + +#else // not DEBUG + +// void macros for performance reasons +#define CUDA_SAFE_CALL( call) call +#define CUDA_SAFE_CALL_NS( call) call + +#endif + +#endif + diff --git a/lib/gpu/geryon/nvc_texture.h b/lib/gpu/geryon/nvc_texture.h new file mode 100644 index 0000000000..939f385340 --- /dev/null +++ b/lib/gpu/geryon/nvc_texture.h @@ -0,0 +1,69 @@ +/*************************************************************************** + nvc_texture.h + ------------------- + W. Michael Brown + + Utilities for dealing with CUDA Runtime textures + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Fri Jul 2 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef NVC_TEXTURE +#define NVC_TEXTURE + +#include "nvc_mat.h" + +namespace ucl_cudart { + +/// Class storing a texture reference +class UCL_Texture { + public: + UCL_Texture() {} + ~UCL_Texture() {} + /// Construct with a specified texture reference + inline UCL_Texture(textureReference *t) { get_texture(t); } + /// Set the texture reference for this object + inline void get_texture(textureReference *t) { _tex_ptr=t; } + + /// Bind a float array where each fetch grabs a vector of length numel + template + inline void bind_float(mat_typ &vec, const unsigned numel) { + #ifdef UCL_DEBUG + assert(numel!=0 && numel<5); + #endif + int bits[4]={0,0,0,0}; + for (int i=0; i +#include +#include +#include "nvd_macros.h" +#include "ucl_types.h" + +namespace ucl_cudadr { + +// -------------------------------------------------------------------------- +// - COMMAND QUEUE STUFF +// -------------------------------------------------------------------------- +typedef CUstream command_queue; + +inline void ucl_sync(CUstream &stream) { + CU_SAFE_CALL(cuStreamSynchronize(stream)); +} + +struct NVDProperties { + std::string name; + int major; + int minor; + CUDA_INT_TYPE totalGlobalMem; + int multiProcessorCount; + CUdevprop_st p; + int kernelExecTimeoutEnabled; + int integrated; + int canMapHostMemory; + int concurrentKernels; + int ECCEnabled; +}; + +/// Class for looking at device properties +/** \note Calls to change the device outside of the class results in incorrect + * behavior + * \note There is no error checking for indexing past the number of devices **/ +class UCL_Device { + public: + /// Collect properties for every GPU on the node + /** \note You must set the active GPU with set() before using the device **/ + UCL_Device(); + + ~UCL_Device(); + + /// Returns 1 (For compatibility with OpenCL) + inline int num_platforms() { return 1; } + + /// Return a string with name and info of the current platform + std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; } + + /// Return the number of devices that support CUDA + inline int num_devices() { return _properties.size(); } + + /// Set the CUDA device to the specified device number + /** A context and default command queue will be created for the device **/ + void set(int num); + + /// Get the current device number + inline int device_num() { return _device; } + + /// Returns the default stream for the current device + inline command_queue & cq() { return cq(0); } + + /// Returns the stream indexed by i + inline command_queue & cq(const int i) { return _cq[i]; } + + /// Block until all commands in the default stream have completed + inline void sync() { sync(0); } + + /// Block until all commands in the specified stream have completed + inline void sync(const int i) { ucl_sync(cq(i)); } + + /// Get the number of command queues currently available on device + inline int num_queues() + { return _cq.size(); } + + /// Add a stream for device computations + inline void push_command_queue() { + _cq.push_back(CUstream()); + CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0)); + } + + /// Remove a stream for device computations + /** \note You cannot delete the default stream **/ + inline void pop_command_queue() { + if (_cq.size()<2) return; + CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back())); + _cq.pop_back(); + } + + /// Get the current CUDA device name + inline std::string name() { return name(_device); } + /// Get the CUDA device name + inline std::string name(const int i) + { return std::string(_properties[i].name); } + + /// Get a string telling the type of the current device + inline std::string device_type_name() { return device_type_name(_device); } + /// Get a string telling the type of the device + inline std::string device_type_name(const int i) { return "GPU"; } + + /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) + inline int device_type() { return device_type(_device); } + /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) + inline int device_type(const int i) { return UCL_GPU; } + + /// Returns true if double precision is support for the current device + bool double_precision() { return double_precision(_device); } + /// Returns true if double precision is support for the device + bool double_precision(const int i) {return arch(i)>=1.3;} + + /// Get the number of cores in the current device + inline unsigned cores() { return cores(_device); } + /// Get the number of cores + inline unsigned cores(const int i) + { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; + else return _properties[i].multiProcessorCount*32; } + + /// Get the gigabytes of global memory in the current device + inline double gigabytes() { return gigabytes(_device); } + /// Get the gigabytes of global memory + inline double gigabytes(const int i) + { return static_cast(_properties[i].totalGlobalMem)/1073741824; } + + /// Get the bytes of global memory in the current device + inline size_t bytes() { return bytes(_device); } + /// Get the bytes of global memory + inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; } + + // Get the gigabytes of free memory in the current device + inline double free_gigabytes() { return free_gigabytes(_device); } + // Get the gigabytes of free memory + inline double free_gigabytes(const int i) + { return static_cast(free_bytes(i))/1073741824; } + + // Get the bytes of free memory in the current device + inline size_t free_bytes() { return free_bytes(_device); } + // Get the bytes of free memory + inline size_t free_bytes(const int i) { + CUDA_INT_TYPE dfree, dtotal; + CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal)); + return static_cast(dfree); + } + + /// Return the GPGPU compute capability for current device + inline double arch() { return arch(_device); } + /// Return the GPGPU compute capability + inline double arch(const int i) + { return static_cast(_properties[i].minor)/10+_properties[i].major;} + + /// Clock rate in GHz for current device + inline double clock_rate() { return clock_rate(_device); } + /// Clock rate in GHz + inline double clock_rate(const int i) + { return _properties[i].p.clockRate*1e-6;} + + /// Get the maximum number of threads per block + inline size_t group_size() { return group_size(_device); } + /// Get the maximum number of threads per block + inline size_t group_size(const int i) + { return _properties[i].p.maxThreadsPerBlock; } + + /// Return the maximum memory pitch in bytes for current device + inline size_t max_pitch() { return max_pitch(_device); } + /// Return the maximum memory pitch in bytes + inline size_t max_pitch(const int i) { return _properties[i].p.memPitch; } + + /// List all devices along with all properties + void print_all(std::ostream &out); + + private: + int _device, _num_devices; + std::vector _properties; + std::vector _cq; + CUdevice _cu_device; + CUcontext _context; +}; + +// Grabs the properties for all devices +inline UCL_Device::UCL_Device() { + CU_SAFE_CALL_NS(cuInit(0)); + CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices)); + for (int dev=0; dev<_num_devices; ++dev) { + CUdevice m; + CU_SAFE_CALL_NS(cuDeviceGet(&m,dev)); + _properties.push_back(NVDProperties()); + + char namecstr[1024]; + CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m)); + _properties.back().name=namecstr; + + CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major, + &_properties.back().minor,m)); + + CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m)); + CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + m)); + CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m)); + #if CUDA_VERSION >= 2020 + CU_SAFE_CALL_NS(cuDeviceGetAttribute( + &_properties.back().kernelExecTimeoutEnabled, + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev)); + CU_SAFE_CALL_NS(cuDeviceGetAttribute( + &_properties.back().integrated, + CU_DEVICE_ATTRIBUTE_INTEGRATED, dev)); + CU_SAFE_CALL_NS(cuDeviceGetAttribute( + &_properties.back().canMapHostMemory, + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev)); + #endif + #if CUDA_VERSION >= 3000 + CU_SAFE_CALL_NS(cuDeviceGetAttribute( + &_properties.back().concurrentKernels, + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev)); + CU_SAFE_CALL_NS(cuDeviceGetAttribute( + &_properties.back().ECCEnabled, + CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev)); + #endif + } + _device=-1; + _cq.push_back(CUstream()); + _cq.back()=0; +} + +inline UCL_Device::~UCL_Device() { + if (_device>-1) { + for (int i=1; i-1) { + CU_SAFE_CALL_NS(cuCtxDestroy(_context)); + for (int i=1; i= 2020 + int driver_version; + cuDriverGetVersion(&driver_version); + out << "CUDA Driver Version: " + << driver_version/1000 << "." << driver_version%100 + << std::endl; + #endif + + if (num_devices() == 0) + out << "There is no device supporting CUDA\n"; + for (int i=0; i= 2000 + out << " Number of compute units/multiprocessors: " + << _properties[i].multiProcessorCount << std::endl; + out << " Number of cores: " + << cores(i) << std::endl; + #endif + out << " Total amount of constant memory: " + << _properties[i].p.totalConstantMemory << " bytes\n"; + out << " Total amount of local/shared memory per block: " + << _properties[i].p.sharedMemPerBlock << " bytes\n"; + out << " Total number of registers available per block: " + << _properties[i].p.regsPerBlock << std::endl; + out << " Warp size: " + << _properties[i].p.SIMDWidth << std::endl; + out << " Maximum number of threads per block: " + << _properties[i].p.maxThreadsPerBlock << std::endl; + out << " Maximum group size (# of threads per block) " + << _properties[i].p.maxThreadsDim[0] << " x " + << _properties[i].p.maxThreadsDim[1] << " x " + << _properties[i].p.maxThreadsDim[2] << std::endl; + out << " Maximum item sizes (# threads for each dim) " + << _properties[i].p.maxGridSize[0] << " x " + << _properties[i].p.maxGridSize[1] << " x " + << _properties[i].p.maxGridSize[2] << std::endl; + out << " Maximum memory pitch: " + << max_pitch(i) << " bytes\n"; + out << " Texture alignment: " + << _properties[i].p.textureAlign << " bytes\n"; + out << " Clock rate: " + << clock_rate(i) << " GHz\n"; + #if CUDA_VERSION >= 2020 + out << " Run time limit on kernels: "; + if (_properties[i].kernelExecTimeoutEnabled) + out << "Yes\n"; + else + out << "No\n"; + out << " Integrated: "; + if (_properties[i].integrated) + out << "Yes\n"; + else + out << "No\n"; + out << " Support host page-locked memory mapping: "; + if (_properties[i].canMapHostMemory) + out << "Yes\n"; + else + out << "No\n"; + #endif + #if CUDA_VERSION >= 3000 + out << " Concurrent kernel execution: "; + if (_properties[i].concurrentKernels) + out << "Yes\n"; + else + out << "No\n"; + out << " Device has ECC support enabled: "; + if (_properties[i].ECCEnabled) + out << "Yes\n"; + else + out << "No\n"; + #endif + } +} + +} + +#endif diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h new file mode 100644 index 0000000000..1f53cfaadb --- /dev/null +++ b/lib/gpu/geryon/nvd_kernel.h @@ -0,0 +1,259 @@ +/*************************************************************************** + nvd_kernel.h + ------------------- + W. Michael Brown + + Utilities for dealing with CUDA Driver kernels + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Tue Feb 9 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef NVD_KERNEL +#define NVD_KERNEL + +#include "nvd_device.h" +#include + +namespace ucl_cudadr { + +class UCL_Texture; + +/// Class storing 1 or more kernel functions from a single string or file +class UCL_Program { + public: + inline UCL_Program(UCL_Device &device) {} + inline ~UCL_Program() {} + + /// Initialize the program with a device + inline void init(UCL_Device &device) { } + + /// Clear any data associated with program + /** \note Must call init() after each clear **/ + inline void clear() { } + + /// Load a program from a file and compile with flags + inline int load(const char *filename, const char *flags="", + std::string *log=NULL) { + std::ifstream in(filename); + if (!in || in.is_open()==false) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not open kernel file: " + << filename << std::endl; + exit(1); + #endif + return UCL_FILE_NOT_FOUND; + } + + std::string program((std::istreambuf_iterator(in)), + std::istreambuf_iterator()); + in.close(); + return load_string(program.c_str(),flags,log); + } + + /// Load a program from a string and compile with flags + inline int load_string(const char *program, const char *flags="", + std::string *log=NULL) { + if (std::string(flags)=="BINARY") + return load_binary(program); + const unsigned int num_opts=2; + CUjit_option options[num_opts]; + void *values[num_opts]; + + // set up size of compilation log buffer + options[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + values[0] = (void *)(int)10240; + // set up pointer to the compilation log buffer + options[1] = CU_JIT_INFO_LOG_BUFFER; + char clog[10240]; + values[1] = clog; + + CUresult err=cuModuleLoadDataEx(&_module,program,num_opts, + options,(void **)values); + + if (log!=NULL) + *log=std::string(clog); + + if (err != CUDA_SUCCESS) { + #ifndef UCL_NO_EXIT + std::cerr << std::endl + << "----------------------------------------------------------\n" + << " UCL Error: Error compiling PTX Program...\n" + << "----------------------------------------------------------\n"; + std::cerr << log << std::endl; + #endif + return UCL_COMPILE_ERROR; + } + + return UCL_SUCCESS; + } + + /// Load a precompiled program from a file + inline int load_binary(const char *filename) { + CUmodule _module; + CUresult err = cuModuleLoad(&_module,filename); + if (err==301) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not open binary kernel file: " + << filename << std::endl; + exit(1); + #endif + return UCL_FILE_NOT_FOUND; + } else if (err!=CUDA_SUCCESS) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Error loading binary kernel file: " + << filename << std::endl; + exit(1); + #endif + return UCL_FILE_NOT_FOUND; + } + //int ucl_error=UCL_SUCCESS; + //if (err==301) + // return UCL_FILE_NOT_FOUND; + //else if (err!=CUDA_SUCCESS) + // return UCL_ERROR; + return UCL_SUCCESS; + } + + friend class UCL_Kernel; + private: + CUmodule _module; + friend class UCL_Texture; +}; + +/// Class for dealing with OpenCL kernels +class UCL_Kernel { + public: + UCL_Kernel() : _dimensions(1), _num_args(0), _param_size(0) + { _num_blocks[0]=0; } + + UCL_Kernel(UCL_Program &program, const char *function) : + _dimensions(1), _num_args(0), _param_size(0) + { _num_blocks[0]=0; set_function(program,function); } + + ~UCL_Kernel() {} + + /// Clear any function associated with the kernel + inline void clear() { } + + /// Get the kernel function from a program + /** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/ + inline int set_function(UCL_Program &program, const char *function) { + CUresult err=cuModuleGetFunction(&_kernel,program._module,function); + if (err!=CUDA_SUCCESS) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not find function: " << function + << " in program.\n"; + exit(1); + #endif + return UCL_FUNCTION_NOT_FOUND; + } + return UCL_SUCCESS; + } + + /// Set the kernel argument. + /** If not a device pointer, this must be repeated each time the argument + * changes + * \note To set kernel parameter i (i>0), parameter i-1 must be set **/ + template + inline void set_arg(const unsigned index, dtype *arg) { + if (index==_num_args) + add_arg(arg); + else if (index<_num_args) + CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype))); + else + assert(0==1); // Must add kernel parameters in sequential order + } + + /// Add a kernel argument. + inline void add_arg(const CUdeviceptr* const arg) { + void* ptr = (void*)(size_t)(*arg); + _param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1); + CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr))); + _offsets.push_back(_param_size); + _param_size+=sizeof(ptr); + _num_args++; + } + + /// Add a kernel argument. + template + inline void add_arg(const dtype* const arg) { + _param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1); + CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype))); + _offsets.push_back(_param_size); + _param_size+=sizeof(dtype); + _num_args++; + } + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called after all arguments have been added **/ + inline void set_size(const size_t num_blocks, const size_t block_size) { + _dimensions=1; + _num_blocks[0]=num_blocks; + _num_blocks[1]=1; + CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1)); + } + + /// Set the number of thread blocks and the number of threads in each block + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, const size_t block_size_y) { + _dimensions=2; + _num_blocks[0]=num_blocks_x; + _num_blocks[1]=num_blocks_y; + CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1)); + } + + /// Set the number of thread blocks and the number of threads in each block + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, + const size_t block_size_y, const size_t block_size_z) { + _dimensions=2; + _num_blocks[0]=num_blocks_x; + _num_blocks[1]=num_blocks_y; + CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y, + block_size_z)); + } + + /// Run the kernel in the default command queue + inline void run() { + CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size)); + CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],0)); + } + + /// Run the kernel in the specified command queue + inline void run(command_queue &cq) { + CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size)); + CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],cq)); + } + + /// Clear any arguments associated with the kernel + inline void clear_args() { _num_args=0; _offsets.clear(); _param_size=0; } + + #include "ucl_arg_kludge.h" + + private: + CUfunction _kernel; + unsigned _dimensions; + unsigned _num_blocks[2]; + unsigned _num_args; + std::vector _offsets; + unsigned _param_size; + friend class UCL_Texture; +}; + +} // namespace + +#endif + diff --git a/lib/gpu/geryon/nvd_macros.h b/lib/gpu/geryon/nvd_macros.h new file mode 100644 index 0000000000..4e88fd3201 --- /dev/null +++ b/lib/gpu/geryon/nvd_macros.h @@ -0,0 +1,57 @@ +#ifndef NVD_MACROS_H +#define NVD_MACROS_H + +#include +#include +#include + +#if CUDA_VERSION >= 3020 +#define CUDA_INT_TYPE size_t +#else +#define CUDA_INT_TYPE unsigned +#endif + +#ifdef MPI_GERYON +#include "mpi.h" +#define NVD_GERYON_EXIT MPI_Abort(MPI_COMM_WORLD,-1) +#else +#define NVD_GERYON_EXIT assert(0==1) +#endif + +#ifndef UCL_NO_API_CHECK + +#define CU_SAFE_CALL_NS( call ) do { \ + CUresult err = call; \ + if( CUDA_SUCCESS != err) { \ + fprintf(stderr, "Cuda driver error %d in call at file '%s' in line %i.\n", \ + err, __FILE__, __LINE__ ); \ + NVD_GERYON_EXIT; \ + } } while (0) + +#ifdef UCL_SYNC_DEBUG + +#define CU_SAFE_CALL( call ) do { \ + CU_SAFE_CALL_NS( call ); \ + CUresult err=cuCtxSynchronize(); \ + if( CUDA_SUCCESS != err) { \ + fprintf(stderr, "Cuda driver error %d in file '%s' in line %i.\n", \ + err, __FILE__, __LINE__ ); \ + NVD_GERYON_EXIT; \ + } } while (0) + +#else + +#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call ) + +#endif + +#else // not DEBUG + +// void macros for performance reasons +#define CU_SAFE_CALL_NS( call ) call +#define CU_SAFE_CALL( call) call + +#endif + +#endif + diff --git a/lib/gpu/geryon/nvd_mat.h b/lib/gpu/geryon/nvd_mat.h new file mode 100644 index 0000000000..ed42305a70 --- /dev/null +++ b/lib/gpu/geryon/nvd_mat.h @@ -0,0 +1,54 @@ +/*************************************************************************** + nvd_mat.h + ------------------- + W. Michael Brown + + CUDA Driver Specific Vector/Matrix Containers, Memory Management, and I/O + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Thu Jan 21 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +/*! \file */ + +#ifndef NVD_MAT_H +#define NVD_MAT_H + +#include "nvd_memory.h" + +/// Namespace for CUDA Driver routines +namespace ucl_cudadr { + +#define _UCL_MAT_ALLOW +#define _UCL_DEVICE_PTR_MAT +#include "ucl_basemat.h" +#include "ucl_h_vec.h" +#include "ucl_h_mat.h" +#include "ucl_d_vec.h" +#include "ucl_d_mat.h" +#undef _UCL_DEVICE_PTR_MAT +#undef _UCL_MAT_ALLOW + +#define UCL_COPY_ALLOW +#include "ucl_copy.h" +#undef UCL_COPY_ALLOW + +#define UCL_PRINT_ALLOW +#include "ucl_print.h" +#undef UCL_PRINT_ALLOW + +} // namespace ucl_cudadr + +#endif diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h new file mode 100644 index 0000000000..2bb6762370 --- /dev/null +++ b/lib/gpu/geryon/nvd_memory.h @@ -0,0 +1,610 @@ +/*************************************************************************** + nvd_memory.h + ------------------- + W. Michael Brown + + CUDA Driver Specific Memory Management and Vector/Matrix Containers + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Thu Jan 21 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef NVD_MEMORY_H +#define NVD_MEMORY_H + +#include +#include +#include +#include "nvd_macros.h" +#include "ucl_types.h" + +namespace ucl_cudadr { + +// -------------------------------------------------------------------------- +// - API Specific Types +// -------------------------------------------------------------------------- +//typedef dim3 ucl_kernel_dim; + +// -------------------------------------------------------------------------- +// - API SPECIFIC DEVICE POINTERS +// -------------------------------------------------------------------------- +typedef CUdeviceptr device_ptr; + +// -------------------------------------------------------------------------- +// - HOST MEMORY ALLOCATION ROUTINES +// -------------------------------------------------------------------------- +template +inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, + const enum UCL_MEMOPT kind) { + CUresult err=CUDA_SUCCESS; + if (kind==UCL_RW_OPTIMIZED) + err=cuMemAllocHost((void **)mat.host_ptr(),n); + else if (kind==UCL_WRITE_OPTIMIZED) + err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); + else + *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +template +inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, + const enum UCL_MEMOPT kind) { + CUresult err=CUDA_SUCCESS; + if (kind==UCL_RW_OPTIMIZED) + err=cuMemAllocHost((void **)mat.host_ptr(),n); + else if (kind==UCL_WRITE_OPTIMIZED) + err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); + else + *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +template +inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) { + if (kind!=UCL_NOT_PINNED) + CU_SAFE_CALL(cuMemFreeHost(mat.begin())); + else + free(mat.begin()); +} + +// -------------------------------------------------------------------------- +// - DEVICE MEMORY ALLOCATION ROUTINES +// -------------------------------------------------------------------------- +template +inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n, + const enum UCL_MEMOPT kind) { + CUresult err=cuMemAlloc(&mat.cbegin(),n); + if (err!=CUDA_SUCCESS) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +template +inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n, + const enum UCL_MEMOPT kind) { + CUresult err=cuMemAlloc(&mat.cbegin(),n); + if (err!=CUDA_SUCCESS) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +template +inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows, + const size_t cols, size_t &pitch, + const enum UCL_MEMOPT kind) { + CUresult err; + CUDA_INT_TYPE upitch; + err=cuMemAllocPitch(&mat.cbegin(),&upitch, + cols*sizeof(typename mat_type::data_type),rows,16); + pitch=static_cast(upitch); + if (err!=CUDA_SUCCESS) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +template +inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows, + const size_t cols, size_t &pitch, + const enum UCL_MEMOPT kind) { + CUresult err; + unsigned upitch; + err=cuMemAllocPitch(&mat.cbegin(),&upitch, + cols*sizeof(typename mat_type::data_type),rows,16); + pitch=static_cast(upitch); + if (err!=CUDA_SUCCESS) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +template +inline void _device_free(mat_type &mat) { + CU_SAFE_CALL(cuMemFree(mat.cbegin())); +} + +inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { + *ptr=in; +} + +template +inline void _device_view(CUdeviceptr *ptr, numtyp *in) { + *ptr=0; +} + +inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in, + const size_t offset, const size_t numsize) { + *ptr=in+offset*numsize; +} + +template +inline void _device_view(CUdeviceptr *ptr, numtyp *in, + const size_t offset, const size_t numsize) { + *ptr=0; +} + +// -------------------------------------------------------------------------- +// - DEVICE IMAGE ALLOCATION ROUTINES +// -------------------------------------------------------------------------- +template +inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows, + const size_t cols) { + assert(0==1); +} + +template +inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows, + const size_t cols) { + assert(0==1); +} + +template +inline void _device_image_free(mat_type &mat) { + assert(0==1); +} + +// -------------------------------------------------------------------------- +// - ZERO ROUTINES +// -------------------------------------------------------------------------- +inline void _host_zero(void *ptr, const size_t n) { + memset(ptr,0,n); +} + +template +inline void _device_zero(mat_type &mat, const size_t n) { + if (n%32==0) + CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4)); + else if (n%16==0) + CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2)); + else + CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n)); +} + +// -------------------------------------------------------------------------- +// - HELPER FUNCTIONS FOR MEMCPY ROUTINES +// -------------------------------------------------------------------------- + +inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch, + const size_t spitch, const size_t cols, + const size_t rows) { + ins.srcXInBytes=0; + ins.srcY=0; + ins.srcPitch=spitch; + ins.dstXInBytes=0; + ins.dstY=0; + ins.dstPitch=dpitch; + ins.WidthInBytes=cols; + ins.Height=rows; +} + +template struct _nvd_set_2D_mem; +template <> struct _nvd_set_2D_mem<1> + { static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } }; +template <> struct _nvd_set_2D_mem<2> + { static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } }; +template struct _nvd_set_2D_mem + { static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } }; + + +// -------------------------------------------------------------------------- +// - MEMCPY ROUTINES +// -------------------------------------------------------------------------- + +template struct _ucl_memcpy; + +// Both are images +template<> struct _ucl_memcpy<2,2> { + template + static inline void mc(p1 &dst, const p2 &src, const size_t n) { + assert(0==1); + } + template + static inline void mc(p1 &dst, const p2 &src, const size_t n, + CUstream &cq) { + assert(0==1); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstArray=dst.cbegin(); + ins.srcArray=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2D(&ins)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows, CUstream &cq) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstArray=dst.cbegin(); + ins.srcArray=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); + } +}; + +// Destination is texture, source on device +template<> struct _ucl_memcpy<2,0> { + template + static inline void mc(p1 &dst, const p2 &src, const size_t n) { + assert(0==1); + } + template + static inline void mc(p1 &dst, const p2 &src, const size_t n, + CUstream &cq) { + assert(0==1); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstArray=dst.cbegin(); + ins.srcDevice=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2D(&ins)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows, CUstream &cq) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstArray=dst.cbegin(); + ins.srcDevice=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); + } +}; + +// Destination is texture, source on host +template<> struct _ucl_memcpy<2,1> { + template + static inline void mc(p1 &dst, const p2 &src, const size_t n) { + assert(0==1); + } + template + static inline void mc(p1 &dst, const p2 &src, const size_t n, + CUstream &cq) { + assert(0==1); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstArray=dst.cbegin(); + ins.srcHost=src.begin(); + CU_SAFE_CALL(cuMemcpy2D(&ins)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows, CUstream &cq) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstArray=dst.cbegin(); + ins.srcHost=src.begin(); + CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); + } +}; + +// Source is texture, dest on device +template<> struct _ucl_memcpy<0,2> { + template + static inline void mc(p1 &dst, const p2 &src, const size_t n) { + assert(0==1); + } + template + static inline void mc(p1 &dst, const p2 &src, const size_t n, + CUstream &cq) { + assert(0==1); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstDevice=dst.cbegin(); + ins.srcArray=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2D(&ins)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows, CUstream &cq) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstDevice=dst.cbegin(); + ins.srcArray=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); + } +}; + +// Source is texture, dest on host +template<> struct _ucl_memcpy<1,2> { + template + static inline void mc(p1 &dst, const p2 &src, const size_t n) { + assert(0==1); + } + template + static inline void mc(p1 &dst, const p2 &src, const size_t n, + CUstream &cq) { + assert(0==1); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstHost=dst.begin(); + ins.srcArray=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2D(&ins)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows, CUstream &cq) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstHost=dst.begin(); + ins.srcArray=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); + } +}; + +// Neither are textures, destination on host +template <> struct _ucl_memcpy<1,0> { + template + static inline void mc(p1 &dst, const p2 &src, const size_t n) { + CU_SAFE_CALL(cuMemcpyDtoH(dst.begin(),src.cbegin(),n)); + } + template + static inline void mc(p1 &dst, const p2 &src, const size_t n, + CUstream &cq) { + CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstHost=dst.begin(); + ins.srcDevice=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2D(&ins)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows, CUstream &cq) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstHost=dst.begin(); + ins.srcDevice=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); + } +}; + +// Neither are textures, source on host +template <> struct _ucl_memcpy<0,1> { + template + static inline void mc(p1 &dst, const p2 &src, const size_t n) { + CU_SAFE_CALL(cuMemcpyHtoD(dst.cbegin(),src.begin(),n)); + } + template + static inline void mc(p1 &dst, const p2 &src, const size_t n, + CUstream &cq) { + CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstDevice=dst.cbegin(); + ins.srcHost=src.begin(); + CU_SAFE_CALL(cuMemcpy2D(&ins)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows, CUstream &cq) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstDevice=dst.cbegin(); + ins.srcHost=src.begin(); + CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); + } +}; + +// Neither are textures, both on host +template <> struct _ucl_memcpy<1,1> { + template + static inline void mc(p1 &dst, const p2 &src, const size_t n) + { memcpy(dst.begin(),src.begin(),n); } + template + static inline void mc(p1 &dst, const p2 &src, const size_t n, + CUstream &cq) + { memcpy(dst.begin(),src.begin(),n); } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstHost=dst.begin(); + ins.srcHost=src.begin(); + CU_SAFE_CALL(cuMemcpy2D(&ins)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows, CUstream &cq) { + CUDA_MEMCPY2D ins; + _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstHost=dst.begin(); + ins.srcHost=src.begin(); + CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); + } +}; + +// Neither are textures, both on device +template struct _ucl_memcpy { + template + static inline void mc(p1 &dst, const p2 &src, const size_t n) { + CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n)); + } + template + static inline void mc(p1 &dst, const p2 &src, const size_t n, + CUstream &cq) { + CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n)); + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + if (p1::PADDED==0 || p2::PADDED==0) { + size_t src_offset=0, dst_offset=0; + for (size_t i=0; i::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstDevice=dst.cbegin(); + ins.srcDevice=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2D(&ins)); + } + } + template + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + const size_t spitch, const size_t cols, + const size_t rows, CUstream &cq) { + if (p1::PADDED==0 || p2::PADDED==0) { + size_t src_offset=0, dst_offset=0; + for (size_t i=0; i::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstDevice=dst.cbegin(); + ins.srcDevice=src.cbegin(); + CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); + } + } +}; + +template +inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) { + _ucl_memcpy::mc(dst,src,n); +} + +template +inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n, + CUstream &cq) { + _ucl_memcpy::mc(dst,src,n,cq); +} + +template +inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + _ucl_memcpy::mc(dst,dpitch,src,spitch,cols, + rows); +} + +template +inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, + const size_t spitch, const size_t cols, + const size_t rows,CUstream &cq) { + _ucl_memcpy::mc(dst,dpitch,src,spitch,cols, + rows,cq); +} + +} // namespace ucl_cudart + +#endif + diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h new file mode 100644 index 0000000000..3fbf80180b --- /dev/null +++ b/lib/gpu/geryon/nvd_texture.h @@ -0,0 +1,71 @@ +/*************************************************************************** + nvd_texture.h + ------------------- + W. Michael Brown + + Utilities for dealing with CUDA Driver textures + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Fri Jul 2 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef NVD_TEXTURE +#define NVD_TEXTURE + +#include "nvd_kernel.h" +#include "nvd_mat.h" + +namespace ucl_cudadr { + +/// Class storing a texture reference +class UCL_Texture { + public: + UCL_Texture() {} + ~UCL_Texture() {} + /// Construct with a specified texture reference + inline UCL_Texture(UCL_Program &prog, const char *texture_name) + { get_texture(prog,texture_name); } + /// Set the texture reference for this object + inline void get_texture(UCL_Program &prog, const char *texture_name) + { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); } + + /// Bind a float array where each fetch grabs a vector of length numel + template + inline void bind_float(mat_typ &vec, const unsigned numel) { + #ifdef UCL_DEBUG + assert(numel!=0 && numel<5); + #endif + CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), + vec.numel()*vec.element_size())); + CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel)); + } + + /// Unbind the texture reference from the memory allocation + inline void unbind() { } + + /// Make a texture reference available to kernel + inline void allow(UCL_Kernel &kernel) { + CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); + } + + private: + CUtexref _tex; + friend class UCL_Kernel; +}; + +} // namespace + +#endif + diff --git a/lib/gpu/geryon/nvd_timer.h b/lib/gpu/geryon/nvd_timer.h new file mode 100644 index 0000000000..cf7cf6c572 --- /dev/null +++ b/lib/gpu/geryon/nvd_timer.h @@ -0,0 +1,106 @@ +/*************************************************************************** + nvd_timer.h + ------------------- + W. Michael Brown + + Class for timing CUDA Driver routines + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Fri Jan 22 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef NVD_TIMER_H +#define NVD_TIMER_H + +#include "nvd_macros.h" + +namespace ucl_cudadr { + +/// Class for timing CUDA Driver events +class UCL_Timer { + public: + inline UCL_Timer() : _total_time(0.0f), _initialized(false) { } + inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false) + { init(dev); } + + inline ~UCL_Timer() { clear(); } + + /// Clear any data associated with timer + /** \note init() must be called to reuse timer after a clear() **/ + inline void clear() { + if (_initialized) { + CU_SAFE_CALL(cuEventDestroy(start_event)); + CU_SAFE_CALL(cuEventDestroy(stop_event)); + _initialized=false; + _total_time=0.0; + } + } + + /// Initialize default command queue for timing + inline void init(UCL_Device &dev) { init(dev, dev.cq()); } + + /// Initialize command queue for timing + inline void init(UCL_Device &dev, command_queue &cq) { + clear(); + _cq=cq; + _initialized=true; + CU_SAFE_CALL( cuEventCreate(&start_event,0) ); + CU_SAFE_CALL( cuEventCreate(&stop_event,0) ); + } + + /// Start timing on command queue + inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); } + + /// Stop timing on command queue + inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); } + + /// Set the time elapsed to zero (not the total_time) + inline void zero() { + CU_SAFE_CALL(cuEventRecord(start_event,_cq)); + CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); + } + + /// Add time from previous start and stop to total + /** Forces synchronization **/ + inline double add_to_total() + { double t=time(); _total_time+=t; return t/1000.0; } + + /// Return the time (ms) of last start to stop - Forces synchronization + inline double time() { + float timer; + CU_SAFE_CALL(cuEventSynchronize(stop_event)); + CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) ); + return timer; + } + + /// Return the time (s) of last start to stop - Forces synchronization + inline double seconds() { return time()/1000.0; } + + /// Return the total time in ms + inline double total_time() { return _total_time; } + + /// Return the total time in seconds + inline double total_seconds() { return _total_time/1000.0; } + + private: + CUevent start_event, stop_event; + CUstream _cq; + double _total_time; + bool _initialized; +}; + +} // namespace + +#endif diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h new file mode 100644 index 0000000000..8ef5f32454 --- /dev/null +++ b/lib/gpu/geryon/ocl_device.h @@ -0,0 +1,449 @@ +/*************************************************************************** + ocl_device.h + ------------------- + W. Michael Brown + + Utilities for dealing with OpenCL devices + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Mon Dec 23 2009 + copyright : (C) 2009 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef OCL_DEVICE +#define OCL_DEVICE + +#include +#include +#include + +#include "CL/cl.h" +#include "CL/cl_platform.h" +#include "ocl_macros.h" +#include "ucl_types.h" + +namespace ucl_opencl { + +// -------------------------------------------------------------------------- +// - COMMAND QUEUE STUFF +// -------------------------------------------------------------------------- +typedef cl_command_queue command_queue; +typedef cl_context context_type; + +inline void ucl_sync(cl_command_queue &cq) { + CL_SAFE_CALL(clFinish(cq)); +} + +struct OCLProperties { + std::string name; + cl_device_type device_type; + cl_ulong global_mem; + cl_ulong shared_mem; + cl_ulong const_mem; + cl_uint compute_units; + cl_uint clock; + size_t work_group_size; + size_t work_item_size[3]; + bool double_precision; + int alignment; + size_t timer_resolution; +}; + +/// Class for looking at data parallel device properties +/** \note Calls to change the device outside of the class results in incorrect + * behavior + * \note There is no error checking for indexing past the number of devices **/ +class UCL_Device { + public: + /// Collect properties for every device on the node + /** \note You must set the active GPU with set() before using the device **/ + UCL_Device(); + + ~UCL_Device(); + + /// Return the number of platforms (0 if error or no platforms) + inline int num_platforms() { return _num_platforms; } + + /// Return a string with name and info of the current platform + std::string platform_name(); + + /// Return the number of devices that support OpenCL + inline int num_devices() { return _num_devices; } + + /// Set the OpenCL device to the specified device number + /** A context and default command queue will be created for the device **/ + void set(int num); + + /// Get the current device number + inline int device_num() { return _device; } + + /// Returns the context for the current device + inline cl_context & context() { return _context; } + + /// Returns the default stream for the current device + inline command_queue & cq() { return cq(0); } + + /// Returns the stream indexed by i + inline command_queue & cq(const int i) { return _cq[i]; } + + /// Block until all commands in the default stream have completed + inline void sync() { sync(0); } + + /// Block until all commands in the specified stream have completed + inline void sync(const int i) { ucl_sync(cq(i)); } + + /// Get the number of command queues currently available on device + inline int num_queues() + { return _cq.size(); } + + /// Add a command queue for device computations (with profiling enabled) + inline void push_command_queue() { + cl_int errorv; + _cq.push_back(cl_command_queue()); + _cq.back()=clCreateCommandQueue(_context,_cl_device, + CL_QUEUE_PROFILING_ENABLE,&errorv); + if (errorv!=CL_SUCCESS) { + std::cerr << "Could not create command queue on device: " << name() + << std::endl; + exit(1); + } + } + + /// Remove a stream for device computations + /** \note You cannot delete the default stream **/ + inline void pop_command_queue() { + if (_cq.size()<2) return; + CL_SAFE_CALL(clReleaseCommandQueue(_cq.back())); + _cq.pop_back(); + } + + /// Get the current OpenCL device name + inline std::string name() { return name(_device); } + /// Get the OpenCL device name + inline std::string name(const int i) + { return std::string(_properties[i].name); } + + /// Get a string telling the type of the current device + inline std::string device_type_name() { return device_type_name(_device); } + /// Get a string telling the type of the device + inline std::string device_type_name(const int i); + + /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) + inline int device_type() { return device_type(_device); } + /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) + inline int device_type(const int i); + + /// Returns true if double precision is support for the current device + bool double_precision() { return double_precision(_device); } + /// Returns true if double precision is support for the device + bool double_precision(const int i) {return _properties[i].double_precision;} + + /// Get the number of cores in the current device + inline unsigned cores() { return cores(_device); } + /// Get the number of cores + inline unsigned cores(const int i) + { if (device_type(i)==UCL_CPU) return _properties[i].compute_units; + else return _properties[i].compute_units*8; } + + /// Get the gigabytes of global memory in the current device + inline double gigabytes() { return gigabytes(_device); } + /// Get the gigabytes of global memory + inline double gigabytes(const int i) + { return static_cast(_properties[i].global_mem)/1073741824; } + + /// Get the bytes of global memory in the current device + inline size_t bytes() { return bytes(_device); } + /// Get the bytes of global memory + inline size_t bytes(const int i) { return _properties[i].global_mem; } + + /// Return the GPGPU revision number for current device + //inline double revision() { return revision(_device); } + /// Return the GPGPU revision number + //inline double revision(const int i) + // { return //static_cast(_properties[i].minor)/10+_properties[i].major;} + + /// Clock rate in GHz for current device + inline double clock_rate() { return clock_rate(_device); } + /// Clock rate in GHz + inline double clock_rate(const int i) { return _properties[i].clock*1e-3;} + + /// Return the address alignment in bytes + inline int alignment() { return alignment(_device); } + /// Return the address alignment in bytes + inline int alignment(const int i) { return _properties[i].alignment; } + + /// Return the timer resolution + inline size_t timer_resolution() { return timer_resolution(_device); } + /// Return the timer resolution + inline size_t timer_resolution(const int i) + { return _properties[i].timer_resolution; } + + /// Get the maximum number of threads per block + inline size_t group_size() { return group_size(_device); } + /// Get the maximum number of threads per block + inline size_t group_size(const int i) + { return _properties[i].work_group_size; } + + /// Return the maximum memory pitch in bytes for current device + inline size_t max_pitch() { return max_pitch(_device); } + /// Return the maximum memory pitch in bytes + inline size_t max_pitch(const int i) { return 0; } + + /// List all devices along with all properties + void print_all(std::ostream &out); + + /// Return the OpenCL type for the device + inline cl_device_id & cl_device() { return _cl_device; } + + private: + int _num_platforms; // Number of platforms + int _platform; // UCL_Device ID for current platform + cl_platform_id _cl_platform; // OpenCL ID for current platform + cl_context _context; // Context used for accessing the device + std::vector _cq;// The default command queue for this device + int _device; // UCL_Device ID for current device + cl_device_id _cl_device; // OpenCL ID for current device + std::vector _cl_devices; // OpenCL IDs for all devices + int _num_devices; // Number of devices + std::vector _properties; // Properties for each device + + void add_properties(cl_device_id); + void create_context(); + +}; + +// Grabs the properties for all devices +inline UCL_Device::UCL_Device() { + cl_int errorv; + cl_uint nplatforms; + + _cl_device=0; + _device=-1; + _num_devices=0; + _platform=0; + + // --- Get Number of Platforms + errorv=clGetPlatformIDs(1,&_cl_platform,&nplatforms); + + if (errorv!=CL_SUCCESS) { + _num_platforms=0; + return; + } else + _num_platforms=static_cast(nplatforms); + + + // --- Get Number of Devices + cl_uint n; + errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n); + _num_devices=n; + if (errorv!=CL_SUCCESS || _num_devices==0) { + _num_devices=0; + return; + } + cl_device_id device_list[_num_devices]; + CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list, + &n)); + + // --- Store properties for each device + for (int i=0; i<_num_devices; i++) { + _cl_devices.push_back(device_list[i]); + add_properties(device_list[i]); + } +} + +inline UCL_Device::~UCL_Device() { + if (_device>-1) { + for (size_t i=0; i<_cq.size(); i++) { + CL_SAFE_CALL(clReleaseCommandQueue(_cq.back())); + _cq.pop_back(); + } + CL_SAFE_CALL(clReleaseContext(_context)); + } +} + +inline void UCL_Device::create_context() { + cl_int errorv; + cl_context_properties props[3]; + props[0]=CL_CONTEXT_PLATFORM; + props[1]=_platform; + props[2]=0; + _context=clCreateContext(0,1,&_cl_device,NULL,NULL,&errorv); + if (errorv!=CL_SUCCESS) { + std::cerr << "Could not create context on device: " << name() << std::endl; + exit(1); + } + push_command_queue(); +} + +inline void UCL_Device::add_properties(cl_device_id device_list) { + OCLProperties op; + char buffer[1024]; + + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL)); + op.name=buffer; + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE, + sizeof(op.global_mem),&op.global_mem,NULL)); + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(op.shared_mem),&op.shared_mem,NULL)); + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, + sizeof(op.const_mem),&op.const_mem,NULL)); + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_TYPE, + sizeof(op.device_type),&op.device_type,NULL)); + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(op.compute_units),&op.compute_units, + NULL)); + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CLOCK_FREQUENCY, + sizeof(op.clock),&op.clock,NULL)); + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(op.work_group_size),&op.work_group_size, + NULL)); + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_ITEM_SIZES, + 3*sizeof(op.work_item_size[0]),op.work_item_size, + NULL)); + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN, + sizeof(cl_uint),&op.alignment,NULL)); + op.alignment/=8; + + // Determine if double precision is supported + cl_uint double_width; + CL_SAFE_CALL(clGetDeviceInfo(device_list, + CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, + sizeof(double_width),&double_width,NULL)); + if (double_width==0) + op.double_precision=false; + else + op.double_precision=true; + + CL_SAFE_CALL(clGetDeviceInfo(device_list, + CL_DEVICE_PROFILING_TIMER_RESOLUTION, + sizeof(size_t),&op.timer_resolution,NULL)); + + _properties.push_back(op); +} + +inline std::string UCL_Device::platform_name() { + char info[1024]; + + CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info, + NULL)); + std::string ans=std::string(info)+' '; + + CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info, + NULL)); + ans+=std::string(info)+' '; + + CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info, + NULL)); + ans+=std::string(info); + + return ans; +} + +// Get a string telling the type of the device +inline std::string UCL_Device::device_type_name(const int i) { + if (_properties[i].device_type==CL_DEVICE_TYPE_CPU) + return "CPU"; + else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU) + return "GPU"; + else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR) + return "ACCELERATOR"; + else + return "DEFAULT"; +} + +// Get a string telling the type of the device +inline int UCL_Device::device_type(const int i) { + if (_properties[i].device_type==CL_DEVICE_TYPE_CPU) + return UCL_CPU; + else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU) + return UCL_GPU; + else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR) + return UCL_ACCELERATOR; + else + return UCL_DEFAULT; +} + +// Set the CUDA device to the specified device number +inline void UCL_Device::set(int num) { + if (_device==num) + return; + + if (_device>-1) { + for (size_t i=0; i<_cq.size(); i++) { + CL_SAFE_CALL(clReleaseCommandQueue(_cq.back())); + _cq.pop_back(); + } + CL_SAFE_CALL(clReleaseContext(_context)); + } + + cl_device_id device_list[_num_devices]; + cl_uint n; + CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices, + device_list,&n)); + + _device=num; + _cl_device=device_list[_device]; + create_context(); +} + +// List all devices along with all properties +inline void UCL_Device::print_all(std::ostream &out) { + if (num_devices() == 0) + out << "There is no device supporting OpenCL\n"; + for (int i=0; i + +namespace ucl_opencl { + +/// Class storing 1 or more kernel functions from a single string or file +class UCL_Program { + public: + inline UCL_Program() : _init_done(false) {} + inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); } + inline ~UCL_Program() { clear(); } + + /// Initialize the program with a device + inline void init(UCL_Device &device) { + clear(); + _device=device.cl_device(); + _context=device.context(); + _cq=device.cq(); + CL_SAFE_CALL(clRetainContext(_context)); + CL_SAFE_CALL(clRetainCommandQueue(_cq)); + _init_done=true; + } + + /// Clear any data associated with program + /** \note Must call init() after each clear **/ + inline void clear() { + if (_init_done) { + CL_SAFE_CALL(clReleaseProgram(_program)); + CL_SAFE_CALL(clReleaseContext(_context)); + CL_SAFE_CALL(clReleaseCommandQueue(_cq)); + _init_done=false; + } + } + + /// Load a program from a file and compile with flags + inline int load(const char *filename, const char *flags="", + std::string *log=NULL) { + std::ifstream in(filename); + if (!in || in.is_open()==false) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not open kernel file: " + << filename << std::endl; + exit(1); + #endif + return UCL_FILE_NOT_FOUND; + } + + std::string program((std::istreambuf_iterator(in)), + std::istreambuf_iterator()); + in.close(); + return load_string(program.c_str(),flags,log); + } + + /// Load a program from a string and compile with flags + inline int load_string(const char *program, const char *flags="", + std::string *log=NULL) { + cl_int error_flag; + const char *prog=program; + _program=clCreateProgramWithSource(_context,1,&prog,NULL,&error_flag); + CL_CHECK_ERR(error_flag); + error_flag = clBuildProgram(_program,1,&_device,flags,NULL,NULL); + cl_build_status build_status; + CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device, + CL_PROGRAM_BUILD_STATUS, + sizeof(cl_build_status),&build_status, + NULL)); + + if (build_status != CL_SUCCESS || log!=NULL) { + size_t ms; + CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0, + NULL, &ms)); + char build_log[ms]; + CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms, + build_log, NULL)); + + if (log!=NULL) + *log=std::string(build_log); + + if (build_status != CL_SUCCESS) { + #ifndef UCL_NO_EXIT + std::cerr << std::endl + << "----------------------------------------------------------\n" + << " UCL Error: Error compiling OpenCL Program...\n" + << "----------------------------------------------------------\n"; + std::cerr << build_log << std::endl; + #endif + return UCL_COMPILE_ERROR; + } + } + + return UCL_SUCCESS; + } + + friend class UCL_Kernel; + private: + bool _init_done; + cl_program _program; + cl_device_id _device; + cl_context _context; + cl_command_queue _cq; +}; + +/// Class for dealing with OpenCL kernels +class UCL_Kernel { + public: + UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0) + { _block_size[0]=0; _num_blocks[0]=0; } + + inline UCL_Kernel(UCL_Program &program, const char *function) : + _dimensions(1), _function_set(false), _num_args(0) + { _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); } + + inline ~UCL_Kernel() { clear(); } + + /// Clear any function associated with the kernel + inline void clear() { + if (_function_set) { + clReleaseKernel(_kernel); + clReleaseProgram(_program); + clReleaseCommandQueue(_cq); + _function_set=false; + } + } + + /// Get the kernel function from a program + /** \return UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/ + inline int set_function(UCL_Program &program, const char *function); + + /// Set the kernel argument. + /** If not a device pointer, this must be repeated each time the argument + * changes **/ + template + inline void set_arg(const cl_uint index, dtype *arg) { + CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); + if (index>_num_args) _num_args=index; + } + + /// Add a kernel argument. + template + inline void add_arg(dtype *arg) { + CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); + _num_args++; + } + + /// Set the number of thread blocks and the number of threads in each block + inline void set_size(const size_t num_blocks, const size_t block_size) { + _dimensions=1; + _num_blocks[0]=num_blocks*block_size; + _block_size[0]=block_size; + } + + /// Set the number of thread blocks and the number of threads in each block + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, const size_t block_size_y) { + _dimensions=2; + _num_blocks[0]=num_blocks_x*block_size_x; + _block_size[0]=block_size_x; + _num_blocks[1]=num_blocks_y*block_size_y; + _block_size[1]=block_size_y; + } + + /// Set the number of thread blocks and the number of threads in each block + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, + const size_t block_size_y, const size_t block_size_z) { + _dimensions=3; + const size_t num_blocks_z=1; + _num_blocks[0]=num_blocks_x*block_size_x; + _block_size[0]=block_size_x; + _num_blocks[1]=num_blocks_y*block_size_y; + _block_size[1]=block_size_y; + _num_blocks[2]=num_blocks_z*block_size_z; + _block_size[2]=block_size_z; + } + + /// Run the kernel in the default command queue + inline void run() { + run(_cq); + } + + /// Run the kernel in the specified command queue + inline void run(command_queue &cq) { + CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,_kernel,_dimensions,NULL, + _num_blocks,_block_size,0,NULL,NULL)); + } + + /// Clear any arguments associated with the kernel + inline void clear_args() { _num_args=0; } + + #include "ucl_arg_kludge.h" + + private: + cl_kernel _kernel; + cl_program _program; + cl_uint _dimensions; + size_t _block_size[3]; + size_t _num_blocks[3]; + bool _function_set; + + cl_command_queue _cq; // The default command queue for this kernel + unsigned _num_args; +}; + +inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) { + clear(); + _function_set=true; + _cq=program._cq; + CL_SAFE_CALL(clRetainCommandQueue(_cq)); + _program=program._program; + CL_SAFE_CALL(clRetainProgram(_program)); + cl_int error_flag; + _kernel=clCreateKernel(program._program,function,&error_flag); + + if (error_flag!=CL_SUCCESS) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not find function: " << function + << " in program.\n"; + exit(1); + #endif + return UCL_FUNCTION_NOT_FOUND; + } + return UCL_SUCCESS; +} + +} // namespace + +#endif + diff --git a/lib/gpu/geryon/ocl_mat.h b/lib/gpu/geryon/ocl_mat.h new file mode 100644 index 0000000000..180b292d3b --- /dev/null +++ b/lib/gpu/geryon/ocl_mat.h @@ -0,0 +1,56 @@ +/*************************************************************************** + ocl_mat.h + ------------------- + W. Michael Brown + + OpenCL Specific Vector/Matrix Containers, Memory Management, and I/O + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Wed Jan 13 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +/*! \file */ + +#ifndef OCL_MAT_H +#define OCL_MAT_H + +#include "ocl_memory.h" + +/// Namespace for OpenCL routines +namespace ucl_opencl { + +#define _UCL_MAT_ALLOW +#define _UCL_DEVICE_PTR_MAT +#define _OCL_MAT +#include "ucl_basemat.h" +#include "ucl_h_vec.h" +#include "ucl_h_mat.h" +#include "ucl_d_vec.h" +#include "ucl_d_mat.h" +#undef _UCL_DEVICE_PTR_MAT +#undef _OCL_MAT +#undef _UCL_MAT_ALLOW + +#define UCL_COPY_ALLOW +#include "ucl_copy.h" +#undef UCL_COPY_ALLOW + +#define UCL_PRINT_ALLOW +#include "ucl_print.h" +#undef UCL_PRINT_ALLOW + +} // namespace ucl_cudart + +#endif diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h new file mode 100644 index 0000000000..8e72c51730 --- /dev/null +++ b/lib/gpu/geryon/ocl_texture.h @@ -0,0 +1,59 @@ +/*************************************************************************** + ocl_texture.h + ------------------- + W. Michael Brown + + Utilities for dealing with OpenCL textures + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Fri Jul 2 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef OCL_TEXTURE +#define OCL_TEXTURE + +#include "ocl_kernel.h" +#include "ocl_mat.h" + +namespace ucl_opencl { + +/// Class storing a texture reference +class UCL_Texture { + public: + UCL_Texture() {} + ~UCL_Texture() {} + /// Construct with a specified texture reference + inline UCL_Texture(UCL_Program &prog, const char *texture_name) { } + /// Set the texture reference for this object + inline void get_texture(UCL_Program &prog, const char *texture_name) { } + + /// Bind a float array where each fetch grabs a vector of length numel + template + inline void bind_float(mat_typ &vec, const unsigned numel) { } + + /// Unbind the texture reference from the memory allocation + inline void unbind() { } + + /// Make a texture reference available to kernel + inline void allow(UCL_Kernel &kernel) { } + + private: + friend class UCL_Kernel; +}; + +} // namespace + +#endif + diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h new file mode 100644 index 0000000000..649076c1e9 --- /dev/null +++ b/lib/gpu/geryon/ocl_timer.h @@ -0,0 +1,111 @@ +/*************************************************************************** + ocl_timer.h + ------------------- + W. Michael Brown + + Class for timing OpenCL routines + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Jan Fri 22 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef OCL_TIMER_H +#define OCL_TIMER_H + +#include "ocl_macros.h" + +namespace ucl_opencl { + +/// Class for timing OpenCL events +class UCL_Timer { + public: + inline UCL_Timer() : _total_time(0.0f), _initialized(false) { } + inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false) + { init(dev); } + + inline ~UCL_Timer() { clear(); } + + /// Clear any data associated with timer + /** \note init() must be called to reuse timer after a clear() **/ + inline void clear() { + if (_initialized) { + CL_SAFE_CALL(clReleaseCommandQueue(_cq)); + clReleaseEvent(start_event); + clReleaseEvent(stop_event); + _initialized=false; + _total_time=0.0; + } + } + + /// Initialize default command queue for timing + inline void init(UCL_Device &dev) { init(dev,dev.cq()); } + + /// Initialize command queue for timing + inline void init(UCL_Device &dev, command_queue &cq) { + clear(); + t_factor=dev.timer_resolution()/1000000000.0; + _cq=cq; + clRetainCommandQueue(_cq); + _initialized=true; + } + + /// Start timing on default command queue + inline void start() { clEnqueueMarker(_cq,&start_event); } + + /// Stop timing on default command queue + inline void stop() { clEnqueueMarker(_cq,&stop_event); } + + /// Set the time elapsed to zero (not the total_time) + inline void zero() + { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } + + /// Add time from previous start and stop to total + /** Forces synchronization **/ + inline double add_to_total() + { double t=time(); _total_time+=t; return t/1000.0; } + + /// Return the time (ms) of last start to stop - Forces synchronization + inline double time() { + cl_ulong tstart,tend; + CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); + CL_SAFE_CALL(clGetEventProfilingInfo(stop_event, + CL_PROFILING_COMMAND_START, + sizeof(cl_ulong), &tend, NULL)); + CL_SAFE_CALL(clGetEventProfilingInfo(start_event, + CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &tstart, NULL)); + return (tend-tstart)*t_factor; + } + + /// Return the time (s) of last start to stop - Forces synchronization + inline double seconds() { return time()/1000.0; } + + /// Return the total time in ms + inline double total_time() { return _total_time; } + + /// Return the total time in seconds + inline double total_seconds() { return _total_time/1000.0; } + + private: + cl_event start_event, stop_event; + cl_command_queue _cq; + double _total_time; + bool _initialized; + double t_factor; +}; + +} // namespace + +#endif diff --git a/lib/gpu/geryon/ucl_arg_kludge.h b/lib/gpu/geryon/ucl_arg_kludge.h new file mode 100644 index 0000000000..78ec66ddc9 --- /dev/null +++ b/lib/gpu/geryon/ucl_arg_kludge.h @@ -0,0 +1,673 @@ +/*************************************************************************** + ucl_arg_kludge.h + ------------------- + W. Michael Brown + + Allow multiple arguments to be added for a kernel call at a single time + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Sun Feb 7 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + + template + inline void add_args(t1 *a1, t2 *a2) { + add_arg(a1); add_arg(a2); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3) { + add_arg(a1); add_arg(a2); add_arg(a3); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + } + + +// --------------------------------------------------------------------------- + + template + inline void run(t1 *a1) { + clear_args(); + add_arg(a1); + run(); + } + + template + inline void run(t1 *a1, t2 *a2) { + clear_args(); + add_arg(a1); add_arg(a2); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + run(); + } + +// --------------------------------------------------------------------------- + + template + inline void run_cq(command_queue &cq, t1 *a1) { + clear_args(); + add_arg(a1); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2) { + clear_args(); + add_arg(a1); add_arg(a2); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + run(cq); + } + diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h new file mode 100644 index 0000000000..844071c9b5 --- /dev/null +++ b/lib/gpu/geryon/ucl_basemat.h @@ -0,0 +1,77 @@ +/*************************************************************************** + ucl_basemat.h + ------------------- + W. Michael Brown + + Vector/Matrix Base Container + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Thu Jun 25 2009 + copyright : (C) 2009 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +// Only allow this file to be included by CUDA and OpenCL specific headers +#ifdef _UCL_MAT_ALLOW + +#include "ucl_types.h" + +#define UCL_H_VecT UCL_H_Vec +#define UCL_H_VecD UCL_H_Vec +#define UCL_H_VecS UCL_H_Vec +#define UCL_H_VecI UCL_H_Vec + +#define UCL_D_VecT UCL_D_Vec +#define UCL_D_VecD UCL_D_Vec +#define UCL_D_VecS UCL_D_Vec +#define UCL_D_VecI UCL_D_Vec +#define UCL_D_VecI2 UCL_D_Vec +#define UCL_D_VecU2 UCL_D_Vec + +#define UCL_D_MatT UCL_D_Mat +#define UCL_D_MatD UCL_D_Mat +#define UCL_D_MatS UCL_D_Mat +#define UCL_D_MatI UCL_D_Mat + +#define UCL_ConstMatT UCL_ConstMat +#define UCL_ConstMatD UCL_ConstMat +#define UCL_ConstMatS UCL_ConstMat +#define UCL_ConstMatI UCL_ConstMat +#define UCL_ConstMatD2 UCL_ConstMat + +/// Base class for vector/matrix containers +/** All containers are associated with a default command queue. + * For CUDA, this is the default stream. + * + * The default queue is used for asynchonrous operations on the container + * that do not specify a queue. For OpenCL, this queue is also used in + * calls for reserving and copying memory **/ +class UCL_BaseMat { + public: + UCL_BaseMat() : _cq(0) { } + virtual ~UCL_BaseMat() { } + /// Return the default command queue/stream associated with this data + inline command_queue & cq() { return _cq; } + /// Block until command_queue associated with matrix is complete + inline void sync() { ucl_sync(_cq); } + + #ifdef UCL_DEBUG + // Returns the type of host allocation + virtual inline enum UCL_MEMOPT kind() const { return UCL_NOT_PINNED; } + #endif + protected: + command_queue _cq; +}; + +#endif + diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h new file mode 100644 index 0000000000..c201cc0b12 --- /dev/null +++ b/lib/gpu/geryon/ucl_copy.h @@ -0,0 +1,826 @@ +/*************************************************************************** + ucl_copy.h + ------------------- + W. Michael Brown + + Routines for copying matrix/vector data onto and off coprocessor device + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Mon Jan 4 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +/*************************************************************************** + The ucl_copy and ucl_cast_copy routines provide a general prototype for + copying data between host and device memory (including texture memory) + for the matrix and vector types in nvc_memory. + + For host/host and host/device transfers, typecasting is performed + automatically as necessary. + + The routines are written so that all branches can be removed by the + compiler during template instantiation. + + The routines currently assume row-major ordering for all types. + + For asynchronous copy in the default command queue, async is boolean true; + For asynchronous copy in a specified command queue, async is command queue + Otherwise, set async to boolean false; + + When performing frequent data copies that require casting, it is more + efficient to allocate a casting buffer once and then pass that buffer + to the copy routine. This can be accomplished with the ucl_cast_copy + routines. + + Examples + (x's represent alignment padding - to maintain alignment) + (o's represent a larger matrix in memory) + (vectors represented as single row) + ---------------------------------------------------------------- + dst src command + ---------------------------------------------------------------- + 0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async) + + 0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async) + + 0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async) + 3 4 5 + + 0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async) + 3 4 5 + + 0 1 2 <-- 0 1 2 ucl_copy(dst,src,async) + 3 4 5 3 4 5 + + 0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async) + 3 4 5 3 4 5 + 5 6 7 + + 0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async) + 4 5 6 4 5 6 7 + 8 9 10 11 + + 0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async) + 3 4 5 x x 3 4 5 + + 0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async) + 3 4 5 3 4 5 x x + + 0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async) + 3 4 5 o o 3 4 5 + o o o o o + + 0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async) + 3 4 5 o o + o o o o o + + 0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async) + 2 3 o o o + o o o o o + + 0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async) + 5 6 7 o o 5 6 7 8 9 + o o o o o 10 11 12 13 14 + + 0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async) + 5 6 7 8 9 + 10 11 12 13 14 + + ***************************************************************************/ + +// Only allow this file to be included by nvc_memory.h and ocl_memory.h +#ifdef UCL_COPY_ALLOW + +// -------------------------------------------------------------------------- +// - HOST-HOST COPY ROUTINES +// -------------------------------------------------------------------------- + +// Have to use specialization because some types don't have operator[] +template struct _host_host_copy; + +// Both on host +template <> struct _host_host_copy<1,1> { + template + static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) { + #ifdef UCL_DEBUG + assert(mat1::PADDED==0 && mat2::PADDED==0); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + #endif + if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) + memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type)); + else + for (size_t i=0; i(src[i]); + } + template + static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols) { + #ifdef UCL_DEBUG + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + #endif + size_t dst_row_size, src_row_size; + if (mat1::VECTOR) + dst_row_size=cols; + else + dst_row_size=dst.row_size(); + if (mat2::VECTOR) + src_row_size=cols; + else + src_row_size=src.row_size(); + if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) + for (size_t i=0; i(src[src_i]); + src_i++; + } + } + } +}; + +// Should never be here +template struct _host_host_copy { + template + static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) { + assert(0==1); + } + template + static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols) { + assert(0==1); + } +}; + +// -------------------------------------------------------------------------- +// - TEMPLATE HELPER FUNCTIONS FOR SPECIALIZED CASTING +// -------------------------------------------------------------------------- + +// Helper functions for ucl_cast_copy +template struct _ucl_cast_copy; + +// Destination is on host +template struct _ucl_cast_copy<1,host_type2> { + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, + mat3 &cast_buffer) { + ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type)); + for (size_t i=0; i(cast_buffer[i]); + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, + mat3 &cast_buffer,command_queue &cq) { + ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type),cq); + cast_buffer.sync(); + for (size_t i=0; i(cast_buffer[i]); + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, mat3 &cast_buffer) { + // Asynchronous currently pointless here + #ifdef UCL_DEBUG + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); + if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); + if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); + #endif + if (mat1::VECTOR) { + ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, + src.row_bytes(),cols*sizeof(typename mat2::data_type),rows); + for (size_t i=0; i(cast_buffer[i]); + } else { + if (mat2::VECTOR) + ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, + cols*sizeof(typename mat2::data_type), + cols*sizeof(typename mat2::data_type),rows); + else + ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, + src.row_bytes(),cols*sizeof(typename mat2::data_type), + rows); + int dst_i=0; + int buff_i=0; + for (size_t i=0; i(cast_buffer[buff_i]); + buff_i++; + dst_i++; + } + dst_i+=dst.cols()-cols; + } + } + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, mat3 &cast_buffer, + command_queue &cq) { + // Asynchronous currently pointless here + #ifdef UCL_DEBUG + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); + if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); + if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); + #endif + if (mat1::VECTOR) { + ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, + src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq); + cast_buffer.sync(); + for (size_t i=0; i(cast_buffer[i]); + } else { + if (mat2::VECTOR) + ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, + cols*sizeof(typename mat2::data_type), + cols*sizeof(typename mat2::data_type),rows,cq); + else + ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, + src.row_bytes(),cols*sizeof(typename mat2::data_type), + rows,cq); + cast_buffer.sync(); + int dst_i=0; + int buff_i=0; + for (size_t i=0; i(cast_buffer[buff_i]); + buff_i++; + dst_i++; + } + dst_i+=dst.cols()-cols; + } + } + } +}; + +// Source is on host +template struct _ucl_cast_copy { + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, + mat3 &cast_buffer) { + for (size_t i=0; i(src[i]); + ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type)); + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, + mat3 &cast_buffer, command_queue &cq) { + for (size_t i=0; i(src[i]); + ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type),cq); + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, mat3 &cast_buffer) { + #ifdef UCL_DEBUG + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); + if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); + if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); + #endif + if (mat2::VECTOR) { + for (size_t i=0; i(src[i]); + ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, + cols*sizeof(typename mat1::data_type), + cols*sizeof(typename mat1::data_type),rows); + } else if (mat1::VECTOR) { + int src_i=0; + int buf_i=0; + for (size_t i=0; i(src[src_i]); + buf_i++; + src_i++; + } + src_i+=src.cols()-cols; + } + ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows); + } else { + int src_i=0; + int buf_i=0; + for (size_t i=0; i(src[src_i]); + buf_i++; + src_i++; + } + src_i+=src.cols()-cols; + } + ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, + cols*sizeof(typename mat1::data_type), + cols*sizeof(typename mat1::data_type),rows); + } + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, mat3 &cast_buffer, + command_queue &cq) { + #ifdef UCL_DEBUG + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); + if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); + if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); + #endif + if (mat2::VECTOR) { + for (size_t i=0; i(src[i]); + ucl_mv_cpy(dst,dst.row_bytes(), + cast_buffer,cols*sizeof(typename mat1::data_type), + cols*sizeof(typename mat1::data_type),rows,cq); + } else if (mat1::VECTOR) { + int src_i=0; + int buf_i=0; + for (size_t i=0; i(src[src_i]); + buf_i++; + src_i++; + } + src_i+=src.cols()-cols; + } + ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq); + } else { + int src_i=0; + int buf_i=0; + for (size_t i=0; i(src[src_i]); + buf_i++; + src_i++; + } + src_i+=src.cols()-cols; + } + ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, + cols*sizeof(typename mat1::data_type), + cols*sizeof(typename mat1::data_type),rows,cq); + } + } +}; + +// Neither on host or both on host +template <> struct _ucl_cast_copy<1,1> { + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, + mat3 &cast_buffer, command_queue &cq) { + assert(0==1); + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, + mat3 &cast_buffer) { + assert(0==1); + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, mat3 &cast_buffer) { + assert(0==1); + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, mat3 &cast_buffer, + command_queue &cq) { + assert(0==1); + } +}; + +// Neither on host or both on host +template <> struct _ucl_cast_copy<0,0> { + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, + mat3 &cast_buffer, command_queue &cq) { + assert(0==1); + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, + mat3 &cast_buffer) { + assert(0==1); + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, mat3 &cast_buffer) { + assert(0==1); + } + template + static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, mat3 &cast_buffer, + command_queue &cq) { + assert(0==1); + } +}; + +// -------------------------------------------------------------------------- +// - 1D COPY - SPECIFIED NUMBER OF BYTES +// -------------------------------------------------------------------------- + +/// Asynchronous copy of matrix/vector with cast (Device/Host transfer) +/** \param numel Number of elements (not bytes) to copy + * \param cast_buffer Buffer on host with enough storage for casting + * - If the data types for the two matrices are same, no cast performed + * - Padding for 2D matrices is not considered in this routine. + * - Currently does not handle textures **/ +template +inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, + mat3 &cast_buffer, command_queue &cq) { + #ifdef UCL_DEBUG + assert(dst.numel()>=numel && src.numel()>=numel); + assert(cast_buffer.numel()>=numel); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + #endif + if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) + ucl_copy(dst,src,numel,cq); + else + _ucl_cast_copy::cc(dst,src,numel, + cast_buffer,cq); +} + +/// Asynchronous copy of matrix/vector with cast (Device/Host transfer) +/** \param numel Number of elements (not bytes) to copy + * \param async Perform non-blocking copy on default stream + * \param cast_buffer Buffer on host with enough storage for casting + * - If the data types for the two matrices are same, no cast performed + * - Padding for 2D matrices is not considered in this routine. + * - Currently does not handle textures **/ +template +inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, + mat3 &cast_buffer, const bool async) { + #ifdef UCL_DEBUG + assert(dst.numel()>=numel && src.numel()>=numel); + assert(cast_buffer.numel()>=numel); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + #endif + if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) + ucl_copy(dst,src,numel,async); + else if (async) + _ucl_cast_copy::cc(dst,src,numel, + cast_buffer,dst.cq()); + else + _ucl_cast_copy::cc(dst,src,numel, + cast_buffer); +} + +/// Asynchronous copy of matrix/vector (memory already allocated) +/** \param numel Number of elements (not bytes) to copy + * - If the data types of the two matrices are not the same, + * casting will be performed automatically as long as the copy is + * not device to device. For host/device transfers, a temporary + * buffer is created for copy. When multiple casts occur, it is + * more efficient to create a permanent casting buffer that can + * be passed to an alternative copy routine. + * - Padding for 2D matrices is not considered in this routine. + * - Currently does not handle textures **/ +template +inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, + command_queue &cq) { + #ifdef UCL_DEBUG + assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + #endif + if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) + _host_host_copy::hhc(dst,src,numel); + else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && + (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { + if (mat1::MEM_TYPE==1) { + UCL_H_Vec cast_buffer; + cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED); + _ucl_cast_copy::cc(dst,src,numel, + cast_buffer,cq); + } else { + UCL_H_Vec cast_buffer; + cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED); + _ucl_cast_copy::cc(dst,src,numel, + cast_buffer,cq); + } + } else + ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq); +} + +/// Copy matrix/vector (memory already allocated) +/** \param numel Number of elements (not bytes) to copy + * \param async Perform non-blocking copy (ignored for host to host copy) + * - If the data types of the two matrices are not the same, + * casting will be performed automatically as long as the copy is + * not device to device. For host/device transfers, a temporary + * buffer is created for copy. When multiple casts occur, it is + * more efficient to create a permanent casting buffer that can + * be passed to an alternative copy routine. + * - Padding for 2D matrices is not considered in this routine. + * - The default stream is used for asynchronous copy + * - Currently does not handle textures **/ +template +inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, + const bool async) { + #ifdef UCL_DEBUG + assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + #endif + if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) + _host_host_copy::hhc(dst,src,numel); + else if (async) + ucl_copy(dst,src,numel,dst.cq()); + else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && + (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { + if (mat1::MEM_TYPE==1) { + UCL_H_Vec cast_buffer; + cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED); + _ucl_cast_copy::cc(dst,src,numel, + cast_buffer); + } else { + UCL_H_Vec cast_buffer; + cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED); + _ucl_cast_copy::cc(dst,src,numel, + cast_buffer); + } + } else + ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type)); +} + +// -------------------------------------------------------------------------- +// - 2D COPY - SPECIFIED NUMBER OF ROWS/COLS +// -------------------------------------------------------------------------- + +/// Asynchronous copy subset matrix rows/cols with cast (Device/Host transfer) +/** \param async Perform non-blocking copy on default stream + * \param cast_buffer Buffer on host with enough storage for casting + * - If src is a vector, routine assumes row-major rows by cols copy + * - If src is a matrix, routine will copy upper left tile of matrix + * - If dst is a vector, routine assumes row-major rows by cols copy + * - If dst is a matrix, routine will copy into left tile of matrix + * - If the data types for the two matrices are same, no cast performed + * - Padding for 2D matrices is not considered in this routine. + * - Copy from vector to matrix and vice versa allowed + * - Currently does not handle textures **/ +template +inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, mat3 &cast_buffer, + const bool async) { + if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) + ucl_copy(dst,src,rows,cols,async); + else if (async) + ucl_copy(dst,src,rows,cols,dst.cq()); + else + _ucl_cast_copy::cc(dst,src,rows,cols, + cast_buffer); +} + +/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer) +/** \param cast_buffer Buffer on host with enough storage for casting + * - If src is a vector, routine assumes row-major rows by cols copy + * - If src is a matrix, routine will copy upper left tile of matrix + * - If dst is a vector, routine assumes row-major rows by cols copy + * - If dst is a matrix, routine will copy into upper left tile of matrix + * - If the data types for the two matrices are same, no cast performed + * - Padding for 2D matrices is not considered in this routine. + * - Copy from vector to matrix and vice versa allowed + * - Currently does not handle textures **/ +template +inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, mat3 &cast_buffer, + command_queue &cq) { + if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) + ucl_copy(dst,src,rows,cols,cq); + else + _ucl_cast_copy::cc(dst,src,rows,cols, + cast_buffer,cq); +} + +/// Asynchronous copy of subset matrix rows,cols (memory already allocated) +/** - If src is a vector, routine assumes row-major rows by cols copy + * - If src is a matrix, routine will copy upper left tile of matrix + * - If dst is a vector, routine assumes row-major rows by cols copy + * - If dst is a matrix, routine will copy into left tile of matrix + * - If the data types of the two matrices are not the same, + * casting will be performed automatically as long as the copy is + * not device to device. For host/device transfers, a temporary + * buffer is created for copy. When multiple casts occur, it is + * more efficient to create a permanent casting buffer that can + * be passed to an alternative copy routine. + * - The copy should handle padding for 2D alignment correctly + * - Copy from vector to matrix and vice versa allowed + * - Currently does not handle textures **/ +template +inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, command_queue &cq) { + if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) + _host_host_copy::hhc(dst,src,rows,cols); + else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && + (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { + if (mat1::MEM_TYPE==1) { + UCL_H_Vec cast_buffer; + cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED); + _ucl_cast_copy::cc(dst,src,rows,cols, + cast_buffer,cq); + } else { + UCL_H_Vec cast_buffer; + cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED); + _ucl_cast_copy::cc(dst,src,rows,cols, + cast_buffer,cq); + } + // If we are here, at least one of the matrices must have VECTOR=0 + } else if (mat1::VECTOR) { + #ifdef UCL_DEBUG + assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + #endif + ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(), + cols*sizeof(typename mat1::data_type),rows, + cq); + } else if (mat2::VECTOR) { + #ifdef UCL_DEBUG + assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + #endif + ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type), + cols*sizeof(typename mat1::data_type),rows,cq); + } else { + #ifdef UCL_DEBUG + assert(src.rows()>=rows && src.cols()>=cols); + assert(dst.rows()>=rows && dst.cols()>=cols); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + #endif + ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(), + cols*sizeof(typename mat1::data_type),rows,cq); + } +} + +/// Copy subset of matrix rows,cols (memory already allocated) +/** \param async Perform non-blocking copy (ignored for host to host copy) + * - If src is a vector, routine assumes row-major rows by cols copy + * - If src is a matrix, routine will copy upper left tile of matrix + * - If dst is a vector, routine assumes row-major rows by cols copy + * - If dst is a matrix, routine will copy into left tile of matrix + * - If the data types of the two matrices are not the same, + * casting will be performed automatically as long as the copy is + * not device to device. For host/device transfers, a temporary + * buffer is created for copy. When multiple casts occur, it is + * more efficient to create a permanent casting buffer that can + * be passed to an alternative copy routine. + * - The copy should handle padding for 2D alignment correctly + * - Copy from vector to matrix and vice versa allowed + * - The default stream is used for asynchronous copy + * - Currently does not handle textures **/ +template +inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, + const size_t cols, const bool async) { + if (async) + ucl_copy(dst,src,rows,cols,dst.cq()); + else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) + _host_host_copy::hhc(dst,src,rows,cols); + else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && + (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { + if (mat1::MEM_TYPE==1) { + UCL_H_Vec cast_buffer; + cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED); + _ucl_cast_copy::cc(dst,src,rows,cols, + cast_buffer); + } else { + UCL_H_Vec cast_buffer; + cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED); + _ucl_cast_copy::cc(dst,src,rows,cols, + cast_buffer); + } + // If we are here, at least one of the matrices must have VECTOR=0 + } else if (mat1::VECTOR) { + #ifdef UCL_DEBUG + assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + assert(mat2::VECTOR==0); + #endif + ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(), + cols*sizeof(typename mat1::data_type),rows); + } else if (mat2::VECTOR) { + #ifdef UCL_DEBUG + assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + assert(mat1::VECTOR==0); + #endif + ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type), + cols*sizeof(typename mat1::data_type),rows); + } else { + #ifdef UCL_DEBUG + assert(src.rows()>=rows && src.cols()>=cols); + assert(dst.rows()>=rows && dst.cols()>=cols); + assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + #endif + ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(), + cols*sizeof(typename mat1::data_type),rows); + } +} + +// -------------------------------------------------------------------------- +// - 1D/2D COPY +// -------------------------------------------------------------------------- + +/// Asynchronous copy of matrix/vector with cast (Device/Host transfer) +/** \param async Perform non-blocking copy on default stream + * \param cast_buffer Buffer on host with enough storage for casting + * - If the data types for the two matrices are same, no cast performed + * - The number of bytes copied is determined by entire src data + * - Padding for 2D matrices is not considered in this routine. + * - Copy from vector to matrix and vice versa allowed + * - Currently does not handle textures **/ +template +inline void ucl_cast_copy(mat1 &dst, const mat2 &src, + mat3 &cast_buffer, const bool async) { + if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) + ucl_copy(dst,src,async); + else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) ) + ucl_cast_copy(dst,src,src.rows(),src.cols(),cast_buffer,async); + else if (mat1::PADDED==1) + ucl_cast_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,async); + else + ucl_cast_copy(dst,src,src.numel(),cast_buffer,async); +} + +/// Asynchronous copy of matrix/vector with cast (Device/Host transfer) +/** \param cast_buffer Buffer on host with enough storage for casting + * - If the data types for the two matrices are same, no cast performed + * - The number of bytes copied is determined by entire src data + * - Padding for 2D matrices is not considered in this routine. + * - Copy from vector to matrix and vice versa allowed + * - Currently does not handle textures **/ +template +inline void ucl_cast_copy(mat1 &dst, const mat2 &src, + mat3 &cast_buffer, command_queue &cq) { + if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) + ucl_copy(dst,src,cq); + else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) ) + ucl_copy(dst,src,src.rows(),src.cols(),cast_buffer,cq); + else if (mat1::PADDED==1) + ucl_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,cq); + else + ucl_copy(dst,src,src.numel(),cast_buffer,cq); +} + +/// Asynchronous copy of matrix/vector (memory already allocated) +/** - The number of bytes copied is determined by entire src data + * - If the data types of the two matrices are not the same, + * casting will be performed automatically as long as the copy is + * not device to device. For host/device transfers, a temporary + * buffer is created for copy. When multiple casts occur, it is + * more efficient to create a permanent casting buffer that can + * be passed to an alternative copy routine. + * - The copy should handle padding for 2D alignment correctly + * - Copy from vector to matrix and vice versa allowed + * - Currently does not handle textures **/ +template +inline void ucl_copy(mat1 &dst, const mat2 &src, command_queue &cq) { + if (dst.row_bytes()==src.row_bytes() && + src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW && + (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) + ucl_copy(dst,src,src.row_size()*src.rows(),cq); + else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) ) + ucl_copy(dst,src,src.rows(),src.cols(),cq); + else if (mat1::PADDED==1) + ucl_copy(dst,src,dst.rows(),dst.cols(),cq); + else + ucl_copy(dst,src,src.numel(),cq); +} + +/// Copy matrix/vector (memory already allocated) +/** \param async Perform non-blocking copy (ignored for host to host copy) + * - The number of bytes copied is determined by entire src data + * - If the data types of the two matrices are not the same, + * casting will be performed automatically as long as the copy is + * not device to device. For host/device transfers, a temporary + * buffer is created for copy. When multiple casts occur, it is + * more efficient to create a permanent casting buffer that can + * be passed to an alternative copy routine. + * - The copy should handle padding for 2D alignment correctly + * - Copy from vector to matrix and vice versa allowed + * - The default stream is used for asynchronous copy + * - Currently does not handle textures **/ +template +inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) { + if (async) + ucl_copy(dst,src,dst.cq()); + else if (dst.row_bytes()==src.row_bytes() && + src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW && + (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) + ucl_copy(dst,src,src.row_size()*src.rows(),async); + else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) ) + ucl_copy(dst,src,src.rows(),src.cols(),async); + else if (mat1::PADDED==1) + ucl_copy(dst,src,dst.rows(),dst.cols(),async); + else + ucl_copy(dst,src,src.numel(),async); +} + +#endif + diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h new file mode 100644 index 0000000000..115d7a6dd6 --- /dev/null +++ b/lib/gpu/geryon/ucl_d_mat.h @@ -0,0 +1,430 @@ +/*************************************************************************** + ucl_d_mat.h + ------------------- + W. Michael Brown + + Matrix Container on Device + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Thu Jun 25 2009 + copyright : (C) 2009 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +// Only allow this file to be included by CUDA and OpenCL specific headers +#ifdef _UCL_MAT_ALLOW + +/// 2D Matrix on device (can have extra column storage to get correct alignment) +template +class UCL_D_Mat : public UCL_BaseMat { + public: + // Traits for copying data + // MEM_TYPE is 0 for device, 1 for host, and 2 for image + enum traits { + DATA_TYPE = _UCL_DATA_ID::id, + MEM_TYPE = 0, + PADDED = 1, + ROW_MAJOR = 1, + VECTOR = 0 + }; + typedef numtyp data_type; + + UCL_D_Mat() : _rows(0), _kind(UCL_VIEW) {} + ~UCL_D_Mat() { if (_kind!=UCL_VIEW) _device_free(*this); } + + /// Construct with specified rows and cols + /** \sa alloc() **/ + UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device, + const enum UCL_MEMOPT kind=UCL_READ_WRITE) : + _rows(0), _kind(UCL_VIEW) { alloc(rows,cols,device,kind); } + + /// Row major matrix on device + /** The kind parameter controls memory optimizations as follows: + * - UCL_READ_WRITE - Specify that you will read and write in kernels + * - UCL_WRITE_ONLY - Specify that you will only write in kernels + * - UCL_READ_ONLY - Specify that you will only read in kernels + * \param cq Default command queue for operations copied from another mat + * \note - Coalesced access using adjacent cols on same row + * UCL_D_Mat(row,col) given by array[row*row_size()+col] + * \return UCL_SUCCESS if the memory allocation is successful **/ + template + inline int alloc(const size_t rows, const size_t cols, mat_type &cq, + const enum UCL_MEMOPT kind=UCL_READ_WRITE) { + clear(); + _kind=kind; + _rows=rows; + _cols=cols; + int err=_device_alloc(*this,cq,rows,cols,_pitch,kind); + _row_size=_pitch/sizeof(numtyp); + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_row_size*cols; + #endif + #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + std::cerr << "UCL Error: Could not allocate " + << rows*cols*sizeof(numtyp) << " bytes on device.\n"; + exit(1); + } + #endif + #ifdef _OCL_MAT + _offset=0; + #endif + return err; + } + + /// Row major matrix on device + /** The kind parameter controls memory optimizations as follows: + * - UCL_READ_WRITE - Specify that you will read and write in kernels + * - UCL_WRITE_ONLY - Specify that you will only write in kernels + * - UCL_READ_ONLY - Specify that you will only read in kernels + * \param device Used to get the default command queue for operations + * \note - Coalesced access using adjacent cols on same row + * UCL_D_Mat(row,col) given by array[row*row_size()+col] + * \return UCL_SUCCESS if the memory allocation is successful **/ + inline int alloc(const size_t rows, const size_t cols, UCL_Device &device, + const enum UCL_MEMOPT kind=UCL_READ_WRITE) { + clear(); + _kind=kind; + _rows=rows; + _cols=cols; + int err=_device_alloc(*this,device,rows,cols,_pitch,kind); + _row_size=_pitch/sizeof(numtyp); + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_row_size*cols; + #endif + #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + std::cerr << "UCL Error: Could not allocate " + << rows*cols*sizeof(numtyp) << " bytes on device.\n"; + exit(1); + } + #endif + #ifdef _OCL_MAT + _offset=0; + #endif + return err; + } + + /// Return the type of memory allocation + /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ + inline enum UCL_MEMOPT kind() const { return _kind; } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view(ucl_type &input, const size_t rows, const size_t cols, + const size_t stride) { + clear(); + _kind=UCL_VIEW; + _rows=rows; + _cols=cols; + _pitch=stride*sizeof(numtyp); + _row_size=stride; + this->_cq=input.cq(); + #ifdef _OCL_MAT + _offset=0; + _array=input.cbegin(); + #else + _device_view(&_array,input.begin()); + #endif + + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_cols; + #endif + } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view(ucl_type &input, const size_t rows, const size_t cols) + { view(input,rows,cols,input.row_size()); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view **/ + template + inline void view(ucl_type &input, const size_t cols) + { view(input,1,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view **/ + template + inline void view(ucl_type &input) + { view(input,input.rows(),input.cols()); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view(ptr_type input, const size_t rows, const size_t cols, + const size_t stride, UCL_Device &dev) { + clear(); + _kind=UCL_VIEW; + _cols=cols; + _rows=rows; + _pitch=stride*sizeof(numtyp); + _row_size=stride; + this->_cq=dev.cq(); + _array=input; + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_cols; + #endif + #ifdef _OCL_MAT + _offset=0; + #endif + } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view(ptr_type input, const size_t rows, const size_t cols, + UCL_Device &dev) { view(input,rows,cols,cols,dev); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view(ptr_type input, const size_t cols, UCL_Device &dev) + { view(input,1,cols,dev); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + const size_t cols, const size_t stride) { + clear(); + _kind=UCL_VIEW; + _cols=cols; + _rows=rows; + _pitch=stride*sizeof(numtyp); + _row_size=stride; + this->_cq=input.cq(); + #ifdef _OCL_MAT + _array=input.begin(); + _offset=offset; + #else + _device_view(&_array,input.begin(),offset,sizeof(numtyp)); + #endif + + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_cols; + #endif + } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + const size_t cols) + { view_offset(offset,input,rows,cols,input.row_size()); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) + { view_offset(offset,input,1,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view **/ + template + inline void view_offset(const size_t offset, ucl_type &input) { + if (input.rows()==1) + view_offset(offset,input,1,input.cols()-offset); + else + view_offset(offset,input,input.rows()-offset/input.row_size(), + input.cols()); + } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view_offset(const size_t offset,ptr_type input,const size_t rows, + const size_t cols,const size_t stride, + UCL_Device &dev) { + clear(); + _kind=UCL_VIEW; + _cols=cols; + _rows=rows; + _pitch=stride*sizeof(numtyp); + _row_size=stride; + this->_cq=dev.cq(); + + #ifdef _OCL_MAT + _array=input; + _offset=offset; + #else + #ifdef _UCL_DEVICE_PTR_MAT + _array=input+offset*sizeof(numtyp); + #else + _array=input+offset; + #endif + #endif + + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_cols; + #endif + } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view_offset(const size_t offset,ptr_type input,const size_t rows, + const size_t cols, UCL_Device &dev) + { view_offset(offset,input,rows,cols,cols,dev); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view_offset(const size_t offset, ptr_type input, + const size_t cols, UCL_Device &dev) + { view_offset(offset,input,1,cols,dev); } + + /// Free memory and set size to 0 + inline void clear() + { _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } } + + /// Set each element to zero + inline void zero() { _device_zero(*this,row_bytes()*_rows); } + + /// Set first n elements to zero + inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); } + + #ifdef _UCL_DEVICE_PTR_MAT + /// For OpenCL, returns a (void *) device pointer to memory allocation + inline device_ptr & begin() { return _array; } + /// For OpenCL, returns a (void *) device pointer to memory allocation + inline const device_ptr & begin() const { return _array; } + #else + /// For CUDA-RT, get device pointer to first element + inline numtyp * begin() { return _array; } + /// For CUDA-RT, get device pointer to first element + inline const numtyp * begin() const { return _array; } + /// For CUDA-RT, get device pointer to one past last element + inline numtyp * end() { return _end; } + /// For CUDA-RT, get device pointer to one past last element + inline const numtyp * end() const { return _end; } + #endif + + #ifdef _UCL_DEVICE_PTR_MAT + /// Returns an API specific device pointer + /** - For OpenCL, returns a &cl_mem object + * - For CUDA Driver, returns a &CUdeviceptr + * - For CUDA-RT, returns void** **/ + inline device_ptr & cbegin() { return _array; } + /// Returns an API specific device pointer + /** - For OpenCL, returns a &cl_mem object + * - For CUDA Driver, returns a &CUdeviceptr + * - For CUDA-RT, returns void** **/ + inline const device_ptr & cbegin() const { return _array; } + #else + /// Returns an API specific device pointer + /** - For OpenCL, returns a &cl_mem object + * - For CUDA Driver, returns a &CUdeviceptr + * - For CUDA-RT, returns numtyp** **/ + inline numtyp ** cbegin() { return &_array; } + /// Returns an API specific device pointer + /** - For OpenCL, returns a &cl_mem object + * - For CUDA Driver, returns a &CUdeviceptr + * - For CUDA-RT, returns numtyp** **/ + inline const numtyp ** cbegin() const { return &_array; } + #endif + + /// Get the number of elements + inline size_t numel() const { return _cols*_rows; } + /// Get the number of rows + inline size_t rows() const { return _rows; } + /// Get the number of columns + inline size_t cols() const { return _cols; } + ///Get the size of a row (including any padding) in elements + inline size_t row_size() const { return _row_size; } + /// Get the size of a row (including any padding) in bytes + inline size_t row_bytes() const { return _pitch; } + /// Get the size in bytes of 1 element + inline int element_size() const { return sizeof(numtyp); } + + #ifdef _OCL_MAT + /// Return the offset (in elements) from begin() pointer where data starts + /** \note Always 0 for host matrices and CUDA APIs **/ + inline size_t offset() const { return _offset; } + #else + /// Return the offset (in elements) from begin() pointer where data starts + /** \note Always 0 for host matrices and CUDA APIs **/ + inline size_t offset() const { return 0; } + #endif + + /// Return the offset (in bytes) from begin() pointer where data starts + /** \note Always 0 for host matrices and CUDA APIs **/ + inline size_t byteoff() const { return offset()*sizeof(numtyp); } + + private: + size_t _pitch, _row_size, _rows, _cols; + enum UCL_MEMOPT _kind; + + #ifdef _UCL_DEVICE_PTR_MAT + device_ptr _array; + #else + numtyp *_array,*_end; + #endif + + #ifdef _OCL_MAT + size_t _offset; + #endif +}; + +#endif + diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h new file mode 100644 index 0000000000..1873642181 --- /dev/null +++ b/lib/gpu/geryon/ucl_d_vec.h @@ -0,0 +1,442 @@ +/*************************************************************************** + ucl_d_vec.h + ------------------- + W. Michael Brown + + Vector Container on Device + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Thu Jun 25 2009 + copyright : (C) 2009 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +// Only allow this file to be included by CUDA and OpenCL specific headers +#ifdef _UCL_MAT_ALLOW + +/// Row vector on device +template +class UCL_D_Vec : public UCL_BaseMat { + public: + // Traits for copying data + // MEM_TYPE is 0 for device, 1 for host, and 2 for image + enum traits { + DATA_TYPE = _UCL_DATA_ID::id, + MEM_TYPE = 0, + PADDED = 0, + ROW_MAJOR = 1, + VECTOR = 1 + }; + typedef numtyp data_type; + + UCL_D_Vec() : _cols(0), _kind(UCL_VIEW) {} + ~UCL_D_Vec() { if (_kind!=UCL_VIEW) _device_free(*this); } + + /// Construct with n columns + /** \sa alloc() **/ + UCL_D_Vec(const size_t n, UCL_Device &device, + const enum UCL_MEMOPT kind=UCL_READ_WRITE) : + _cols(0), _kind(UCL_VIEW) { alloc(n,device,kind); } + + /// Set up host vector with 'cols' columns and reserve memory + /** The kind parameter controls memory optimizations as follows: + * - UCL_READ_WRITE - Specify that you will read and write in kernels + * - UCL_WRITE_ONLY - Specify that you will only write in kernels + * - UCL_READ_ONLY - Specify that you will only read in kernels + * \param cq Default command queue for operations copied from another mat + * \return UCL_SUCCESS if the memory allocation is successful **/ + template + inline int alloc(const size_t cols, mat_type &cq, + const enum UCL_MEMOPT kind=UCL_READ_WRITE) { + + clear(); + _kind=kind; + _cols=cols; + _row_bytes=cols*sizeof(numtyp); + int err=_device_alloc(*this,cq,_row_bytes,kind); + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+cols; + #endif + #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + std::cerr << "UCL Error: Could not allocate " << _row_bytes + << " bytes on device.\n"; + exit(1); + } + #endif + #ifdef _OCL_MAT + _offset=0; + #endif + return err; + } + + /// Set up host vector with 'cols' columns and reserve memory + /** The kind parameter controls memory optimizations as follows: + * - UCL_READ_WRITE - Specify that you will read and write in kernels + * - UCL_WRITE_ONLY - Specify that you will only write in kernels + * - UCL_READ_ONLY - Specify that you will only read in kernels + * \param device Used to get the default command queue for operations + * \return UCL_SUCCESS if the memory allocation is successful **/ + inline int alloc(const size_t cols, UCL_Device &device, + const enum UCL_MEMOPT kind=UCL_READ_WRITE) { + clear(); + _kind=kind; + _cols=cols; + _row_bytes=cols*sizeof(numtyp); + int err=_device_alloc(*this,device,_row_bytes,kind); + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+cols; + #endif + #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + std::cerr << "UCL Error: Could not allocate " << _row_bytes + << " bytes on device.\n"; + exit(1); + } + #endif + #ifdef _OCL_MAT + _offset=0; + #endif + return err; + } + + /// Return the type of memory allocation + /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ + inline enum UCL_MEMOPT kind() const { return _kind; } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view(ucl_type &input, const size_t rows, const size_t cols) { + #ifdef UCL_DEBUG + assert(rows==1); + #endif + clear(); + _kind=UCL_VIEW; + _cols=cols; + _row_bytes=_cols*sizeof(numtyp); + this->_cq=input.cq(); + #ifdef _OCL_MAT + _offset=0; + _array=input.cbegin(); + #else + _device_view(&_array,input.begin()); + #endif + + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_cols; + #endif + } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view(ucl_type &input, const size_t rows, const size_t cols, + const size_t stride) { view(input,rows,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view **/ + template + inline void view(ucl_type &input, const size_t cols) + { view(input,1,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view **/ + template + inline void view(ucl_type &input) + { view(input,input.rows()*input.row_size()); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view(ptr_type input, const size_t rows, const size_t cols, + UCL_Device &dev) { + #ifdef UCL_DEBUG + assert(rows==1); + #endif + clear(); + _kind=UCL_VIEW; + _cols=cols; + _row_bytes=_cols*sizeof(numtyp); + this->_cq=dev.cq(); + _array=input; + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_cols; + #endif + #ifdef _OCL_MAT + _offset=0; + #endif + } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view(ptr_type input, const size_t rows, const size_t cols, + const size_t stride, UCL_Device &dev) + { view(input,rows,cols,stride); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view(ptr_type input, const size_t cols, UCL_Device &dev) + { view(input,1,cols,dev); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + const size_t cols) { + #ifdef UCL_DEBUG + assert(rows==1); + #endif + clear(); + _kind=UCL_VIEW; + _cols=cols; + _row_bytes=_cols*sizeof(numtyp); + this->_cq=input.cq(); + #ifdef _OCL_MAT + _array=input.begin(); + _offset=offset; + #else + _device_view(&_array,input.begin(),offset,sizeof(numtyp)); + #endif + + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_cols; + #endif + } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + const size_t cols, const size_t stride) + { view_offset(offset,input,rows,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) + { view_offset(offset,input,1,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view **/ + template + inline void view_offset(const size_t offset, ucl_type &input) + { view_offset(offset,input,input.rows()*input.row_size()-offset); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view_offset(const size_t offset,ptr_type input,const size_t rows, + const size_t cols, UCL_Device &dev) { + #ifdef UCL_DEBUG + assert(rows==1); + #endif + clear(); + _kind=UCL_VIEW; + _cols=cols; + _row_bytes=_cols*sizeof(numtyp); + this->_cq=dev.cq(); + + #ifdef _OCL_MAT + _array=input; + _offset=offset; + #else + #ifdef _UCL_DEVICE_PTR_MAT + _array=input+offset*sizeof(numtyp); + #else + _array=input+offset; + #endif + #endif + + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_cols; + #endif + } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view_offset(const size_t offset,ptr_type input,const size_t rows, + const size_t cols,const size_t stride,UCL_Device &dev) + { view_offset(offset,input,rows,cols,stride); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container **/ + template + inline void view_offset(const size_t offset, ptr_type input, + const size_t cols, UCL_Device &dev) + { view_offset(offset,input,1,cols,dev); } + + /// Free memory and set size to 0 + inline void clear() + { if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } } + + /// Set each element to zero + inline void zero() { _device_zero(*this,row_bytes()); } + + /// Set first n elements to zero + inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); } + + #ifdef _UCL_DEVICE_PTR_MAT + /// For OpenCL, returns a (void *) device pointer to memory allocation + inline device_ptr & begin() { return _array; } + /// For OpenCL, returns a (void *) device pointer to memory allocation + inline const device_ptr & begin() const { return _array; } + #else + /// For CUDA-RT, get device pointer to first element + inline numtyp * begin() { return _array; } + /// For CUDA-RT, get device pointer to first element + inline const numtyp * begin() const { return _array; } + /// For CUDA-RT, get device pointer to one past last element + inline numtyp * end() { return _end; } + /// For CUDA-RT, get device pointer to one past last element + inline const numtyp * end() const { return _end; } + #endif + + #ifdef _UCL_DEVICE_PTR_MAT + /// Returns an API specific device pointer + /** - For OpenCL, returns a &cl_mem object + * - For CUDA Driver, returns a &CUdeviceptr + * - For CUDA-RT, returns void** **/ + inline device_ptr & cbegin() { return _array; } + /// Returns an API specific device pointer + /** - For OpenCL, returns a &cl_mem object + * - For CUDA Driver, returns a &CUdeviceptr + * - For CUDA-RT, returns void** **/ + inline const device_ptr & cbegin() const { return _array; } + #else + /// Returns an API specific device pointer + /** - For OpenCL, returns a &cl_mem object + * - For CUDA Driver, returns a &CUdeviceptr + * - For CUDA-RT, returns numtyp** **/ + inline numtyp ** cbegin() { return &_array; } + /// Returns an API specific device pointer + /** - For OpenCL, returns a &cl_mem object + * - For CUDA Driver, returns a &CUdeviceptr + * - For CUDA-RT, returns numtyp** **/ + inline const numtyp ** cbegin() const { return &_array; } + /// For CUDA-RT, allocate row vector and bind texture + inline void safe_alloc(const size_t cols, UCL_Device &dev, + textureReference *t) + { alloc(cols,dev); assign_texture(t); bind(); } + /// For CUDA-RT, assign a texture to matrix + inline void assign_texture(textureReference *t) { _tex_ptr=t; } + /// For CUDA-RT, bind to texture + inline void bind() { + cuda_gb_get_channel(_channel); + (*_tex_ptr).addressMode[0] = cudaAddressModeClamp; + (*_tex_ptr).addressMode[1] = cudaAddressModeClamp; + (*_tex_ptr).filterMode = cudaFilterModePoint; + (*_tex_ptr).normalized = false; + CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,_array,&_channel)); + } + /// For CUDA-RT, unbind texture + inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); } + #endif + + /// Get the number of elements + inline size_t numel() const { return _cols; } + /// Get the number of rows + inline size_t rows() const { return 1; } + /// Get the number of columns + inline size_t cols() const { return _cols; } + ///Get the size of a row (including any padding) in elements + inline size_t row_size() const { return _cols; } + /// Get the size of a row (including any padding) in bytes + inline size_t row_bytes() const { return _row_bytes; } + /// Get the size in bytes of 1 element + inline int element_size() const { return sizeof(numtyp); } + + #ifdef _OCL_MAT + /// Return the offset (in elements) from begin() pointer where data starts + /** \note Always 0 for host matrices and CUDA APIs **/ + inline size_t offset() const { return _offset; } + #else + /// Return the offset (in elements) from begin() pointer where data starts + /** \note Always 0 for host matrices and CUDA APIs **/ + inline size_t offset() const { return 0; } + #endif + + /// Return the offset (in bytes) from begin() pointer where data starts + /** \note Always 0 for host matrices and CUDA APIs **/ + inline size_t byteoff() const { return offset()*sizeof(numtyp); } + + private: + size_t _row_bytes, _row_size, _rows, _cols; + enum UCL_MEMOPT _kind; + + #ifdef _UCL_DEVICE_PTR_MAT + device_ptr _array; + #else + numtyp *_array,*_end; + cudaChannelFormatDesc _channel; + textureReference *_tex_ptr; + #endif + + #ifdef _OCL_MAT + size_t _offset; + #endif +}; + +#endif + diff --git a/lib/gpu/geryon/ucl_get_devices.cpp b/lib/gpu/geryon/ucl_get_devices.cpp new file mode 100644 index 0000000000..1fa758fb46 --- /dev/null +++ b/lib/gpu/geryon/ucl_get_devices.cpp @@ -0,0 +1,48 @@ +/*************************************************************************** + nvc_get_devices.h + ------------------- + W. Michael Brown + + List properties of cuda devices + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Wed Jan 28 2009 + copyright : (C) 2009 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifdef UCL_OPENCL +#include "ocl_device.h" +using namespace ucl_opencl; +#endif + +#ifdef UCL_CUDADR +#include "nvd_device.h" +using namespace ucl_cudadr; +#endif + +#ifdef UCL_CUDART +#include "nvc_device.h" +using namespace ucl_cudart; +#endif + +int main(int argc, char** argv) { + UCL_Device cop; + std::cout << "Found " << cop.num_platforms() << " platform(s).\n"; + if (cop.num_platforms()>0) { + std::cout << "Using platform: " << cop.platform_name() << std::endl; + cop.print_all(std::cout); + } + return 0; +} + diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h new file mode 100644 index 0000000000..bfd4a6ce99 --- /dev/null +++ b/lib/gpu/geryon/ucl_h_mat.h @@ -0,0 +1,378 @@ +/*************************************************************************** + ucl_h_mat.h + ------------------- + W. Michael Brown + + Matrix Container on Host + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Thu Jun 25 2009 + copyright : (C) 2009 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +// Only allow this file to be included by CUDA and OpenCL specific headers +#ifdef _UCL_MAT_ALLOW + +/// Matrix on Host with options for pinning (page locked) +template +class UCL_H_Mat : public UCL_BaseMat { + public: + // Traits for copying data + // MEM_TYPE is 0 for device, 1 for host, and 2 for image + enum traits { + DATA_TYPE = _UCL_DATA_ID::id, + MEM_TYPE = 1, + PADDED = 0, + ROW_MAJOR = 1, + VECTOR = 0 + }; + typedef numtyp data_type; + + UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { } + ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); } + + /// Construct with specied number of rows and columns + /** \sa alloc() **/ + UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, + const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) + { _rows=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); } + + /// Set up host matrix with specied # of rows/cols and reserve memory + /** The kind parameter controls memory pinning as follows: + * - UCL_NOT_PINNED - Memory is not pinned + * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) + * - UCL_RW_OPTIMIZED - Memory can be pinned + * \param cq Default command queue for operations copied from another mat + * \return UCL_SUCCESS if the memory allocation is successful **/ + template + inline int alloc(const size_t rows, const size_t cols, mat_type &cq, + const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { + clear(); + _cols=cols; + _rows=rows; + _row_bytes=cols*sizeof(numtyp); + _kind=kind; + int err=_host_alloc(*this,cq,_row_bytes*_rows,kind); + #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows + << " bytes on host.\n"; + exit(1); + } + #endif + _end=_array+rows*cols; + return err; + } + + /// Set up host matrix with specied # of rows/cols and reserve memory + /** The kind parameter controls memory pinning as follows: + * - UCL_NOT_PINNED - Memory is not pinned + * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) + * - UCL_RW_OPTIMIZED - Memory can be pinned + * \param device Used to get the default command queue for operations + * \return UCL_SUCCESS if the memory allocation is successful **/ + inline int alloc(const size_t rows, const size_t cols, UCL_Device &device, + const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { + clear(); + _cols=cols; + _rows=rows; + _row_bytes=cols*sizeof(numtyp); + _kind=kind; + int err=_host_alloc(*this,device,_row_bytes*_rows,kind); + _end=_array+rows*cols; + #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows + << " bytes on host.\n"; + exit(1); + } + #endif + return err; + } + + /// Return the type of memory allocation + /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ + inline enum UCL_MEMOPT kind() const { return _kind; } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device container on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view(ucl_type &input, const size_t rows, const size_t cols, + const size_t stride) { + assert(rows==1 || stride==cols); + clear(); + _kind=UCL_VIEW; + _cols=cols; + _rows=rows; + _row_bytes=stride*sizeof(numtyp); + this->_cq=input.cq(); + _array=input.begin(); + _end=_array+_cols; + #ifdef _OCL_MAT + _carray=input.cbegin(); + #endif + } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device container on the host is not supported **/ + template + inline void view(ucl_type &input, const size_t rows, const size_t cols) + { view(input,rows,cols,input.row_size()); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view + * - Viewing a device container on the host is not supported **/ + template + inline void view(ucl_type &input, const size_t cols) + { view(input,1,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view + * - Viewing a device container on the host is not supported **/ + template + inline void view(ucl_type &input) + { view(input,input.rows(),input.cols()); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view(ptr_type *input, const size_t rows, const size_t cols, + const size_t stride, UCL_Device &dev) { + assert(rows==1 || stride==cols); + clear(); + _kind=UCL_VIEW; + _cols=cols; + _rows=rows; + _row_bytes=stride*sizeof(numtyp); + this->_cq=dev.cq(); + _array=input; + _end=_array+_cols; + + #ifdef _OCL_MAT + _host_alloc(*this,dev,_row_bytes,UCL_VIEW); + #endif + } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported **/ + template + inline void view(ptr_type *input, const size_t rows, const size_t cols, + UCL_Device &dev) { view(input,rows,cols,cols,dev); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported **/ + template + inline void view(ptr_type *input, const size_t cols, UCL_Device &dev) + { view(input,1,cols,dev); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device container on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + const size_t cols, const size_t stride) { + assert(rows==1 || stride==cols); + clear(); + _kind=UCL_VIEW; + _cols=cols; + _rows=rows; + _row_bytes=stride*sizeof(numtyp); + this->_cq=input.cq(); + _array=input.begin()+offset; + _end=_array+_cols; + #ifdef _OCL_MAT + _host_alloc(*this,input,_row_bytes,UCL_VIEW); + #endif + } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device container on the host is not supported **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + const size_t cols) + { view_offset(offset,input,rows,cols,input.row_size()); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view + * - Viewing a device container on the host is not supported **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) + { view_offset(offset,input,1,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view + * - Viewing a device container on the host is not supported **/ + template + inline void view_offset(const size_t offset, ucl_type &input) { + if (input.rows()==1) + view_offset(offset,input,1,input.cols()-offset); + else + view_offset(offset,input,input.rows()-offset/input.row_size(), + input.cols()); + } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported **/ + template + inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, + const size_t cols, UCL_Device &dev) + { view(input+offset,rows,cols,dev); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, + const size_t cols,const size_t stride,UCL_Device &dev) + { view(input+offset,rows,cols,stride,dev); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported **/ + template + inline void view_offset(const size_t offset, ptr_type *input, + const size_t cols, UCL_Device &dev) + { view(input+offset,1,cols,dev); } + + /// Free memory and set size to 0 + inline void clear() + { if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }} + + /// Set each element to zero + inline void zero() { _host_zero(_array,_rows*row_bytes()); } + /// Set first n elements to zero + inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); } + + /// Get host pointer to first element + inline numtyp * begin() { return _array; } + /// Get host pointer to first element + inline const numtyp * begin() const { return _array; } + /// Get host pointer to one past last element + inline numtyp * end() { return _end; } + /// Get host pointer to one past last element + inline const numtyp * end() const { return _end; } + + /// Get the number of elements + inline size_t numel() const { return _rows*_cols; } + /// Get the number of rows + inline size_t rows() const { return _rows; } + /// Get the number of columns + inline size_t cols() const { return _cols; } + ///Get the size of a row (including any padding) in elements + inline size_t row_size() const { return _cols; } + /// Get the size of a row (including any padding) in bytes + inline size_t row_bytes() const { return _row_bytes; } + /// Get the size in bytes of 1 element + inline int element_size() const { return sizeof(numtyp); } + + /// Get element at index i + inline numtyp & operator[](const int i) { return _array[i]; } + /// Get element at index i + inline const numtyp & operator[](const int i) const { return _array[i]; } + /// 2D access (row should always be 0) + inline numtyp & operator()(const int row, const int col) + { return _array[row*_cols+col]; } + /// 2D access (row should always be 0) + inline const numtyp & operator()(const int row, const int col) const + { return _array[row*_cols+col]; } + + /// Returns pointer to memory pointer for allocation on host + inline numtyp ** host_ptr() { return &_array; } + + /// Return the offset (in elements) from begin() pointer where data starts + /** \note Always 0 for host matrices and CUDA APIs **/ + inline size_t offset() const { return 0; } + /// Return the offset (in bytes) from begin() pointer where data starts + /** \note Always 0 for host matrices and CUDA APIs **/ + inline size_t byteoff() const { return 0; } + + #ifdef _OCL_MAT + /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA) + inline device_ptr & cbegin() { return _carray; } + /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA) + inline const device_ptr & cbegin() const { return _carray; } + #else + /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA) + inline void ** cbegin() { return (void **)&_array; } + /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA) + inline const void ** cbegin() const { return (const void **)&_array; } + #endif + + private: + enum UCL_MEMOPT _kind; + numtyp *_array, *_end; + size_t _row_bytes, _rows, _cols; + + #ifdef _OCL_MAT + device_ptr _carray; + #endif +}; + +#endif + diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h new file mode 100644 index 0000000000..fb60e8cf17 --- /dev/null +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -0,0 +1,370 @@ +/*************************************************************************** + ucl_h_vec.h + ------------------- + W. Michael Brown + + Vector Container on Host + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Thu Jun 25 2009 + copyright : (C) 2009 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +// Only allow this file to be included by CUDA and OpenCL specific headers +#ifdef _UCL_MAT_ALLOW + +/// Row Vector on Host with options for pinning (page locked) +template +class UCL_H_Vec : public UCL_BaseMat { + public: + // Traits for copying data + // MEM_TYPE is 0 for device, 1 for host, and 2 for image + enum traits { + DATA_TYPE = _UCL_DATA_ID::id, + MEM_TYPE = 1, + PADDED = 0, + ROW_MAJOR = 1, + VECTOR = 1 + }; + typedef numtyp data_type; + + UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { } + ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); } + + /// Construct with n columns + /** \sa alloc() **/ + UCL_H_Vec(const size_t n, UCL_Device &device, + const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) + { _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); } + + /// Set up host vector with 'cols' columns and reserve memory + /** The kind parameter controls memory pinning as follows: + * - UCL_NOT_PINNED - Memory is not pinned + * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) + * - UCL_RW_OPTIMIZED - Memory can be pinned + * \param cq Default command queue for operations copied from another mat + * \return UCL_SUCCESS if the memory allocation is successful **/ + template + inline int alloc(const size_t cols, mat_type &cq, + const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { + clear(); + _cols=cols; + _row_bytes=cols*sizeof(numtyp); + _kind=kind; + int err=_host_alloc(*this,cq,_row_bytes,kind); + _end=_array+cols; + #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + std::cerr << "UCL Error: Could not allocate " << _row_bytes + << " bytes on host.\n"; + exit(1); + } + #endif + return err; + } + + /// Set up host vector with 'cols' columns and reserve memory + /** The kind parameter controls memory pinning as follows: + * - UCL_NOT_PINNED - Memory is not pinned + * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) + * - UCL_RW_OPTIMIZED - Memory can be pinned + * \param device Used to get the default command queue for operations + * \return UCL_SUCCESS if the memory allocation is successful **/ + inline int alloc(const size_t cols, UCL_Device &device, + const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { + clear(); + _cols=cols; + _row_bytes=cols*sizeof(numtyp); + _kind=kind; + int err=_host_alloc(*this,device,_row_bytes,kind); + _end=_array+cols; + #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + std::cerr << "UCL Error: Could not allocate " << _row_bytes + << " bytes on host.\n"; + exit(1); + } + #endif + return err; + } + + /// Return the type of memory allocation + /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ + inline enum UCL_MEMOPT kind() const { return _kind; } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device container on the host is not supported **/ + template + inline void view(ucl_type &input, const size_t rows, const size_t cols) { + #ifdef UCL_DEBUG + assert(rows==1); + #endif + clear(); + _kind=UCL_VIEW; + _cols=cols; + _row_bytes=_cols*sizeof(numtyp); + this->_cq=input.cq(); + _array=input.begin(); + _end=_array+_cols; + #ifdef _OCL_MAT + _carray=input.cbegin(); + #endif + } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device container on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view(ucl_type &input, const size_t rows, const size_t cols, + const size_t stride) { view(input,rows,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view + * - Viewing a device container on the host is not supported **/ + template + inline void view(ucl_type &input, const size_t cols) + { view(input,1,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view + * - Viewing a device container on the host is not supported **/ + template + inline void view(ucl_type &input) + { view(input,input.rows()*input.row_size()); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported **/ + template + inline void view(ptr_type *input, const size_t rows, const size_t cols, + UCL_Device &dev) { + #ifdef UCL_DEBUG + assert(rows==1); + #endif + clear(); + _kind=UCL_VIEW; + _cols=cols; + _row_bytes=_cols*sizeof(numtyp); + this->_cq=dev.cq(); + _array=input; + _end=_array+_cols; + + #ifdef _OCL_MAT + _host_alloc(*this,dev,_row_bytes,UCL_VIEW); + #endif + } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view(ptr_type *input, const size_t rows, const size_t cols, + const size_t stride, UCL_Device &dev) + { view(input,rows,cols,stride); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported **/ + template + inline void view(ptr_type *input, const size_t cols, UCL_Device &dev) + { view(input,1,cols,dev); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device container on the host is not supported **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + const size_t cols) { + #ifdef UCL_DEBUG + assert(rows==1); + #endif + clear(); + _kind=UCL_VIEW; + _cols=cols; + _row_bytes=_cols*sizeof(numtyp); + this->_cq=input.cq(); + _array=input.begin()+offset; + _end=_array+_cols; + #ifdef _OCL_MAT + _host_alloc(*this,input,_row_bytes,UCL_VIEW); + #endif + } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device container on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + const size_t cols, const size_t stride) + { view_offset(offset,input,rows,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view + * - Viewing a device container on the host is not supported **/ + template + inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) + { view_offset(offset,input,1,cols); } + + /// Do not allocate memory, instead use an existing allocation from Geryon + /** This function must be passed a Geryon vector or matrix container. + * No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - If a matrix is used a input, all elements (including padding) + * will be used for view + * - Viewing a device container on the host is not supported **/ + template + inline void view_offset(const size_t offset, ucl_type &input) + { view_offset(offset,input,input.rows()*input.row_size()-offset); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported **/ + template + inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, + const size_t cols, UCL_Device &dev) + { view(input+offset,rows,cols,dev); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ + template + inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, + const size_t cols,const size_t stride,UCL_Device &dev) + { view(input+offset,rows,cols,stride,dev); } + + /// Do not allocate memory, instead use an existing allocation + /** - No memory is freed when the object is destructed. + * - The view does not prevent the memory from being freed by the + * allocating container + * - Viewing a device pointer on the host is not supported **/ + template + inline void view_offset(const size_t offset, ptr_type *input, + const size_t cols, UCL_Device &dev) + { view(input+offset,1,cols,dev); } + + /// Free memory and set size to 0 + inline void clear() + { if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}} + + /// Set each element to zero + inline void zero() { _host_zero(_array,row_bytes()); } + + /// Set first n elements to zero + inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); } + + /// Get host pointer to first element + inline numtyp * begin() { return _array; } + /// Get host pointer to first element + inline const numtyp * begin() const { return _array; } + /// Get host pointer to one past last element + inline numtyp * end() { return _end; } + /// Get host pointer to one past last element + inline const numtyp * end() const { return _end; } + + /// Get the number of elements + inline size_t numel() const { return _cols; } + /// Get the number of rows + inline size_t rows() const { return 1; } + /// Get the number of columns + inline size_t cols() const { return _cols; } + ///Get the size of a row (including any padding) in elements + inline size_t row_size() const { return _cols; } + /// Get the size of a row (including any padding) in bytes + inline size_t row_bytes() const { return _row_bytes; } + /// Get the size in bytes of 1 element + inline int element_size() const { return sizeof(numtyp); } + + /// Get element at index i + inline numtyp & operator[](const int i) { return _array[i]; } + /// Get element at index i + inline const numtyp & operator[](const int i) const { return _array[i]; } + /// 2D access (row should always be 0) + inline numtyp & operator()(const int row, const int col) + { return _array[col]; } + /// 2D access (row should always be 0) + inline const numtyp & operator()(const int row, const int col) const + { return _array[col]; } + + /// Returns pointer to memory pointer for allocation on host + inline numtyp ** host_ptr() { return &_array; } + + /// Return the offset (in elements) from begin() pointer where data starts + /** \note Always 0 for host matrices and CUDA APIs **/ + inline size_t offset() const { return 0; } + /// Return the offset (in bytes) from begin() pointer where data starts + /** \note Always 0 for host matrices and CUDA APIs **/ + inline size_t byteoff() const { return 0; } + + #ifdef _OCL_MAT + /// For OpenCL, returns a reference to the cl_mem object + inline device_ptr & cbegin() { return _carray; } + /// For OpenCL, returns a reference to the cl_mem object + inline const device_ptr & cbegin() const { return _carray; } + #endif + + private: + enum UCL_MEMOPT _kind; + numtyp *_array, *_end; + size_t _row_bytes, _cols; + + #ifdef _OCL_MAT + device_ptr _carray; + #endif +}; + +#endif + diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h new file mode 100644 index 0000000000..a162b32c2a --- /dev/null +++ b/lib/gpu/geryon/ucl_nv_kernel.h @@ -0,0 +1,42 @@ +/*************************************************************************** + ucl_nv_kernel.h + ------------------- + W. Michael Brown + + Preprocessor macros for OpenCL/CUDA compatibility + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Mon May 3 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +// Only allow this file to be included by CUDA and OpenCL specific headers +#ifndef UCL_NV_KERNEL_H +#define UCL_NV_KERNEL_H + +#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x) +#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y) +#define THREAD_ID_X threadIdx.x +#define THREAD_ID_Y threadIdx.y +#define BLOCK_ID_X blockIdx.x +#define BLOCK_ID_Y blockIdx.y +#define BLOCK_SIZE_X blockDim.x +#define BLOCK_SIZE_Y blockDim.y +#define __kernel extern "C" __global__ +#define __local __shared__ +#define mul24 __mul24 +#define __global +#define __inline static __inline__ __device__ + +#endif diff --git a/lib/gpu/geryon/ucl_print.h b/lib/gpu/geryon/ucl_print.h new file mode 100644 index 0000000000..0152764225 --- /dev/null +++ b/lib/gpu/geryon/ucl_print.h @@ -0,0 +1,273 @@ +/*************************************************************************** + ucl_print.h + ------------------- + W. Michael Brown + + Routines for printing debugging output for matrix/vector data + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Mon Jan 11 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +// Only allow this file to be included by nvc_memory.h and ocl_memory.h +#ifdef UCL_PRINT_ALLOW + +template struct _ucl_print; +template <> struct _ucl_print<1> { + template + static inline void p(mat_type &mat, const size_t n, std::ostream &out, + const std::string delim) { + for (size_t i=0; i + static inline void p(const mat_type &mat, const size_t n, std::ostream &out, + const std::string delim, UCL_Device &dev) { + p(mat,n,out,delim); + } + template + static inline void p(mat_type &mat, const size_t rows, const size_t cols, + std::ostream &out, const std::string delim, + const std::string row_delim) { + int offset=0; + int row_size=cols; + if (mat_type::VECTOR==0) + row_size=mat.row_size(); + for (size_t j=0; j + static inline void p(const mat_type &mat,const size_t rows,const size_t cols, + std::ostream &out,const std::string delim, + const std::string row_delim, UCL_Device &dev) { + p(mat,rows,cols,out,delim,row_delim); + } +}; + +template struct _ucl_print { + template + static inline void p(mat_type &mat, const size_t n, std::ostream &out, + const std::string delim) { + UCL_H_Vec temp; + temp.alloc(n,mat); + ucl_copy(temp,mat,n,false); + _ucl_print<1>::p(temp,n,out,delim); + } + template + static inline void p(const mat_type &mat, const size_t n, std::ostream &out, + const std::string delim, UCL_Device &dev) { + UCL_H_Vec temp; + temp.alloc(n,dev); + ucl_copy(temp,mat,n,false); + _ucl_print<1>::p(temp,n,out,delim); + } + template + static inline void p(mat_type &mat, const size_t rows, const size_t cols, + std::ostream &out, const std::string delim, + const std::string row_delim) { + UCL_H_Vec temp; + temp.alloc(mat.rows()*mat.cols(),mat); + if (mat_type::VECTOR==1) + ucl_copy(temp,mat,rows*cols,false); + else + ucl_copy(temp,mat,rows,cols,false); + _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim); + } + template + static inline void p(const mat_type &mat, const size_t rows, + const size_t cols,std::ostream &out, + const std::string delim, + const std::string row_delim, UCL_Device &dev) { + UCL_H_Vec temp; + temp.alloc(mat.rows()*mat.cols(),dev); + if (mat_type::VECTOR==1) + ucl_copy(temp,mat,rows*cols,false); + else + ucl_copy(temp,mat,rows,cols,false); + _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim); + } +}; + +// ------------------------------------------------------------------------- +// - Non-const routines that do not require a device object +// ------------------------------------------------------------------------- + +/// Outputs n elements of mat delimited by the string delim +template +inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out, + const std::string delim) { + if (n>mat.numel()) { + std::cerr << "Attempted to ucl_print " << n << " elements of matrix " + << "that only has " << mat.numel() << " elements."; + exit(1); + } + _ucl_print::p(mat,n,out,delim); +} + +/// Outputs n elements of mat delimited by a space +template +inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out) { + ucl_print(mat,n,out," "); +} + +/// Outputs n elements of mat delimited by a space to standard out +template +inline void ucl_print(mat_type &mat, const size_t n) { + ucl_print(mat,n,std::cout," "); +} + +/// Outputs upper left rows and cols of mat delimited by the string delim +template +inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols, + std::ostream &out, const std::string delim, + const std::string row_delim) { + if (rows*cols>mat.numel()) { + std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix " + << "that only has " << mat.numel() << " elements."; + exit(1); + } + _ucl_print::p(mat,rows,cols,out,delim,row_delim); +} + +/// Outputs upper left rows and cols of mat delimited by a space +template +inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols, + std::ostream &out) { + ucl_print(mat,rows,cols,out," ","\n"); +} + +/// Outputs upper left rows and cols of mat delimited by a space to std out +template +inline void ucl_print(mat_type &mat, const size_t rows, + const size_t cols) { + ucl_print(mat,rows,cols,std::cout," ","\n"); +} + +/// Outputs mat delimited by a space to standard out +template +inline void ucl_print(mat_type &mat) { + ucl_print(mat,std::cout); +} + +/// Outputs mat delimited by a space +template +inline void ucl_print(mat_type &mat, std::ostream &out) { + if (mat_type::VECTOR==1) + ucl_print(mat,mat.cols(),out," "); + else + ucl_print(mat,mat.rows(),mat.cols(),out," ","\n"); +} + +// ------------------------------------------------------------------------- +// - Const routines that do not require a device object +// ------------------------------------------------------------------------- + +/// Outputs n elements of mat delimited by the string delim +template +inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out, + const std::string delim, UCL_Device &dev) { + if (n>mat.numel()) { + std::cerr << "Attempted to ucl_print " << n << " elements of matrix " + << "that only has " << mat.numel() << " elements."; + exit(1); + } + _ucl_print::p(mat,n,out,delim,dev); +} + +/// Outputs n elements of mat delimited by a space +template +inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out, + UCL_Device &dev) { + ucl_print(mat,n,out," ",dev); +} + +/// Outputs n elements of mat delimited by a space to standard out +template +inline void ucl_print(const mat_type &mat, const size_t n, + UCL_Device &dev) { + ucl_print(mat,n,std::cout," ",dev); +} + +/// Outputs upper left rows and cols of mat delimited by the string delim +template +inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols, + std::ostream &out, const std::string delim, + const std::string row_delim, UCL_Device &dev) { + if (rows*cols>mat.numel()) { + std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix " + << "that only has " << mat.numel() << " elements."; + exit(1); + } + _ucl_print::p(mat,rows,cols,out,delim,row_delim,dev); +} + +/// Outputs upper left rows and cols of mat delimited by a space +template +inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols, + std::ostream &out, UCL_Device &dev) { + ucl_print(mat,rows,cols,out," ","\n",dev); +} + +/// Outputs upper left rows and cols of mat delimited by a space to std out +template +inline void ucl_print(const mat_type &mat, const size_t rows, + const size_t cols, UCL_Device &dev) { + ucl_print(mat,rows,cols,std::cout," ","\n",dev); +} + +/// Outputs mat delimited by a space to standard out +template +inline void ucl_print(const mat_type &mat, UCL_Device &dev) { + ucl_print(mat,std::cout,dev); +} + +/// Outputs mat delimited by a space +template +inline void ucl_print(const mat_type &mat, std::ostream &out, UCL_Device &dev) { + if (mat_type::VECTOR==1) + ucl_print(mat,mat.cols(),out," ",dev); + else + ucl_print(mat,mat.rows(),mat.cols(),out," ","\n",dev); +} + +// ------------------------------------------------------------------------- +// - Operator << Overloading +// ------------------------------------------------------------------------- + +template +inline std::ostream & operator << (std::ostream &out, UCL_H_Vec &mat) + { ucl_print(mat,out); return out; } + +template +inline std::ostream & operator << (std::ostream &out, UCL_H_Mat &mat) + { ucl_print(mat,out); return out; } + +template +inline std::ostream & operator << (std::ostream &out, UCL_D_Vec &mat) + { ucl_print(mat,out); return out; } + +template +inline std::ostream & operator << (std::ostream &out, UCL_D_Mat &mat) + { ucl_print(mat,out); return out; } + +#endif diff --git a/lib/gpu/geryon/ucl_types.h b/lib/gpu/geryon/ucl_types.h new file mode 100644 index 0000000000..9dabf16687 --- /dev/null +++ b/lib/gpu/geryon/ucl_types.h @@ -0,0 +1,121 @@ +/*************************************************************************** + ucl_types.h + ------------------- + W. Michael Brown + + Data type definitions for Coprocessor library + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Mon Jan 4 2010 + copyright : (C) 2010 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef UCL_TYPES_H +#define UCL_TYPES_H + +// Assign an integer id based on the data type: (int, float, double, etc) +template struct _UCL_DATA_ID; +template <> struct _UCL_DATA_ID { + enum { id=1 }; + static inline const char * name() { return "double"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=double"; } +}; +template <> struct _UCL_DATA_ID { + enum { id=2 }; + static inline const char * name() { return "float"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=float"; } +}; +template <> struct _UCL_DATA_ID { + enum { id=3 }; + static inline const char * name() { return "unsigned"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; } +}; +template <> struct _UCL_DATA_ID { + enum { id=4 }; + static inline const char * name() { return "int"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=int"; } +}; +template <> struct _UCL_DATA_ID { + enum { id=5 }; + static inline const char * name() { return "char"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=char"; } +}; +template <> struct _UCL_DATA_ID { + enum { id=6 }; + static inline const char * name() { return "unsigned char"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; } +}; +template <> struct _UCL_DATA_ID { + enum { id=7 }; + static inline const char * name() { return "short"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=short"; } +}; +template <> struct _UCL_DATA_ID { + enum { id=8 }; + static inline const char * name() { return "unsigned short"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; } +}; +template <> struct _UCL_DATA_ID { + enum { id=9 }; + static inline const char * name() { return "long"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=long"; } +}; +template <> struct _UCL_DATA_ID { + enum { id=10 }; + static inline const char * name() { return "unsigned long"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; } +}; +template <> struct _UCL_DATA_ID { + enum { id=11 }; + static inline const char * name() { return "long double"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; } +}; +template struct _UCL_DATA_ID { + enum { id=0 }; + static inline const char * name() { return "error_type"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; } +}; + +// Host memory allocation types +enum UCL_MEMOPT { + UCL_WRITE_ONLY, ///< Allow any optimizations for memory that is write only + UCL_READ_ONLY, ///< Allow any optimizations for memory that is read only + UCL_READ_WRITE, ///< Allow read and write + UCL_WRITE_OPTIMIZED,///< Allow host memory to be pinned (write combined) + UCL_RW_OPTIMIZED, ///< Allow host memory to be pinned + UCL_NOT_PINNED, ///< Host memory is not to be pinned + UCL_VIEW ///< View of another memory allocation +}; + +enum UCL_DEVICE_TYPE { + UCL_DEFAULT, ///< Unknown device type + UCL_CPU, ///< Device is a CPU + UCL_GPU, ///< Device is a GPU + UCL_ACCELERATOR ///< Device is an Accelerator +}; + +enum UCL_ERROR_FLAG { + UCL_SUCCESS, ///< No error + UCL_ERROR, ///< Unqualified error + UCL_FILE_NOT_FOUND, ///< File not found + UCL_FUNCTION_NOT_FOUND, ///< Kernel function not found + UCL_COMPILE_ERROR, ///< Error compiling kernel + UCL_MEMORY_ERROR +}; + +template +const char * ucl_template_name() { return _UCL_DATA_ID::name(); } + +#endif + diff --git a/lib/gpu/lj96_cut_gpu.cpp b/lib/gpu/lj96_cut_gpu.cpp new file mode 100644 index 0000000000..eba26018e4 --- /dev/null +++ b/lib/gpu/lj96_cut_gpu.cpp @@ -0,0 +1,123 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include +#include +#include + +#include "lj96_cut_gpu_memory.h" + +using namespace std; + +static LJ96_GPU_Memory LJ96MF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + LJ96MF.clear(); + gpu_mode=LJ96MF.device->gpu_mode(); + double gpu_split=LJ96MF.device->particle_split(); + int first_gpu=LJ96MF.device->first_device(); + int last_gpu=LJ96MF.device->last_device(); + int world_me=LJ96MF.device->world_me(); + int gpu_rank=LJ96MF.device->gpu_rank(); + int procs_per_gpu=LJ96MF.device->procs_per_gpu(); + + LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu); + + bool message=false; + if (world_me==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + if (world_me==0) { + bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + if (!init_ok) + return false; + } + + MPI_Barrier(MPI_COMM_WORLD); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_comm); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + return true; +} + +void lj96_gpu_clear() { + LJ96MF.clear(); +} + +int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, + boxhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, cpu_time, success); +} + +void lj96_gpu_compute(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success) { + LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double lj96_gpu_bytes() { + return LJ96MF.host_memory_usage(); +} + + diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu new file mode 100644 index 0000000000..a1faec8f24 --- /dev/null +++ b/lib/gpu/lj96_cut_gpu_kernel.cu @@ -0,0 +1,281 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef LJ96_GPU_KERNEL +#define LJ96_GPU_KERNEL + +#define MAX_SHARED_TYPES 8 + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#ifdef NV_KERNEL + +#include "geryon/ucl_nv_kernel.h" +texture pos_tex; + +#ifdef _DOUBLE_DOUBLE +__inline double4 fetch_pos(const int& i, const double4 *pos) +{ + return pos[i]; +} +#else +__inline float4 fetch_pos(const int& i, const float4 *pos) +{ + return tex1Dfetch(pos_tex, i); +} +#endif + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline + +#define fetch_pos(i,y) x_[i] + +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch) { + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X; + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + if (ii0) { + numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch) { + // ii indexes the two interacting particles in gi + int ii=THREAD_ID_X; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (ii<4) + sp_lj[ii]=sp_lj_in[ii]; + if (ii0) + lj3[ii]=lj3_in[ii]; + } + ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + __syncthreads(); + + if (ii0) { + numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii*/ +} + +#endif + diff --git a/lib/gpu/lj96_cut_gpu_memory.cpp b/lib/gpu/lj96_cut_gpu_memory.cpp new file mode 100644 index 0000000000..0078e1ecf3 --- /dev/null +++ b/lib/gpu/lj96_cut_gpu_memory.cpp @@ -0,0 +1,150 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifdef USE_OPENCL +#include "lj96_cut_gpu_cl.h" +#else +#include "lj96_cut_gpu_ptx.h" +#endif + +#include "lj96_cut_gpu_memory.h" +#include +#define LJ96_GPU_MemoryT LJ96_GPU_Memory + +extern PairGPUDevice pair_gpu_device; + +template +LJ96_GPU_MemoryT::LJ96_GPU_Memory() : AtomicGPUMemory(), _allocated(false) { +} + +template +LJ96_GPU_MemoryT::~LJ96_GPU_Memory() { + clear(); +} + +template +int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +bool LJ96_GPU_MemoryT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj96_cut_gpu_kernel); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { + lj_types=MAX_SHARED_TYPES; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return true; +} + +template +void LJ96_GPU_MemoryT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJ96_GPU_MemoryT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJ96_GPU_Memory); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + + int ainum=this->atom->inum(); + int anall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch); + } + this->time_pair.stop(); +} + +template class LJ96_GPU_Memory; diff --git a/lib/gpu/lj96_cut_gpu_memory.h b/lib/gpu/lj96_cut_gpu_memory.h new file mode 100644 index 0000000000..214a951c76 --- /dev/null +++ b/lib/gpu/lj96_cut_gpu_memory.h @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef LJ96_GPU_MEMORY_H +#define LJ96_GPU_MEMORY_H + +#include "atomic_gpu_memory.h" + +template +class LJ96_GPU_Memory : public AtomicGPUMemory { + public: + LJ96_GPU_Memory(); + ~LJ96_GPU_Memory(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device **/ + bool init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +#endif + diff --git a/lib/gpu/lj_cut_gpu.cpp b/lib/gpu/lj_cut_gpu.cpp new file mode 100644 index 0000000000..55454022f7 --- /dev/null +++ b/lib/gpu/lj_cut_gpu.cpp @@ -0,0 +1,124 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include +#include +#include + +#include "lj_cut_gpu_memory.h" + +using namespace std; + +static LJL_GPU_Memory LJLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +bool ljl_gpu_init(const int ntypes, double **cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen) { + LJLMF.clear(); + gpu_mode=LJLMF.device->gpu_mode(); + double gpu_split=LJLMF.device->particle_split(); + int first_gpu=LJLMF.device->first_device(); + int last_gpu=LJLMF.device->last_device(); + int world_me=LJLMF.device->world_me(); + int gpu_rank=LJLMF.device->gpu_rank(); + int procs_per_gpu=LJLMF.device->procs_per_gpu(); + + LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu); + + bool message=false; + if (world_me==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + if (world_me==0) { + bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + if (!init_ok) + return false; + } + + MPI_Barrier(MPI_COMM_WORLD); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_comm); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + return true; +} + +void ljl_gpu_clear() { + LJLMF.clear(); +} + +int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, + boxhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, cpu_time, success); +} + +void ljl_gpu_compute(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success) { + LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double ljl_gpu_bytes() { + return LJLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu new file mode 100644 index 0000000000..5c784084c4 --- /dev/null +++ b/lib/gpu/lj_cut_gpu_kernel.cu @@ -0,0 +1,279 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef LJ_GPU_KERNEL +#define LJ_GPU_KERNEL + +#define MAX_SHARED_TYPES 8 + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#ifdef NV_KERNEL + +#include "geryon/ucl_nv_kernel.h" +texture pos_tex; + +#ifdef _DOUBLE_DOUBLE +__inline double4 fetch_pos(const int& i, const double4 *pos) +{ + return pos[i]; +} +#else +__inline float4 fetch_pos(const int& i, const float4 *pos) +{ + return tex1Dfetch(pos_tex, i); +} +#endif + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline + +#define fetch_pos(i,y) x_[i] + +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch) { + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X; + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + if (ii0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch) { + // ii indexes the two interacting particles in gi + int ii=THREAD_ID_X; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (ii<4) + sp_lj[ii]=sp_lj_in[ii]; + if (ii0) + lj3[ii]=lj3_in[ii]; + } + ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + __syncthreads(); + + if (ii0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii*/ +} + +#endif + diff --git a/lib/gpu/lj_cut_gpu_memory.cpp b/lib/gpu/lj_cut_gpu_memory.cpp new file mode 100644 index 0000000000..deb8b264c0 --- /dev/null +++ b/lib/gpu/lj_cut_gpu_memory.cpp @@ -0,0 +1,150 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifdef USE_OPENCL +#include "lj_cut_gpu_cl.h" +#else +#include "lj_cut_gpu_ptx.h" +#endif + +#include "lj_cut_gpu_memory.h" +#include +#define LJL_GPU_MemoryT LJL_GPU_Memory + +extern PairGPUDevice pair_gpu_device; + +template +LJL_GPU_MemoryT::LJL_GPU_Memory() : AtomicGPUMemory(), _allocated(false) { +} + +template +LJL_GPU_MemoryT::~LJL_GPU_Memory() { + clear(); +} + +template +int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +bool LJL_GPU_MemoryT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_cut_gpu_kernel); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { + lj_types=MAX_SHARED_TYPES; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return true; +} + +template +void LJL_GPU_MemoryT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJL_GPU_MemoryT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJL_GPU_Memory); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + + int ainum=this->atom->inum(); + int anall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch); + } + this->time_pair.stop(); +} + +template class LJL_GPU_Memory; diff --git a/lib/gpu/lj_cut_gpu_memory.h b/lib/gpu/lj_cut_gpu_memory.h new file mode 100644 index 0000000000..b03486bda2 --- /dev/null +++ b/lib/gpu/lj_cut_gpu_memory.h @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef LJL_GPU_MEMORY_H +#define LJL_GPU_MEMORY_H + +#include "atomic_gpu_memory.h" + +template +class LJL_GPU_Memory : public AtomicGPUMemory { + public: + LJL_GPU_Memory(); + ~LJL_GPU_Memory(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device **/ + bool init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +#endif + diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu index 75fb6ad010..6ba89ba842 100644 --- a/lib/gpu/lj_gpu.cu +++ b/lib/gpu/lj_gpu.cu @@ -16,206 +16,270 @@ Peng Wang (Nvidia), penwang@nvidia.com Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ - #include #include -#include "nvc_macros.h" -#include "nvc_timer.h" -#include "nvc_device.h" -#include "pair_gpu_texture.h" -#include "pair_gpu_cell.h" -#include "lj_gpu_memory.cu" +#include +#include "cudatimer.h" +#include "lj_tex.h" +#include "neigh.h" +#include "cell.h" #include "lj_gpu_kernel.h" +#ifdef WINDLL +#define EXTERN extern "C" __declspec(dllexport) +#else +#define EXTERN +#endif +static float h_boxlo[3], h_boxhi[3]; +static float cell_size; +static float *energy = NULL, *d_energy = NULL; +static float3 *d_force = NULL, *f_temp = NULL, *v_temp = NULL, *d_virial = NULL; +static float4 *d_pos = NULL, *temp_pos = NULL; +static int *d_type = NULL; +static int ncellx, ncelly, ncellz; -static LJ_GPU_Memory LJMF; -#define LJMT LJ_GPU_Memory +static neigh_list_gpu d_neigh_list; +static cell_list_gpu d_cell_list; - - -// --------------------------------------------------------------------------- -// Convert something to a string -// --------------------------------------------------------------------------- -#include - -template -inline string lj_gpu_toa(const t& in) { - ostringstream o; - o.precision(2); - o << in; - return o.str(); -} +#define TIMING(x) // --------------------------------------------------------------------------- // Return string with GPU info // --------------------------------------------------------------------------- -EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name) { - string sname=LJMF.gpu.name(id)+", "+ - lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+ - lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+ - lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ"; - strcpy(name,sname.c_str()); +EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name) +{ + struct cudaDeviceProp prop; + CUDA_SAFE_CALL( cudaGetDeviceProperties(&prop, id) ); +#ifdef _WIN32 + strcpy_s(name, strlen(prop.name)+1, prop.name); +#else + strncpy(name, prop.name, strlen(prop.name)+1); +#endif } -static bool _pc_cell_alloc; +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, + double **cutsq,double **sigma, + double **epsilon, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, + double *special_lj, double *boxlo, double *boxhi, + double cellsize, double skin, + const int max_nbors, const int gpu_id) +{ + int num_devices; -inline void _lj_gpu_clear() { - if (_pc_cell_alloc) { - free(energy); - free(v_temp); - cudaFreeHost(f_temp); - cudaFree(d_force); - cudaFree(d_energy); - cudaFree(d_virial); - clear_cell_list(cell_list_gpu); - _pc_cell_alloc=false; + /* get device count */ + CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) ); + if (num_devices == 0) { + printf("NO CUDA-capable GPU detected.\n"); + exit(1); } + + if (gpu_id > num_devices) { + printf("gpu_id %d is larger than the number of GPUs %d\n", + gpu_id, num_devices); + exit(1); + } + + /* set CUDA device to the specified GPU */ + cudaThreadExit(); + CUDA_SAFE_CALL( cudaSetDevice(gpu_id) ); + + ij_size=0; + + cell_size = cellsize; + ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size); + ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size); + ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size); + + for (int i = 0; i < 3; i++) { + h_boxhi[i] = boxhi[i]; + h_boxlo[i] = boxlo[i]; + } + + init_force_const(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset); + + init_cell_list_const(cellsize, skin, boxlo, boxhi); + + return true; } // --------------------------------------------------------------------------- // Clear memory on host and device // --------------------------------------------------------------------------- EXTERN void lj_gpu_clear() { - _lj_gpu_clear(); - LJMF.clear(); + + free(energy); + free(v_temp); + CUDA_SAFE_CALL( cudaFreeHost(f_temp) ); + if (d_force) CUDA_SAFE_CALL( cudaFree(d_force) ); + if (d_energy) CUDA_SAFE_CALL( cudaFree(d_energy) ); + if (d_virial) CUDA_SAFE_CALL( cudaFree(d_virial) ); + if (d_pos) CUDA_SAFE_CALL( cudaFree(d_pos) ); + if (d_type) CUDA_SAFE_CALL( cudaFree(d_type) ); + if (temp_pos) CUDA_SAFE_CALL( cudaFreeHost(temp_pos) ); + clear_neigh_list_gpu(d_neigh_list); + clear_cell_list_gpu(d_cell_list); + + if (useCache) { + unbind_pos(); + unbind_type(); + } + + + //LJMF.clear(); } -// --------------------------------------------------------------------------- -// Allocate memory on host and device and copy constants to device -// --------------------------------------------------------------------------- -EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, - double **epsilon, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, double **offset, - double *special_lj, double *boxlo, double *boxhi, - double cell_size, double skin, - const int max_nbors, const int gpu_id) { - if (LJMF.is_allocated()) - lj_gpu_clear(); - else - _pc_cell_alloc=false; - - LJMF.gpu.init(); - if (LJMF.gpu.num_devices()==0) - return false; - - ij_size=IJ_SIZE; - - bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, - host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id, - 0,0); - - ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size); - ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size); - ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size); - - init_cell_list_const(cell_size, skin, boxlo, boxhi); - - return ret; -} template -double _lj_gpu_cell(LJMT &ljm, double **force, double *virial, - double **host_x, int *host_type, const int inum, - const int nall, const int ago, const bool eflag, const bool vflag, - const double *boxlo, const double *boxhi) +double _lj_gpu_neigh(double **force, double *virial, + double **host_x, int *host_type, const int inum, + const int nall, const int ago, const bool eflag, const bool vflag, + const double *boxlo, const double *boxhi) { - cudaError_t err; - - ljm.atom.nall(nall); - ljm.atom.inum(inum); - - ljm.nbor.time_nbor.start(); - ljm.nbor.time_nbor.stop(); double evdwl=0.0; - static int blockSize = BLOCK_1D; - static int ncell = ncellx*ncelly*ncellz; - static int first_call = 1; + + TIMING( static CUDATimer cuTimer ); + TIMING( static CTimer cTimer ); + TIMING( static CTimer cTimer2 ); + + double *atom_pos = host_x[0]; + + static int szTailList = inum*32; + + TIMING( cTimer.Start() ); + TIMING( cTimer2.Start() ); + + /* MPI communication just happened, reallocate space using new inum & nall + FIXME: this is costly: ~ total kernel time! Use a DIY GPU memory allocator.*/ if (first_call || ago == 0) { - first_call = 0; - _lj_gpu_clear(); + + if (!first_call) { + if (useCache) { + unbind_pos(); + unbind_type(); + } + + CUDA_SAFE_CALL( cudaFree(d_force) ); + CUDA_SAFE_CALL( cudaFree(d_energy) ); + CUDA_SAFE_CALL( cudaFree(d_virial) ); + CUDA_SAFE_CALL( cudaFree(d_pos) ); + CUDA_SAFE_CALL( cudaFree(d_type) ); + + clear_neigh_list_gpu(d_neigh_list); + + CUDA_SAFE_CALL( cudaFreeHost(f_temp) ); + CUDA_SAFE_CALL( cudaFreeHost(temp_pos) ); + + free(energy); + free(v_temp); + } + + CUDA_SAFE_CALL( cudaMalloc((void**)&d_force, inum*sizeof(float3)) ); + CUDA_SAFE_CALL( cudaMalloc((void**)&d_energy, inum*sizeof(float)) ); + CUDA_SAFE_CALL( cudaMalloc((void**)&d_virial, inum*3*sizeof(float3)) ); + CUDA_SAFE_CALL( cudaMalloc((void**)&d_pos, nall*sizeof(float4)) ); + CUDA_SAFE_CALL( cudaMalloc((void**)&d_type, nall*sizeof(int)) ); + init_neigh_list_gpu(d_neigh_list, inum, NEIGH_BIN_SIZE, szTailList); + + CUDA_SAFE_CALL( cudaMallocHost((void**)&temp_pos, nall*sizeof(float4)) ); + CUDA_SAFE_CALL( cudaMallocHost((void**)&f_temp, inum*sizeof(float3)) ); + energy = (float*) malloc(inum*sizeof(float)); v_temp = (float3*)malloc(inum*2*sizeof(float3)); - cudaMallocHost((void**)&f_temp, inum*sizeof(float3)); - cudaMalloc((void**)&d_force, inum*sizeof(float3)); - cudaMalloc((void**)&d_energy, inum*sizeof(float)); - cudaMalloc((void**)&d_virial, inum*3*sizeof(float3)); + if (useCache) { + bind_pos(d_pos, nall); + bind_type(d_type, nall); + } + + first_call = 0; + CUDA_SAFE_CALL( cudaThreadSynchronize() ); + CUDA_SAFE_CALL( cudaGetLastError() ); + CUDA_SAFE_CALL( cudaMemcpy(d_type, host_type, nall*sizeof(int), + cudaMemcpyHostToDevice) ); - init_cell_list(cell_list_gpu, nall, ncell, blockSize); - _pc_cell_alloc=true; } - // build cell-list on GPU - ljm.atom.time_atom.start(); - build_cell_list(host_x[0], host_type, cell_list_gpu, - ncell, ncellx, ncelly, ncellz, blockSize, inum, nall, ago); - ljm.atom.time_atom.stop(); + TIMING( static double mallocTime = 0. ); + TIMING( mallocTime += cTimer2.GetET() ); + TIMING( printf("malloc time = %f ms\n", mallocTime*1e3) ); - ljm.time_pair.start(); + TIMING( cTimer2.Start() ); + for (int i = 0; i < 3*nall; i+=3) { + temp_pos[i/3] = make_float4(atom_pos[i], atom_pos[i+1], atom_pos[i+2], 0.f); + } -#ifdef TIMING - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, 0); -#endif + TIMING( static double copyTime = 0. ); + TIMING( copyTime += cTimer2.GetET() ); + TIMING( printf("position copy time = %f ms\n", copyTime*1e3) ); -#define KERNEL_LJ_CELL(e, v, b, s) kernel_lj_cell<<>> \ - (d_force, d_energy, d_virial, \ - cell_list_gpu.pos, \ - cell_list_gpu.idx, \ - cell_list_gpu.type, \ - cell_list_gpu.natom, \ - inum, nall, ncell, ncellx, ncelly, ncellz); - - // call the cell-list force kernel - const int BX=blockSize; - dim3 GX(ncellx, ncelly*ncellz); - if (eflag == 0 && vflag == 0) { - if (blockSize == 64 ) KERNEL_LJ_CELL(false, false, 64, 0); - if (blockSize == 128) KERNEL_LJ_CELL(false, false, 128, 0); - if (blockSize == 256) KERNEL_LJ_CELL(false, false, 256, 0); - } else { - if (blockSize == 64) KERNEL_LJ_CELL(true, true, 64, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES); - if (blockSize == 128) KERNEL_LJ_CELL(true, true, 128, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES); - if (blockSize == 256) KERNEL_LJ_CELL(true, true, 256, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES); + TIMING( cTimer2.Start() ); + CUDA_SAFE_CALL( cudaMemcpy(d_pos, temp_pos, nall*sizeof(float4), cudaMemcpyHostToDevice) ); + + TIMING( static double h2dTime = 0. ); + TIMING( h2dTime += cTimer2.GetET() ); + TIMING( printf("h2d copy time = %f ms\n", h2dTime*1e3) ); + + TIMING( cTimer2.Start() ); + if (ago == 0) { + build_neigh_list_gpu(d_pos, + d_neigh_list, + h_boxlo, h_boxhi, cell_size, + inum, nall); } - - err = cudaGetLastError(); - if (err != cudaSuccess) { - printf("LJ force kernel launch error: %d\n", err); - exit(1); + TIMING( static double neighTime = 0. ); + TIMING( neighTime += cTimer2.GetET() ); + TIMING( printf("Neigh List time = %f ms\n", neighTime*1e3) ); + + TIMING( cTimer2.Start() ); + calc_lj_neigh_gpu(d_force, d_energy, d_virial, + d_pos, d_type, + d_neigh_list, + inum, nall, + eflag, vflag); + TIMING( static double forceTime = 0. ); + TIMING( forceTime += cTimer2.GetET() ); + TIMING( printf("Force time = %f ms\n", forceTime*1e3) ); + TIMING( printf("GPU kernel time = %f ms\n", (forceTime + neighTime)*1e3) ); + + + TIMING( cTimer2.Start() ); + CUDA_SAFE_CALL( cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost) ); + TIMING( static double d2hTime = 0. ); + TIMING( d2hTime += cTimer2.GetET() ); + TIMING( printf("d2h copy time = %f ms\n", d2hTime*1e3) ); + TIMING( printf("GPU-CPU data transfer time = %f ms\n", (h2dTime+d2hTime)*1e3) ); + + TIMING( cTimer2.Start() ); + + for (int i = 0; i < inum; i++) { + force[i][0] += f_temp[i].x; + force[i][1] += f_temp[i].y; + force[i][2] += f_temp[i].z; } -#ifdef TIMING - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - float kTime; - cudaEventElapsedTime(&kTime, start, stop); - kernelTime += kTime; - printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag); - cudaEventDestroy(start); - cudaEventDestroy(stop); -#endif - - // copy results from GPU to CPU - cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost); if (eflag) { - cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost); + CUDA_SAFE_CALL( cudaMemcpy(energy, d_energy, + inum*sizeof(float), cudaMemcpyDeviceToHost) ); for (int i = 0; i < inum; i++) { evdwl += energy[i]; } evdwl *= 0.5f; } + if (vflag) { - cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost); + CUDA_SAFE_CALL( cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), + cudaMemcpyDeviceToHost) ); for (int i = 0; i < inum; i++) { virial[0] += v_temp[2*i].x; virial[1] += v_temp[2*i].y; @@ -228,43 +292,175 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial, virial[i] *= 0.5f; } - for (int i = 0; i < inum; i++) { - force[i][0] += f_temp[i].x; - force[i][1] += f_temp[i].y; - force[i][2] += f_temp[i].z; - } - ljm.time_pair.stop(); - - ljm.atom.time_atom.add_to_total(); - ljm.nbor.time_nbor.add_to_total(); - ljm.time_pair.add_to_total(); + TIMING( static double postTime = 0. ); + TIMING( postTime += cTimer2.GetET() ); + TIMING( printf("postprocess Time = %f ms\n", postTime*1e3) ); + TIMING( printf("Data process time = %f ms\n", (postTime+copyTime)*1e3) ); + TIMING( static double totalTime = 0. ); + TIMING( totalTime += cTimer.GetET() ); + TIMING( printf("lj_gpu time = %f ms\n", totalTime*1e3) ); return evdwl; } -EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, - const int ago, const bool eflag, const bool vflag, - const double *boxlo, const double *boxhi) +EXTERN double lj_gpu_neigh(double **force, double *virial, + double **host_x, int *host_type, + const int inum, const int nall, + const int ago, const bool eflag, const bool vflag, + const double *boxlo, const double *boxhi) { - return _lj_gpu_cell(LJMF, force, virial, host_x, host_type, inum, nall, - ago, eflag, vflag, boxlo, boxhi); + return _lj_gpu_neigh(force, virial, + host_x, host_type, inum, nall, + ago, eflag, vflag, boxlo, boxhi); +} + + +template +double _lj_gpu_cell(double **force, double *virial, + double **host_x, int *host_type, const int inum, + const int nall, const int ago, + const bool eflag, const bool vflag, + const double *boxlo, const double *boxhi) +{ + + double evdwl=0.0; + + static int ncell = ncellx*ncelly*ncellz; + + static int first_call = 1; + + // allocate memory on CPU and GPU + if (first_call || ago == 0) { + if (!first_call) { + if (useCache) { + unbind_pos(); + unbind_type(); + } + + free(energy); + free(v_temp); + + CUDA_SAFE_CALL( cudaFree(d_force) ); + CUDA_SAFE_CALL( cudaFree(d_energy) ); + CUDA_SAFE_CALL( cudaFree(d_virial) ); + + CUDA_SAFE_CALL( cudaFree(d_pos) ); + CUDA_SAFE_CALL( cudaFree(d_type) ); + CUDA_SAFE_CALL( cudaFreeHost(f_temp) ); + CUDA_SAFE_CALL( cudaFreeHost(temp_pos) ); + + clear_cell_list_gpu(d_cell_list); + } + + energy = (float*) malloc(inum*sizeof(float)); + v_temp = (float3*)malloc(inum*2*sizeof(float3)); + + + cudaMalloc((void**)&d_force, inum*sizeof(float3)); + cudaMalloc((void**)&d_energy, inum*sizeof(float)); + cudaMalloc((void**)&d_virial, inum*3*sizeof(float3)); + + CUDA_SAFE_CALL( cudaMalloc((void**)&d_pos, nall*sizeof(float4)) ); + CUDA_SAFE_CALL( cudaMalloc((void**)&d_type, nall*sizeof(int)) ); + + CUDA_SAFE_CALL( cudaMallocHost((void**)&f_temp, inum*sizeof(float3)) ); + CUDA_SAFE_CALL( cudaMallocHost((void**)&temp_pos, nall*sizeof(float4)) ); + + init_cell_list_gpu(d_cell_list, nall, ncell); + + CUDA_SAFE_CALL( cudaMemcpy(d_type, host_type, nall*sizeof(int), + cudaMemcpyHostToDevice) ); + + if (useCache) { + bind_pos(d_pos, nall); + bind_type(d_type, nall); + } + + first_call = 0; + } + + /* build cell-list on GPU */ + double *atom_pos = host_x[0]; + for (int i = 0; i < 3*nall; i+=3) { + temp_pos[i/3] = make_float4(atom_pos[i], atom_pos[i+1], atom_pos[i+2], 0.f); + } + CUDA_SAFE_CALL( cudaMemcpy(d_pos, temp_pos, nall*sizeof(float4), + cudaMemcpyHostToDevice) ); + if (ago == 0) { + build_cell_list_gpu(d_pos, d_cell_list, h_boxlo, h_boxhi, + cell_size, inum, nall); + } + + calc_lj_cell_gpu(d_force, d_energy, d_virial, + d_pos, d_type, d_cell_list, + inum, nall, ncellx, + ncelly, ncellz, cell_size, + eflag, vflag); + + CUDA_SAFE_CALL( cudaMemcpy(f_temp, d_force, inum*sizeof(float3), + cudaMemcpyDeviceToHost) ); + + for (int i = 0; i < inum; i++) { + force[i][0] += f_temp[i].x; + force[i][1] += f_temp[i].y; + force[i][2] += f_temp[i].z; + } + + if (eflag) { + CUDA_SAFE_CALL( cudaMemcpy(energy, d_energy, + inum*sizeof(float), cudaMemcpyDeviceToHost) ); + for (int i = 0; i < inum; i++) { + evdwl += energy[i]; + } + evdwl *= 0.5f; + } + + if (vflag) { + CUDA_SAFE_CALL( cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), + cudaMemcpyDeviceToHost) ); + for (int i = 0; i < inum; i++) { + virial[0] += v_temp[2*i].x; + virial[1] += v_temp[2*i].y; + virial[2] += v_temp[2*i].z; + virial[3] += v_temp[2*i+1].x; + virial[4] += v_temp[2*i+1].y; + virial[5] += v_temp[2*i+1].z; + } + for (int i = 0; i < 6; i++) + virial[i] *= 0.5f; + } + + return evdwl; +} + +EXTERN double lj_gpu_cell(double **force, double *virial, + double **host_x, int *host_type, + const int inum, const int nall, + const int ago, const bool eflag, const bool vflag, + const double *boxlo, const double *boxhi) +{ + return _lj_gpu_cell(force, virial, + host_x, host_type, inum, nall, + ago, eflag, vflag, boxlo, boxhi); } EXTERN void lj_gpu_time() { - cout.precision(4); - cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n"; - cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n"; - cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n"; - cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n"; + /* cout.precision(4); + cout << "Atom copy: " << LJMF.time_atom.total_seconds() << " s.\n"; + cout << "Neighbor copy: " << LJMF.time_nbor.total_seconds() << " s.\n"; + cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n";*/ + //cout << "Answer copy: " << LJMF.time_answer.total_seconds() << " s.\n"; } EXTERN int lj_gpu_num_devices() { - return LJMF.gpu.num_devices(); + int num_devices; + CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) ); + return num_devices; } EXTERN double lj_gpu_bytes() { - return LJMF.host_memory_usage(); + return 0.0; } diff --git a/lib/gpu/lj_gpu_kernel.h b/lib/gpu/lj_gpu_kernel.h deleted file mode 100644 index b2f03cde2b..0000000000 --- a/lib/gpu/lj_gpu_kernel.h +++ /dev/null @@ -1,220 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#ifndef LJ_GPU_KERNEL -#define LJ_GPU_KERNEL - -/* Cell list version of LJ kernel */ -template -__global__ void kernel_lj_cell(float3 *force3, - float *energy, float3 *virial, - float3 *cell_list, unsigned int *cell_idx, - int *cell_type, int *cell_atom, - const int inum, const int nall, const int ncell, - const int ncellx, const int ncelly, const int ncellz) -{ - - - - // calculate 3D block idx from 2d block - int bx = blockIdx.x; - int by = blockIdx.y % ncelly; - int bz = blockIdx.y / ncelly; - - int tid = threadIdx.x; - - // compute cell idx from 3D block idx - int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly)); - - __shared__ int typeSh[blockSize]; - __shared__ float posSh[blockSize*3]; - __shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - - extern __shared__ float smem[]; - - __shared__ float *lj3Sh; - __shared__ float *lj4Sh; - __shared__ float *offsetSh; - - // load force parameters into shared memory - for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) { - int itype = i/MAX_SHARED_TYPES; - int jtype = i%MAX_SHARED_TYPES; - cutsqSh[i] = _cutsq_(itype,jtype); - lj1Sh[i] = _lj1_(itype,jtype).x; - lj2Sh[i] = _lj1_(itype,jtype).y; - } - - // Only allocate shared memory when needed, - // this reduces shared memory limitation on occupancy - if (eflag || vflag) { - lj3Sh = smem; - lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES; - offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES; - for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) { - int itype = i/MAX_SHARED_TYPES; - int jtype = i%MAX_SHARED_TYPES; - lj3Sh[i] = _lj3_(itype,jtype).x+0.01; - lj4Sh[i] = _lj3_(itype,jtype).y; - offsetSh[i]= _offset_(itype,jtype); - } - } - - __syncthreads(); - - int nborz0 = max(bz-1,0), nborz1 = min(bz+1, ncellz-1), - nbory0 = max(by-1,0), nbory1 = min(by+1, ncelly-1), - nborx0 = max(bx-1,0), nborx1 = min(bx+1, ncellx-1); - - for (int ii = 0; ii < ceil((float)(cell_atom[cid])/blockSize); ii++) { - float3 f = {0.0f, 0.0f, 0.0f}; - float ener = 0.0f; - float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f}; - int itype; - float ix, iy, iz; - int i = tid + ii*blockSize; - unsigned int answer_pos = cell_idx[cid*blockSize+i]; - - // load current cell atom position and type into sMem - for (int j = tid; j < cell_atom[cid]; j += blockSize) { - int pid = cid*blockSize + j; - float3 pos = cell_list[pid]; - posSh[j ] = pos.x; - posSh[j+ blockSize] = pos.y; - posSh[j+2*blockSize] = pos.z; - typeSh[j] = cell_type[pid]; - } - __syncthreads(); - if (answer_pos < inum) { - itype = typeSh[i]; - ix = posSh[i ]; - iy = posSh[i+ blockSize]; - iz = posSh[i+2*blockSize]; - - // compute force from current cell - for (int j = 0; j < cell_atom[cid]; j++) { - if (j == i) continue; - float delx = ix - posSh[j ]; - float dely = iy - posSh[j+ blockSize]; - float delz = iz - posSh[j+2*blockSize]; - int jtype = typeSh[j]; - int mtype = itype + jtype*MAX_SHARED_TYPES; - float r2inv = delx*delx + dely*dely + delz*delz; - - if (r2inv < cutsqSh[mtype]) { - r2inv = 1.0f/r2inv; - float r6inv = r2inv * r2inv * r2inv; - float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]); - f.x += delx * force; - f.y += dely * force; - f.z += delz * force; - - if (eflag) { - float e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]); - ener += (e - offsetSh[mtype]); - } - - if (vflag) { - v0.x += delx*delx*force; - v0.y += dely*dely*force; - v0.z += delz*delz*force; - v1.x += delx*dely*force; - v1.y += delx*delz*force; - v1.z += dely*delz*force; - } - - } - } - } - __syncthreads(); - - // compute force from neigboring cells - for (int nborz = nborz0; nborz <= nborz1; nborz++) { - for (int nbory = nbory0; nbory <= nbory1; nbory++) { - for (int nborx = nborx0; nborx <= nborx1; nborx++) { - if (nborz == bz && nbory == by && nborx == bx) continue; - - // compute cell id - int cid_nbor = nborx + INT_MUL(nbory,ncellx) + - INT_MUL(nborz,INT_MUL(ncellx,ncelly)); - - // load neighbor cell position and type into smem - for (int j = tid; j < cell_atom[cid_nbor]; j += blockSize) { - int pid = INT_MUL(cid_nbor,blockSize) + j; - float3 pos = cell_list[pid]; - posSh[j ] = pos.x; - posSh[j+ blockSize] = pos.y; - posSh[j+2*blockSize] = pos.z; - typeSh[j] = cell_type[pid]; - } - __syncthreads(); - // compute force - if (answer_pos < inum) { - for (int j = 0; j < cell_atom[cid_nbor]; j++) { - float delx = ix - posSh[j ]; - float dely = iy - posSh[j+ blockSize]; - float delz = iz - posSh[j+2*blockSize]; - int jtype = typeSh[j]; - int mtype = itype + jtype*MAX_SHARED_TYPES; - float r2inv = delx*delx + dely*dely + delz*delz; - - if (r2inv < cutsqSh[mtype]) { - r2inv = 1.0f/r2inv; - float r6inv = r2inv * r2inv * r2inv; - float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]); - f.x += delx * force; - f.y += dely * force; - f.z += delz * force; - - if (eflag) { - float e=r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]); - ener += (e-offsetSh[mtype]); - } - if (vflag) { - v0.x += delx*delx*force; - v0.y += dely*dely*force; - v0.z += delz*delz*force; - v1.x += delx*dely*force; - v1.y += delx*delz*force; - v1.z += dely*delz*force; - } - } - } - } - __syncthreads(); - } - } - } - - if (answer_pos < inum) { - force3[answer_pos] = f; - if (eflag) - energy[answer_pos] = ener; - if (vflag) { - virial[2*answer_pos] = v0; - virial[2*answer_pos+1] = v1; - } - } - } - -} - -#endif diff --git a/lib/gpu/lj_gpu_memory.cu b/lib/gpu/lj_gpu_memory.cu deleted file mode 100644 index 5e7e9207ae..0000000000 --- a/lib/gpu/lj_gpu_memory.cu +++ /dev/null @@ -1,147 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#include "lj_gpu_memory.h" -#define LJ_GPU_MemoryT LJ_GPU_Memory - -template -int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { - return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors); -} - -template -bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, - double **host_cutsq, double **host_sigma, - double **host_epsilon, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int max_nbors, - const int me, const int nlocal, const int nall) { - if (allocated) - clear(); - - if (me>=gpu.num_devices()) - return false; - gpu.set(me); - if (gpu.revision()<1.0) - return false; - - // Initialize timers for the selected GPU - time_pair.init(); - - // Initialize atom and nbor data - max_local=static_cast(static_cast(nlocal)*1.10); - if (max_local==0) - max_local=1000; - if (nall<=nlocal) - max_atoms=max_local*2; - else - max_atoms=static_cast(static_cast(nall)*1.10); - - if (!atom.init(max_atoms)) - return false; - if (!nbor.init(ij_size,max_local,max_nbors)) - return false; - - // Get a stream for computing pair potentials - CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream)); - - // Use the write buffer from atom for data initialization - NVC_HostT &host_write=atom.host_write; - assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2); - - // Copy data for bonded interactions - special_lj.safe_alloc(4); - special_lj.cast_copy(host_special_lj,host_write); - - // Copy sigma, epsilon, and cutsq onto GPU - sigma.safe_alloc(ntypes,ntypes,sigma_get_texture()); - sigma.cast_copy(host_sigma[0],host_write); - epsilon.safe_alloc(ntypes,ntypes,epsilon_get_texture()); - epsilon.cast_copy(host_epsilon[0],host_write); - cutsq.safe_alloc(ntypes,ntypes,cutsq_get_texture()); - cutsq.cast_copy(host_cutsq[0],host_write); - - // If atom type constants fit in shared memory use fast kernel - int lj_types=ntypes; - shared_types=false; - if (lj_types<=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; - shared_types=true; - } - offset.safe_alloc(lj_types,lj_types,offset_get_texture()); - offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes); - double *t1=host_lj1[0]; - double *t2=host_lj2[0]; - for (int i=0; i()); - lj1.copy_2Dfrom_host(reinterpret_cast::vec2 *> (host_write.begin()), - ntypes,ntypes); - t1=host_lj3[0]; - t2=host_lj4[0]; - for (int i=0; i()); - lj3.copy_2Dfrom_host(reinterpret_cast::vec2 *> (host_write.begin()), - ntypes,ntypes); - - dev_error.safe_alloc(1); - dev_error.zero(); - - allocated=true; - return true; -} - -template -void LJ_GPU_MemoryT::clear() { - if (!allocated) - return; - allocated=false; - - // Check for any pair style specific errors here - int err_flag; - dev_error.copy_to_host(&err_flag); - - atom.clear(); - nbor.clear(); - - CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream)); - - dev_error.clear(); - sigma.clear(); - epsilon.clear(); - special_lj.clear(); - cutsq.clear(); - offset.clear(); - lj1.clear(); - lj3.clear(); -} - -template -double LJ_GPU_MemoryT::host_memory_usage() const { - return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+ - sizeof(LJ_GPU_Memory); -} - -template class LJ_GPU_Memory; diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h deleted file mode 100644 index 97ab18acd8..0000000000 --- a/lib/gpu/lj_gpu_memory.h +++ /dev/null @@ -1,87 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#ifndef LJ_GPU_MEMORY_H -#define LJ_GPU_MEMORY_H - -#include "nvc_device.h" -#include "nvc_traits.h" -#include "pair_gpu_atom.h" -#include "pair_gpu_nbor.h" - -#define BLOCK_1D 64 // max value = 256 -#define CELL_SIZE BLOCK_1D -#define MAX_SHARED_TYPES 8 -#define BIG_NUMBER 100000000 - -template -class LJ_GPU_Memory { - public: - LJ_GPU_Memory() : allocated(false) {} - ~LJ_GPU_Memory() { clear(); } - - inline bool is_allocated() { return allocated; } - - /// Allocate memory on host and device - bool init(const int ij_size, const int ntypes, double **host_cutsq, - double **host_sigma, double **host_epsilon, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int max_nbors, const int me, const int nlocal, - const int nall); - /// Free any memory on host and device - void clear(); - - /// Returns memory usage on GPU per atom - int bytes_per_atom(const int max_nbors) const; - /// Total host memory used by library - double host_memory_usage() const; - - // ------------------------- DATA ----------------------------- - - // Device Properties - NVCDevice gpu; - // Device Error Flag - NVC_VecI dev_error; - // Stream for asynchronous work - cudaStream_t pair_stream; - - // Atom Data - PairGPUAtom atom; - // Neighbor Data - PairGPUNbor nbor; - - // --------------- Const Data for Atoms - NVC_ConstMatT sigma, epsilon, cutsq, offset; - NVC_ConstMat< typename nvc_vec_traits::vec2 > lj1, lj3; - NVC_VecT special_lj; - - size_t max_atoms, max_local; - - // Timing for pair calculation - NVCTimer time_pair; - - // If atom type constants fit in shared memory, use fast kernels - bool shared_types; - - protected: - bool allocated; -}; - -#endif diff --git a/lib/gpu/ljc_cut_gpu.cpp b/lib/gpu/ljc_cut_gpu.cpp new file mode 100644 index 0000000000..784ab38633 --- /dev/null +++ b/lib/gpu/ljc_cut_gpu.cpp @@ -0,0 +1,129 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include +#include +#include + +#include "ljc_cut_gpu_memory.h" + +using namespace std; + +static LJC_GPU_Memory LJCMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + LJCMF.clear(); + gpu_mode=LJCMF.device->gpu_mode(); + double gpu_split=LJCMF.device->particle_split(); + int first_gpu=LJCMF.device->first_device(); + int last_gpu=LJCMF.device->last_device(); + int world_me=LJCMF.device->world_me(); + int gpu_rank=LJCMF.device->gpu_rank(); + int procs_per_gpu=LJCMF.device->procs_per_gpu(); + + LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu); + + bool message=false; + if (world_me==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + if (world_me==0) { + bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, + host_cut_ljsq, host_cut_coulsq, host_special_coul, + qqrd2e); + if (!init_ok) + return false; + } + + MPI_Barrier(MPI_COMM_WORLD); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_comm); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + return true; +} + +void ljc_gpu_clear() { + LJCMF.clear(); +} + +int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q) { + return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, + boxhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, cpu_time, success, host_q); +} + +void ljc_gpu_compute(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q) { + LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q); +} + +double ljc_gpu_bytes() { + return LJCMF.host_memory_usage(); +} + + diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu new file mode 100644 index 0000000000..b6d9610f0d --- /dev/null +++ b/lib/gpu/ljc_cut_gpu_kernel.cu @@ -0,0 +1,339 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef LJC_GPU_KERNEL +#define LJC_GPU_KERNEL + +#define MAX_SHARED_TYPES 8 + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#ifdef NV_KERNEL + +#include "geryon/ucl_nv_kernel.h" +texture pos_tex; +texture q_tex; + +#ifdef _DOUBLE_DOUBLE +__inline double4 fetch_pos(const int& i, const double4 *pos) +{ + return pos[i]; +} +__inline double fetch_q(const int& i, const double *q) +{ + return q[i]; +} +#else +__inline float4 fetch_pos(const int& i, const float4 *pos) +{ + return tex1Dfetch(pos_tex, i); +} +__inline float fetch_q(const int& i, const float *q) +{ + return tex1Dfetch(q_tex, i); +} +#endif + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline + +#define fetch_pos(i,y) x_[i] +#define fetch_q(i,y) q_[i] + +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + __global numtyp *q_ , __global numtyp *cutsq, + const numtyp qqrd2e) { + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X; + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + if (ii0) { + e_coul += forcecoul; + if (rsq < lj1[mtype].z) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + __global numtyp *q_ , __global numtyp *_cutsq, + const numtyp qqrd2e) { + // ii indexes the two interacting particles in gi + int ii=THREAD_ID_X; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (ii<8) + sp_lj[ii]=sp_lj_in[ii]; + if (ii0) + lj3[ii]=lj3_in[ii]; + } + ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + __syncthreads(); + + if (ii0) { + e_coul += forcecoul; + if (rsq < lj1[mtype].z) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii*/ +} + +#endif + diff --git a/lib/gpu/ljc_cut_gpu_memory.cpp b/lib/gpu/ljc_cut_gpu_memory.cpp new file mode 100644 index 0000000000..ec2001aa4a --- /dev/null +++ b/lib/gpu/ljc_cut_gpu_memory.cpp @@ -0,0 +1,165 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifdef USE_OPENCL +#include "ljc_cut_gpu_cl.h" +#else +#include "ljc_cut_gpu_ptx.h" +#endif + +#include "ljc_cut_gpu_memory.h" +#include +#define LJC_GPU_MemoryT LJC_GPU_Memory + +extern PairGPUDevice pair_gpu_device; + +template +LJC_GPU_MemoryT::LJC_GPU_Memory() : ChargeGPUMemory(), + _allocated(false) { +} + +template +LJC_GPU_MemoryT::~LJC_GPU_Memory() { + clear(); +} + +template +int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +bool LJC_GPU_MemoryT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,ljc_cut_gpu_kernel); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { + lj_types=MAX_SHARED_TYPES; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cut_ljsq, host_cut_coulsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _qqrd2e=qqrd2e; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes(); + return true; +} + +template +void LJC_GPU_MemoryT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + cutsq.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJC_GPU_MemoryT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJC_GPU_Memory); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + + int ainum=this->atom->inum(); + int anall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch, + &this->atom->dev_q.begin(), &cutsq.begin(), + &_qqrd2e); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch, &this->atom->dev_q.begin(), + &cutsq.begin(), &_qqrd2e); + } + this->time_pair.stop(); +} + +template class LJC_GPU_Memory; diff --git a/lib/gpu/ljc_cut_gpu_memory.h b/lib/gpu/ljc_cut_gpu_memory.h new file mode 100644 index 0000000000..2d50bd6d16 --- /dev/null +++ b/lib/gpu/ljc_cut_gpu_memory.h @@ -0,0 +1,77 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef LJC_GPU_MEMORY_H +#define LJC_GPU_MEMORY_H + +#include "charge_gpu_memory.h" + +template +class LJC_GPU_Memory : public ChargeGPUMemory { + public: + LJC_GPU_Memory(); + ~LJC_GPU_Memory(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device **/ + bool init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// cutsq + UCL_D_Vec cutsq; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +#endif + diff --git a/lib/gpu/ljcl_cut_gpu.cpp b/lib/gpu/ljcl_cut_gpu.cpp new file mode 100644 index 0000000000..1861350596 --- /dev/null +++ b/lib/gpu/ljcl_cut_gpu.cpp @@ -0,0 +1,130 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include +#include +#include + +#include "ljcl_cut_gpu_memory.h" + +using namespace std; + +static LJCL_GPU_Memory LJCLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + LJCLMF.clear(); + gpu_mode=LJCLMF.device->gpu_mode(); + double gpu_split=LJCLMF.device->particle_split(); + int first_gpu=LJCLMF.device->first_device(); + int last_gpu=LJCLMF.device->last_device(); + int world_me=LJCLMF.device->world_me(); + int gpu_rank=LJCLMF.device->gpu_rank(); + int procs_per_gpu=LJCLMF.device->procs_per_gpu(); + + LJCLMF.device->init_message(screen,"lj/cut/coul/long",first_gpu,last_gpu); + + bool message=false; + if (world_me==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + if (world_me==0) { + bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, + host_cut_ljsq, host_cut_coulsq, host_special_coul, + qqrd2e,g_ewald); + if (!init_ok) + return false; + } + + MPI_Barrier(MPI_COMM_WORLD); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_comm); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + return true; +} + +void ljcl_gpu_clear() { + LJCLMF.clear(); +} + +int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q) { + return LJCLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, + boxhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, cpu_time, success, host_q); +} + +void ljcl_gpu_compute(const int timestep, const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q) { + LJCLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q); +} + +double ljcl_gpu_bytes() { + return LJCLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu new file mode 100644 index 0000000000..9e2fb3c230 --- /dev/null +++ b/lib/gpu/ljcl_cut_gpu_kernel.cu @@ -0,0 +1,361 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef LJCL_GPU_KERNEL +#define LJCL_GPU_KERNEL + +#define MAX_SHARED_TYPES 8 + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#define EWALD_F (numtyp)1.12837917 +#define EWALD_P (numtyp)0.3275911 +#define A1 (numtyp)0.254829592 +#define A2 (numtyp)-0.284496736 +#define A3 (numtyp)1.421413741 +#define A4 (numtyp)-1.453152027 +#define A5 (numtyp)1.061405429 + +#ifdef NV_KERNEL + +#include "geryon/ucl_nv_kernel.h" +texture pos_tex; +texture q_tex; + +#ifdef _DOUBLE_DOUBLE +__inline double4 fetch_pos(const int& i, const double4 *pos) +{ + return pos[i]; +} +__inline double fetch_q(const int& i, const double *q) +{ + return q[i]; +} +#else +__inline float4 fetch_pos(const int& i, const float4 *pos) +{ + return tex1Dfetch(pos_tex, i); +} +__inline float fetch_q(const int& i, const float *q) +{ + return tex1Dfetch(q_tex, i); +} +#endif + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline + +#define fetch_pos(i,y) x_[i] +#define fetch_q(i,y) q_[i] + +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + __global numtyp *q_ , const numtyp cut_coulsq, + const numtyp qqrd2e, const numtyp g_ewald) { + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X; + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + if (ii0) { + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].w) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, __global int *dev_nbor, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + __global numtyp *q_ , const numtyp cut_coulsq, + const numtyp qqrd2e, const numtyp g_ewald) { + // ii indexes the two interacting particles in gi + int ii=THREAD_ID_X; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (ii<8) + sp_lj[ii]=sp_lj_in[ii]; + if (ii0) + lj3[ii]=lj3_in[ii]; + } + ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + __syncthreads(); + + if (ii0) { + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].w) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=energy; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii*/ +} + +#endif + diff --git a/lib/gpu/ljcl_cut_gpu_memory.cpp b/lib/gpu/ljcl_cut_gpu_memory.cpp new file mode 100644 index 0000000000..21716c183d --- /dev/null +++ b/lib/gpu/ljcl_cut_gpu_memory.cpp @@ -0,0 +1,163 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifdef USE_OPENCL +#include "ljcl_cut_gpu_cl.h" +#else +#include "ljcl_cut_gpu_ptx.h" +#endif + +#include "ljcl_cut_gpu_memory.h" +#include +#define LJCL_GPU_MemoryT LJCL_GPU_Memory + +extern PairGPUDevice pair_gpu_device; + +template +LJCL_GPU_MemoryT::LJCL_GPU_Memory() : ChargeGPUMemory(), + _allocated(false) { +} + +template +LJCL_GPU_MemoryT::~LJCL_GPU_Memory() { + clear(); +} + +template +int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +bool LJCL_GPU_MemoryT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,ljcl_cut_gpu_kernel); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { + lj_types=MAX_SHARED_TYPES; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return true; +} + +template +void LJCL_GPU_MemoryT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJCL_GPU_MemoryT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJCL_GPU_Memory); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + + int ainum=this->atom->inum(); + int anall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch, + &this->atom->dev_q.begin(), &_cut_coulsq, + &_qqrd2e, &_g_ewald); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->atom->dev_ans.begin(), + &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch, &this->atom->dev_q.begin(), + &_cut_coulsq, &_qqrd2e, &_g_ewald); + } + this->time_pair.stop(); +} + +template class LJCL_GPU_Memory; diff --git a/lib/gpu/ljcl_cut_gpu_memory.h b/lib/gpu/ljcl_cut_gpu_memory.h new file mode 100644 index 0000000000..59379cb4c8 --- /dev/null +++ b/lib/gpu/ljcl_cut_gpu_memory.h @@ -0,0 +1,75 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef LJCL_GPU_MEMORY_H +#define LJCL_GPU_MEMORY_H + +#include "charge_gpu_memory.h" + +template +class LJCL_GPU_Memory : public ChargeGPUMemory { + public: + LJCL_GPU_Memory(); + ~LJCL_GPU_Memory(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device **/ + bool init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +#endif + diff --git a/lib/gpu/nvc_device.cu b/lib/gpu/nvc_device.cu deleted file mode 100644 index 3f2d81228a..0000000000 --- a/lib/gpu/nvc_device.cu +++ /dev/null @@ -1,94 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#include -#include -#include -#include "nvc_macros.h" -#include "nvc_device.h" - -// Grabs the properties for all devices -void NVCDevice::init() { - _properties.clear(); - - CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices)); - for (int dev=0; dev<_num_devices; ++dev) { - cudaDeviceProp deviceProp; - CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev)); - if (deviceProp.major == 9999 && deviceProp.minor == 9999) - break; - _properties.push_back(deviceProp); - } - _device=0; -} - -// Set the CUDA device to the specified device number -void NVCDevice::set(int num) { - if (_device==num) - return; - cudaThreadExit(); - CUDA_SAFE_CALL(cudaSetDevice(num)); - _device=num; -} - -// List all devices along with all properties -void NVCDevice::print_all(ostream &out) { - if (num_devices() == 0) - printf("There is no device supporting CUDA\n"); - for (int i=0; i= 2000 - printf(" Number of multiprocessors: %d\n", - _properties[i].multiProcessorCount); - printf(" Number of cores: %d\n",cores(i)); - #endif - printf(" Total amount of constant memory: %u bytes\n", - _properties[i].totalConstMem); - printf(" Total amount of shared memory per block: %u bytes\n", - _properties[i].sharedMemPerBlock); - printf(" Total number of registers available per block: %d\n", - _properties[i].regsPerBlock); - printf(" Warp size: %d\n", - _properties[i].warpSize); - printf(" Maximum number of threads per block: %d\n", - _properties[i].maxThreadsPerBlock); - printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", - _properties[i].maxThreadsDim[0], - _properties[i].maxThreadsDim[1], - _properties[i].maxThreadsDim[2]); - printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", - _properties[i].maxGridSize[0], - _properties[i].maxGridSize[1], - _properties[i].maxGridSize[2]); - printf(" Maximum memory pitch: %u bytes\n", - _properties[i].memPitch); - printf(" Texture alignment: %u bytes\n", - _properties[i].textureAlignment); - printf(" Clock rate: %.2f GHz\n", - clock_rate(i)); - #if CUDART_VERSION >= 2000 - printf(" Concurrent copy and execution: %s\n", - _properties[i].deviceOverlap ? "Yes" : "No"); - #endif - } -} - diff --git a/lib/gpu/nvc_device.h b/lib/gpu/nvc_device.h deleted file mode 100644 index 61a4bb003a..0000000000 --- a/lib/gpu/nvc_device.h +++ /dev/null @@ -1,93 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#ifndef NVC_DEVICE -#define NVC_DEVICE - -#include -#include -#include - -using namespace std; - -/// Class for looking at device properties -/** \note Calls to change the device outside of the class results in incorrect - * behavior - * \note There is no error checking for indexing past the number of devices - * \note init() at least once before using any of the routines **/ -class NVCDevice { - public: - /// Grabs the properties for all devices - /** \note init() must be called following construction before any routines **/ - NVCDevice() {} - - /// Collect properties for every GPU on the node and set active GPU to ID 0 - void init(); - - /// Return the number of devices that support CUDA - inline int num_devices() { return _properties.size(); } - - /// Set the CUDA device to the specified device number - void set(int num); - - /// Get the current device number - inline int device_num() { return _device; } - - /// Get the current CUDA device name - inline string name() { return name(_device); } - /// Get the CUDA device name - inline string name(const int i) { return string(_properties[i].name); } - - /// Get the number of cores in the current device - inline unsigned cores() { return cores(_device); } - /// Get the number of cores - inline unsigned cores(const int i) - { return _properties[i].multiProcessorCount*8; } - - /// Get the gigabytes of global memory in the current device - inline double gigabytes() { return gigabytes(_device); } - /// Get the gigabytes of global memory - inline double gigabytes(const int i) - { return static_cast(_properties[i].totalGlobalMem)/1073741824; } - - /// Get the bytes of global memory in the current device - inline size_t bytes() { return bytes(_device); } - /// Get the bytes of global memory - inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; } - - /// Return the GPGPU revision number for current device - inline double revision() { return revision(_device); } - /// Return the GPGPU revision number - inline double revision(const int i) - { return static_cast(_properties[i].minor)/10+_properties[i].major;} - - /// Clock rate in GHz for current device - inline double clock_rate() { return clock_rate(_device); } - /// Clock rate in GHz - inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;} - - /// List all devices along with all properties - void print_all(ostream &out); - - private: - int _device, _num_devices; - vector _properties; -}; - -#endif diff --git a/lib/gpu/nvc_get_devices.cu b/lib/gpu/nvc_get_devices.cu deleted file mode 100644 index 6b54f10f41..0000000000 --- a/lib/gpu/nvc_get_devices.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#if defined(__APPLE__) -#if _GLIBCXX_ATOMIC_BUILTINS == 1 -#undef _GLIBCXX_ATOMIC_BUILTINS -#endif // _GLIBCXX_ATOMIC_BUILTINS -#endif // __APPLE__ - -#include "nvc_device.h" - -int main(int argc, char** argv) { - NVCDevice gpu; - gpu.init(); - gpu.print_all(cout); - return 0; -} - diff --git a/lib/gpu/nvc_macros.h b/lib/gpu/nvc_macros.h deleted file mode 100644 index ef61684c46..0000000000 --- a/lib/gpu/nvc_macros.h +++ /dev/null @@ -1,146 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#ifndef NVC_MACROS_H -#define NVC_MACROS_H - -#if defined(__APPLE__) -#if _GLIBCXX_ATOMIC_BUILTINS == 1 -#undef _GLIBCXX_ATOMIC_BUILTINS -#endif // _GLIBCXX_ATOMIC_BUILTINS -#endif // __APPLE__ - -#include -#include "math_constants.h" -#define INT_MUL(x,y) (__mul24(x,y)) -//#define INT_MUL(x,y) ((x)*(y)) - -template -static __inline__ __device__ numbr cuda_inf() { return CUDART_INF_F; } - -#ifdef CUDA_DOUBLE -template <> -static __inline__ __device__ double cuda_inf() { return CUDART_INF; } -#endif - -template -static __inline__ __device__ numbr cuda_zero() { return 0.0; } - -template <> -static __inline__ __device__ float cuda_zero() { return 0.0f; } - -#ifndef NO_DEBUG - -# define CU_SAFE_CALL_NO_SYNC( call ) do { \ - CUresult err = call; \ - if( CUDA_SUCCESS != err) { \ - fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \ - err, __FILE__, __LINE__ ); \ - exit(EXIT_FAILURE); \ - } } while (0) - -# define CU_SAFE_CALL( call ) do { \ - CU_SAFE_CALL_NO_SYNC(call); \ - CUresult err = cuCtxSynchronize(); \ - if( CUDA_SUCCESS != err) { \ - fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \ - err, __FILE__, __LINE__ ); \ - exit(EXIT_FAILURE); \ - } } while (0) - -# define CUDA_SAFE_CALL_NO_SYNC( call) do { \ - cudaError err = call; \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ - __FILE__, __LINE__, cudaGetErrorString( err) ); \ - exit(EXIT_FAILURE); \ - } } while (0) - -# define CUDA_SAFE_CALL( call) do { \ - CUDA_SAFE_CALL_NO_SYNC(call); \ - cudaError err = cudaThreadSynchronize(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ - __FILE__, __LINE__, cudaGetErrorString( err) ); \ - exit(EXIT_FAILURE); \ - } } while (0) - -# define CUFFT_SAFE_CALL( call) do { \ - cufftResult err = call; \ - if( CUFFT_SUCCESS != err) { \ - fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } } while (0) - -# define CUT_SAFE_CALL( call) \ - if( CUTTrue != call) { \ - fprintf(stderr, "Cut error in file '%s' in line %i.\n", \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } - - //! Check for CUDA error -# define CUT_CHECK_ERROR(errorMessage) do { \ - cudaError_t err = cudaGetLastError(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ - } \ - err = cudaThreadSynchronize(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ - } } while (0) - - //! Check for malloc error -# define CUT_SAFE_MALLOC( mallocCall ) do{ \ - if( !(mallocCall)) { \ - fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } } while(0); - - //! Check if conditon is true (flexible assert) -# define CUT_CONDITION( val) \ - if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) { \ - exit(EXIT_FAILURE); \ - } - -#else // not DEBUG - -#define CUT_BANK_CHECKER( array, index) array[index] - - // void macros for performance reasons -# define CUT_CHECK_ERROR(errorMessage) -# define CUT_CHECK_ERROR_GL() -# define CUT_CONDITION( val) -# define CU_SAFE_CALL_NO_SYNC( call) call -# define CU_SAFE_CALL( call) call -# define CUDA_SAFE_CALL_NO_SYNC( call) call -# define CUDA_SAFE_CALL( call) call -# define CUT_SAFE_CALL( call) call -# define CUFFT_SAFE_CALL( call) call -# define CUT_SAFE_MALLOC( mallocCall ) mallocCall - -#endif - -#endif diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h deleted file mode 100644 index 821ff7b568..0000000000 --- a/lib/gpu/nvc_memory.h +++ /dev/null @@ -1,522 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#ifndef NVC_MEMORY_H -#define NVC_MEMORY_H - -#include -#include "nvc_macros.h" - -#define NVC_HostT NVC_Host -#define NVC_HostD NVC_Host -#define NVC_HostS NVC_Host -#define NVC_HostI NVC_Host - -#define NVC_VecT NVC_Vec -#define NVC_VecD NVC_Vec -#define NVC_VecS NVC_Vec -#define NVC_VecI NVC_Vec -#define NVC_VecI2 NVC_Vec -#define NVC_VecU2 NVC_Vec - -#define NVC_MatT NVC_Mat -#define NVC_MatD NVC_Mat -#define NVC_MatS NVC_Mat -#define NVC_MatI NVC_Mat - -#define NVC_ConstMatT NVC_ConstMat -#define NVC_ConstMatD NVC_ConstMat -#define NVC_ConstMatS NVC_ConstMat -#define NVC_ConstMatI NVC_ConstMat -#define NVC_ConstMatD2 NVC_ConstMat - -namespace NVC { - -// Get a channel for float array -template -inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { - channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); -} - -// Get a channel for float2 array -template <> -inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { - channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); -} - -// Get a channel for double array -template <> -inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { - channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned); -} - -// Get a channel for double array -template <> -inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { - channel = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindSigned); -} - -// Get a channel for int array -template <> -inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { - channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned); -} - -} - -/// Page-locked Row Vector on Host -template -class NVC_Host { - public: - NVC_Host() { _cols=0; } - ~NVC_Host() { if (_cols>0) CUDA_SAFE_CALL(cudaFreeHost(_array)); } - - // Allocate page-locked memory with fast write/slow read on host - inline void safe_alloc_w(const size_t cols) { - _cols=cols; - _row_bytes=cols*sizeof(numtyp); - CUDA_SAFE_CALL(cudaHostAlloc((void **)&_array,_row_bytes, - cudaHostAllocWriteCombined)); - _end=_array+cols; - } - - // Allocate page-locked memory with fast write/slow read on host - inline bool alloc_w(const size_t cols) { - _cols=cols; - _row_bytes=cols*sizeof(numtyp); - if (cudaHostAlloc((void **)&_array,_row_bytes,cudaHostAllocWriteCombined)!= - cudaSuccess) - return false; - _end=_array+cols; - return true; - } - - // Allocate page-locked memory with fast read/write on host - inline void safe_alloc_rw(const size_t cols) { - _cols=cols; - _row_bytes=cols*sizeof(numtyp); - CUDA_SAFE_CALL(cudaMallocHost((void **)&_array,_row_bytes)); - _end=_array+cols; - } - - // Allocate page-locked memory with fast read/write on host - inline bool alloc_rw(const size_t cols) { - _cols=cols; - _row_bytes=cols*sizeof(numtyp); - if (cudaMallocHost((void **)&_array,_row_bytes)!=cudaSuccess) - return false; - _end=_array+cols; - return true; - } - - /// Free any memory associated with device - inline void clear() - { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } } - - /// Set each element to zero - inline void zero() { memset(_array,0,row_bytes()); } - - /// Set first n elements to zero - inline void zero(const int n) { memset(_array,0,n*sizeof(numtyp)); } - - inline numtyp * begin() { return _array; } - inline const numtyp * begin() const { return _array; } - inline numtyp * end() { return _end; } - inline const numtyp * end() const { return _end; } - - inline size_t numel() const { return _cols; } - inline size_t rows() const { return 1; } - inline size_t cols() const { return _cols; } - inline size_t row_size() const { return _cols; } - inline size_t row_bytes() const { return _row_bytes; } - - inline numtyp & operator[](const int i) { return _array[i]; } - inline const numtyp & operator[](const int i) const { return _array[i]; } - - /// Copy from device (numel is not bytes) - inline void copy_from_device(const numtyp *device_p, size_t numel) { - CUDA_SAFE_CALL(cudaMemcpy(_array,device_p,numel*sizeof(numtyp), - cudaMemcpyDeviceToHost)); - } - - /// Copy to device (numel is not bytes) - inline void copy_to_device(numtyp *device_p, size_t numel) { - CUDA_SAFE_CALL(cudaMemcpy(device_p,_array,numel*sizeof(numtyp), - cudaMemcpyHostToDevice)); - } - - /// Copy to 2D matrix on device (numel is not bytes) - inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, - const size_t rows, const size_t cols) { - CUDA_SAFE_CALL(cudaMemcpy2D(device_p,dev_row_size*sizeof(numtyp), - _array,cols*sizeof(numtyp), - cols*sizeof(numtyp),rows, - cudaMemcpyHostToDevice)); - } - - /// Asynchronous copy from device (numel is not bytes) - inline void copy_from_device(const numtyp *device_p, size_t numel, - cudaStream_t &stream) { - CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp), - cudaMemcpyDeviceToHost,stream)); - } - - /// Asynchronous copy to device (numel is not bytes) - inline void copy_to_device(numtyp *device_p, size_t numel, - cudaStream_t &stream) { - CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp), - cudaMemcpyHostToDevice,stream)); - } - - /// Asynchronous copy to device (numel is not bytes) - inline void copy_to_device(size_t offset, numtyp *device_p, size_t numel, - cudaStream_t &stream) { - CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array+offset,numel*sizeof(numtyp), - cudaMemcpyHostToDevice,stream)); - } - - /// Asynchronous copy to 2D matrix on device (numel is not bytes) - inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, - const size_t rows, const size_t cols, - cudaStream_t &stream) { - CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(device_p, - dev_row_size*sizeof(numtyp), - _array,cols*sizeof(numtyp), - cols*sizeof(numtyp),rows, - cudaMemcpyHostToDevice,stream)); - } - - private: - numtyp *_array, *_end; - size_t _row_bytes, _row_size, _rows, _cols; -}; - -/// Row vector on device -template -class NVC_Vec { - public: - NVC_Vec() { _cols=0; } - ~NVC_Vec() { if (_cols>0) CUDA_SAFE_CALL(cudaFree(_array)); } - - // Row vector on device - inline void safe_alloc(const size_t cols) { - _cols=cols; - _row_bytes=cols*sizeof(numtyp); - CUDA_SAFE_CALL(cudaMalloc((void **)&_array,_row_bytes)); - _end=_array+cols; - } - - // Row vector on device - inline bool alloc(const size_t cols) { - _cols=cols; - _row_bytes=cols*sizeof(numtyp); - if (cudaMalloc((void **)&_array,_row_bytes)!=cudaSuccess) - return false; - _end=_array+cols; - return true; - } - - // Row vector on device (allocate and assign texture and bind) - inline void safe_alloc(const size_t cols, textureReference *t) - { safe_alloc(cols); assign_texture(t); bind(); } - - /// Free any memory associated with device - inline void clear() - { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFree(_array)); } } - - /// Set each element to zero - inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0,row_bytes())); } - - inline numtyp * begin() { return _array; } - inline const numtyp * begin() const { return _array; } - inline numtyp * end() { return _end; } - inline const numtyp * end() const { return _end; } - - inline size_t numel() const { return _cols; } - inline size_t rows() const { return 1; } - inline size_t cols() const { return _cols; } - inline size_t row_size() const { return _cols; } - inline size_t row_bytes() const { return _row_bytes; } - - /// Copy from host - inline void copy_from_host(const numtyp *host_p) - { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), - cudaMemcpyHostToDevice)); } - - /// Copy from host (n elements) - inline void copy_from_host(const numtyp *host_p, const size_t n) - { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,n*sizeof(numtyp), - cudaMemcpyHostToDevice)); } - - /// Asynchronous copy from host - inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) - { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(), - cudaMemcpyHostToDevice, stream)); } - - /// Asynchronous copy from host (n elements) - inline void copy_from_host(const numtyp *host_p, const size_t n, - cudaStream_t &stream) - { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,n*sizeof(numtyp), - cudaMemcpyHostToDevice, stream)); } - - /// Copy to host - inline void copy_to_host(numtyp *host_p) - { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(), - cudaMemcpyDeviceToHost)); } - - /// Copy n elements to host - inline void copy_to_host(numtyp *host_p, const int n) - { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,n*sizeof(numtyp), - cudaMemcpyDeviceToHost)); } - - /// Cast and then copy to device - template - inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) { - for (int i=0; i(buffer[i]); - copy_from_host(host_write.begin()); - } - - /// Assign a texture to matrix - inline void assign_texture(textureReference *t) { _tex_ptr=t; } - - /// Bind to texture - inline void bind() { - NVC::cuda_gb_get_channel(_channel); - (*_tex_ptr).addressMode[0] = cudaAddressModeClamp; - (*_tex_ptr).addressMode[1] = cudaAddressModeClamp; - (*_tex_ptr).filterMode = cudaFilterModePoint; - (*_tex_ptr).normalized = false; - CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,_array,&_channel)); - } - - /// Unbind texture - inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); } - - /// Output the vector (debugging) - inline void print(std::ostream &out) { print (out, numel()); } - - // Output first n elements of vector - inline void print(std::ostream &out, const int n) { - numtyp *t=new numtyp[n]; - copy_to_host(t,n); - for (int i=0; i -class NVC_Mat { - public: - NVC_Mat() { _rows=0; } - ~NVC_Mat() { if (_rows>0) CUDA_SAFE_CALL(cudaFree(_array)); } - - // Row major matrix on device - // - Coalesced access using adjacent cols on same row - // - NVC_Mat(row,col) given by array[row*row_size()+col] - inline void safe_alloc(const size_t rows, const size_t cols) { - _rows=rows; - _cols=cols; - CUDA_SAFE_CALL(cudaMallocPitch((void **)&_array,&_pitch, - cols*sizeof(numtyp),rows)); - _row_size=_pitch/sizeof(numtyp); - _end=_array+_row_size*cols; - } - - /// Free any memory associated with device - inline void clear() - { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFree(_array)); } } - - /// Set each element to zero - inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0, _pitch*_rows)); } - - inline numtyp * begin() { return _array; } - inline const numtyp * begin() const { return _array; } - inline numtyp * end() { return _end; } - inline const numtyp * end() const { return _end; } - - - inline size_t numel() const { return _cols*_rows; } - inline size_t rows() const { return _rows; } - inline size_t cols() const { return _cols; } - inline size_t row_size() const { return _row_size; } - inline size_t row_bytes() const { return _pitch; } - - /// Copy from host (elements not bytes) - inline void copy_from_host(const numtyp *host_p, const size_t numel) - { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,numel*sizeof(numtyp), - cudaMemcpyHostToDevice)); } - - /// Asynchronous copy from host (elements not bytes) - inline void copy_from_host(const numtyp *host_p, const size_t numel, - cudaStream_t &stream) - { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), - cudaMemcpyHostToDevice, stream)); } - - /// Asynchronous Copy from Host - /** \note Used when the number of columns/rows allocated on host smaller than - * on device **/ - inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, - const size_t cols, cudaStream_t &stream) { - CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(_array, _pitch, host_p, - cols*sizeof(numtyp), cols*sizeof(numtyp), rows, - cudaMemcpyHostToDevice,stream)); - } - - private: - numtyp *_array, *_end; - size_t _pitch, _row_size, _rows, _cols; -}; - -/// Const 2D Matrix on device (requires texture binding) -template -class NVC_ConstMat { - public: - NVC_ConstMat() { _rows=0; } - ~NVC_ConstMat() { if (_rows>0) CUDA_SAFE_CALL(cudaFreeArray(_array)); } - - /// Assign a texture to matrix - inline void assign_texture(textureReference *t) { _tex_ptr=t; } - - /// Row major matrix on device - inline void safe_alloc(const size_t rows, const size_t cols) { - _rows=rows; - _cols=cols; - - NVC::cuda_gb_get_channel(_channel); - CUDA_SAFE_CALL(cudaMallocArray(&_array, &_channel, cols, rows)); - } - - /// Row major matrix on device (Allocate and bind texture) - inline void safe_alloc(const size_t rows, const size_t cols, - textureReference *t) - { safe_alloc(rows,cols); assign_texture(t); bind(); } - - /// Bind to texture - inline void bind() { - (*_tex_ptr).addressMode[0] = cudaAddressModeClamp; - (*_tex_ptr).addressMode[1] = cudaAddressModeClamp; - (*_tex_ptr).filterMode = cudaFilterModePoint; - (*_tex_ptr).normalized = false; - CUDA_SAFE_CALL(cudaBindTextureToArray(_tex_ptr,_array,&_channel)); - } - - /// Unbind texture - inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); } - - /// Free any memory associated with device and unbind - inline void clear() { - if (_rows>0) { - _rows=0; - CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); - CUDA_SAFE_CALL(cudaFreeArray(_array)); - } - } - - inline size_t numel() const { return _cols*_rows; } - inline size_t rows() const { return _rows; } - inline size_t cols() const { return _cols; } - inline size_t row_size() const { return _cols; } - inline size_t row_bytes() const { return _cols*sizeof(numtyp); } - - /// Copy from Host - inline void copy_from_host(const numtyp *host_p) { - CUDA_SAFE_CALL(cudaMemcpyToArray(_array, 0, 0, host_p, - numel()*sizeof(numtyp), - cudaMemcpyHostToDevice)); - } - - /// Copy from Host - /** \note Used when the number of columns/rows allocated on host smaller than - * on device **/ - inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, - const size_t cols) { - CUDA_SAFE_CALL(cudaMemcpy2DToArray(_array, 0, 0, host_p, - cols*sizeof(numtyp), cols*sizeof(numtyp), rows, - cudaMemcpyHostToDevice)); - } - - /// Asynchronous Copy from Host - inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) { - CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, - numel()*sizeof(numtyp), - cudaMemcpyHostToDevice, - stream)); - } - - /// Asynchronous Copy from Host - /** \note Used when the number of columns/rows allocated on host smaller than - * on device **/ - inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, - const size_t cols, cudaStream_t &stream) { - CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, - cols*sizeof(numtyp), cols*sizeof(numtyp), rows, - cudaMemcpyHostToDevice,stream)); - } - - /// Cast buffer to numtyp in host_write and copy to array - template - inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) { - int n=numel(); - for (int i=0; i(*buffer); buffer++; - } - copy_from_host(host_write.begin()); - } - - /// Cast buffer to numtyp in host_write and copy to array - /** \note Used when the number of columns/rows allocated on host smaller than - * on device **/ - template - inline void cast_copy2D(const numtyp2 *buffer, NVC_HostT &host_write, - const size_t rows, const size_t cols) { - int n=rows*cols; - for (int i=0; i(*buffer); buffer++; - } - copy_2Dfrom_host(host_write.begin(),rows,cols); - } - - /// Cast buffer to numtyp in host_write and copy to array asynchronously - template - inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write, - cudaStream_t &stream) { - int n=numel(); - for (int i=0; i(*buffer); buffer++; - } - copy_from_host(host_write.begin(),stream); - } - - private: - size_t _rows, _cols; - cudaArray *_array; - cudaChannelFormatDesc _channel; - textureReference *_tex_ptr; -}; - -#endif diff --git a/lib/gpu/nvc_timer.h b/lib/gpu/nvc_timer.h deleted file mode 100644 index 9cab7302d5..0000000000 --- a/lib/gpu/nvc_timer.h +++ /dev/null @@ -1,79 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov -------------------------------------------------------------------------- */ - -#ifndef NVC_TIMER_H -#define NVC_TIMER_H - -#include "nvc_macros.h" - -#define cudaEventDestroy(a) - -/// Class for timing CUDA events -class NVCTimer { - public: - NVCTimer() : _total_time(0.0f), initialized(false) { } - - ~NVCTimer() { - if (initialized) - { cudaEventDestroy(start_event); cudaEventDestroy(stop_event); } - } - - inline void init() { - if (initialized) { - cudaEventDestroy(start_event); - cudaEventDestroy(stop_event); - } - initialized=true; - CUDA_SAFE_CALL( cudaEventCreate(&start_event) ); - CUDA_SAFE_CALL( cudaEventCreate(&stop_event) ); - } - - /// Start timing - inline void start() { cudaEventRecord(start_event,0); } - - /// Stop timing and store event time - inline void stop() { cudaEventRecord(stop_event,0); } - - /// Set the time elapsed to zero (not the total_time) - inline void zero() - { cudaEventRecord(start_event,0); cudaEventRecord(stop_event,0); } - - /// Add time from previous start and stop to total - /** Forces synchronization **/ - inline void add_to_total() { _total_time+=time(); } - - /// Return the time (ms) of last start to stop - Forces synchronization - inline double time() { - float timer; - cudaEventSynchronize(stop_event); - CUDA_SAFE_CALL( cudaEventElapsedTime(&timer,start_event,stop_event) ); - return timer; - } - - /// Return the total time in ms - inline double total_time() { return _total_time; } - - /// Return the total time in seconds - inline double total_seconds() { return _total_time/1000.0; } - - private: - cudaEvent_t start_event, stop_event; - double _total_time; - bool initialized; -}; - -#endif diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp new file mode 100644 index 0000000000..6e86a9b64e --- /dev/null +++ b/lib/gpu/pair_gpu_atom.cpp @@ -0,0 +1,571 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include "pair_gpu_atom.h" + +#define PairGPUAtomT PairGPUAtom + +#ifdef WINDLL +#include +typedef bool (*__win_sort_alloc)(const int max_atoms); +typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin, + int *particle_begin); +__win_sort_alloc _win_sort_alloc; +__win_sort _win_sort; +#endif + +template +PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false), + _vflag(false),_inum(0),_ilist(NULL) { + #ifndef USE_OPENCL + sort_config.op = CUDPP_ADD; + sort_config.datatype = CUDPP_UINT; + sort_config.algorithm = CUDPP_SORT_RADIX; + sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; + + #ifdef WINDLL + HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll")); + if (hinstLib == NULL) { + printf("\nUnable to load gpu.dll\n"); + exit(1); + } + _win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc"); + _win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort"); + #endif + + #endif +} + +template +int PairGPUAtomT::bytes_per_atom() const { + int id_space=0; + if (_gpu_nbor) + id_space=2; + int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space; + if (_rot) + bytes+=4*sizeof(numtyp)+4*sizeof(acctyp); + if (_charge) + bytes+=sizeof(numtyp); + return bytes; +} + +template +bool PairGPUAtomT::alloc(const int max_atoms) { + bool success=true; + + int ans_elements=4; + if (_rot) + ans_elements+=4; + + // Ignore host/device transfers? + bool cpuview=false; + if (dev->device_type()==UCL_CPU) + cpuview=true; + + // Allocate storage for CUDPP sort + #ifndef USE_OPENCL + #ifdef WINDLL + _win_sort_alloc(max_atoms); + #else + if (_gpu_nbor) { + CUDPPResult result = cudppPlan(&sort_plan, sort_config, max_atoms, 1, 0); + if (CUDPP_SUCCESS != result) + return false; + } + #endif + #endif + + // -------------------------- Host allocations + // Get a host write only buffer + #ifdef GPU_CAST + success=success && (host_x_cast.alloc(max_atoms*3,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + success=success && (host_type_cast.alloc(max_atoms,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + #else + success=success && (host_x.alloc(max_atoms*4,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + #endif + success=success && (host_ans.alloc(ans_elements*max_atoms,*dev)==UCL_SUCCESS); + success=success && (host_engv.alloc(_ev_fields*max_atoms,*dev)==UCL_SUCCESS); + // Buffer for casting only if different precisions + if (_charge) + success=success && (host_q.alloc(max_atoms,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + // Buffer for casting only if different precisions + if (_rot) + success=success && (host_quat.alloc(max_atoms*4,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + + + // --------------------------- Device allocations + _gpu_bytes=0; + if (cpuview) { + #ifdef GPU_CAST + assert(0==1); + #else + dev_x.view(host_x); + #endif + dev_engv.view(host_engv); + dev_ans.view(host_ans); + if (_rot) + dev_quat.view(host_quat); + if (_charge) + dev_q.view(host_q); + } else { + #ifdef GPU_CAST + success=success && (UCL_SUCCESS==dev_x.alloc(max_atoms*4,*dev)); + success=success && (UCL_SUCCESS== + dev_x_cast.alloc(max_atoms*3,*dev,UCL_READ_ONLY)); + success=success && (UCL_SUCCESS== + dev_type_cast.alloc(max_atoms,*dev,UCL_READ_ONLY)); + _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); + #else + success=success && (UCL_SUCCESS== + dev_x.alloc(max_atoms*4,*dev,UCL_READ_ONLY)); + #endif + success=success && (dev_engv.alloc(_ev_fields*max_atoms,*dev, + UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (dev_ans.alloc(ans_elements*max_atoms, + *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); + if (_charge) { + success=success && (dev_q.alloc(max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + _gpu_bytes+=dev_q.row_bytes(); + } + if (_rot) { + success=success && (dev_quat.alloc(max_atoms*4,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + _gpu_bytes+=dev_quat.row_bytes(); + } + } + if (_gpu_nbor) { + success=success && (dev_cell_id.alloc(max_atoms,*dev)==UCL_SUCCESS); + success=success && (dev_particle_id.alloc(max_atoms,*dev)==UCL_SUCCESS); + _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); + if (_bonds) { + success=success && (dev_tag.alloc(max_atoms,*dev)==UCL_SUCCESS); + _gpu_bytes+=dev_tag.row_bytes(); + } + } + + _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes(); + + return success; +} + +template +bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, + const bool rot, UCL_Device &devi, const bool gpu_nbor, + const bool bonds) { + clear(); + + bool success=true; + _gpu_nbor=gpu_nbor; + _bonds=bonds; + _charge=charge; + _rot=rot; + _other=_charge || _rot; + dev=&devi; + + _e_fields=1; + if (_charge) + _e_fields++; + _ev_fields=6+_e_fields; + + // Initialize atom and nbor data + int max_local=static_cast(static_cast(inum)*1.10); + if (max_local==0) + max_local=1000; + if (nall<=inum) + _max_atoms=max_local*2; + else + _max_atoms=static_cast(static_cast(nall)*1.10); + + // Initialize timers for the selected device + time_pos.init(*dev); + time_other.init(*dev); + time_answer.init(*dev); + time_pos.zero(); + time_other.zero(); + time_answer.zero(); + _time_cast=0.0; + + #ifdef GPU_CAST + compile_kernels(*dev); + #endif + + _allocated=true; + return success && alloc(_max_atoms); +} + +template +void PairGPUAtomT::clear_resize() { + if (!_allocated) + return; + _allocated=false; + + dev_x.clear(); + if (_charge) { + dev_q.clear(); + host_q.clear(); + } + if (_rot) { + dev_quat.clear(); + host_quat.clear(); + } + dev_ans.clear(); + dev_engv.clear(); + #ifndef GPU_CAST + host_x.clear(); + #else + host_x_cast.clear(); + host_type_cast.clear(); + #endif + host_ans.clear(); + host_engv.clear(); + dev_cell_id.clear(); + dev_particle_id.clear(); + dev_tag.clear(); + #ifdef GPU_CAST + dev_x_cast.clear(); + dev_type_cast.clear(); + #endif + + #ifndef USE_OPENCL + #ifndef WINDLL + if (_gpu_nbor) cudppDestroyPlan(sort_plan); + #endif + #endif +} + +template +void PairGPUAtomT::clear() { + _gpu_bytes=0; + if (!_allocated) + return; + + time_pos.clear(); + time_other.clear(); + time_answer.clear(); + clear_resize(); + _inum=0; + _eflag=false; + _vflag=false; + + #ifdef GPU_CAST + if (_compiled) { + k_cast_x.clear(); + delete atom_program; + _compiled=false; + } + #endif +} + +template +double PairGPUAtomT::host_memory_usage() const { + int atom_bytes=4; + if (_charge) + atom_bytes+=1; + if (_rot) + atom_bytes+=4; + int ans_bytes=atom_bytes+_ev_fields; + return _max_atoms*atom_bytes*sizeof(numtyp)+ + ans_bytes*(_max_atoms)*sizeof(acctyp)+ + sizeof(PairGPUAtom); +} + +template +void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom) { + time_answer.start(); + _eflag=eflag; + _vflag=vflag; + _ef_atom=ef_atom; + _vf_atom=vf_atom; + + int csize=_ev_fields; + if (!eflag) + csize-=_e_fields; + if (!vflag) + csize-=6; + + if (csize>0) + ucl_copy(host_engv,dev_engv,_inum*csize,true); + if (_rot) + ucl_copy(host_ans,dev_ans,_inum*4*2,true); + else + ucl_copy(host_ans,dev_ans,_inum*4,true); + time_answer.stop(); +} + +template +void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom, + int *ilist) { + _ilist=ilist; + copy_answers(eflag,vflag,ef_atom,vf_atom); +} + +template +double PairGPUAtomT::energy_virial(double *eatom, double **vatom, + double *virial) { + if (_eflag==false && _vflag==false) + return 0.0; + + double evdwl=0.0; + if (_gpu_nbor) { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[i]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[i][j]+=*ap*0.5; + virial[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]*=0.5; + } else { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + int ii=_ilist[i]; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[ii]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[ii][j]+=*ap*0.5; + virial[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]*=0.5; + } + + evdwl*=0.5; + return evdwl; +} + +template +double PairGPUAtomT::energy_virial(double *eatom, double **vatom, + double *virial, double &ecoul) { + if (_eflag==false && _vflag==false) { + ecoul=0.0; + return 0.0; + } + + if (_charge==false) + return energy_virial(eatom,vatom,virial); + + double evdwl=0.0; + double _ecoul=0.0; + if (_gpu_nbor) { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[i]+=*ap*0.5; + ap+=_inum; + _ecoul+=*ap; + eatom[i]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + _ecoul+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[i][j]+=*ap*0.5; + virial[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]*=0.5; + } else { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + int ii=_ilist[i]; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[ii]+=*ap*0.5; + ap+=_inum; + _ecoul+=*ap; + eatom[ii]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + _ecoul+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[ii][j]+=*ap*0.5; + virial[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]*=0.5; + } + + evdwl*=0.5; + ecoul+=_ecoul*0.5; + return evdwl; +} + +template +void PairGPUAtomT::get_answers(double **f, double **tor) { + acctyp *ap=host_ans.begin(); + if (_gpu_nbor) { + for (int i=0; i<_inum; i++) { + f[i][0]+=*ap; + ap++; + f[i][1]+=*ap; + ap++; + f[i][2]+=*ap; + ap+=2; + } + if (_rot) { + for (int i=0; i<_inum; i++) { + tor[i][0]+=*ap; + ap++; + tor[i][1]+=*ap; + ap++; + tor[i][2]+=*ap; + ap+=2; + } + } + } else { + for (int i=0; i<_inum; i++) { + int ii=_ilist[i]; + f[ii][0]+=*ap; + ap++; + f[ii][1]+=*ap; + ap++; + f[ii][2]+=*ap; + ap+=2; + } + if (_rot) { + for (int i=0; i<_inum; i++) { + int ii=_ilist[i]; + tor[ii][0]+=*ap; + ap++; + tor[ii][1]+=*ap; + ap++; + tor[ii][2]+=*ap; + ap+=2; + } + } + } +} + +// Sort arrays for neighbor list calculation +template +void PairGPUAtomT::sort_neighbor(const int num_atoms) { + #ifndef USE_OPENCL + #ifdef WINDLL + _win_sort(num_atoms,(unsigned *)dev_cell_id.begin(), + (int *)dev_particle_id.begin()); + #else + CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), + (int *)dev_particle_id.begin(), + 8*sizeof(unsigned), num_atoms); + if (CUDPP_SUCCESS != result) { + printf("Error in cudppSort\n"); + NVD_GERYON_EXIT; + } + #endif + #endif +} + +#ifdef GPU_CAST +#ifdef USE_OPENCL +#include "pair_gpu_atom_cl.h" +#else +#include "pair_gpu_atom_ptx.h" +#endif + +template +void PairGPUAtomT::compile_kernels(UCL_Device &dev) { + atom_program=new UCL_Program(dev); + atom_program->load_string(pair_gpu_atom_kernel,""); + k_cast_x.set_function(*atom_program,"kernel_cast_x"); + _compiled=true; +} + +#endif + +template class PairGPUAtom; diff --git a/lib/gpu/pair_gpu_atom.cu b/lib/gpu/pair_gpu_atom.cu deleted file mode 100644 index 7f2f13f3b4..0000000000 --- a/lib/gpu/pair_gpu_atom.cu +++ /dev/null @@ -1,192 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#include "pair_gpu_texture.h" -#include "pair_gpu_atom.h" - -#define PairGPUAtomT PairGPUAtom - -template -int PairGPUAtomT::bytes_per_atom() const { - return atom_fields()*sizeof(numtyp)+ans_fields()*sizeof(acctyp); -} - -template -bool PairGPUAtomT::init(const int max_atoms) { - bool success=true; - - if (allocated) - clear(); - - _max_atoms=max_atoms; - - // Initialize timers for the selected GPU - time_atom.init(); - time_answer.init(); - - // Device matrices for atom and force data - success=success && dev_x.alloc(max_atoms*sizeof(vec4)); - success=success && dev_q.alloc(max_atoms*sizeof(vec4)); - success=success && ans.alloc(ans_fields()*max_atoms); - // Get a host read/write buffer - success=success && host_read.alloc_rw(max_atoms*ans_fields()); - - // Get a host write only buffer - success=success && host_write.alloc_w(max_atoms*atom_fields()); - - allocated=true; - - return success; -} - -template -void PairGPUAtomT::resize(const int max_atoms, bool &success) { - ans.clear(); - dev_x.clear(); - dev_q.clear(); - host_write.clear(); - host_read.clear(); - - _max_atoms=max_atoms; - - success = success && dev_x.alloc(_max_atoms*sizeof(vec4)); - success = success && dev_q.alloc(_max_atoms*sizeof(vec4)); - success = success && ans.alloc(ans_fields()*_max_atoms); - success = success && host_read.alloc_rw(_max_atoms*ans_fields()); - success = success && host_write.alloc_w(_max_atoms*atom_fields()); -} - -template -void PairGPUAtomT::clear() { - if (!allocated) - return; - allocated=false; - - ans.clear(); - dev_x.clear(); - dev_q.clear(); - host_write.clear(); - host_read.clear(); -} - -template -double PairGPUAtomT::host_memory_usage(const int max_atoms) const { - return max_atoms*atom_fields()*sizeof(numtyp)+ - ans_fields()*(max_atoms)*sizeof(acctyp)+ - sizeof(PairGPUAtom); -} - -template -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - cudaStream_t &s) { - _eflag=eflag; - _vflag=vflag; - - int csize=ans_fields(); - if (!eflag) - csize--; - if (!vflag) - csize-=6; - - host_read.copy_from_device(ans.begin(),_inum*csize,s); -} - -template -double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom, - const bool vflag_atom, double *eatom, - double **vatom, double *virial, - double **f, double **tor, const int n) { - double evdwl=0.0; - - acctyp *ap=host_read.begin(); - for (int i=0; i<_inum; i++) { - int ii=ilist[i]; - if (_eflag) { - if (eflag_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap++; - } else { - evdwl+=*ap; - ap++; - } - } - if (_vflag) { - if (vflag_atom) { - for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial[j]+=*ap; - ap++; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap++; - } - } - } - f[ii][0]+=*ap; - ap++; - f[ii][1]+=*ap; - ap++; - f[ii][2]+=*ap; - ap++; - if (i -void PairGPUAtomT::copy_asphere(const int *ilist, double **f, double **tor, - const int n) { - acctyp *ap=host_read.begin(); - for (int i=0; i<_inum; i++) { - int ii=ilist[i]; - f[ii][0]+=*ap; - ap++; - f[ii][1]+=*ap; - ap++; - f[ii][2]+=*ap; - ap++; - if (i; diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h index a129cd5850..7cec73f98c 100644 --- a/lib/gpu/pair_gpu_atom.h +++ b/lib/gpu/pair_gpu_atom.h @@ -12,100 +12,207 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef PAIR_GPU_ATOM_H #define PAIR_GPU_ATOM_H -// PRECISION - Precision for rsq, energy, force, and torque calculation -// ACC_PRECISION - Precision for accumulation of energies, forces, and torques -#ifdef _SINGLE_DOUBLE -#define PRECISION float -#define ACC_PRECISION double -#define MAX_ATOMS 65536 -#define vec4 float4 +#include +#include "mpi.h" + +#ifdef USE_OPENCL + +#include "geryon/ocl_device.h" +#include "geryon/ocl_timer.h" +#include "geryon/ocl_mat.h" +#include "geryon/ocl_kernel.h" +using namespace ucl_opencl; + +#else + +#include "cudpp.h" +#include "geryon/nvd_device.h" +#include "geryon/nvd_timer.h" +#include "geryon/nvd_mat.h" +#include "geryon/nvd_kernel.h" +using namespace ucl_cudadr; + #endif -#ifdef _DOUBLE_DOUBLE -#define PRECISION double -#define ACC_PRECISION double -#define MAX_ATOMS 32768 -struct vec4 { double x; double y; double z; double w; }; +#ifndef int2 +struct int2 { int x; int y; }; #endif -#ifndef PRECISION -#define PRECISION float -#define ACC_PRECISION float -#define MAX_ATOMS 65536 -#define vec4 float4 -#endif - -#include "nvc_timer.h" -#include "nvc_memory.h" +#include "pair_gpu_precision.h" template class PairGPUAtom { public: - PairGPUAtom() : _atom_fields(4), _ans_fields(10), allocated(false) {} + PairGPUAtom(); ~PairGPUAtom() { clear(); } - // Accessors - inline int atom_fields() const { return _atom_fields; } - inline int ans_fields() const { return _ans_fields; } + /// Maximum number of atoms that can be stored with current allocation inline int max_atoms() const { return _max_atoms; } + /// Current number of local+ghost atoms stored inline int nall() const { return _nall; } + /// Current number of local atoms stored inline int inum() const { return _inum; } - /// Set number of atoms for future copy operations + /// Set number of local+ghost atoms for future copy operations inline void nall(const int n) { _nall=n; } - /// Set number of inum for future copy operations + /// Set number of local atoms for future copy operations inline void inum(const int n) { _inum=n; } - /// Set the number of atom fields (x, y, z, type, etc) - inline void atom_fields(const int n) { _atom_fields=n; } - /// Set the number of answer fields (energy, virial, force, etc.) - inline void ans_fields(const int n) { _ans_fields=n; } /// Memory usage per atom in this class - /** \note atom_fields and ans_fields should be set for correct answer **/ int bytes_per_atom() const; - /// Must be called once to allocate host and device memory - /** \note atom_fields and ans_fields should be set first if not default **/ - bool init(const int max_atoms); - void resize(const int max_atoms, bool &success); + /// Clear any previous data and set up for a new LAMMPS run + /** \param rot True if atom storage needs quaternions + * \param gpu_nbor True if neighboring will be performed on device **/ + bool init(const int inum, const int nall, const bool charge, const bool rot, + UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false); + + /// Check if we have enough device storage and realloc if not + inline bool resize(const int inum, const int nall, bool &success) { + _inum=inum; + _nall=nall; + if (nall>_max_atoms) { + clear_resize(); + _max_atoms=static_cast(static_cast(nall)*1.10); + _allocated=true; + success = success && alloc(_max_atoms); + return true; + } + return false; + } + + /// Only free matrices of length inum or nall for resizing + void clear_resize(); /// Free all memory on host and device void clear(); - /// Return the total amount of host memory used by class - double host_memory_usage(const int max_atoms) const; + /// Return the total amount of host memory used by class in bytes + double host_memory_usage() const; + /// Sort arrays for neighbor list calculation on device + void sort_neighbor(const int num_atoms); - // -------------------------COPY TO GPU ---------------------------------- + /// Add copy times to timers + inline void acc_timers() { + time_pos.add_to_total(); + time_answer.add_to_total(); + if (_other) + time_other.add_to_total(); + } - /// Reset the write buffer pointer (Start copying new atom data) - inline void reset_write_buffer() { _write_loc=host_write.begin(); } - - /// Add a row to write buffer with unit stride - /** Copies nall() elements **/ - template - inline void add_atom_data(const cpytyp *host_ptr) - { for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i]; _write_loc++; } } - - /// Add a row to write buffer with non-unit stride - /** Copies nall() elements **/ - template - inline void add_atom_data(const cpytyp *hostptr, const int stride) { - int t=_nall*stride; - for (int i=0; i + inline void type_pack1(const int n, const int m_size, + UCL_D_Vec &dev_v, UCL_H_Vec &buffer, + t1 **one) { + int ii=0; + for (int i=0; i(one[i][j]); + ii++; + } + ii+=m_size-n; + } + UCL_H_Vec view; + view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + ucl_copy(dev_v,view,false); + } + + /// Pack LAMMPS atom type constants into 2 vectors and copy to device + template + inline void type_pack2(const int n, const int m_size, + UCL_D_Vec &dev_v, UCL_H_Vec &buffer, + t1 **one, t2 **two) { + int ii=0; + for (int i=0; i(one[i][j]); + buffer[ii*2+1]=static_cast(two[i][j]); + ii++; + } + ii+=m_size-n; + } + UCL_H_Vec view; + view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + ucl_copy(dev_v,view,false); + } + + /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device + template + inline void type_pack4(const int n, const int m_size, + UCL_D_Vec &dev_v, UCL_H_Vec &buffer, + t1 **one, t2 **two, t3 **three) { + int ii=0; + for (int i=0; i(one[i][j]); + buffer[ii*4+1]=static_cast(two[i][j]); + buffer[ii*4+2]=static_cast(three[i][j]); + ii++; + } + ii+=m_size-n; + } + UCL_H_Vec view; + view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + ucl_copy(dev_v,view,false); + } + + /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device + template + inline void type_pack4(const int n, const int m_size, + UCL_D_Vec &dev_v, UCL_H_Vec &buffer, + t1 **one, t2 **two, t3 **three, t4 **four) { + int ii=0; + for (int i=0; i(one[i][j]); + buffer[ii*4+1]=static_cast(two[i][j]); + buffer[ii*4+2]=static_cast(three[i][j]); + buffer[ii*4+3]=static_cast(four[i][j]); + ii++; + } + ii+=m_size-n; + } + UCL_H_Vec view; + view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + ucl_copy(dev_v,view,false); + } + + // -------------------------COPY TO GPU ---------------------------------- + + /// Cast positions and types to write buffer + inline void cast_x_data(double **host_ptr, const int *host_type) { + double t=MPI_Wtime(); + #ifdef GPU_CAST + memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); + memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); + #else + numtyp *_write_loc=host_x.begin(); for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i][0]; _write_loc++; @@ -116,59 +223,184 @@ class PairGPUAtom { *_write_loc=host_type[i]; _write_loc++; } + #endif + _time_cast+=MPI_Wtime()-t; } - /// Add quaternions to write buffer + /// Copy positions and types to device asynchronously /** Copies nall() elements **/ - template - inline void add_q_data(const cpytyp *host_ptr) { - const int end=_nall*4; - for (int i=0; i(ceil(static_cast(_nall)/block_size)); + k_cast_x.set_size(GX,block_size); + k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), + &_nall); + #else + ucl_copy(dev_x,host_x,_nall*4,true); + #endif + time_pos.stop(); + } + + /// Calls cast_x_data and add_x_data and times the routines + inline void cast_copy_x(double **host_ptr, int *host_type) { + cast_x_data(host_ptr,host_type); + add_x_data(host_ptr,host_type); + } + + /// Cast charges to write buffer + template + inline void cast_q_data(cpytyp *host_ptr) { + double t=MPI_Wtime(); + if (dev->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_q.view((numtyp*)host_ptr,_nall,*dev); + dev_q.view(host_q); + } else + for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp)); + else + for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; + } + _time_cast+=MPI_Wtime()-t; + } + + /// Copy charges to device asynchronously + inline void add_q_data() { + ucl_copy(dev_q,host_q,_nall,true); + } + + /// Cast quaternions to write buffer + template + inline void cast_quat_data(cpytyp *host_ptr) { + double t=MPI_Wtime(); + if (dev->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_quat.view((numtyp*)host_ptr,_nall*4,*dev); + dev_quat.view(host_quat); + } else + for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp)); + else + for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; + } + _time_cast+=MPI_Wtime()-t; + } + + /// Copy quaternions to device + /** Copies nall()*4 elements **/ + inline void add_quat_data() { + ucl_copy(dev_quat,host_quat,_nall*4,true); + } + + /// Copy data other than pos and data to device + inline void add_other_data() { + time_other.start(); + if (_charge) + add_q_data(); + if (_rot) + add_quat_data(); + time_other.stop(); + } + + /// Return number of bytes used on device + inline double gpu_bytes() { return _gpu_bytes; } - /// Copy num_rows positions+type to x in GPU - /** num_rows<=atom_fields() **/ - inline void copy_x_data(cudaStream_t &stream) - { dev_x.copy_from_host(host_write.begin(),_nall*4,stream); } - inline void copy_q_data(cudaStream_t &stream) - { dev_q.copy_from_host(host_write.begin()+_nall*4,_nall*4,stream); } - // -------------------------COPY FROM GPU ------------------------------- - /// Copy answers from GPU into read buffer - void copy_answers(const bool eflag, const bool vflag, cudaStream_t &s); + /// Copy answers from device into read buffer asynchronously + void copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom); + + /// Copy answers from device into read buffer asynchronously + void copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom, int *ilist); /// Copy energy and virial data into LAMMPS memory - double energy_virial(const int *ilist, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial, double **f, double **tor, const int); - + double energy_virial(double *eatom, double **vatom, double *virial); + + /// Copy energy and virial data into LAMMPS memory + double energy_virial(double *eatom, double **vatom, double *virial, + double &ecoul); + /// Add forces and torques from the GPU into a LAMMPS pointer - void copy_asphere(const int *ilist, double **f, double **tor, const int n); + void get_answers(double **f, double **tor); + // ------------------------------ DATA ---------------------------------- - // atom coordinates - NVC_Vec dev_x; - // quaterions - NVC_Vec dev_q; - // ans_fields() - // example: if (eflag and vflag) 1 is energy, 2-7 is virial - NVC_Vec ans; + /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type + UCL_D_Vec dev_x; + /// Charges + UCL_D_Vec dev_q; + /// Quaterions + UCL_D_Vec dev_quat; + /// Force and possibly torque + UCL_D_Vec dev_ans; + /// Energy and virial per-atom storage + UCL_D_Vec dev_engv; + + #ifdef GPU_CAST + UCL_D_Vec dev_x_cast; + UCL_D_Vec dev_type_cast; + UCL_H_Vec host_x_cast; + UCL_H_Vec host_type_cast; + #endif - // Buffer for moving floating point data to GPU - NVC_HostT host_write; - // Buffer for moving floating point data to CPU - NVC_Host host_read; + /// Buffer for moving positions to device + UCL_H_Vec host_x; + /// Buffer for moving charge data to GPU + UCL_H_Vec host_q; + /// Buffer for moving quat data to GPU + UCL_H_Vec host_quat; + /// Force and possibly torque data on host + UCL_H_Vec host_ans; + /// Energy/virial data on host + UCL_H_Vec host_engv; - // Timing Stuff - NVCTimer time_atom, time_answer; + /// Cell list identifiers for device nbor builds + UCL_D_Vec dev_cell_id; + /// Cell list identifiers for device nbor builds + UCL_D_Vec dev_particle_id; + /// Atom tag information for device nbor builds + UCL_D_Vec dev_tag; + + /// Device timers + UCL_Timer time_pos, time_other, time_answer; + /// Geryon device + UCL_Device *dev; + private: - bool allocated, _eflag, _vflag; - int _atom_fields, _ans_fields; - int _max_atoms, _nall, _inum; - numtyp * _write_loc; - acctyp * _read_loc; + #ifdef GPU_CAST + UCL_Program *atom_program; + UCL_Kernel k_cast_x; + void compile_kernels(UCL_Device &dev); + #endif + + bool _compiled; + + bool alloc(const int max_atoms); + + bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; + int _max_atoms, _nall, _inum, _e_fields, _ev_fields; + bool _gpu_nbor, _bonds; + int *_ilist; + double _time_cast; + + double _gpu_bytes; + + #ifndef USE_OPENCL + CUDPPConfiguration sort_config; + CUDPPHandle sort_plan; + #endif }; #endif + diff --git a/lib/gpu/pair_tex_tar.cu b/lib/gpu/pair_gpu_atom_kernel.cu similarity index 54% rename from lib/gpu/pair_tex_tar.cu rename to lib/gpu/pair_gpu_atom_kernel.cu index 426d2ff059..2d1a6ba85f 100644 --- a/lib/gpu/pair_tex_tar.cu +++ b/lib/gpu/pair_gpu_atom_kernel.cu @@ -12,18 +12,35 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ -#if defined(__APPLE__) -#if _GLIBCXX_ATOMIC_BUILTINS == 1 -#undef _GLIBCXX_ATOMIC_BUILTINS -#endif // _GLIBCXX_ATOMIC_BUILTINS -#endif // __APPLE__ +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp4 double4 +#else +#define numtyp float +#define numtyp4 float4 +#endif -#include "pair_gpu_atom.cu" -#include "lj_gpu.cu" -#include "gb_gpu.cu" +#ifdef NV_KERNEL +#include "geryon/ucl_nv_kernel.h" +#else +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#endif +__kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x, + __global int *type, const int nall) { + int ii=GLOBAL_ID_X; + + if (ii + +#define _HD_BALANCE_EVERY 25 +#define _HD_BALANCE_WEIGHT 0.5 +#define _HD_BALANCE_GAP 1.05 + +/// Host/device load balancer +template +class PairGPUBalance { + public: + inline PairGPUBalance() : _init_done(false), _measure_this_step(false) {} + inline ~PairGPUBalance() { clear(); } + + /// Clear any old data and setup for new LAMMPS run + inline void init(PairGPUDevice *gpu, const double split); + + /// Clear all host and device data + inline void clear() { + if (_init_done) { + _device_time.clear(); + _measure_this_step=false; + _init_done=false; + } + } + + /// Get a count of the number of particles host will handle for initial alloc + inline int first_host_count(const int nlocal,const bool gpu_nbor, + const double gpu_split) const { + int host_nlocal=0; + if (gpu_nbor && gpu_split!=1.0) { + if (gpu_split>0) + host_nlocal=static_cast(ceil((1.0-gpu_split)*nlocal)); + else + host_nlocal=static_cast(ceil(0.1*nlocal)); + } + return host_nlocal; + } + + /// Return the number of particles the device will handle this timestep + inline int get_gpu_count(const int timestep, const int ago, + const int inum_full); + + /// Return the average fraction of particles handled by device on all procs + inline double all_avg_split() { + if (_load_balance) { + double _all_avg_split=0.0; + int nprocs; + MPI_Comm_size(MPI_COMM_WORLD,&nprocs); + MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0, + MPI_COMM_WORLD); + _all_avg_split/=nprocs; + return _all_avg_split/_avg_count; + } else + return _actual_split; + } + + /// If CPU neighboring, allow the device fraction to increase on 2nd timestep + inline int ago_first(int ago) const + { if (_avg_count==1 && _actual_split<_desired_split) ago=0; return ago; } + + /// Start the timer for asynchronous device execution + inline void start_timer() { + if (_measure_this_step) { + _device->gpu->sync(); + MPI_Barrier(_device->gpu_comm); + _device_time.start(); + _device->gpu->sync(); + MPI_Barrier(_device->gpu_comm); + _device->start_host_timer(); + } + } + + /// Stop the timer for asynchronous device execution + inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } } + + /// Calculate the new host/device split based on the cpu and device times + /** \note Only does calculation every _HD_BALANCE_EVERY timesteps + (and first 10) **/ + inline void balance(const double cpu_time, const bool gpu_nbor); + + /// Calls balance() and then get_gpu_count() + inline int balance(const int timestep, const int ago, const int inum_full, + const double cpu_time, const bool gpu_nbor) { + balance(cpu_time,gpu_nbor); + return get_gpu_count(timestep,ago,inum_full); + } + + private: + PairGPUDevice *_device; + UCL_Timer _device_time; + bool _init_done; + + bool _load_balance; + double _actual_split, _avg_split, _desired_split, _max_split; + int _avg_count; + + bool _measure_this_step; + int _inum, _inum_full; +}; + +#define PairGPUBalanceT PairGPUBalance + +template +void PairGPUBalanceT::init(PairGPUDevice *gpu, + const double split) { + clear(); + _init_done=true; + + _device=gpu; + _device_time.init(*gpu->gpu); + + if (split<0.0) { + _load_balance=true; + _desired_split=0.9; + } else { + _load_balance=false; + _desired_split=split; + } + _actual_split=_desired_split; + _avg_split=0.0; + _avg_count=0; +} + +template +int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago, + const int inum_full) { + _measure_this_step=false; + if (_load_balance) { + if (_avg_count<11 || timestep%_HD_BALANCE_EVERY==0) { + _measure_this_step=true; + _inum_full=inum_full; + } + if (ago==0) { + _actual_split=_desired_split; + _max_split=_desired_split; + } + } + _inum=static_cast(floor(_actual_split*inum_full)); + if (_inum==0) _inum++; + return _inum; +} + +template +void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) { + if (_measure_this_step) { + if (_inum_full==_inum) { + _desired_split=1.0; + return; + } + + _measure_this_step=false; + double gpu_time=_device_time.seconds(); + + double cpu_gpu_time[3], max_times[3]; + cpu_gpu_time[0]=cpu_time/(_inum_full-_inum); + cpu_gpu_time[1]=gpu_time/_inum; + cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full; + + MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX, + _device->gpu_comm); + double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]); + split*=_HD_BALANCE_GAP; + + if (split>1.0) + split=1.0; + if (_avg_count<10) + _desired_split=(_desired_split*_avg_count+split)/(_avg_count+1); + else + _desired_split=_desired_split*(1.0-_HD_BALANCE_WEIGHT)+ + _HD_BALANCE_WEIGHT*split; + + if (!gpu_nbor) { + if (_desired_split<_max_split) + _actual_split=_desired_split; + else + _actual_split=_max_split; + } + } + _avg_split+=_desired_split; + _avg_count++; +} + +#endif + diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu new file mode 100644 index 0000000000..a08a26800a --- /dev/null +++ b/lib/gpu/pair_gpu_build_kernel.cu @@ -0,0 +1,295 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Peng Wang (Nvidia), penwang@nvidia.com + Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifdef NV_KERNEL + +#include "geryon/ucl_nv_kernel.h" +texture neigh_tex; + +#ifdef _DOUBLE_DOUBLE +__inline double4 fetch_pos(const int i, const double4 *pos) +{ + return pos[i]; +} +#else +__inline float4 fetch_pos(const int& i, const float4 *pos) +{ + return tex1Dfetch(neigh_tex, i); +} +#endif + +#else + +#define fetch_pos(i,y) x_[i] + +#endif + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp4 float4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp4 float4 +#endif + +#define CELL_BLOCK_SIZE 64 +#define BLOCK_2D 8 + +__kernel void transpose(int *out, int *in, int columns_in, int rows_in) +{ + __local float block[BLOCK_2D][BLOCK_2D+1]; + + unsigned ti=THREAD_ID_X; + unsigned tj=THREAD_ID_Y; + unsigned bi=BLOCK_ID_X; + unsigned bj=BLOCK_ID_Y; + + unsigned i=bi*BLOCK_2D+ti; + unsigned j=bj*BLOCK_2D+tj; + if ((i 0 && idx < nall) { + int id_l = cell_id[idx-1]; + if (id != id_l) { + for (int i = id_l+1; i <= id; i++) + cell_counts[i] = idx; + } + } + } +} + +__kernel void calc_neigh_list_cell(numtyp4 *pos, + int *cell_particle_id, + int *cell_counts, + int *nbor_list, + int *host_nbor_list, + int neigh_bin_size, + numtyp cell_size, + int ncellx, int ncelly, int ncellz, + int inum, int nt, int nall) +{ + int tid = threadIdx.x; + int ix = blockIdx.x; + int iy = blockIdx.y % ncelly; + int iz = blockIdx.y / ncelly; + + int icell = ix + iy*ncellx + iz*ncellx*ncelly; + + __shared__ int cell_list_sh[CELL_BLOCK_SIZE]; + __shared__ numtyp4 pos_sh[CELL_BLOCK_SIZE]; + + int icell_begin = cell_counts[icell]; + int icell_end = cell_counts[icell+1]; + + int nborz0 = max(iz-1,0), nborz1 = min(iz+1, ncellz-1), + nbory0 = max(iy-1,0), nbory1 = min(iy+1, ncelly-1), + nborx0 = max(ix-1,0), nborx1 = min(ix+1, ncellx-1); + + numtyp4 diff; + numtyp r2; + for (int ii = 0; ii < ceil((numtyp)(icell_end - icell_begin)/blockDim.x); ii++) { + int i = icell_begin + tid + ii*blockDim.x; + int pid_i = nall, pid_j, stride; + numtyp4 atom_i, atom_j; + int cnt = 0; + int *neigh_counts, *neigh_list; + + if (i < icell_end) + pid_i = cell_particle_id[i]; + + if (pid_i < nt) { + atom_i = fetch_pos(pid_i,pos); //pos[pid_i]; + } + if (pid_i < inum) { + stride=inum; + neigh_counts=nbor_list+stride+pid_i; + neigh_list=neigh_counts+stride; + nbor_list[pid_i]=pid_i; + } else { + stride=nt-inum; + neigh_counts=host_nbor_list+pid_i-inum; + neigh_list=neigh_counts+stride; + } + + // loop through neighbors + + for (int nborz = nborz0; nborz <= nborz1; nborz++) { + for (int nbory = nbory0; nbory <= nbory1; nbory++) { + for (int nborx = nborx0; nborx <= nborx1; nborx++) { + + int jcell = nborx + nbory*ncellx + nborz*ncellx*ncelly; + + int jcell_begin = cell_counts[jcell]; + int jcell_end = cell_counts[jcell+1]; + int num_atom_cell = jcell_end - jcell_begin; + + // load jcell to shared memory + int num_iter = (int)ceil((numtyp)num_atom_cell/CELL_BLOCK_SIZE); + + for (int k = 0; k < num_iter; k++) { + int end_idx = min(CELL_BLOCK_SIZE, num_atom_cell-k*CELL_BLOCK_SIZE); + + if (tid < end_idx) { + pid_j = cell_particle_id[tid+k*CELL_BLOCK_SIZE+jcell_begin]; + cell_list_sh[tid] = pid_j; + atom_j = fetch_pos(pid_j,pos); //[pid_j]; + pos_sh[tid].x = atom_j.x; + pos_sh[tid].y = atom_j.y; + pos_sh[tid].z = atom_j.z; + } + __syncthreads(); + + if (pid_i < nt) { + + for (int j = 0; j < end_idx; j++) { + int pid_j = cell_list_sh[j]; // gather from shared memory + if (pid_ipid_i) { + diff.x = atom_i.x - pos_sh[j].x; + diff.y = atom_i.y - pos_sh[j].y; + diff.z = atom_i.z - pos_sh[j].z; + + r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; + if (r2 < cell_size*cell_size && r2 > 1e-5) { + if (cnt < neigh_bin_size) { + *neigh_list = pid_j; + neigh_list+=stride; + } + cnt++; + } + } + } + } + __syncthreads(); + } // for (k) + } + } + } + if (pid_i < nt) + *neigh_counts = cnt; + } // for (i) +} + +__kernel void kernel_special(__global int *dev_nbor, + __global int *host_nbor_list, __global int *tag, + __global int *nspecial, __global int *special, + int inum, int nt, int nall) { + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X; + + if (ii=n1) + nbor+=nall; + if (i>=n2) + nbor+=nall; + } + offset+=nt; + } + if (nbor>=nall) + *list=nbor; + } + } // if ii +} + diff --git a/lib/gpu/pair_gpu_cell.cu b/lib/gpu/pair_gpu_cell.cu deleted file mode 100644 index f933537fec..0000000000 --- a/lib/gpu/pair_gpu_cell.cu +++ /dev/null @@ -1,489 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#if defined(__APPLE__) -#if _GLIBCXX_ATOMIC_BUILTINS == 1 -#undef _GLIBCXX_ATOMIC_BUILTINS -#endif // _GLIBCXX_ATOMIC_BUILTINS -#endif // __APPLE__ - -#include -#include "lj_gpu_memory.h" -#include "pair_gpu_cell.h" - -static __constant__ float d_boxlo[3]; -static __constant__ float d_boxhi[3]; -static __constant__ float d_cell_size[1]; -static __constant__ float d_skin[1]; - -void init_cell_list_const(double cell_size, double skin, - double *boxlo, double *boxhi) -{ - float cell_size1 = cell_size; - float skin1 = skin; - float boxlo1[3], boxhi1[3]; - for (int i = 0; i < 3; i++) { - boxlo1[i] = boxlo[i]; - boxhi1[i] = boxhi[i]; - } - - cudaMemcpyToSymbol(d_cell_size, &cell_size1, sizeof(float), - 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(d_boxlo, boxlo1, 3*sizeof(float), - 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(d_boxhi, boxhi1, 3*sizeof(float), - 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(d_skin, &skin1, sizeof(float), - 0, cudaMemcpyHostToDevice); -} - -__global__ void kernel_set_cell_list(unsigned int *cell_idx) -{ - unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x; - cell_idx[gid] = BIG_NUMBER; -} - -// build the cell list -__global__ void kernel_build_cell_list(float3 *cell_list, - unsigned int *cell_idx, - int *cell_type, - int *cell_atom, - float3 *pos, - int *type, - const int inum, - const int nall, - const int cell_size) -{ - unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x; - float cSize = d_cell_size[0]; - int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); - int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize); - int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize); - - if (gid < nall) { - float3 p = pos[gid]; - p.x = fmaxf(p.x, d_boxlo[0]-cSize); - p.x = fminf(p.x, d_boxhi[0]+cSize); - p.y = fmaxf(p.y, d_boxlo[1]-cSize); - p.y = fminf(p.y, d_boxhi[1]+cSize); - p.z = fmaxf(p.z, d_boxlo[2]-cSize); - p.z = fminf(p.z, d_boxhi[2]+cSize); - - int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncellx - + (int)(p.z/cSize + 1.0) * ncellx * ncelly; - - int atom_pos = atomicAdd(&cell_atom[cell_id], 1); - int pid = cell_id*cell_size + atom_pos; - - cell_list[pid] = pos[gid]; - cell_type[pid] = type[gid]; - cell_idx [pid] = gid; - - } -} - -__global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild) -{ - - float cSize = d_cell_size[0]; - int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); - int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize); - int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize); - - // calculate 3D block idx from 2d block - int bx = blockIdx.x; - int by = blockIdx.y % ncelly; - int bz = blockIdx.y / ncelly; - - int tid = threadIdx.x; - - // compute cell idx from 3D block idx - int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly)); - int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list - - float skin = d_skin[0]; - float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin; - float hix = lowx + cSize + skin; - float lowy = d_boxlo[1] + (by-1)*cSize - 0.5*skin; - float hiy = lowy + cSize + skin; - float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin; - float hiz = lowz + cSize + skin; - - for (int i = tid; i < cell_atom[cid]; i += blockDim.x) { - int pid = pbase + i; - float3 p = cell_list[pid]; - p.x = fmaxf(p.x, d_boxlo[0]-cSize); - p.x = fminf(p.x, d_boxhi[0]+cSize); - p.y = fmaxf(p.y, d_boxlo[1]-cSize); - p.y = fminf(p.y, d_boxhi[1]+cSize); - p.z = fmaxf(p.z, d_boxlo[2]-cSize); - p.z = fminf(p.z, d_boxhi[2]+cSize); - - if (p.x < lowx || p.x > hix || p.y < lowy || p.y > hiy || p.z < lowz || p.z > hiz) { - *rebuild = 1; - } - } - -} - - -__global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell) -{ - unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x; - - if (gid < ncell) { - if (cell_atom[gid] > blockDim.x) - *overflow = 1; - } -} - -__global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos) -{ - float cSize = d_cell_size[0]; - int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); - int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize); - int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize); - - // calculate 3D block idx from 2d block - int bx = blockIdx.x; - int by = blockIdx.y % ncelly; - int bz = blockIdx.y / ncelly; - - int tid = threadIdx.x; - - // compute cell idx from 3D block idx - int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly)); - int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list - - for (int i = tid; i < cell_atom[cid]; i += blockDim.x) { - int pid = pbase + i; - cell_list[pid] = pos[cell_idx[pid]]; - } - -} - - -__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit); - - - -#ifdef __DEVICE_EMULATION__ -#define __SYNC __syncthreads(); -#else -#define __SYNC -#endif - - -#define WARP_SIZE 32 - -template -__device__ T scanwarp(T val, T* sData) -{ - // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = - // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1)) - int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1)); - sData[idx] = 0; - idx += WARP_SIZE; - sData[idx] = val; __SYNC - -#ifdef __DEVICE_EMULATION__ - T t = sData[idx - 1]; __SYNC - sData[idx] += t; __SYNC - t = sData[idx - 2]; __SYNC - sData[idx] += t; __SYNC - t = sData[idx - 4]; __SYNC - sData[idx] += t; __SYNC - t = sData[idx - 8]; __SYNC - sData[idx] += t; __SYNC - t = sData[idx - 16]; __SYNC - sData[idx] += t; __SYNC -#else - if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } __SYNC - if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } __SYNC - if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } __SYNC - if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } __SYNC - if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } __SYNC -#endif - - return sData[idx] - val; // convert inclusive -> exclusive -} - -__device__ unsigned int scan(unsigned int idata) -{ - extern __shared__ unsigned int ptr[]; - - unsigned int idx = threadIdx.x; - - unsigned int val = idata; - - val = scanwarp(val, ptr); - __syncthreads(); - - if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1) - { - ptr[idx >> 5] = val + idata; - } - __syncthreads(); - -#ifndef __DEVICE_EMULATION__ - if (idx < WARP_SIZE) -#endif - { - ptr[idx] = scanwarp(ptr[idx], ptr); - } - __syncthreads(); - - val += ptr[idx >> 5]; - - return val; -} - - -__device__ unsigned int rank(unsigned int preds) -{ - unsigned int address = scan(preds); - - __shared__ unsigned int numtrue; - if (threadIdx.x == blockDim.x - 1) - { - numtrue = address + preds; - } - __syncthreads(); - - unsigned int rank; - unsigned int idx = threadIdx.x; - rank = (preds) ? address : numtrue + idx - address; - - return rank; -} - -template -__device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit) -{ - extern __shared__ unsigned int sMem1[]; - __shared__ float sMem2[blockSize]; - __shared__ int sMem3[blockSize]; - - int tid = threadIdx.x; - - for(unsigned int shift = startbit; shift < (startbit + nbits); ++shift) { - unsigned int lsb; - lsb = !(((*key) >> shift) & 0x1); - - unsigned int r; - - r = rank(lsb); - - // This arithmetic strides the ranks across 4 CTA_SIZE regions - sMem1[r] = *key; - __syncthreads(); - - // The above allows us to read without 4-way bank conflicts: - *key = sMem1[tid]; - __syncthreads(); - - sMem2[r] = (*value1).x; - __syncthreads(); - (*value1).x = sMem2[tid]; - __syncthreads(); - - sMem2[r] = (*value1).y; - __syncthreads(); - (*value1).y = sMem2[tid]; - __syncthreads(); - - sMem2[r] = (*value1).z; - __syncthreads(); - (*value1).z = sMem2[tid]; - __syncthreads(); - - sMem3[r] = *value2; - __syncthreads(); - *value2 = sMem3[tid]; - __syncthreads(); - - } - -} - -__global__ void radixSortBlocks(unsigned int *keys, - float3 *values1, - int *values2, - unsigned int nbits, - unsigned int startbit) -{ - - extern __shared__ unsigned int sMem[]; - - int gid = threadIdx.x + blockIdx.x * blockDim.x; - unsigned int key; - float3 value1; - int value2; - key = keys[gid]; - value1 = values1[gid]; - value2 = values2[gid]; - __syncthreads(); - - if (blockDim.x == 64) - radixSortBlock<64>(&key, &value1, &value2, nbits, startbit); - else if (blockDim.x == 128) - radixSortBlock<128>(&key, &value1, &value2, nbits, startbit); - else if (blockDim.x == 256) - radixSortBlock<256>(&key, &value1, &value2, nbits, startbit); - - keys[gid] = key; - values1[gid] = value1; - values2[gid] = value2; -} - -void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size, int cell_size) -{ - int i = 0; - const unsigned int bitSize = sizeof(unsigned int)*8; - const unsigned int bitStep = 4; - const int gSize = size/cell_size; - while (bitSize > i*bitStep) { - radixSortBlocks<<>>(keys, values1, values2, bitStep, i*bitStep); - i++; - } -} - -static float3 *d_pos, *pos_temp; -static int *d_type; -static int *d_overflow, *d_rebuild; - -void init_cell_list(cell_list &cell_list_gpu, - const int nall, - const int ncell, - const int buffer) -{ - cudaMalloc((void**)&(cell_list_gpu.pos), ncell*buffer*sizeof(float3)); - cudaMalloc((void**)&(cell_list_gpu.idx), ncell*buffer*sizeof(unsigned int)); - cudaMalloc((void**)&(cell_list_gpu.type), ncell*buffer*sizeof(int)); - cudaMalloc((void**)&(cell_list_gpu.natom), ncell*sizeof(int)); - - cudaMallocHost((void**)&pos_temp, nall*sizeof(float3)); - cudaMalloc((void**)&d_pos, nall*sizeof(float3)); - cudaMalloc((void**)&d_type, nall*sizeof(int)); - cudaMalloc((void**)&d_overflow, sizeof(int)); - cudaMalloc((void**)&d_rebuild, sizeof(int)); - - cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int)); - cudaMemset(cell_list_gpu.pos, 0, ncell*buffer*sizeof(float3)); -} - -void clear_cell_list(cell_list &cell_list_gpu) -{ - cudaFree(cell_list_gpu.pos); - cudaFree(cell_list_gpu.idx); - cudaFree(cell_list_gpu.natom); - cudaFree(cell_list_gpu.type); - - cudaFreeHost(pos_temp); - cudaFree(d_pos); - cudaFree(d_type); - cudaFree(d_overflow); - cudaFree(d_rebuild); -} - - -void build_cell_list(double *atom_pos, int *atom_type, - cell_list &cell_list_gpu, - const int ncell, const int ncellx, const int ncelly, const int ncellz, - const int buffer, const int inum, const int nall, const int ago) -{ - - cudaError_t err; - - cudaMemset(d_overflow, 0, sizeof(int)); - cudaMemset(d_rebuild, 0, sizeof(int)); - - // copy position and type to GPU - for (int i = 0; i < 3*nall; i+=3) { - pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]); - } - cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice); - cudaMemcpy(d_type, atom_type, nall*sizeof(int), cudaMemcpyHostToDevice); - - static int first_build = 1; - int rebuild = 0; - - // copy the last built cell-list and test whether it needs to be rebuilt - if (!first_build) { - - dim3 grid(ncellx, ncelly*ncellz); - kernel_copy_list<<>>(cell_list_gpu.pos, - cell_list_gpu.idx, - cell_list_gpu.natom, d_pos); - cudaMemset(d_rebuild, 0, sizeof(int)); - kernel_test_rebuild<<>>(cell_list_gpu.pos, - cell_list_gpu.natom, - d_rebuild); - cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost); - - err = cudaGetLastError(); - assert(err == cudaSuccess); - } - - if (ago == 0) rebuild = 1; - - // build cell-list for the first time - if (first_build || rebuild) { - first_build = 0; - // cout << "Building cell list..." << endl; - cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int)); - // initialize d_cell_idx for the sorting routine - kernel_set_cell_list<<>>(cell_list_gpu.idx); - - // build cell list - dim3 blockDim(128); - dim3 gridDim(static_cast(ceil(static_cast(nall)/blockDim.x))); - kernel_build_cell_list<<>>(cell_list_gpu.pos, - cell_list_gpu.idx, - cell_list_gpu.type, - cell_list_gpu.natom, - d_pos, d_type, inum, nall, buffer); - err = cudaGetLastError(); - assert(err == cudaSuccess); - // check cell list overflow - int overflow = 0; - int gDimCell = static_cast(ceil(static_cast(ncell)/buffer)); - kernel_test_overflow<<>>(cell_list_gpu.natom, - d_overflow, ncell); - cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost); - - if (overflow > 0) { - printf("\n BLOCK_1D too small for cell list, please increase it!"); - printf("\n BLOCK_1D = %d",BLOCK_1D); - printf("\n ncell = %d",ncell); - printf("\n gDimCell = %d",gDimCell); - printf("\n overflow = %d \n",overflow); - exit(0); - } - - // sort atoms in every cell by atom index to avoid floating point associativity problem. - sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos, - cell_list_gpu.type, ncell*buffer, buffer); - - cudaThreadSynchronize(); - err = cudaGetLastError(); - assert(err == cudaSuccess); - } - -} diff --git a/lib/gpu/pair_gpu_cell.h b/lib/gpu/pair_gpu_cell.h deleted file mode 100644 index 48dab9adb0..0000000000 --- a/lib/gpu/pair_gpu_cell.h +++ /dev/null @@ -1,80 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#ifndef PAIR_GPU_CELL_H -#define PAIR_GPU_CELL_H - -#ifdef WINDLL -#include -#endif - -#ifdef WINDLL -#define EXTERN extern "C" __declspec(dllexport) -#else -#define EXTERN -#endif -using namespace std; - -static float kernelTime = 0.0; -static int ncellx, ncelly, ncellz; -static float *energy, *d_energy; -static float3 *d_force, *f_temp, *v_temp, *d_virial; - - -typedef struct { - float3 *pos; - unsigned int *idx; - int *type; - int *natom; -} cell_list; - -static cell_list cell_list_gpu; - -__global__ void kernel_set_cell_list(unsigned int *cell_idx); -__global__ void kernel_build_cell_list(float3 *cell_list, - unsigned int *cell_idx, - int *cell_type, - int *cell_atom, - float3 *pos, - int *type, - const int inum, - const int nall); -__global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild); -__global__ void kernel_copy_list(float3 *cell_list, - unsigned int *cell_idx, - int *cell_atom, - float3 *pos); -__global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell); -void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size); - -void init_cell_list_const(double cell_size, double skin, - double *boxlo, double *boxhi); -void init_cell_list(cell_list &cell_list_gpu, - const int nall, - const int ncell, - const int buffer); - -void build_cell_list(double *atom_pos, int *atom_type, - cell_list &cell_list_gpu, - const int ncell, const int ncellx, const int ncelly, const int ncellz, - const int buffer, const int inum, const int nall, const int ago); - -void clear_cell_list(cell_list &cell_list_gpu); - -#endif diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp new file mode 100644 index 0000000000..0262b3dcd6 --- /dev/null +++ b/lib/gpu/pair_gpu_device.cpp @@ -0,0 +1,263 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include "pair_gpu_device.h" +#include "pair_gpu_precision.h" +#include +#include + +#define PairGPUDeviceT PairGPUDevice + +template +PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false), + _gpu_mode(GPU_FORCE), _first_device(0), + _last_device(0) { +} + +template +PairGPUDeviceT::~PairGPUDevice() { + clear_device(); +} + +template +bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu, + const int gpu_mode, const double p_split) { + if (_device_init) + return true; + _device_init=true; + _first_device=first_gpu; + _last_device=last_gpu; + _gpu_mode=gpu_mode; + _particle_split=p_split; + + // Get the rank within the world + MPI_Comm_rank(MPI_COMM_WORLD,&_world_me); + MPI_Comm_size(MPI_COMM_WORLD,&_world_size); + + // Get the names of all nodes + int name_length; + char node_name[MPI_MAX_PROCESSOR_NAME]; + char node_names[MPI_MAX_PROCESSOR_NAME*_world_size]; + MPI_Get_processor_name(node_name,&name_length); + MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, + MPI_MAX_PROCESSOR_NAME,MPI_CHAR,MPI_COMM_WORLD); + std::string node_string=std::string(node_name); + + // Get the number of procs per node + std::map name_map; + std::map::iterator np; + for (int i=0; i<_world_size; i++) { + std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]); + np=name_map.find(i_string); + if (np==name_map.end()) + name_map[i_string]=1; + else + np->second++; + } + int procs_per_node=name_map.begin()->second; + + // Assign a unique id to each node + int split_num=0, split_id=0; + for (np=name_map.begin(); np!=name_map.end(); ++np) { + if (np->first==node_string) + split_id=split_num; + split_num++; + } + + // Set up a per node communicator and find rank within + MPI_Comm node_comm; + MPI_Comm_split(MPI_COMM_WORLD, split_id, 0, &node_comm); + int node_rank; + MPI_Comm_rank(node_comm,&node_rank); + + // set the device ID + _procs_per_gpu=static_cast(ceil(static_cast(procs_per_node)/ + (last_gpu-first_gpu+1))); + int my_gpu=node_rank/_procs_per_gpu; + + // Set up a per device communicator + MPI_Comm_split(node_comm,my_gpu,0,&gpu_comm); + MPI_Comm_rank(gpu_comm,&_gpu_rank); + + gpu=new UCL_Device(); + if (my_gpu>=gpu->num_devices()) + return false; + + gpu->set(my_gpu); + return true; +} + +template +bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, + const int host_nlocal, const int nall, + const int maxspecial, const bool gpu_nbor, + const int gpu_host, const int max_nbors, + const double cell_size, const bool pre_cut) { + if (!_device_init) + return false; + if (_init_count==0) { + // Initialize atom and nbor data + if (!atom.init(nlocal,nall,charge,rot,*gpu,gpu_nbor, + gpu_nbor && maxspecial>0)) + return false; + if (!nbor.init(nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor, + gpu_host,pre_cut)) + return false; + nbor.cell_size(cell_size); + } else { + if (cell_size>nbor.cell_size()) + nbor.cell_size(cell_size); + } + + _init_count++; + return true; +} + +template +void PairGPUDeviceT::init_message(FILE *screen, const char *name, + const int first_gpu, const int last_gpu) { + #ifdef USE_OPENCL + std::string fs=""; + #else + std::string fs=toa(gpu->free_gigabytes())+"/"; + #endif + + if (_world_me == 0 && screen) { + fprintf(screen,"\n-------------------------------------"); + fprintf(screen,"-------------------------------------\n"); + fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); + fprintf(screen,"- with %d procs per device.\n",_procs_per_gpu); + fprintf(screen,"-------------------------------------"); + fprintf(screen,"-------------------------------------\n"); + + for (int i=first_gpu; i<=last_gpu; i++) { + std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ + toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+ + " GHZ ("; + if (sizeof(PRECISION)==4) { + if (sizeof(ACC_PRECISION)==4) + sname+="Single Precision)"; + else + sname+="Mixed Precision)"; + } else + sname+="Double Precision)"; + + fprintf(screen,"GPU %d: %s\n",i,sname.c_str()); + } + + fprintf(screen,"-------------------------------------"); + fprintf(screen,"-------------------------------------\n\n"); + } +} + +template +void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, + const double max_bytes, FILE *screen) { + double single[5], times[5]; + + single[0]=atom.transfer_time(); + single[1]=nbor.time_nbor.total_seconds(); + single[2]=nbor.time_kernel.total_seconds(); + single[3]=time_pair.total_seconds(); + single[4]=atom.cast_time(); + + MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + + double my_max_bytes=max_bytes; + double mpi_max_bytes; + MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + double max_mb=mpi_max_bytes/(1024.0*1024.0); + + if (world_me()==0) + if (screen && times[3]>0.0) { + fprintf(screen,"\n\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + fprintf(screen," GPU Time Info (average): "); + fprintf(screen,"\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + + if (procs_per_gpu()==1) { + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_world_size); + fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_world_size); + fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_world_size); + if (nbor.gpu_nbor()) + fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_world_size); + else + fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_world_size); + fprintf(screen,"Force calc: %.4f s.\n",times[3]/_world_size); + } + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + + fprintf(screen,"-------------------------------------"); + fprintf(screen,"--------------------------------\n\n"); + } +} + +template +void PairGPUDeviceT::clear() { + if (_init_count>0) { + _init_count--; + if (_init_count==0) { + atom.clear(); + nbor.clear(); + } + } +} + +template +void PairGPUDeviceT::clear_device() { + while (_init_count>0) + clear(); + if (_device_init) { + delete gpu; + _device_init=false; + } +} + +template +double PairGPUDeviceT::host_memory_usage() const { + return atom.host_memory_usage()+ + nbor.host_memory_usage()+4*sizeof(numtyp)+ + sizeof(PairGPUDevice); +} + +template class PairGPUDevice; +PairGPUDevice pair_gpu_device; + +bool lmp_init_device(const int first_gpu, const int last_gpu, + const int gpu_mode, const double particle_split) { + return pair_gpu_device.init_device(first_gpu,last_gpu,gpu_mode, + particle_split); +} + +void lmp_clear_device() { + pair_gpu_device.clear_device(); +} + +double lmp_gpu_forces(double **f, double **tor, double *eatom, + double **vatom, double *virial, double &ecoul) { + if (pair_gpu_device.init_count()) { + pair_gpu_device.stop_host_timer(); + pair_gpu_device.gpu->sync(); + double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul); + pair_gpu_device.atom.get_answers(f,tor); + + return evdw; + } + return 0.0; +} diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h new file mode 100644 index 0000000000..e7a78328d9 --- /dev/null +++ b/lib/gpu/pair_gpu_device.h @@ -0,0 +1,140 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef PAIR_GPU_DEVICE_H +#define PAIR_GPU_DEVICE_H + +#include "pair_gpu_atom.h" +#include "pair_gpu_nbor.h" +#include "mpi.h" +#include +#include "stdio.h" +#include + +template +class PairGPUDevice { + public: + PairGPUDevice(); + ~PairGPUDevice(); + + /// Initialize the device for use by this process + /** Sets up a per-device MPI communicator for load balancing and initializes + * the device (>=first_gpu and <=last_gpu) that this proc will be using **/ + bool init_device(const int first_gpu, const int last_gpu, + const int gpu_mode, const double particle_split); + + /// Initialize the device for Atom and Neighbor storage + /** \param rot True if quaternions need to be stored + * \param nlocal Total number of local particles to allocate memory for + * \param host_nlocal Initial number of host particles to allocate memory for + * \param nall Total number of local+ghost particles + * \param gpu_nbor True if neighboring is performed on device + * \param gpu_host 0 if host will not perform force calculations, + * 1 if gpu_nbor is true, and host needs a half nbor list, + * 2 if gpu_nbor is true, and host needs a full nbor list + * \param max_nbors Initial number of rows in the neighbor matrix + * \param cell_size cutoff+skin + * \param pre_cut True if cutoff test will be performed in separate kernel + * than the force kernel **/ + bool init(const bool charge, const bool rot, const int nlocal, + const int host_nlocal, const int nall, const int maxspecial, + const bool gpu_nbor, const int gpu_host, const int max_nbors, + const double cell_size, const bool pre_cut); + + /// Output a message for pair_style acceleration with device stats + void init_message(FILE *screen, const char *name, + const int first_gpu, const int last_gpu); + + /// Output a message with timing information + void output_times(UCL_Timer &time_pair, const double avg_split, + const double max_bytes, FILE *screen); + + /// Clear all memory on host and device associated with atom and nbor data + void clear(); + + /// Clear all memory on host and device + void clear_device(); + + /// Start timer on host + inline void start_host_timer() { _cpu_full=MPI_Wtime(); } + + /// Stop timer on host + inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; } + + /// Return host time + inline double host_time() { return _cpu_full; } + + /// Return host memory usage in bytes + double host_memory_usage() const; + + /// Return the number of procs sharing a device (size of device commincator) + inline int procs_per_gpu() const { return _procs_per_gpu; } + /// Return my rank in the device communicator + inline int gpu_rank() const { return _gpu_rank; } + /// My rank within all processes + inline int world_me() const { return _world_me; } + /// Total number of processes + inline int world_size() const { return _world_size; } + /// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH + inline int gpu_mode() const { return _gpu_mode; } + /// Index of first device used by a node + inline int first_device() const { return _first_device; } + /// Index of last device used by a node + inline int last_device() const { return _last_device; } + /// Particle split defined in fix + inline double particle_split() const { return _particle_split; } + /// Return the initialization count for the device + inline int init_count() const { return _init_count; } + + // -------------------------- DEVICE DATA ------------------------- + + /// Geryon Device + UCL_Device *gpu; + /// Device communicator + MPI_Comm gpu_comm; + + enum{GPU_FORCE, GPU_NEIGH}; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + PairGPUAtom atom; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor Data + PairGPUNbor nbor; + + private: + int _init_count; + bool _device_init; + int _procs_per_gpu, _gpu_rank, _world_me, _world_size; + int _gpu_mode, _first_device, _last_device; + double _particle_split; + double _cpu_full; + + template + inline std::string toa(const t& in) { + std::ostringstream o; + o.precision(2); + o << in; + return o.str(); + } + +}; + +#endif diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp new file mode 100644 index 0000000000..63048b7560 --- /dev/null +++ b/lib/gpu/pair_gpu_nbor.cpp @@ -0,0 +1,412 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov + Peng Wang (Nvidia), penwang@nvidia.com +------------------------------------------------------------------------- */ + +#include "pair_gpu_precision.h" +#include "pair_gpu_nbor.h" +#include "math.h" + +#ifdef USE_OPENCL +#include "pair_gpu_nbor_cl.h" +#else +#include "pair_gpu_nbor_ptx.h" +#include "pair_gpu_build_ptx.h" +#endif + +int PairGPUNbor::bytes_per_atom(const int max_nbors) const { + if (_gpu_nbor) + return (max_nbors+2)*sizeof(int); + else if (_use_packing) + return ((max_nbors+2)*2)*sizeof(int); + else + return (max_nbors+3)*sizeof(int); +} + +bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, + const int maxspecial, UCL_Device &devi, + const bool gpu_nbor, const int gpu_host, + const bool pre_cut) { + clear(); + + dev=&devi; + _gpu_nbor=gpu_nbor; + if (gpu_host==0) + _gpu_host=false; + else if (gpu_host==1) + _gpu_host=true; + else + // Not yet implemented + assert(0==1); + + if (pre_cut || gpu_nbor==false) + _alloc_packed=true; + else + _alloc_packed=false; + + bool success=true; + + // Initialize timers for the selected GPU + time_nbor.init(*dev); + time_kernel.init(*dev); + time_nbor.zero(); + time_kernel.zero(); + + _max_atoms=static_cast(static_cast(inum)*1.10); + if (_max_atoms==0) + _max_atoms=1000; + + _max_host=static_cast(static_cast(host_inum)*1.10); + _max_nbors=max_nbors; + + _maxspecial=maxspecial; + if (gpu_nbor==false) + _maxspecial=0; + + if (gpu_nbor==false) + success=success && (host_packed.alloc(2*IJ_SIZE,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + alloc(success); + if (_use_packing==false) + compile_kernels(devi); + + return success; +} + +void PairGPUNbor::alloc(bool &success) { + dev_nbor.clear(); + host_acc.clear(); + if (_use_packing==false || _gpu_nbor) + success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + else + success=success && (dev_nbor.alloc(3*_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + + _c_bytes=dev_nbor.row_bytes(); + if (_alloc_packed) { + dev_packed.clear(); + success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + _c_bytes+=dev_packed.row_bytes(); + } + if (_max_host>0) { + host_nbor.clear(); + dev_host_nbor.clear(); + success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev, + UCL_RW_OPTIMIZED)==UCL_SUCCESS); + success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host, + *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); + _c_bytes+=dev_host_nbor.row_bytes(); + } + if (_maxspecial>0) { + dev_nspecial.clear(); + dev_special.clear(); + dev_special_t.clear(); + int at=_max_atoms+_max_host; + success=success && (dev_nspecial.alloc(3*at,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + success=success && (dev_special.alloc(_maxspecial*at,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + success=success && (dev_special_t.alloc(_maxspecial*at,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + _gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+ + dev_special_t.row_bytes(); + } + + _allocated=true; +} + +void PairGPUNbor::clear() { + _gpu_bytes=0.0; + _cell_bytes=0.0; + _c_bytes=0.0; + if (_allocated) { + _allocated=false; + + host_packed.clear(); + host_acc.clear(); + dev_nbor.clear(); + dev_host_nbor.clear(); + dev_packed.clear(); + host_nbor.clear(); + dev_nspecial.clear(); + dev_special.clear(); + dev_special_t.clear(); + + time_kernel.clear(); + time_nbor.clear(); + } + + if (_compiled) { + if (_gpu_nbor) { + k_cell_id.clear(); + k_cell_counts.clear(); + k_build_nbor.clear(); + k_transpose.clear(); + k_special.clear(); + delete build_program; + } else { + k_nbor.clear(); + delete nbor_program; + } + _compiled=false; + } +} + +double PairGPUNbor::host_memory_usage() const { + if (_gpu_nbor) { + if (_gpu_host) + return host_nbor.row_bytes()*host_nbor.rows(); + else + return 0; + } else + return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+ + sizeof(PairGPUNbor); +} + +void PairGPUNbor::get_host(const int inum, int *ilist, int *numj, + int **firstneigh, const int block_size) { + time_nbor.start(); + + UCL_H_Vec ilist_view; + ilist_view.view(ilist,inum,*dev); + ucl_copy(dev_nbor,ilist_view,true); + + UCL_D_Vec nbor_offset; + UCL_H_Vec host_offset; + + int copy_count=0; + int ij_count=0; + int acc_count=0; + int dev_count=0; + int *h_ptr=host_packed.begin(); + _nbor_pitch=inum; + + for (int ii=0; ii acc_view; + acc_view.view_offset(inum,dev_nbor,inum*2); + ucl_copy(acc_view,host_acc,true); + time_nbor.stop(); + + if (_use_packing==false) { + time_kernel.start(); + int GX=static_cast(ceil(static_cast(inum)/block_size)); + k_nbor.set_size(GX,block_size); + k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum); + time_kernel.stop(); + } +} + +void PairGPUNbor::compile_kernels(UCL_Device &dev) { + std::string flags="-cl-fast-relaxed-math -cl-mad-enable"; + + if (_gpu_nbor==false) { + nbor_program=new UCL_Program(dev); + nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str()); + k_nbor.set_function(*nbor_program,"kernel_unpack"); + } else { + build_program=new UCL_Program(dev); + #ifdef USE_OPENCL + std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n"; + exit(1); + #else + build_program->load_string(pair_gpu_build_kernel,flags.c_str()); + #endif + k_cell_id.set_function(*build_program,"calc_cell_id"); + k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts"); + k_build_nbor.set_function(*build_program,"calc_neigh_list_cell"); + k_transpose.set_function(*build_program,"transpose"); + k_special.set_function(*build_program,"kernel_special"); + neigh_tex.get_texture(*build_program,"neigh_tex"); + } + _compiled=true; +} + +template +void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, + const int nall, + PairGPUAtom &atom, + double *boxlo, double *boxhi, int *tag, + int **nspecial, int **special, bool &success, + int &mn) { + const int nt=inum+host_inum; + + if (_maxspecial>0) { + time_nbor.start(); + UCL_H_Vec view_nspecial, view_special, view_tag; + view_nspecial.view(nspecial[0],nt*3,*dev); + view_special.view(special[0],nt*_maxspecial,*dev); + view_tag.view(tag,nall,*dev); + ucl_copy(dev_nspecial,view_nspecial,nt*3,false); + ucl_copy(dev_special_t,view_special,nt*_maxspecial,false); + ucl_copy(atom.dev_tag,view_tag,nall,false); + time_nbor.stop(); + time_nbor.add_to_total(); + time_kernel.start(); + const int b2x=8; + const int b2y=8; + const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); + const int g2y=static_cast(ceil(static_cast(nt)/b2y)); + k_transpose.set_size(g2x,g2y,b2x,b2y); + k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial, + &nt); + } else + time_kernel.start(); + + _nbor_pitch=inum; + neigh_tex.bind_float(atom.dev_x,4); + + int ncellx, ncelly, ncellz, ncell_3d; + ncellx = static_cast(ceil(((boxhi[0] - boxlo[0]) + + 2.0*_cell_size)/_cell_size)); + ncelly = static_cast(ceil(((boxhi[1] - boxlo[1]) + + 2.0*_cell_size)/_cell_size)); + ncellz = static_cast(ceil(((boxhi[2] - boxlo[2]) + + 2.0*_cell_size)/_cell_size)); + ncell_3d = ncellx * ncelly * ncellz; + UCL_D_Vec cell_counts; + cell_counts.alloc(ncell_3d+1,dev_nbor); + _cell_bytes=cell_counts.row_bytes(); + + /* build cell list on GPU */ + const int neigh_block=128; + const int GX=(int)ceil((float)nall/neigh_block); + const numtyp boxlo0=static_cast(boxlo[0]); + const numtyp boxlo1=static_cast(boxlo[1]); + const numtyp boxlo2=static_cast(boxlo[2]); + const numtyp boxhi0=static_cast(boxhi[0]); + const numtyp boxhi1=static_cast(boxhi[1]); + const numtyp boxhi2=static_cast(boxhi[2]); + const numtyp cell_size_cast=static_cast(_cell_size); + k_cell_id.set_size(GX,neigh_block); + k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), + &atom.dev_particle_id.begin(), + &boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1, + &boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall); + + atom.sort_neighbor(nall); + + /* calculate cell count */ + k_cell_counts.set_size(GX,neigh_block); + k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall, + &ncell_3d); + + /* build the neighbor list */ + const int cell_block=64; + k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); + k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), + &cell_counts.begin(), &dev_nbor.begin(), + &dev_host_nbor.begin(), &_max_nbors, &cell_size_cast, + &ncellx, &ncelly, &ncellz, &inum, &nt, &nall); + + /* Get the maximum number of nbors and realloc if necessary */ + UCL_D_Vec numj; + numj.view_offset(inum,dev_nbor,inum); + ucl_copy(host_acc,numj,inum,false); + if (nt>inum) { + UCL_H_Vec host_offset; + host_offset.view_offset(inum,host_acc,nt-inum); + ucl_copy(host_offset,dev_host_nbor,nt-inum,false); + } + mn=host_acc[0]; + for (int i=1; i_max_nbors) { + mn=static_cast(static_cast(mn)*1.10); + dev_nbor.clear(); + success=success && (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_cell_id, + UCL_READ_ONLY)==UCL_SUCCESS); + _gpu_bytes=dev_nbor.row_bytes(); + if (_max_host>0) { + host_nbor.clear(); + dev_host_nbor.clear(); + success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor, + UCL_RW_OPTIMIZED)==UCL_SUCCESS); + success=success && (dev_host_nbor.alloc((mn+1)*_max_host, + dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS); + _gpu_bytes+=dev_host_nbor.row_bytes(); + } + if (_alloc_packed) { + dev_packed.clear(); + success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + _gpu_bytes+=dev_packed.row_bytes(); + } + if (!success) + return; + _max_nbors=mn; + time_kernel.stop(); + time_kernel.add_to_total(); + build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial, + special, success, mn); + return; + } + + if (_maxspecial>0) { + const int GX2=static_cast(ceil(static_cast(nt)/cell_block)); + k_special.set_size(GX2,cell_block); + k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), + &atom.dev_tag.begin(), &dev_nspecial.begin(), + &dev_special.begin(), &inum, &nt, &nall); + } + time_kernel.stop(); + + time_nbor.start(); + if (_gpu_host) + ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false); + time_nbor.stop(); +} + +template void PairGPUNbor::build_nbor_list + (const int inum, const int host_inum, const int nall, + PairGPUAtom &atom, double *boxlo, double *boxhi, + int *, int **, int **, bool &success, int &mn); + diff --git a/lib/gpu/pair_gpu_nbor.cu b/lib/gpu/pair_gpu_nbor.cu deleted file mode 100644 index 20fba00691..0000000000 --- a/lib/gpu/pair_gpu_nbor.cu +++ /dev/null @@ -1,113 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#include "pair_gpu_nbor.h" - -int PairGPUNbor::bytes_per_atom(const int max_nbors) const { - if (_use_packing) - return (max_nbors*2+4)*sizeof(int); - else - return (max_nbors+3)*sizeof(int); -} - -bool PairGPUNbor::init(const int ij_size, const int max_atoms, - const int max_nbors) { - bool success=true; - if (allocated) - clear(); - - // Initialize timers for the selected GPU - time_nbor.init(); - - if (_use_packing) - success=success && dev_nbor.alloc((max_nbors+4)*max_atoms); - else - success=success && dev_nbor.alloc(3*max_atoms); - - success=success && ij.alloc(max_nbors*max_atoms); - success=success && host_ij.alloc_w(ij_size); - - allocated=true; - - return success; -} - -void PairGPUNbor::resize(const int nlocal, const int max_nbor, bool &success) { - dev_nbor.clear(); - ij.clear(); - if (_use_packing) - success=success && dev_nbor.alloc((max_nbor+4)*nlocal); - else - success=success && dev_nbor.alloc(3*nlocal); - success=success && ij.alloc(max_nbor*nlocal); - allocated=true; -} - -void PairGPUNbor::clear() { - if (!allocated) - return; - allocated=false; - - ij.clear(); - host_ij.clear(); - dev_nbor.clear(); -} - -double PairGPUNbor::host_memory_usage() const { - return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor); -} - -void PairGPUNbor::reset(const int inum, int *ilist, const int *numj, - cudaStream_t &s) { - ij_total=0; - - dev_nbor.copy_from_host(ilist,inum); - int acc=0; - - int ij_size=host_ij.numel(); - if (inum*20) { - host_ij.copy_to_device(dev_nbor.begin()+inum+offset,hi,s); - host_ij.copy_to_device(half,dev_nbor.begin()+2*inum+offset,hi,s); - } - } -} diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h index c505120785..403bd7aed4 100644 --- a/lib/gpu/pair_gpu_nbor.h +++ b/lib/gpu/pair_gpu_nbor.h @@ -12,77 +12,196 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef PAIR_GPU_NBOR_H #define PAIR_GPU_NBOR_H -#include "nvc_macros.h" -#include "nvc_timer.h" -#include "nvc_memory.h" +#include "pair_gpu_atom.h" #define IJ_SIZE 131072 +#ifdef USE_OPENCL + +#include "geryon/ocl_device.h" +#include "geryon/ocl_timer.h" +#include "geryon/ocl_mat.h" +#include "geryon/ocl_kernel.h" +#include "geryon/ocl_texture.h" +using namespace ucl_opencl; + +#else + +#include "geryon/nvd_device.h" +#include "geryon/nvd_timer.h" +#include "geryon/nvd_mat.h" +#include "geryon/nvd_kernel.h" +#include "geryon/nvd_texture.h" +using namespace ucl_cudadr; + +#endif + class PairGPUNbor { public: - PairGPUNbor() : _use_packing(false), allocated(false) {} + PairGPUNbor() : _allocated(false), _use_packing(false), _compiled(false) {} ~PairGPUNbor() { clear(); } - /// Determine whether neighbor packing should be used - /** If true, twice as much memory is reserved to allow packing neighbors by - * atom for coalesced access after cutoff evaluation. This can be used - * for expensive potentials where it is more efficient to evaluate the - * cutoff separately from the potential in order to reduce thread divergence - * for expensive routines **/ + /// Determine whether neighbor unpacking should be used + /** If false, twice as much memory is reserved to allow unpacking neighbors by + * atom for coalesced access. **/ void packing(const bool use_packing) { _use_packing=use_packing; } - /// Called once to allocate memory - bool init(const int ij_size, const int max_atoms, const int max_nbors); + /// Clear any old data and setup for new LAMMPS run + /** \param inum Initial number of particles whose neighbors stored on device + * \param host_inum Initial number of particles whose nbors copied to host + * \param max_nbors Initial number of rows in the neighbor matrix + * \param gpu_nbor True if device will perform neighboring + * \param gpu_host 0 if host will not perform force calculations, + * 1 if gpu_nbor is true, and host needs a half nbor list, + * 2 if gpu_nbor is true, and host needs a full nbor list + * \param pre_cut True if cutoff test will be performed in separate kernel + * than the force kernel **/ + bool init(const int inum, const int host_inum, const int max_nbors, + const int maxspecial, UCL_Device &dev, const bool gpu_nbor, + const int gpu_host, const bool pre_cut); + + /// Set the size of the cutoff+skin + inline void cell_size(const double size) { _cell_size=size; } - void resize(const int nlocal, const int max_nbor, bool &success); + /// Get the size of the cutoff+skin + inline double cell_size() const { return _cell_size; } + + /// Check if there is enough memory for neighbor data and realloc if not + /** \param inum Number of particles whose nbors will be stored on device + * \param max_nbor Current max number of neighbors for a particle + * \param success False if insufficient memory **/ + inline void resize(const int inum, const int max_nbor, bool &success) { + if (inum>_max_atoms || max_nbor>_max_nbors) { + _max_atoms=static_cast(static_cast(inum)*1.10); + if (max_nbor>_max_nbors) + _max_nbors=static_cast(static_cast(max_nbor)*1.10); + alloc(success); + } + } + + /// Check if there is enough memory for neighbor data and realloc if not + /** \param inum Number of particles whose nbors will be stored on device + * \param host_inum Number of particles whose nbors will be copied to host + * \param max_nbor Current max number of neighbors for a particle + * \param success False if insufficient memory **/ + inline void resize(const int inum, const int host_inum, const int max_nbor, + bool &success) { + if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) { + _max_atoms=static_cast(static_cast(inum)*1.10); + _max_host=static_cast(static_cast(host_inum)*1.10); + if (max_nbor>_max_nbors) + _max_nbors=static_cast(static_cast(max_nbor)*1.10); + alloc(success); + } + } /// Free all memory on host and device void clear(); /// Bytes per atom used on device int bytes_per_atom(const int max_nbors) const; + /// Total host memory used by class double host_memory_usage() const; + + /// True if neighboring performed on GPU + inline bool gpu_nbor() const { return _gpu_nbor; } + + /// Make a copy of unpacked nbor lists in the packed storage area (for gb) + inline void copy_unpacked(const int inum, const int maxj) + { ucl_copy(dev_packed,dev_nbor,inum*(maxj+2),true); } - /// Reset neighbor data (first time or from a rebuild) - void reset(const int inum, int *ilist, const int *numj, cudaStream_t &s); - /// Add neighbor data from host - inline void add(const int num_ij, cudaStream_t &s) - { host_ij.copy_to_device(ij.begin()+ij_total,num_ij,s); ij_total+=num_ij; } + /// Copy neighbor list from host (first time or from a rebuild) + void get_host(const int inum, int *ilist, int *numj, + int **firstneigh, const int block_size); + + /// Return the stride in elements for each nbor row + inline int nbor_pitch() const { return _nbor_pitch; } + + /// Return the maximum number of atoms that can currently be stored + inline int max_atoms() const { return _max_atoms; } - /// Pack neighbors satisfying cutoff by atom for coalesced access - void pack_nbors(const int GX, const int BX, const int start, - const int inum, const int form_low, const int form_high); + /// Return the maximum number of nbors for a particle based on current alloc + inline int max_nbors() const { return _max_nbors; } - + /// Loop through neighbor count array and return maximum nbors for a particle + inline int max_nbor_loop(const int inum, int *numj) const { + int mn=0; + for (int i=0; i + void build_nbor_list(const int inum, const int host_inum, const int nall, + PairGPUAtom &atom, double *boxlo, + double *boxhi, int *tag, int **nspecial, int **special, + bool &success, int &max_nbors); + + /// Return the number of bytes used on device + inline double gpu_bytes() { + double res = _gpu_bytes + _c_bytes + _cell_bytes; + if (_gpu_nbor==false) + res += 2*IJ_SIZE*sizeof(int); + + return res; + } + // ------------------------------- Data ------------------------------- - // Store IJ interactions on device - NVC_VecI ij; - // Buffer for moving ij data to GPU - NVC_HostI host_ij; + /// Device neighbor matrix + /** - 1st row is i (index into atom data) + * - 2nd row is numj (number of neighbors) + * - 3rd row is starting location in packed nbors + * - Remaining rows are the neighbors arranged for coalesced access **/ + UCL_D_Vec dev_nbor; + /// Packed storage for neighbor lists copied from host + UCL_D_Vec dev_packed; + /// Host buffer for copying neighbor lists + UCL_H_Vec host_packed; + /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2) + UCL_H_Vec host_acc; - // --------------- Atom neighbors - // 3 x n - // - 1st row is i - // - 2nd row is numj (number of neighbors) - // - 3rd row is starting address in host_ij of neighbors - NVC_VecI dev_nbor; + // ----------------- Data for GPU Neighbor Calculation --------------- - // --------------- Timing Stuff - NVCTimer time_nbor; + /// Host storage for device calculated neighbor lists + /** Same storage format as device matrix **/ + UCL_H_Vec host_nbor; + /// Device storage for neighbor list matrix that will be copied to host + /** - 1st row is numj + * - Remaining rows are nbors **/ + UCL_D_Vec dev_host_nbor; + /// Device storage for special neighbor counts + UCL_D_Vec dev_nspecial; + /// Device storage for special neighbors + UCL_D_Vec dev_special, dev_special_t; + /// Texture for cached position/type access with CUDA + UCL_Texture neigh_tex; + + /// Device timers + UCL_Timer time_nbor, time_kernel; - int ij_total; private: - bool allocated, _use_packing; + UCL_Device *dev; + UCL_Program *nbor_program, *build_program; + UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor; + UCL_Kernel k_transpose, k_special; + bool _allocated, _use_packing, _compiled; + void compile_kernels(UCL_Device &dev); + int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial; + bool _gpu_nbor, _gpu_host, _alloc_packed; + double _cell_size; + + double _gpu_bytes, _c_bytes, _cell_bytes; + void alloc(bool &success); }; #endif + diff --git a/lib/gpu/nvc_traits.h b/lib/gpu/pair_gpu_nbor_kernel.cu similarity index 56% rename from lib/gpu/nvc_traits.h rename to lib/gpu/pair_gpu_nbor_kernel.cu index 41cb9487ec..238023b429 100644 --- a/lib/gpu/nvc_traits.h +++ b/lib/gpu/pair_gpu_nbor_kernel.cu @@ -12,16 +12,35 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ -#ifndef NVC_TEXTURE_TRAITS_H -#define NVC_TEXTURE_TRAITS_H +#ifdef NV_KERNEL -template class nvc_vec_traits; -template <> class nvc_vec_traits { public: typedef float2 vec2; }; -template <> class nvc_vec_traits { public: typedef double2 vec2; }; +#include "geryon/ucl_nv_kernel.h" + +#else + +#define GLOBAL_ID_X get_global_id(0) #endif + +__kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij, + const int inum) { + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X; + + if (ii +inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) { + out << v.x << " " << v.y; + return out; +} + +inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) { + out << v.x << " " << v.y << " " << v.z; + return out; +} + +inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) { + out << v.x << " " << v.y; + return out; +} + +inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { + out << v.x << " " << v.y << " " << v.z; + return out; +} + +// PRECISION - Precision for rsq, energy, force, and torque calculation +// ACC_PRECISION - Precision for accumulation of energies, forces, and torques +#ifdef _SINGLE_DOUBLE +#define OCL_PRECISION_COMPILE "-D_SINGLE_DOUBLE" +#define PRECISION float +#define ACC_PRECISION double +#define numtyp2 _lgpu_float2 +#define numtyp4 _lgpu_float4 +#define acctyp4 _lgpu_double4 +#endif + +#ifdef _DOUBLE_DOUBLE +#define OCL_PRECISION_COMPILE "-D_DOUBLE_DOUBLE" +#define PRECISION double +#define ACC_PRECISION double +#define numtyp2 _lgpu_double2 +#define numtyp4 _lgpu_double4 +#define acctyp4 _lgpu_double4 +#endif + +#ifndef PRECISION +#define OCL_PRECISION_COMPILE "-D_SINGLE_SINGLE" +#define PRECISION float +#define ACC_PRECISION float +#define numtyp2 _lgpu_float2 +#define numtyp4 _lgpu_float4 +#define acctyp4 _lgpu_float4 +#endif + +#define MAX_SHARED_TYPES 8 +enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; + +#endif + diff --git a/lib/gpu/pair_gpu_texture.h b/lib/gpu/pair_gpu_texture.h deleted file mode 100644 index 6c26b47952..0000000000 --- a/lib/gpu/pair_gpu_texture.h +++ /dev/null @@ -1,286 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ - -#include "nvc_traits.h" -#include "nvc_memory.h" - -#ifndef PAIR_GPU_TEXTURE_H -#define PAIR_GPU_TEXTURE_H - -#ifdef _SINGLE_DOUBLE -#define GB_GPU_DOUBLE -#endif - -#ifdef _DOUBLE_DOUBLE -#define GB_GPU_DOUBLE -#endif - -// ------------------------------- form ------------------------------------ - -static texture form_tex; -inline textureReference * form_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"form_tex"); - return const_cast(ptr); -} -static __inline__ __device__ int _form_(const int i, const int j) { - return tex2D(form_tex,i,j); -} - -// ------------------------------- lshape ------------------------------------ - -static texture lshape_float_tex; -static texture lshape_double_tex; -template inline textureReference * lshape_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"lshape_float_tex"); - return const_cast(ptr); -} -template <> inline textureReference * lshape_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"lshape_double_tex"); - return const_cast(ptr); -} -template -static __inline__ __device__ numtyp _lshape_(const int i) - { return tex1Dfetch(lshape_float_tex,i); } -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _lshape_(const int i) { - int2 t=tex1Dfetch(lshape_double_tex,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- shape ------------------------------------ - -static texture shape_float_tex; -static texture shape_double_tex; -template inline textureReference * shape_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"shape_float_tex"); - return const_cast(ptr); -} -template <> inline textureReference * shape_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"shape_double_tex"); - return const_cast(ptr); -} -template -static __inline__ __device__ numtyp _shape_(const int i, const int j) { - return tex2D(shape_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _shape_(const int i, const int j) { - int2 t=tex2D(shape_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- well ------------------------------------ - -static texture well_float_tex; -static texture well_double_tex; -template inline textureReference * well_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"well_float_tex"); - return const_cast(ptr); -} -template <> inline textureReference * well_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"well_double_tex"); - return const_cast(ptr); -} -template -static __inline__ __device__ numtyp _well_(const int i, const int j) - { return tex2D(well_float_tex,j,i); } -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _well_(const int i,const int j) { - int2 t=tex2D(well_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- sigma ------------------------------------ - -static texture sigma_float_tex; -static texture sigma_double_tex; -template inline textureReference * sigma_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"sigma_float_tex"); - return const_cast(ptr); -} -template <> inline textureReference * sigma_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"sigma_double_tex"); - return const_cast(ptr); -} -template -static __inline__ __device__ numtyp _sigma_(const int i, const int j) { - return tex2D(sigma_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _sigma_(const int i,const int j) { - int2 t=tex2D(sigma_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- epsilon ------------------------------------ - -static texture epsilon_float_tex; -static texture epsilon_double_tex; -template inline textureReference * epsilon_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"epsilon_float_tex"); - return const_cast(ptr); -} -template <> inline textureReference * epsilon_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"epsilon_double_tex"); - return const_cast(ptr); -} -template -static __inline__ __device__ numtyp _epsilon_(const int i, const int j) { - return tex2D(epsilon_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _epsilon_(const int i,const int j) { - int2 t=tex2D(epsilon_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- cutsq ------------------------------------ - -static texture cutsq_float_tex; -static texture cutsq_double_tex; -template inline textureReference * cutsq_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"cutsq_float_tex"); - return const_cast(ptr); -} -template <> inline textureReference * cutsq_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"cutsq_double_tex"); - return const_cast(ptr); -} -template -static __inline__ __device__ numtyp _cutsq_(const int i, const int j) { - return tex2D(cutsq_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _cutsq_(const int i,const int j) { - int2 t=tex2D(cutsq_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- lj1 ------------------------------------ - -static texture lj1_float_tex; -static texture lj1_double_tex; -template inline textureReference * lj1_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"lj1_float_tex"); - return const_cast(ptr); -} -template <> inline textureReference * lj1_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"lj1_double_tex"); - return const_cast(ptr); -} -template -static __inline__ __device__ -typename nvc_vec_traits::vec2 _lj1_(const int i, const int j) { - return tex2D(lj1_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double2 _lj1_(const int i,const int j) { - int4 t=tex2D(lj1_double_tex,j,i); - double2 ans; - ans.x=__hiloint2double(t.y, t.x); - ans.y=__hiloint2double(t.w, t.z); - return ans; -} -#endif - -// ------------------------------- lj3 ------------------------------------ - -static texture lj3_float_tex; -static texture lj3_double_tex; -template inline textureReference * lj3_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"lj3_float_tex"); - return const_cast(ptr); -} -template <> inline textureReference * lj3_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"lj3_double_tex"); - return const_cast(ptr); -} -template -static __inline__ __device__ -typename nvc_vec_traits::vec2 _lj3_(const int i, const int j) { - return tex2D(lj3_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double2 _lj3_(const int i,const int j) { - int4 t=tex2D(lj3_double_tex,j,i); - double2 ans; - ans.x=__hiloint2double(t.y, t.x); - ans.y=__hiloint2double(t.w, t.z); - return ans; -} -#endif - -// ------------------------------- offset ------------------------------------ - -static texture offset_float_tex; -static texture offset_double_tex; -template inline textureReference * offset_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"offset_float_tex"); - return const_cast(ptr); -} -template <> inline textureReference * offset_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"offset_double_tex"); - return const_cast(ptr); -} -template -static __inline__ __device__ numtyp _offset_(const int i, const int j) { - return tex2D(offset_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _offset_(const int i,const int j) { - int2 t=tex2D(offset_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -#endif diff --git a/lib/gpu/pair_win_sort.cpp b/lib/gpu/pair_win_sort.cpp new file mode 100644 index 0000000000..9e4e46cc95 --- /dev/null +++ b/lib/gpu/pair_win_sort.cpp @@ -0,0 +1,82 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef USE_OPENCL +#include "cudpp.h" +#endif + +class PairWinSort { + public: + inline PairWinSort() : _allocated(false) { + #ifndef USE_OPENCL + sort_config.op = CUDPP_ADD; + sort_config.datatype = CUDPP_UINT; + sort_config.algorithm = CUDPP_SORT_RADIX; + sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; + #endif + } + inline ~PairWinSort() { clear(); } + + /// Free all memory on host and device + inline void clear() { + #ifndef USE_OPENCL + if (_allocated) { cudppDestroyPlan(sort_plan); _allocated=false; } + #endif + } + + inline bool alloc(const int max_atoms) { + #ifndef USE_OPENCL + clear(); + CUDPPResult result = cudppPlan(&sort_plan, sort_config, max_atoms, 1, 0); + if (CUDPP_SUCCESS != result) + return false; + #endif + return true; + } + + /// Sort arrays for neighbor list calculation + void sort_neighbor(const int num_atoms, unsigned *cell_begin, int *particle_begin) { + #ifndef USE_OPENCL + CUDPPResult result = cudppSort(sort_plan, cell_begin, particle_begin, + 8*sizeof(unsigned), num_atoms); + if (CUDPP_SUCCESS != result) { + printf("Error in cudppSort\n"); + assert(1==0); + } + #endif + } + + private: + + bool allocated; + + #ifndef USE_OPENCL + CUDPPConfiguration sort_config; + CUDPPHandle sort_plan; + #endif +}; + +static PairWinSort win_sort; + +extern "C" __declspec(dllexport) bool _win_sort_alloc(const int max_atoms) { + win_sort.alloc(max_atoms); +} + +extern "C" __declspec(dllexport) bool _win_sort(const int max_atoms, unsigned *cell_begin, + int *particle_begin) { + win_sort.sort(num_atoms,cell_begin,particle_begin); +}