diff --git a/lib/cuda/Makefile b/lib/cuda/Makefile new file mode 100644 index 0000000000..844906ba89 --- /dev/null +++ b/lib/cuda/Makefile @@ -0,0 +1,4 @@ +#Makefile for liblammpscuda.a +#No need to modify anything here! The CUDA path is inserted into Makefile.common + +include Makefile.cudalib \ No newline at end of file diff --git a/lib/cuda/Makefile.common b/lib/cuda/Makefile.common new file mode 100644 index 0000000000..b4018cc5ed --- /dev/null +++ b/lib/cuda/Makefile.common @@ -0,0 +1,108 @@ +#Common commandline argument interpreter for compilation with lammpscuda (USER-CUDA) installed + +# make options: +# emu=1 switch to cuda emulation mode (otherwise: use gpu) +# dbg=1 print a lot of debugging output during runtime +# verbose=1 output nvcc command line during compilation +# keep=1 do not delete temporary compilation files (.ii, .cubin, ...) +# cufft=1 use cuda's fast fourier transformation lib "cufft" where possible (otherwise: use cpu fftw) +# binning=1 create virtual particle grid (neighbor-lists otherwise); currently this is not supported +# precision=1 single precision (global setting) +# precision=2 double precision (global setting) + +SHELL = /bin/sh + +# System-specific settings + +CUDA_INSTALL_PATH = /usr/local/cuda +# e.g. in Gentoo +# CUDA_INSTALL_PATH = /opt/cuda + + +#////////////////////////////////////////////////////////////////////////////////////////////// +# no need to change anything below this line +#////////////////////////////////////////////////////////////////////////////////////////////// + +#use CPU FFT if cufft=0 is requested. +FALLBACK_FFT = 1 + +#default settings for compiler switches +#ifdef COMPILELIB +#include Makefile.defaults +#else +include ../../lib/cuda/Makefile.defaults +#endif + +#shell echo "Compiling with precision = " ${precision} ", arch = " ${arch} ", cufft = " ${cufft} ", dbg = " ${dbg} ", prec_timer = " ${prec_timer} + +CUDA_FLAGS := -DUNIX +CUDA_USRLIB_CONDITIONAL := + +# debug setting +ifeq ($(dbg), 1) + CUDA_FLAGS += -D_DEBUG -g + NVCC_FLAGS += -g -G +else + NVCC_FLAGS += --compiler-options -fno-strict-aliasing -O2 +endif + +# skip timing on Mac and Windows manually +ifeq ($(prec_timer), 0) + CUDA_FLAGS += -DNO_PREC_TIMING +endif + +# set fft routine +ifeq ($(cufft), 0) + ifneq ($(FALLBACK_FFT), 1) + FFT_INC = -DFFT_NONE + FFT_PATH = + FFT_LIB = + CUDA_FLAGS += -DFFT_NONE + endif +else + CUDA_FLAGS += -DFFT_CUFFT + CUDA_USRLIB_CONDITIONAL += -lcufft +endif + +# make global precision setting +ifeq ($(precision), 1) + CUDA_FLAGS += -DCUDA_PRECISION=1 +else + ifeq ($(precision), 3) + CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 + else + ifeq ($(precision), 4) + CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2 + else + CUDA_FLAGS += -DCUDA_PRECISION=2 + endif + endif +endif + +# make architecture settings +ifeq ($(arch), 13) + CUDA_FLAGS += -DCUDA_ARCH=13 + SMVERSIONFLAGS := -arch sm_13 +else + ifeq ($(arch), 20) + CUDA_FLAGS += -DCUDA_ARCH=20 + #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true + NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false + SMVERSIONFLAGS := -arch sm_20 + else + ifeq ($(arch), 21) + CUDA_FLAGS += -DCUDA_ARCH=20 + #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true + NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false + SMVERSIONFLAGS := -arch sm_21 + else + CUDA_FLAGS += -DCUDA_ARCH=99 + SMVERSIONFLAGS := -arch sm_13 + endif + endif +endif + + + +CCFLAGS := $(CCFLAGS) $(CUDA_FLAGS) \ + -I$(CUDA_INSTALL_PATH)/include diff --git a/lib/cuda/Makefile.cudalib b/lib/cuda/Makefile.cudalib new file mode 100644 index 0000000000..e60ac38f18 --- /dev/null +++ b/lib/cuda/Makefile.cudalib @@ -0,0 +1,82 @@ +#Makefile for liblammpscuda.a +#No need to modify anything here! The CUDA path is inserted into Makefile.common + +.DEFAULT: lib + +COMPILELIB := 1 + +SHELL = /bin/sh + +CUDA_SRC_DIR = ../cuda +CUDA_TEMP = $(CUDA_SRC_DIR)/.lastmake +CUDA_TEMP_DUMMY := $(shell touch $(CUDA_TEMP) ) +include $(CUDA_TEMP) +CUDA_CU = $(wildcard $(CUDA_SRC_DIR)/*_kernel.cu) +CUDA_CUO = $(CUDA_CU:_kernel.cu=_cu.o) +CUDA_OBJ = $(subst $(CUDA_SRC_DIR)/,,$(CUDA_CUO)) +CUDA_DEP = $(CUDA_OBJ:.o=.d) + +NVCC_FLAGS := + +VPATH = $(CUDA_SRC_DIR) + +#rewriting default settings if new ones are specified + + +ifdef precision +tmp := $(shell sed -i 's|precision ?= [0-9]|precision ?= '${precision}'|g' Makefile.defaults) +endif + +ifdef arch +tmp := $(shell sed -i 's|arch ?= [0-9][0-9]|arch ?= '${arch}'|g' Makefile.defaults) +endif + +ifdef cufft +tmp := $(shell sed -i 's|cufft ?= [0-9]|cufft ?= '${cufft}'|g' Makefile.defaults) +endif + +ifdef dbg +tmp := $(shell sed -i 's|dbg ?= [0-9]|dbg ?= '${dbg}'|g' Makefile.defaults) +endif + +ifdef prec_timer +tmp := $(shell sed -i 's|prec_timer ?= [0-9]|prec_timer ?= '${prec_timer}'|g' Makefile.defaults) +endif + +include Makefile.common + +# verbose nvcc output during compilation +ifeq ($(verbose), 1) + VERBOSE := + NVCC_FLAGS += --ptxas-options=-v +else + VERBOSE := @ +endif + +# keep temporary compilation files of nvcc +ifeq ($(keep), 1) + NVCC_FLAGS += -keep -Xptxas="--verbose" +endif + + +NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc +CUDA_INCLUDES = -I$(CUDA_INSTALL_PATH)/include -I../../src/USER-CUDA +CUDA_USRLIB = + +# Link target + +lib: $(CUDA_OBJ) + $(NVCC) -lib $(CUDA_OBJ) $(CUDA_FLAGS) $(CUDA_USRLIB_CONDITIONAL) -o liblammpscuda.a + +clean: + rm $(CUDA_SRC_DIR)/*.o + rm liblammpscuda.a + +# Library target + + +# Cuda compilation rules + +%_cu.o: %.cu %_kernel.cu %_cu.h cuda_shared.h + $(VERBOSE)$(NVCC) $(NVCC_FLAGS) $(CUDA_FLAGS) $(CUDA_INCLUDES) $(CUDA_USRLIB) $(SMVERSIONFLAGS) -o $@ -c $< + diff --git a/lib/cuda/Makefile.defaults b/lib/cuda/Makefile.defaults new file mode 100644 index 0000000000..d006a02d81 --- /dev/null +++ b/lib/cuda/Makefile.defaults @@ -0,0 +1,16 @@ + +#precision setting: 1 single, 2 double, 4 mixed +precision ?= 2 + +#GPU architecture (compute capability): 13, 20, 21 +arch ?= 20 + +#Using cufft (should not be changed) +cufft ?= 1 + +#Using dbg mode +dbg ?= 0 + +#On mac machines set this to 0 in order to avoid usage of linux specific precision timer +prec_timer ?= 1 + diff --git a/lib/cuda/atom_vec_angle_cuda.cu b/lib/cuda/atom_vec_angle_cuda.cu new file mode 100644 index 0000000000..a11d9adbe4 --- /dev/null +++ b/lib/cuda/atom_vec_angle_cuda.cu @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +const unsigned int ANGLE_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK; + +#include "atom_vec_angle_cuda_cu.h" + +void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata) +{ + return Cuda_AtomVecCuda_Init(sdata); +} + +int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackExchangeList(sdata,n,dim,buf_send); +} + +int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackExchange(sdata,nsend,buf_send,copylist); +} + +int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackExchange(sdata,nsend,buf_send,copylist); +} + +int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); +} + +int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); +} + +int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); +} + +int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); +} + +int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); +} + +int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); +} diff --git a/lib/cuda/atom_vec_angle_cuda_cu.h b/lib/cuda/atom_vec_angle_cuda_cu.h new file mode 100644 index 0000000000..d8f5a2b9a4 --- /dev/null +++ b/lib/cuda/atom_vec_angle_cuda_cu.h @@ -0,0 +1,15 @@ +#ifndef ATOM_VEC_ANGLE_CUDA_CU_H_ +#define ATOM_VEC_ANGLE_CUDA_CU_H_ + +extern "C" void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata); +extern "C" int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send); +extern "C" int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); +extern "C" int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); +extern "C" int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv); +extern "C" int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv); + +#endif /*ATOM_VEC_ANGLE2_CUDA_CU_H_*/ diff --git a/lib/cuda/atom_vec_atomic_cuda.cu b/lib/cuda/atom_vec_atomic_cuda.cu new file mode 100644 index 0000000000..0a75de2754 --- /dev/null +++ b/lib/cuda/atom_vec_atomic_cuda.cu @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +const unsigned int ATOMIC_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK; + +#include "atom_vec_atomic_cuda_cu.h" + +void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata) +{ + return Cuda_AtomVecCuda_Init(sdata); +} + +int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK; + return Cuda_AtomVecCuda_PackExchangeList(sdata,n,dim,buf_send); +} + +int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK; + return Cuda_AtomVecCuda_PackExchange(sdata,nsend,buf_send,copylist); +} + +int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK; + return Cuda_AtomVecCuda_UnpackExchange(sdata,nsend,buf_send,copylist); +} + +int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; + return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); +} + +int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; + return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); +} + +int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; + return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); +} + +int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; + return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); +} + +int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; + return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); +} + +int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; + return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); +} diff --git a/lib/cuda/atom_vec_atomic_cuda_cu.h b/lib/cuda/atom_vec_atomic_cuda_cu.h new file mode 100644 index 0000000000..8e776308e0 --- /dev/null +++ b/lib/cuda/atom_vec_atomic_cuda_cu.h @@ -0,0 +1,15 @@ +#ifndef ATOM_VEC_ATOMIC_CUDA_CU_H_ +#define ATOM_VEC_ATOMIC_CUDA_CU_H_ + +extern "C" void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata); +extern "C" int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send); +extern "C" int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); +extern "C" int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); +extern "C" int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv); +extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv); + +#endif /*ATOM_VEC_ATOMIC2_CUDA_CU_H_*/ diff --git a/lib/cuda/atom_vec_charge_cuda.cu b/lib/cuda/atom_vec_charge_cuda.cu new file mode 100644 index 0000000000..a78ffb9de0 --- /dev/null +++ b/lib/cuda/atom_vec_charge_cuda.cu @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +const unsigned int CHARGE_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK; + +#include "atom_vec_charge_cuda_cu.h" + +void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata) +{ + return Cuda_AtomVecCuda_Init(sdata); +} + +int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK; + return Cuda_AtomVecCuda_PackExchangeList(sdata,n,dim,buf_send); +} + +int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK; + return Cuda_AtomVecCuda_PackExchange(sdata,nsend,buf_send,copylist); +} + +int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK; + return Cuda_AtomVecCuda_UnpackExchange(sdata,nsend,buf_send,copylist); +} + +int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; + return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); +} + +int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; + return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); +} + +int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; + return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); +} + +int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; + return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); +} + +int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; + return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); +} + +int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; + return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); +} diff --git a/lib/cuda/atom_vec_charge_cuda_cu.h b/lib/cuda/atom_vec_charge_cuda_cu.h new file mode 100644 index 0000000000..137b001847 --- /dev/null +++ b/lib/cuda/atom_vec_charge_cuda_cu.h @@ -0,0 +1,15 @@ +#ifndef ATOM_VEC_CHARGE_CUDA_CU_H_ +#define ATOM_VEC_CHARGE_CUDA_CU_H_ + +extern "C" void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata); +extern "C" int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send); +extern "C" int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); +extern "C" int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); +extern "C" int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv); +extern "C" int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv); + +#endif /*ATOM_VEC_CHARGE2_CUDA_CU_H_*/ diff --git a/lib/cuda/atom_vec_cuda.cu b/lib/cuda/atom_vec_cuda.cu new file mode 100644 index 0000000000..187718dc36 --- /dev/null +++ b/lib/cuda/atom_vec_cuda.cu @@ -0,0 +1,553 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX atom_vec_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "cuda_wrapper_cu.h" +#include "crm_cuda_utils.cu" + +#include "atom_vec_cuda_kernel.cu" + +int AtomVecCuda_CountDataItems(unsigned int data_mask) +{ + int n=0; + if(data_mask & X_MASK) n+=3; + if(data_mask & V_MASK) n+=3; + if(data_mask & F_MASK) n+=3; + if(data_mask & TAG_MASK) n++; + if(data_mask & TYPE_MASK) n++; + if(data_mask & MASK_MASK) n++; + if(data_mask & IMAGE_MASK) n++; + if(data_mask & Q_MASK) n++; + if(data_mask & MOLECULE_MASK) n++; + if(data_mask & RMASS_MASK) n++; + if(data_mask & RADIUS_MASK) n++; + if(data_mask & DENSITY_MASK) n++; + if(data_mask & OMEGA_MASK) n+=3; + if(data_mask & TORQUE_MASK) n++; + + //if(data_mask & NSPECIAL_MASK) n+=3; + return n; +} + +void Cuda_AtomVecCuda_UpdateBuffer(cuda_shared_data* sdata,int size) +{ + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); +} + +template +void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(image) , & sdata->atom.image.dev_data, sizeof(int*) ); + if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) ); + if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_CONST(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*) ); + if(data_mask & RADIUS_MASK) cudaMemcpyToSymbol(MY_CONST(radius) , & sdata->atom.radius.dev_data, sizeof(int*) ); + if(data_mask & DENSITY_MASK) cudaMemcpyToSymbol(MY_CONST(density) , & sdata->atom.density.dev_data, sizeof(int*) ); + if(data_mask & RMASS_MASK) cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(int*) ); + if(data_mask & OMEGA_MASK) cudaMemcpyToSymbol(MY_CONST(omega) , & sdata->atom.omega.dev_data, sizeof(int*) ); + //if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_CONST(nspecial) , & sdata->atom.nspecial.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(flag) , & sdata->flag, sizeof(int*) ); +} + +template +void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata) +{ + MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n"); ) + Cuda_AtomVecCuda_UpdateNmax(sdata); + MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n"); ) + cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd, 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(sublo) , & sdata->domain.sublo, 3*sizeof(X_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(subhi) , & sdata->domain.subhi, 3*sizeof(X_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(flag) , & sdata->flag, sizeof(int*) ); + MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n"); ) +} + + +template +int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + + timespec time1,time2; + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int n_data_items=AtomVecCuda_CountDataItems(data_mask); + int size=(n*n_data_items)*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata,size); + + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal>0) + { + cudaMemset( sdata->flag,0,sizeof(int)); + +clock_gettime(CLOCK_REALTIME,&time1); + + void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer; + Cuda_AtomVecCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n + ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_kernel_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed"); + if(not sdata->overlap_comm) + cudaMemcpy(buf_send, sdata->buffer, n*n_data_items*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_forward_download+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + int aflag; + cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + if(aflag!=0) printf("aflag PackComm: %i\n",aflag); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed"); + + } + return n_data_items*n; +} + + +template +int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");) + timespec time1,time2; + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int n_data_items=AtomVecCuda_CountDataItems(data_mask); + int size=(n*n_data_items)*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata,size); + static int count=-1; + count++; + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + +clock_gettime(CLOCK_REALTIME,&time1); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed"); + + Cuda_AtomVecCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_kernel_self+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self: Kernel execution failed"); + } + + return n_data_items*n; +} + + +template +void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap) +{ + timespec time1,time2; + + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int n_data_items=AtomVecCuda_CountDataItems(data_mask); + int size=(n*n_data_items)*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata,size); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { +clock_gettime(CLOCK_REALTIME,&time1); + if(not sdata->overlap_comm||iswap<0) + cudaMemcpy(sdata->buffer,(void*)buf_recv, n_data_items*n*sizeof(X_FLOAT), cudaMemcpyHostToDevice); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_upload+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer; + Cuda_AtomVecCuda_UnpackComm_Kernel<<>>(n,first,buf); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_forward_kernel_unpack+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackComm: Kernel execution failed"); + + } +} + +template +int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) +{ + MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n",dim); ) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + Cuda_AtomVecCuda_Init(sdata); + int size=n*sizeof(double); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata,size); + + cudaMemset((int*) (sdata->buffer),0,sizeof(int)); + + int3 layout=getgrid(sdata->atom.nlocal,sizeof(int),256,true); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + + Cuda_AtomVecCuda_PackExchangeList_Kernel<<>>(n-1,dim); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed"); + + clock_gettime(CLOCK_REALTIME,&time2); + sdata->cuda_timings.comm_exchange_kernel_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost); + int return_value = ((int*) buf_send)[0]; + cudaMemcpy(buf_send, sdata->buffer, (1+return_value)*sizeof(double), cudaMemcpyDeviceToHost); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_exchange_download+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n"); ) + return return_value; +} + +template +int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) +{ + MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n"); ) + Cuda_AtomVecCuda_UpdateNmax(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1; + int size=(nsend*n_data_items+1)*sizeof(double); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata,size); + + cudaMemset((int*) (sdata->buffer),0,sizeof(int)); + + int3 layout=getgrid(nsend,0); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + + Cuda_AtomVecCuda_PackExchange_Kernel<<>>(nsend,(int*) copylist); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed"); + + clock_gettime(CLOCK_REALTIME,&time2); + sdata->cuda_timings.comm_exchange_kernel_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost); + + clock_gettime(CLOCK_REALTIME,&time1); + sdata->cuda_timings.comm_exchange_download+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n"); ) + return nsend*n_data_items+1; +} + + +template +int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) +{ + Cuda_AtomVecCuda_UpdateNmax(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1; + + int size=(nsend*n_data_items+1)*sizeof(double); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata,size); + cudaMemcpyToSymbol(MY_CONST(flag) , & sdata->flag, sizeof(int*) ); + + cudaMemset((int*) (sdata->flag),0,sizeof(int)); + if(nsend) + { + int3 layout=getgrid(nsend,0); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + + cudaMemcpy(sdata->buffer,buf_send , size, cudaMemcpyHostToDevice); + + clock_gettime(CLOCK_REALTIME,&time2); + sdata->cuda_timings.comm_exchange_upload+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + Cuda_AtomVecCuda_UnpackExchange_Kernel<<>>(sdata->exchange_dim,nsend,(int*) copylist); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME,&time1); + sdata->cuda_timings.comm_exchange_kernel_unpack+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackExchange: Kernel execution failed"); + } + } + int naccept; + cudaMemcpy((void*)&naccept, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + + return naccept; +} + +template +int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + timespec atime1,atime2; + clock_gettime(CLOCK_REALTIME,&atime1); + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax(sdata); + + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + clock_gettime(CLOCK_REALTIME,&atime2); + sdata->cuda_timings.test1+= + atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; + + int n_data_items=AtomVecCuda_CountDataItems(data_mask); + + int size=nsend*n_data_items*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata,size); + + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]; + dy = pbc[1]; + dz = pbc[2]; + }} + + int3 layout=getgrid(nsend); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal>0) + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + + Cuda_AtomVecCuda_PackBorder_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,nsend,sdata->comm.maxlistlength,iswap,dx,dy,dz); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME,&time2); + sdata->cuda_timings.comm_border_kernel_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed"); + + clock_gettime(CLOCK_REALTIME,&time1); + sdata->cuda_timings.comm_border_download+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + } + return nsend*n_data_items; +} + +template +int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax(sdata); + + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int n_data_items=AtomVecCuda_CountDataItems(data_mask); + + int size=n*n_data_items*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata,size); + + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]; + dy = pbc[1]; + dz = pbc[2]; + }} + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + + Cuda_AtomVecCuda_PackBorder_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME,&time2); + sdata->cuda_timings.comm_border_kernel_self+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder_Self: Kernel execution failed"); + + } + return n*n_data_items; +} + + +template +int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv) +{ + timespec atime1,atime2; + clock_gettime(CLOCK_REALTIME,&atime1); + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax(sdata); + + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + clock_gettime(CLOCK_REALTIME,&atime2); + sdata->cuda_timings.test1+= + atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; + + int n_data_items=AtomVecCuda_CountDataItems(data_mask); + + int size=n*n_data_items*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata,size); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + + cudaMemset((int*) (sdata->flag),0,sizeof(int)); + cudaMemcpy(sdata->buffer,(void*)buf_recv, size, cudaMemcpyHostToDevice); + + clock_gettime(CLOCK_REALTIME,&time2); + sdata->cuda_timings.comm_border_upload+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + Cuda_AtomVecCuda_UnpackBorder_Kernel<<>>(n,first); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME,&time1); + sdata->cuda_timings.comm_border_kernel_unpack+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + cudaMemcpy(&sdata->comm.grow_flag,sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackBorder: Kernel execution failed"); + + } + return sdata->comm.grow_flag; +} + + +#include "atom_vec_angle_cuda.cu" +#include "atom_vec_atomic_cuda.cu" +#include "atom_vec_charge_cuda.cu" +#include "atom_vec_full_cuda.cu" +//#include "atom_vec_granular_cuda.cu" diff --git a/lib/cuda/atom_vec_cuda_cu.h b/lib/cuda/atom_vec_cuda_cu.h new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/cuda/atom_vec_cuda_kernel.cu b/lib/cuda/atom_vec_cuda_kernel.cu new file mode 100644 index 0000000000..0ec079d45b --- /dev/null +++ b/lib/cuda/atom_vec_cuda_kernel.cu @@ -0,0 +1,371 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#define RIMLARGER 1.000001 +#define RIMSMALLER 0.999999 +#define SMALL 1e-5 + +extern __shared__ int shared[]; + +template +__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + int* list=sendlist+iswap*maxlistlength; + if(i_nmax) _flag[0]=1; + int k=0; + if(data_mask & X_MASK){ + ((X_FLOAT*) buffer)[i+k*n]=_x[j] + dx; k++; + ((X_FLOAT*) buffer)[i+k*n] = _x[j+_nmax] + dy; k++; + ((X_FLOAT*) buffer)[i+k*n] = _x[j+2*_nmax] + dz; k++;} + if(data_mask & V_MASK){ + ((X_FLOAT*) buffer)[i+k*n]=_v[j]; k++; + ((X_FLOAT*) buffer)[i+k*n] = _v[j+_nmax]; k++; + ((X_FLOAT*) buffer)[i+k*n] = _v[j+2*_nmax]; k++;} + if(data_mask & OMEGA_MASK){ + ((X_FLOAT*) buffer)[i+k*n]=_omega[j]; k++; + ((X_FLOAT*) buffer)[i+k*n] = _omega[j+_nmax]; k++; + ((X_FLOAT*) buffer)[i+k*n] = _omega[j+2*_nmax]; k++;} + if(data_mask & RADIUS_MASK) ((X_FLOAT*) buffer)[i+k*n]=_radius[j]; k++; + if(data_mask & RMASS_MASK) ((X_FLOAT*) buffer)[i+k*n]=_rmass[j]; k++; + } +} + +template +__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + int* list=sendlist+iswap*maxlistlength; + if(i +__global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n,int first,void* buffer) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i (_x[i+dim*_nmax]); + if (xdim_tmp < _sublo[dim] || xdim_tmp >= _subhi[dim]) + { + add=true; + } + } + shared[threadIdx.x]=add?1:0; + __syncthreads(); + int nsend=0; + if(threadIdx.x==0) + { + for(int k=0;k +__global__ void Cuda_AtomVecCuda_PackExchange_Kernel(int nsend, int* copylist) +{ + double* buf=(double*) _buffer; + int k=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(k>=nsend) return; + buf=&buf[1+k]; + + int i=static_cast (buf[0]); + int j=copylist[k]; + + int m=1; + if(data_mask & X_MASK){ + buf[(m++)*nsend] = static_cast (_x[i]); + buf[(m++)*nsend] = static_cast (_x[i+_nmax]); + buf[(m++)*nsend] = static_cast (_x[i+2*_nmax]);} + if(data_mask & V_MASK){ + buf[(m++)*nsend] = _v[i]; + buf[(m++)*nsend] = _v[i+_nmax]; + buf[(m++)*nsend] = _v[i+2*_nmax];} + if(data_mask & TAG_MASK) buf[(m++)*nsend] = _tag[i]; + if(data_mask & TYPE_MASK) buf[(m++)*nsend] = _type[i]; + if(data_mask & MASK_MASK) buf[(m++)*nsend] = _mask[i]; + if(data_mask & IMAGE_MASK) buf[(m++)*nsend] = _image[i]; + if(data_mask & Q_MASK) buf[(m++)*nsend] = _q[i]; + if(data_mask & MOLECULE_MASK) buf[(m++)*nsend] = _molecule[i]; + if(data_mask & RADIUS_MASK) buf[(m++)*nsend] = _radius[i]; + if(data_mask & DENSITY_MASK) buf[(m++)*nsend] = _density[i]; + if(data_mask & RMASS_MASK) buf[(m++)*nsend] = _rmass[i]; + if(data_mask & OMEGA_MASK) { + buf[(m++)*nsend] = _omega[i]; + buf[(m++)*nsend] = _omega[i+_nmax]; + buf[(m++)*nsend] = _omega[i+2*_nmax];} + +/* if(data_mask & NSPECIAL_MASK) + { + buf[(m++)*nsend] = _nspecial[i]; + buf[(m++)*nsend] = _nspecial[i+_nmax]; + buf[(m++)*nsend] = _nspecial[i+2* _nmax]; + }*/ + + if(i>=_nlocal) return; + if(data_mask & X_MASK){ + _x[i] = _x[j]; + _x[i+_nmax] = _x[j+_nmax]; + _x[i+2*_nmax] = _x[j+2*_nmax];} + if(data_mask & V_MASK){ + _v[i] = _v[j]; + _v[i+_nmax] = _v[j+_nmax]; + _v[i+2*_nmax] = _v[j+2*_nmax];} + if(data_mask & TAG_MASK) _tag[i] = _tag[j]; + if(data_mask & TYPE_MASK) _type[i] = _type[j]; + if(data_mask & MASK_MASK) _mask[i] = _mask[j]; + if(data_mask & IMAGE_MASK) _image[i] = _image[j]; + + if(data_mask & Q_MASK) _q[i] = _q[j]; + if(data_mask & MOLECULE_MASK) _molecule[i]= _molecule[j]; + if(data_mask & RADIUS_MASK) _radius[i] = _radius[j]; + if(data_mask & DENSITY_MASK) _density[i] = _density[j]; + if(data_mask & RMASS_MASK) _rmass[i] = _rmass[j]; + if(data_mask & OMEGA_MASK) + { + _omega[i] = _omega[j]; + _omega[i+_nmax] = _omega[j+_nmax]; + _omega[i+2*_nmax] = _omega[j+2*_nmax]; + } + /* if(data_mask & NSPECIAL_MASK) + { + _nspecial[i] = _nspecial[j]; + _nspecial[i+_nmax] = _nspecial[j+_nmax]; + _nspecial[i+2* _nmax] = _nspecial[j+2* _nmax]; + }*/ +} + +template +__global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim,int nsend,int* copylist) +{ + double* buf=(double*) _buffer; + int k=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(k>=nsend) return; + buf=&buf[1+k]; + int i=-1; + double xdim_tmp = buf[(1+dim)*nsend]; + if(xdim_tmp>=_sublo[dim]-SMALL && xdim_tmp<_subhi[dim]+SMALL) + { + i=atomicAdd(_flag,1)+_nlocal; + + int m=1; + if(data_mask & X_MASK){ + _x[i] = buf[(m++)*nsend]; + _x[i+_nmax] = buf[(m++)*nsend]; + _x[i+2*_nmax] = buf[(m++)*nsend];} + if(data_mask & V_MASK){ + _v[i] = buf[(m++)*nsend]; + _v[i+_nmax] = buf[(m++)*nsend]; + _v[i+2*_nmax] = buf[(m++)*nsend];} + if(data_mask & TAG_MASK) _tag[i] = buf[(m++)*nsend]; + if(data_mask & TYPE_MASK) _type[i] = buf[(m++)*nsend]; + if(data_mask & MASK_MASK) _mask[i] = buf[(m++)*nsend]; + if(data_mask & IMAGE_MASK) _image[i] = buf[(m++)*nsend]; + + if(data_mask & Q_MASK) _q[i] = buf[(m++)*nsend]; + if(data_mask & MOLECULE_MASK) _molecule[i] = buf[(m++)*nsend]; + if(data_mask & RADIUS_MASK) _radius[i] = buf[(m++)*nsend]; + if(data_mask & DENSITY_MASK) _density[i] = buf[(m++)*nsend]; + if(data_mask & RMASS_MASK) _rmass[i] = buf[(m++)*nsend]; + if(data_mask & OMEGA_MASK) + { + _omega[i] = buf[(m++)*nsend]; + _omega[i+_nmax] = buf[(m++)*nsend]; + _omega[i+2*_nmax] = buf[(m++)*nsend]; + } + /* if(data_mask & NSPECIAL_MASK) + { + _nspecial[i] = buf[(m++)*nsend]; + _nspecial[i+_nmax] = buf[(m++)*nsend]; + _nspecial[i+2*_nmax] = buf[(m++)*nsend]; + }*/ + } + copylist[k]=i; +} + +template +__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + int* list=sendlist+iswap*maxlistlength; + if(i +__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + int* list=sendlist+iswap*maxlistlength; + if(i +__global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n,int first) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i (((X_FLOAT*) _buffer)[i+(m++)*n]); + if(data_mask & TYPE_MASK) _type[i+first] = static_cast (((X_FLOAT*) _buffer)[i+(m++)*n]); + if(data_mask & MASK_MASK) _mask[i+first] = static_cast (((X_FLOAT*) _buffer)[i+(m++)*n]); + if(data_mask & Q_MASK) _q[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n]; + if(data_mask & MOLECULE_MASK) _molecule[i+first] = static_cast (((X_FLOAT*) _buffer)[i+(m++)*n]); + if(data_mask & RADIUS_MASK) _radius[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n]; + if(data_mask & DENSITY_MASK) _density[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n]; + if(data_mask & RMASS_MASK) _rmass[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n]; + if(data_mask & OMEGA_MASK) { + _omega[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n]; + _omega[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n]; + _omega[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];} + } + else + { + _flag[0]=1; + } + } +} + + diff --git a/lib/cuda/atom_vec_full_cuda.cu b/lib/cuda/atom_vec_full_cuda.cu new file mode 100644 index 0000000000..a5aae11824 --- /dev/null +++ b/lib/cuda/atom_vec_full_cuda.cu @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +const unsigned int FULL_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK; + +#include "atom_vec_full_cuda_cu.h" + +void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata) +{ + return Cuda_AtomVecCuda_Init(sdata); +} + +int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackExchangeList(sdata,n,dim,buf_send); +} + +int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackExchange(sdata,nsend,buf_send,copylist); +} + +int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackExchange(sdata,nsend,buf_send,copylist); +} + +int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); +} + +int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); +} + +int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); +} + +int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); +} + +int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv) +{ + const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); +} + +int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv) +{ + const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); +} diff --git a/lib/cuda/atom_vec_full_cuda_cu.h b/lib/cuda/atom_vec_full_cuda_cu.h new file mode 100644 index 0000000000..6cf163ab71 --- /dev/null +++ b/lib/cuda/atom_vec_full_cuda_cu.h @@ -0,0 +1,15 @@ +#ifndef ATOM_VEC_FULL_CUDA_CU_H_ +#define ATOM_VEC_FULL_CUDA_CU_H_ + +extern "C" void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata); +extern "C" int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send); +extern "C" int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); +extern "C" int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); +extern "C" int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); +extern "C" int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv); +extern "C" int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv); + +#endif /*ATOM_VEC_FULL2_CUDA_CU_H_*/ diff --git a/lib/cuda/binning.cu b/lib/cuda/binning.cu new file mode 100644 index 0000000000..823015ff55 --- /dev/null +++ b/lib/cuda/binning.cu @@ -0,0 +1,196 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef CUDA_USE_BINNING +#include +#define MY_PREFIX binning +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "binning_cu.h" +#include "binning_kernel.cu" + +void Cuda_PreBinning(cuda_shared_data* sdata) +{ + // initialize only on first call + short init = 0; + if(! init) + { + init = 1; + int cuda_dummy_type = sdata->atom.ntypes + 1; + X_FLOAT outside[3] = + { + (sdata->domain.subhi[0] - sdata->domain.sublo[0])/1000.0, + (sdata->domain.subhi[1] - sdata->domain.sublo[1])/1000.0, + (sdata->domain.subhi[2] - sdata->domain.sublo[2])/1000.0 + }; + cudaMemcpyToSymbol("binned_size_all" , & sdata->atom.binned_type.dim[0] , sizeof(unsigned) ); + cudaMemcpyToSymbol("cuda_dummy_type" , & cuda_dummy_type , sizeof(int) ); + cudaMemcpyToSymbol("outside" , & outside , sizeof(X_FLOAT)*3); + cudaMemcpyToSymbol(MY_CONST(binned_type), & sdata->atom.binned_type.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(binned_x) , & sdata->atom.binned_x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi , sizeof(X_FLOAT)*3); + // bin_nmax == blockDim.x + + // printf("# CUDA: MY_CONST(binned_type) = %s\n", MY_CONST(binned_type)); + // int* p = pre_binning_binned_type; // pre_binning_binned_type is defined here!! + } + + dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_dim[2], 1); + dim3 threads(sdata->domain.bin_nmax, 1, 1); + + MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning grid = (%u, %u, %u)\n", grid.x, grid.y, grid.z);) + MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning threads = (%u, %u, %u)\n", threads.x, threads.y, threads.z); ) + PreBinning_Kernel<<>> (); + cudaThreadSynchronize(); + MYDBG(printf("ERROR-CUDA pre_binning: %s\n",cudaGetErrorString(cudaGetLastError()))); + CUT_CHECK_ERROR("Cuda_PreBinning: binning Kernel execution failed"); +} + +void Cuda_Binning(cuda_shared_data* sdata) +{ + MYDBG( // check assumption in debug mode + if(sdata->atom.x.dim[1] != 3) + { + printf("# CUDA: Cuda_Binning: binning error: atom array dimensions not Nx3\n"); + return; + } + ) + + // initialize only on first call + short init = 0; + if(! init) + { + init = 0; + X_FLOAT const_rez_bin_size[3] = + { + (1.0 * sdata->domain.bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]), + (1.0 * sdata->domain.bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]), + (1.0 * sdata->domain.bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2]) + }; + cudaMemcpyToSymbol("bin_error_count" , & sdata->atom.bin_error_count.dev_data, sizeof(X_FLOAT)*1); + cudaMemcpyToSymbol("rez_bin_size" , & const_rez_bin_size , sizeof(X_FLOAT)*3); + cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*)); + cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*)); + cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int3) ); + cudaMemcpyToSymbol(MY_CONST(bin_nmax) , & sdata->domain.bin_nmax , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(binned_q) , & sdata->atom.binned_q .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(binned_rmass) , & sdata->atom.binned_rmass .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(binpos) , & sdata->atom.binpos .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(natoms) , & sdata->atom.nall , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(nghost) , & sdata->atom.nghost , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo , sizeof(X_FLOAT)*3); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + } + + dim3 grid((unsigned)(1 + sdata->atom.nlocal/64.0), 1, 1); + MYDBG( printf("# CUDA: Cuda_Binning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); ) + dim3 threads(64, 1, 1); + + cudaMemset((int*) (sdata->atom.bin_count_all.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2])); + cudaMemset((int*) (sdata->atom.bin_count_local.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2])); + cudaMemset(sdata->atom.bin_error_count.dev_data,0,sizeof(int)*1); + int binning_error_l[1]; + + + Binning_Kernel<<>> ( + (X_FLOAT*) (sdata->atom. x.dev_data), + (X_FLOAT*) (sdata->atom.binned_x.dev_data), + sdata->atom.q_flag, + 0, + sdata->atom.rmass_flag + ); + cudaThreadSynchronize(); + cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost); + if(binning_error_l[0]!=0) + { + printf("CUDA-ERROR: binning local: could not bin %i atoms\n",binning_error_l[0]); + } + CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed"); + + grid.x=(unsigned)(1 + (sdata->atom.nall-sdata->atom.nlocal)/32.0); + MYDBG( printf("# CUDA: Cuda_Binning Ghost: grid dim.x = %u\n", grid.x); ) + + + Binning_Kernel<<>> ( + (X_FLOAT*) (sdata->atom. x.dev_data), + (X_FLOAT*) (sdata->atom.binned_x.dev_data), + sdata->atom.q_flag, + sdata->atom.nlocal, + sdata->atom.rmass_flag + ); + cudaThreadSynchronize(); + cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost); + if(binning_error_l[0]!=0) printf("CUDA-ERROR: binning ghost: could not bin %i atoms\n",binning_error_l[0]); +} + +void Cuda_ReverseBinning(cuda_shared_data* sdata) +{ + // initialize only on first call + short init = 0; + if(! init) + { + init = 0; + cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*)); + cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*)); + cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int3) ); + cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(binned_q) , & sdata->atom.binned_q .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(natoms) , & sdata->atom.nall , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + } + + dim3 grid((unsigned)(1 + sdata->atom.nlocal/32.0), 1, 1); + MYDBG( printf("# CUDA: Cuda_ReverseBinning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); ) + dim3 threads(32, 1, 1); + + ReverseBinning_Kernel<<>> ( + (X_FLOAT*) (sdata->atom. x.dev_data), + (X_FLOAT*) (sdata->atom.binned_x.dev_data), + sdata->atom.q_flag + ); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Binning: reverse binning Kernel execution failed"); +} + +#endif diff --git a/lib/cuda/binning_cu.h b/lib/cuda/binning_cu.h new file mode 100644 index 0000000000..4f932c392f --- /dev/null +++ b/lib/cuda/binning_cu.h @@ -0,0 +1,28 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PreBinning(cuda_shared_data* sdata); +extern "C" void Cuda_Binning(cuda_shared_data* sdata); +extern "C" void Cuda_ReverseBinning(cuda_shared_data* sdata); diff --git a/lib/cuda/binning_kernel.cu b/lib/cuda/binning_kernel.cu new file mode 100644 index 0000000000..f5677d475f --- /dev/null +++ b/lib/cuda/binning_kernel.cu @@ -0,0 +1,149 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +// load some variables from shared cuda data into device's constant memory: +__device__ __constant__ X_FLOAT rez_bin_size[3]; +__device__ __constant__ unsigned* bin_error_count; + +__device__ __constant__ int cuda_dummy_type; +__device__ __constant__ unsigned binned_size_all; +__device__ __constant__ X_FLOAT outside[3]; + +__global__ void PreBinning_Kernel() +{ + const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; + + if(bin < gridDim.x * gridDim.y) // TODO: suspected always to be true + { + _binned_type[blockDim.x * bin + threadIdx.x] = cuda_dummy_type; + + const int i = 3*blockDim.x * bin + threadIdx.x; + X_FLOAT* binned_x = _binned_x + i; *binned_x = _subhi[0] + outside[0] * (1+i); + binned_x += blockDim.x; *binned_x = _subhi[1] + outside[1] * (1+i); + binned_x += blockDim.x; *binned_x = _subhi[2] + outside[2] * (1+i); + _binned_tag[i]=-1; + } +} + +__global__ void Binning_Kernel(X_FLOAT* x, X_FLOAT* binned_x, int q_flag, int offset, int rmass_flag) +{ + const unsigned i = blockDim.x * blockIdx.x + threadIdx.x+offset; + + int binatoms=_natoms; + if(offset==0) binatoms=_nlocal ; + + if(i < binatoms) + { + // copy atom position from global device memory to local register + // in this 3 steps to get as much coalesced access as possible + X_FLOAT my_xX, my_xY, my_xZ; + x += i; my_xX = *x; + x += _nmax; my_xY = *x; + x += _nmax; my_xZ = *x; + //my_xX=x[i]; + //my_xY=x[i+_nmax]; + //my_xZ=x[i+2*_nmax]; + + + // calculate flat bin index + int bx=__float2int_rd(rez_bin_size[0] * (my_xX - _sublo[0]))+2; + int by=__float2int_rd(rez_bin_size[1] * (my_xY - _sublo[1]))+2; + int bz=__float2int_rd(rez_bin_size[2] * (my_xZ - _sublo[2]))+2; + + bx-=bx*negativCUDA(1.0f*bx); + bx-=(bx-_bin_dim.x+1)*negativCUDA(1.0f*_bin_dim.x-1.0f-1.0f*bx); + by-=by*negativCUDA(1.0f*by); + by-=(by-_bin_dim.y+1)*negativCUDA(1.0f*_bin_dim.y-1.0f-1.0f*by); + bz-=bz*negativCUDA(1.0f*bz); + bz-=(bz-_bin_dim.z+1)*negativCUDA(1.0f*_bin_dim.z-1.0f-1.0f*bz); + + + const unsigned j = _bin_dim.z * ( _bin_dim.y *bx+by)+bz; + + // add new atom to bin, get bin-array position + const unsigned k = atomicAdd(& _bin_count_all[j], 1); + if(offset==0) atomicAdd(& _bin_count_local[j], 1); + if(k < _bin_nmax) + { + // copy register values back to global device memory + unsigned pos = 3*_bin_nmax * j + k; + _binpos[i]=pos; + binned_x += pos; *binned_x = my_xX; + binned_x += _bin_nmax; *binned_x = my_xY; + binned_x += _bin_nmax; *binned_x = my_xZ; + + // also copy velocity and force accordingly + + binned_x = _binned_v + pos; x = _v + i; *binned_x = *x; + binned_x += _bin_nmax; x += _nmax; *binned_x = *x; + binned_x += _bin_nmax; x += _nmax; *binned_x = *x; + + binned_x = _binned_f + pos; x = _f + i; *binned_x = *x; + binned_x += _bin_nmax; x += _nmax; *binned_x = *x; + binned_x += _bin_nmax; x += _nmax; *binned_x = *x; + + pos = _bin_nmax * j + k; + _binned_type [pos] = _type[i]; + _binned_tag [pos] = _tag[i]; + if(rmass_flag) + _binned_rmass[pos] = _rmass[i]; + if(q_flag) + _binned_q [pos] = _q[i]; + } + else + { // normally, this should not happen: + int errorn=atomicAdd(bin_error_count, 1); + MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); ) + } + } +} + +__global__ void ReverseBinning_Kernel(X_FLOAT* x, X_FLOAT* binned_x,int q_flag) +{ + const unsigned i = blockDim.x * blockIdx.x + threadIdx.x; + + if(i < _nlocal) + { + unsigned bin_pos3 = _binpos[i]; + unsigned bin_pos=bin_pos3/(3*_bin_nmax); + bin_pos*=_bin_nmax; + bin_pos+=bin_pos3-bin_pos*3; + + binned_x = _binned_x + bin_pos3; x = x + i; *x = *binned_x; + binned_x += _bin_nmax; x += _nmax; *x = *binned_x; + binned_x += _bin_nmax; x += _nmax; *x = *binned_x; + + binned_x = _binned_v + bin_pos3; x = _v + i; *x = *binned_x; + binned_x += _bin_nmax; x += _nmax; *x = *binned_x; + binned_x += _bin_nmax; x += _nmax; *x = *binned_x; + + binned_x = _binned_f + bin_pos3; x = _f + i; *x = *binned_x; + binned_x += _bin_nmax; x += _nmax; *x = *binned_x; + binned_x += _bin_nmax; x += _nmax; *x = *binned_x; + + + _type[i] = _binned_type[bin_pos]; + _tag[i] = _binned_tag[bin_pos]; + if(q_flag) _q[i] = _binned_q[bin_pos]; + } +} diff --git a/lib/cuda/comm_cuda.cu b/lib/cuda/comm_cuda.cu new file mode 100644 index 0000000000..0233f3ee13 --- /dev/null +++ b/lib/cuda/comm_cuda.cu @@ -0,0 +1,483 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX comm_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "comm_cuda_cu.h" +#include "comm_cuda_kernel.cu" +#include + +void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata,int n) +{ + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); +} + + +void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); +} + + +void Cuda_CommCuda_Init(cuda_shared_data* sdata) +{ + Cuda_CommCuda_UpdateNmax(sdata); + int ntypesp=sdata->atom.ntypes+1; + cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , &ntypesp, sizeof(int)); + cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd, 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(debugdata) , &sdata->debugdata, sizeof(int*)); +} + +int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + + timespec time1,time2; + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal>0) + { + cudaMemset( sdata->flag,0,sizeof(int)); + +clock_gettime(CLOCK_REALTIME,&time1); + + void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer; + Cuda_CommCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n + ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_kernel_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + if(not sdata->overlap_comm) + cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_forward_download+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + int aflag; + cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + if(aflag!=0) printf("aflag PackComm: %i\n",aflag); + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + + } + return 3*n; +} + +int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + + timespec time1,time2; + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*6*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal>0) + { + cudaMemset( sdata->flag,0,sizeof(int)); + +clock_gettime(CLOCK_REALTIME,&time1); + + void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer; + Cuda_CommCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n + ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_kernel_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + if(not sdata->overlap_comm) + cudaMemcpy(buf_send, sdata->buffer, n*6*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_forward_download+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + int aflag; + cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + if(aflag!=0) printf("aflag PackComm: %i\n",aflag); + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + + } + return 6*n; +} + +int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) + timespec time1,time2; + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + static int count=-1; + count++; + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + +clock_gettime(CLOCK_REALTIME,&time1); + + Cuda_CommCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_kernel_self+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); + } + + return 3*n; +} + +int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) + timespec time1,time2; + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*6*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + static int count=-1; + count++; + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + +clock_gettime(CLOCK_REALTIME,&time1); + + Cuda_CommCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_kernel_self+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); + } + + return 6*n; +} + +void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap) +{ + timespec time1,time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { +clock_gettime(CLOCK_REALTIME,&time1); + if(not sdata->overlap_comm||iswap<0) + cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_upload+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer; + Cuda_CommCuda_UnpackComm_Kernel<<>>(n,first,buf); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_forward_kernel_unpack+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed"); + + } +} + +void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap) +{ + timespec time1,time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*6*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { +clock_gettime(CLOCK_REALTIME,&time1); + + if(not sdata->overlap_comm||iswap<0) + cudaMemcpy(sdata->buffer,(void*)buf_recv, n*6*sizeof(X_FLOAT), cudaMemcpyHostToDevice); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_upload+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer; + Cuda_CommCuda_UnpackComm_Kernel<<>>(n,first,buf); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_forward_kernel_unpack+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed"); + + } +} + +int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send) +{ + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(F_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + + F_FLOAT* buf=(F_FLOAT*)buf_send; + F_FLOAT* f_dev=(F_FLOAT*)sdata->atom.f.dev_data; + f_dev+=first; + cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + buf+=n; f_dev+=sdata->atom.nmax; + cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + buf+=n; f_dev+=sdata->atom.nmax; + cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + return n*3; +} + + +void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv) +{ + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(F_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + cudaMemcpy(sdata->buffer,buf_recv, size, cudaMemcpyHostToDevice); + Cuda_CommCuda_UnpackReverse_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed"); + } +} + +void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first) +{ + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal>0) + { + Cuda_CommCuda_UnpackReverse_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,first); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed"); + + } +} + + +int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap) +{ + MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");) + timespec time1,time2; + Cuda_CommCuda_UpdateNmax(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + if(sdata->buffer_new or (80>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,10); + int n; + if (!bordergroup || ineed >= 2) + n=nlast-nfirst+1; + else + { + n=atom_nfirst; + if(nlast-sdata->atom.nlocal+1>n) n=nlast-sdata->atom.nlocal+1; + } + int3 layout=getgrid(n,0,512,true); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x+1, layout.y, 1); + + + cudaMemset((int*) (sdata->buffer),0,sizeof(int)); + +clock_gettime(CLOCK_REALTIME,&time1); + if(style==1) + Cuda_CommCuda_BuildSendlist_Single<<>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.slablo.dev_data,(X_FLOAT*) sdata->comm.slabhi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength); + else + Cuda_CommCuda_BuildSendlist_Multi<<>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.multilo.dev_data,(X_FLOAT*) sdata->comm.multihi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength); + cudaThreadSynchronize(); +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_border_kernel_buildlist+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed"); + int nsend; + cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost); + return nsend; + + +} + diff --git a/lib/cuda/comm_cuda_cu.h b/lib/cuda/comm_cuda_cu.h new file mode 100644 index 0000000000..b5b2d192ba --- /dev/null +++ b/lib/cuda/comm_cuda_cu.h @@ -0,0 +1,35 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbcflag); +extern "C" int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbcflag); +extern "C" int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbcflag); +extern "C" int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbcflag); +extern "C" void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap=-1); +extern "C" void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap=-1); +extern "C" int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send); +extern "C" void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv); +extern "C" void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first); +extern "C" int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap); diff --git a/lib/cuda/comm_cuda_kernel.cu b/lib/cuda/comm_cuda_kernel.cu new file mode 100644 index 0000000000..c171a721a4 --- /dev/null +++ b/lib/cuda/comm_cuda_kernel.cu @@ -0,0 +1,353 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + int* list=sendlist+iswap*maxlistlength; + if(i_nmax) _flag[0]=1; + ((X_FLOAT*) buffer)[i]=_x[j] + dx; + ((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy; + ((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz; + } +} + +__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + int* list=sendlist+iswap*maxlistlength; + if(i_nmax) _flag[0]=1; + ((X_FLOAT*) buffer)[i]=_x[j] + dx; + ((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy; + ((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz; + ((X_FLOAT*) buffer)[i+3*n]=_v[j]; + ((X_FLOAT*) buffer)[i+4*n] = _v[j+_nmax]; + ((X_FLOAT*) buffer)[i+5*n] = _v[j+2*_nmax]; + } +} + +__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + int* list=sendlist+iswap*maxlistlength; + if(i= 2) { + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst; + if(i= lo && _x[i+dim*_nmax] <= hi) { + add=true; + } + shared[threadIdx.x]=add?1:0; + + __syncthreads(); + + int nsend=0; + if(threadIdx.x==0) + { + for(int k=0;k= lo && _x[i+dim*_nmax] <= hi) { + add=true; + } + + shared[threadIdx.x]=add?1:0; + + __syncthreads(); + + int nsend=0; + if(threadIdx.x==0) + { + for(int k=0;k= lo && _x[i+dim*_nmax] <= hi) { + add=true; + } + shared[threadIdx.x]=add?1:0; + + __syncthreads(); + + nsend=0; + if(threadIdx.x==0) + { + for(int k=0;k= 2) { + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst; + if(i= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) { + add=true; + } + } + shared[threadIdx.x]=add?1:0; + + __syncthreads(); + + int nsend=0; + if(threadIdx.x==0) + { + for(int k=0;k= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) { + add=true; + } + } + shared[threadIdx.x]=add?1:0; + + __syncthreads(); + + int nsend=0; + if(threadIdx.x==0) + { + for(int k=0;k= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) { + add=true; + } + } + shared[threadIdx.x]=add?1:0; + + __syncthreads(); + + nsend=0; + if(threadIdx.x==0) + { + for(int k=0;k +#define MY_PREFIX compute_temp_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "compute_temp_cuda_cu.h" +#include "compute_temp_cuda_kernel.cu" + +void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int size=(unsigned)((sdata->atom.nlocal+63)/64.0)*6*sizeof(ENERGY_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); +} + +void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) ); + if(sdata->atom.rmass_flag) + cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); +} + +void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata) +{ + Cuda_ComputeTempCuda_UpdateNmax(sdata); +} + + +void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + //if(sdata->buffer_new) + Cuda_ComputeTempCuda_UpdateBuffer(sdata); + + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + Cuda_ComputeTempCuda_Vector_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed"); + + int oldgrid=grid.x; + grid.x=6; + threads.x=512; + Cuda_ComputeTempCuda_Reduce_Kernel<<>> (oldgrid,t); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed"); + } +} + +void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + //if(sdata->buffer_new) + Cuda_ComputeTempCuda_UpdateBuffer(sdata); + MYDBG(printf("#CUDA ComputeTempCuda_Scalar: %i\n",sdata->atom.nlocal);) + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel"); + Cuda_ComputeTempCuda_Scalar_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed"); + + int oldgrid=grid.x; + grid.x=1; + threads.x=512; + Cuda_ComputeTempCuda_Reduce_Kernel<<>> (oldgrid,t); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed"); + } +} diff --git a/lib/cuda/compute_temp_cuda_cu.h b/lib/cuda/compute_temp_cuda_cu.h new file mode 100644 index 0000000000..0793be77cb --- /dev/null +++ b/lib/cuda/compute_temp_cuda_cu.h @@ -0,0 +1,28 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t); +extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t); diff --git a/lib/cuda/compute_temp_cuda_kernel.cu b/lib/cuda/compute_temp_cuda_kernel.cu new file mode 100644 index 0000000000..3e97148f6b --- /dev/null +++ b/lib/cuda/compute_temp_cuda_kernel.cu @@ -0,0 +1,109 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ ENERGY_FLOAT sharedmem[]; + + +__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + sharedmem[threadIdx.x]=0; + if(i < _nlocal) + { + if (_rmass_flag) { + if (_mask[i] & groupbit) + sharedmem[threadIdx.x] = (_v[i]*_v[i] + _v[i+_nmax]*_v[i+_nmax] + _v[i+2*_nmax]*_v[i+2*_nmax]) * _rmass[i]; + } else { + if (_mask[i] & groupbit) + sharedmem[threadIdx.x] = (_v[i]*_v[i] + _v[i+_nmax]*_v[i+_nmax] + _v[i+2*_nmax]*_v[i+2*_nmax]) * (_mass[_type[i]]); + } + } + reduceBlock(sharedmem); + ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; + if(threadIdx.x==0) + { + buffer[blockIdx.x]=sharedmem[0]; + } +} + +__global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + sharedmem[threadIdx.x]=0; + sharedmem[threadIdx.x+blockDim.x]=0; + sharedmem[threadIdx.x+2*blockDim.x]=0; + sharedmem[threadIdx.x+3*blockDim.x]=0; + sharedmem[threadIdx.x+4*blockDim.x]=0; + sharedmem[threadIdx.x+5*blockDim.x]=0; + if(i < _nlocal) + if (_mask[i] & groupbit) { + V_FLOAT massone; + if (_rmass_flag) massone = _rmass[i]; + else massone = _mass[_type[i]]; + sharedmem[threadIdx.x] = massone * _v[i]*_v[i]; + sharedmem[threadIdx.x+blockDim.x] = massone * _v[i+_nmax]*_v[i+_nmax]; + sharedmem[threadIdx.x+2*blockDim.x] = massone * _v[i+2*_nmax]*_v[i+2*_nmax]; + sharedmem[threadIdx.x+3*blockDim.x] = massone * _v[i]*_v[i+_nmax]; + sharedmem[threadIdx.x+4*blockDim.x] = massone * _v[i]*_v[i+2*_nmax]; + sharedmem[threadIdx.x+5*blockDim.x] = massone * _v[i+_nmax]*_v[i+2*_nmax]; + } + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2*blockDim.x]); + reduceBlock(&sharedmem[3*blockDim.x]); + reduceBlock(&sharedmem[4*blockDim.x]); + reduceBlock(&sharedmem[5*blockDim.x]); + ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; + if(threadIdx.x==0) + { + buffer[blockIdx.x]=sharedmem[0]; + buffer[blockIdx.x+gridDim.x]=sharedmem[blockDim.x]; + buffer[blockIdx.x+2*gridDim.x]=sharedmem[2*blockDim.x]; + buffer[blockIdx.x+3*gridDim.x]=sharedmem[3*blockDim.x]; + buffer[blockIdx.x+4*gridDim.x]=sharedmem[4*blockDim.x]; + buffer[blockIdx.x+5*gridDim.x]=sharedmem[5*blockDim.x]; + } +} + + +__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n,ENERGY_FLOAT* t) +{ + int i=0; + sharedmem[threadIdx.x]=0; + ENERGY_FLOAT myforig=0.0; + ENERGY_FLOAT* buf=(ENERGY_FLOAT*) _buffer; + buf=&buf[blockIdx.x*n]; + while(i +#define MY_PREFIX compute_temp_partial_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "compute_temp_partial_cuda_cu.h" +#include "compute_temp_partial_cuda_kernel.cu" + +void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int size=(unsigned)((sdata->atom.nlocal+63)/64.0)*6*sizeof(ENERGY_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); +} + +void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) ); + if(sdata->atom.rmass_flag) + cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); +} + +void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata) +{ + Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); +} + + +void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + //if(sdata->buffer_new) + Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); + + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + Cuda_ComputeTempPartialCuda_Vector_Kernel<<>> (groupbit,xflag,yflag,zflag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed"); + + int oldgrid=grid.x; + grid.x=6; + threads.x=512; + Cuda_ComputeTempPartialCuda_Reduce_Kernel<<>> (oldgrid,t); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed"); + } +} + +void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + //if(sdata->buffer_new) + Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); + MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n",sdata->atom.nlocal);) + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel"); + Cuda_ComputeTempPartialCuda_Scalar_Kernel<<>> (groupbit,xflag,yflag,zflag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed"); + + int oldgrid=grid.x; + grid.x=1; + threads.x=512; + Cuda_ComputeTempPartialCuda_Reduce_Kernel<<>> (oldgrid,t); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed"); + } +} + +void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + //if(sdata->buffer_new) + Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); + + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel<<>> (groupbit,xflag,yflag,zflag,(V_FLOAT*) vbiasall); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed"); + } +} + +void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + //if(sdata->buffer_new) + Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); + + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel<<>> (groupbit,xflag,yflag,zflag,(V_FLOAT*) vbiasall); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed"); + } +} diff --git a/lib/cuda/compute_temp_partial_cuda_cu.h b/lib/cuda/compute_temp_partial_cuda_cu.h new file mode 100644 index 0000000000..82fe86fa71 --- /dev/null +++ b/lib/cuda/compute_temp_partial_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag); +extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag); +extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall); +extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall); diff --git a/lib/cuda/compute_temp_partial_cuda_kernel.cu b/lib/cuda/compute_temp_partial_cuda_kernel.cu new file mode 100644 index 0000000000..c14c3a06a2 --- /dev/null +++ b/lib/cuda/compute_temp_partial_cuda_kernel.cu @@ -0,0 +1,152 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ ENERGY_FLOAT sharedmem[]; + + +__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit,int xflag,int yflag,int zflag) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + sharedmem[threadIdx.x]=0; + if(i < _nlocal) + { + if (_rmass_flag) { + if (_mask[i] & groupbit) + sharedmem[threadIdx.x] = (_v[i]*_v[i]*xflag + _v[i+_nmax]*_v[i+_nmax]*yflag + _v[i+2*_nmax]*_v[i+2*_nmax]*zflag) * _rmass[i]; + } else { + if (_mask[i] & groupbit) + sharedmem[threadIdx.x] = (_v[i]*_v[i]*xflag + _v[i+_nmax]*_v[i+_nmax]*yflag + _v[i+2*_nmax]*_v[i+2*_nmax]*zflag) * (_mass[_type[i]]); + } + } + reduceBlock(sharedmem); + ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; + if(threadIdx.x==0) + { + buffer[blockIdx.x]=sharedmem[0]; + } +} + +__global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit,int xflag,int yflag,int zflag) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + sharedmem[threadIdx.x]=0; + sharedmem[threadIdx.x+blockDim.x]=0; + sharedmem[threadIdx.x+2*blockDim.x]=0; + sharedmem[threadIdx.x+3*blockDim.x]=0; + sharedmem[threadIdx.x+4*blockDim.x]=0; + sharedmem[threadIdx.x+5*blockDim.x]=0; + if(i < _nlocal) + if (_mask[i] & groupbit) { + V_FLOAT massone; + if (_rmass_flag) massone = _rmass[i]; + else massone = _mass[_type[i]]; + sharedmem[threadIdx.x] = massone * _v[i]*_v[i]*xflag; + sharedmem[threadIdx.x+blockDim.x] = massone * _v[i+_nmax]*_v[i+_nmax]*yflag; + sharedmem[threadIdx.x+2*blockDim.x] = massone * _v[i+2*_nmax]*_v[i+2*_nmax]*zflag; + sharedmem[threadIdx.x+3*blockDim.x] = massone * _v[i]*_v[i+_nmax]*xflag*yflag; + sharedmem[threadIdx.x+4*blockDim.x] = massone * _v[i]*_v[i+2*_nmax]*xflag*zflag; + sharedmem[threadIdx.x+5*blockDim.x] = massone * _v[i+_nmax]*_v[i+2*_nmax]*yflag*zflag; + } + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2*blockDim.x]); + reduceBlock(&sharedmem[3*blockDim.x]); + reduceBlock(&sharedmem[4*blockDim.x]); + reduceBlock(&sharedmem[5*blockDim.x]); + ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; + if(threadIdx.x==0) + { + buffer[blockIdx.x]=sharedmem[0]; + buffer[blockIdx.x+gridDim.x]=sharedmem[blockDim.x]; + buffer[blockIdx.x+2*gridDim.x]=sharedmem[2*blockDim.x]; + buffer[blockIdx.x+3*gridDim.x]=sharedmem[3*blockDim.x]; + buffer[blockIdx.x+4*gridDim.x]=sharedmem[4*blockDim.x]; + buffer[blockIdx.x+5*gridDim.x]=sharedmem[5*blockDim.x]; + } +} + + +__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n,ENERGY_FLOAT* t) +{ + int i=0; + sharedmem[threadIdx.x]=0; + ENERGY_FLOAT myforig=0.0; + ENERGY_FLOAT* buf=(ENERGY_FLOAT*) _buffer; + buf=&buf[blockIdx.x*n]; + while(i (b) ? (a) : (b)) + +inline int3 getgrid(int n,int shared_per_thread=0,int threadsmax=256, bool p2=false) +{ + int3 gridparams; + int sharedsize=16000; + if(shared_per_thread>0) threadsmax= sharedsize/shared_per_thread10000) + gridparams.x=gridparams.y=int(sqrt(blocks)); + else + {gridparams.x=blocks; gridparams.y=1;} + while(gridparams.x*gridparams.y*gridparams.z>31; +} + +//return value: -1 if f<0; else +1 +static inline __device__ float fsignCUDA(float f) +{ + return f<0.0f?-1.0f:1.0f; +} + +//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights) +//blockDim.y and blockDim.z are assumed to be 1 +static inline __device__ void copySharedToGlob(int* shared, int* glob,const int& n) +{ + int i,k; + k=n-blockDim.x; + for(i=0;i t, int i) +{ + int2 v = tex1Dfetch(t,i); + return __hiloint2double(v.y, v.x); +} + +static __device__ inline X_FLOAT4 tex1Dfetch_double(texture t, int i) +{ + int4 v = tex1Dfetch(t,2*i); + int4 u = tex1Dfetch(t,2*i+1); + X_FLOAT4 w; + + w.x= __hiloint2double(v.y, v.x); + w.y= __hiloint2double(v.w, v.z); + w.z= __hiloint2double(u.y, u.x); + w.w= __hiloint2double(u.w, u.z); + return w; +} +#endif + +inline void BindXTypeTexture(cuda_shared_data* sdata) +{ + #ifdef CUDA_USE_TEXTURE + _x_type_tex.normalized = false; // access with normalized texture coordinates + _x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* x_type_texture_ptr; + cudaGetTextureReference(&x_type_texture_ptr, MY_CONST(x_type_tex)); + + #if X_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4)); + #endif + #endif +} + +static __device__ inline X_FLOAT4 fetchXType(int i) +{ + #ifdef CUDA_USE_TEXTURE + #if X_PRECISION == 1 + return tex1Dfetch(_x_type_tex,i); + #else + return tex1Dfetch_double(_x_type_tex,i); + #endif + #else + return _x_type[i]; + #endif +} + +#if V_PRECISION == 2 +static __device__ inline double tex1Dfetch_double_v(texture t, int i) +{ + int2 v = tex1Dfetch(t,i); + return __hiloint2double(v.y, v.x); +} + +static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture t, int i) +{ + int4 v = tex1Dfetch(t,2*i); + int4 u = tex1Dfetch(t,2*i+1); + V_FLOAT4 w; + + w.x= __hiloint2double(v.y, v.x); + w.y= __hiloint2double(v.w, v.z); + w.z= __hiloint2double(u.y, u.x); + w.w= __hiloint2double(u.w, u.z); + return w; +} +#endif + +inline void BindVRadiusTexture(cuda_shared_data* sdata) +{ + #ifdef CUDA_USE_TEXTURE + _v_radius_tex.normalized = false; // access with normalized texture coordinates + _v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* v_radius_texture_ptr; + cudaGetTextureReference(&v_radius_texture_ptr, MY_CONST(v_radius_tex)); + + #if V_PRECISION == 1 + cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); + cudaBindTexture(0,v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax*sizeof(X_FLOAT4)); + #else + cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); + cudaBindTexture(0,v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax*2*sizeof(int4)); + #endif + #endif +} + +static __device__ inline V_FLOAT4 fetchVRadius(int i) +{ + #ifdef CUDA_USE_TEXTURE + #if V_PRECISION == 1 + return tex1Dfetch(_v_radius_tex,i); + #else + return tex1Dfetch_double_v(_v_radius_tex,i); + #endif + #else + return _v_radius[i]; + #endif +} + +inline void BindOmegaRmassTexture(cuda_shared_data* sdata) +{ + #ifdef CUDA_USE_TEXTURE + _omega_rmass_tex.normalized = false; // access with normalized texture coordinates + _omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* omega_rmass_texture_ptr; + cudaGetTextureReference(&omega_rmass_texture_ptr, MY_CONST(omega_rmass_tex)); + + #if V_PRECISION == 1 + cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); + cudaBindTexture(0,omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax*sizeof(X_FLOAT4)); + #else + cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); + cudaBindTexture(0,omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax*2*sizeof(int4)); + #endif + #endif +} + +static __device__ inline V_FLOAT4 fetchOmegaRmass(int i) +{ + #ifdef CUDA_USE_TEXTURE + #if V_PRECISION == 1 + return tex1Dfetch(_omega_rmass_tex,i); + #else + return tex1Dfetch_double_v(_omega_rmass_tex,i); + #endif + #else + return _omega_rmass[i]; + #endif +} + +#if F_PRECISION == 2 +static __device__ inline double tex1Dfetch_double_f(texture t, int i) +{ + int2 v = tex1Dfetch(t,i); + return __hiloint2double(v.y, v.x); +} + +static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture t, int i) +{ + int4 v = tex1Dfetch(t,2*i); + int4 u = tex1Dfetch(t,2*i+1); + F_FLOAT4 w; + + w.x= __hiloint2double(v.y, v.x); + w.y= __hiloint2double(v.w, v.z); + w.z= __hiloint2double(u.y, u.x); + w.w= __hiloint2double(u.w, u.z); + return w; +} +#endif + +inline void BindQTexture(cuda_shared_data* sdata) +{ + #ifdef CUDA_USE_TEXTURE + _q_tex.normalized = false; // access with normalized texture coordinates + _q_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* q_texture_ptr; + cudaGetTextureReference(&q_texture_ptr, MY_CONST(q_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); + cudaBindTexture(0,q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax*sizeof(F_FLOAT)); + #else + cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); + cudaBindTexture(0,q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax*sizeof(int2)); + #endif + #endif +} + +static __device__ inline F_FLOAT fetchQ(int i) +{ + #ifdef CUDA_USE_TEXTURE + #if F_PRECISION == 1 + return tex1Dfetch(_q_tex,i); + #else + return tex1Dfetch_double_f(_q_tex,i); + #endif + #else + return _q[i]; + #endif +} + +#endif + +/* + +inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex) +{ + #ifdef CUDA_USE_TEXTURE + _coeff_tex.normalized = false; // access with normalized texture coordinates + _coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff_texture_ptr; + cudaGetTextureReference(&coeff_texture_ptr, MY_CONST(coeff_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4)); + #endif + #endif +} + +static __device__ inline X_FLOAT4 fetchXType(int i) +{ + #ifdef CUDA_USE_TEXTURE + #if X_PRECISION == 1 + return tex1Dfetch(_x_type_tex,i); + #else + return tex1Dfetch_double(_x_type_tex,i); + #endif + #else + return _x_type[i]; + #endif +} +*/ diff --git a/lib/cuda/cuda.cu b/lib/cuda/cuda.cu new file mode 100644 index 0000000000..1fc4dc4a41 --- /dev/null +++ b/lib/cuda/cuda.cu @@ -0,0 +1,22 @@ +#include "cuda_precision.h" +#include "cuda_shared.h" +#include "cuda_cu.h" + +void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata) +{ + sdata->compile_settings.prec_glob=sizeof(CUDA_FLOAT)/4; + sdata->compile_settings.prec_x=sizeof(X_FLOAT)/4; + sdata->compile_settings.prec_v=sizeof(V_FLOAT)/4; + sdata->compile_settings.prec_f=sizeof(F_FLOAT)/4; + sdata->compile_settings.prec_pppm=sizeof(PPPM_FLOAT)/4; + sdata->compile_settings.prec_fft=sizeof(FFT_FLOAT)/4; + + #ifdef FFT_CUFFT + sdata->compile_settings.cufft=1; + #else + sdata->compile_settings.cufft=0; + #endif + + sdata->compile_settings.arch=CUDA_ARCH; + +} diff --git a/lib/cuda/cuda_cu.h b/lib/cuda/cuda_cu.h new file mode 100644 index 0000000000..48498b8d0f --- /dev/null +++ b/lib/cuda/cuda_cu.h @@ -0,0 +1 @@ +extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata); diff --git a/lib/cuda/cuda_data.cu b/lib/cuda/cuda_data.cu new file mode 100644 index 0000000000..327cbd9014 --- /dev/null +++ b/lib/cuda/cuda_data.cu @@ -0,0 +1,168 @@ +enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet + +#include "cuda_data_cu.h" +#include "cuda_wrapper_cu.h" +#include "cuda_data_kernel.cu" +#include + +void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) +{ + int size=n[0]; + if(n[1]>0) size*=n[1]; + if(n[2]>0) size*=n[2]; + + dim3 threads; threads.x=1; threads.y=1; threads.z=1; + dim3 grid; grid.x=1; grid.y=1; grid.z=1; + + if(size<=128*30) + threads.x=32; + else if(size<=256*30) + threads.x=64; + else if(size<=512*30) + threads.x=128; + else + threads.x=256; + + grid.x=((size-1)+threads.x)/threads.x; + if(grid.x>32000) + grid.x=32000; + while(grid.x*grid.y*threads.x>>((double*)buffer,(float*)dev_data,n[0],n[1],n[2],mode); + cudaThreadSynchronize(); + CudaWrapper_DownloadCudaData(debugdata, dev_data, size/2); + double sum=0; + printf("debugdata: "); + for(int i=0;i0) size*=n[1]; + if(n[2]>0) size*=n[2]; + + dim3 threads; threads.x=1; threads.y=1; threads.z=1; + dim3 grid; grid.x=1; grid.y=1; grid.z=1; + + if(size<=128*30) + threads.x=32; + else if(size<=256*30) + threads.x=64; + else if(size<=512*30) + threads.x=128; + else + threads.x=256; + + grid.x=((size-1)+threads.x)/threads.x; + if(grid.x>32000) + grid.x=32000; + while(grid.x*grid.y*threads.x>>((double*)buffer,(double*)dev_data,n[0],n[1],n[2],mode); + cudaThreadSynchronize(); +} + +void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) +{ + int size=n[0]; + if(n[1]>0) size*=n[1]; + if(n[2]>0) size*=n[2]; + + dim3 threads; threads.x=1; threads.y=1; threads.z=1; + dim3 grid; grid.x=1; grid.y=1; grid.z=1; + + if(size<=128*30) + threads.x=32; + else if(size<=256*30) + threads.x=64; + else if(size<=512*30) + threads.x=128; + else + threads.x=256; + + grid.x=((size-1)+threads.x)/threads.x; + if(grid.x>32000) + grid.x=32000; + while(grid.x*grid.y*threads.x>>((float*)buffer,(double*)dev_data,n[0],n[1],n[2],mode); + cudaThreadSynchronize(); +} + +void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) +{ + int size=n[0]; + if(n[1]>0) size*=n[1]; + if(n[2]>0) size*=n[2]; + + dim3 threads; threads.x=1; threads.y=1; threads.z=1; + dim3 grid; grid.x=1; grid.y=1; grid.z=1; + + if(size<=128*30) + threads.x=32; + else if(size<=256*30) + threads.x=64; + else if(size<=512*30) + threads.x=128; + else + threads.x=256; + + grid.x=((size-1)+threads.x)/threads.x; + if(grid.x>32000) + grid.x=32000; + while(grid.x*grid.y*threads.x>>((float*)buffer,(float*)dev_data,n[0],n[1],n[2],mode); + cudaThreadSynchronize(); +} + +void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) +{ + int size=n[0]; + if(n[1]>0) size*=n[1]; + if(n[2]>0) size*=n[2]; + + dim3 threads; threads.x=1; threads.y=1; threads.z=1; + dim3 grid; grid.x=1; grid.y=1; grid.z=1; + + if(size<=128*30) + threads.x=32; + else if(size<=256*30) + threads.x=64; + else if(size<=512*30) + threads.x=128; + else + threads.x=256; + + grid.x=((size-1)+threads.x)/threads.x; + if(grid.x>32000) + grid.x=32000; + while(grid.x*grid.y*threads.x>>((int*)buffer,(int*)dev_data,n[0],n[1],n[2],mode); + cudaThreadSynchronize(); +} + +void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer) +{ +} diff --git a/lib/cuda/cuda_data_cu.h b/lib/cuda/cuda_data_cu.h new file mode 100644 index 0000000000..e323b30429 --- /dev/null +++ b/lib/cuda/cuda_data_cu.h @@ -0,0 +1,13 @@ +#ifndef CUDA_DATA_CU_H_ +#define CUDA_DATA_CU_H_ + +extern "C" void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer); +extern "C" void CudaData_Upload_DoubleDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer); +extern "C" void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer); +extern "C" void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer); +extern "C" void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer); + +extern "C" void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer); + + +#endif /*CUDA_DATA_CU_H_*/ diff --git a/lib/cuda/cuda_data_kernel.cu b/lib/cuda/cuda_data_kernel.cu new file mode 100644 index 0000000000..831b7b08bb --- /dev/null +++ b/lib/cuda/cuda_data_kernel.cu @@ -0,0 +1,156 @@ +__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer,float* dev_data, + unsigned nx,unsigned ny,unsigned nz,copy_mode mode) +{ + if(mode==x) mode=xx; + unsigned length=nx; + if(ny>0) length*=ny; + if(nz>0) length*=nz; + unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l; + + + if(i>=length) return; + switch(mode) + { + case xx: + { + dev_data[i]=buffer[i]; + } + case xy: + { + dev_data[i]=buffer[i]; + } + case yx: + { + j=i/ny; + k=i%ny; + dev_data[k*nx+j]=buffer[j*ny+k]; + } + case xyz: + { + dev_data[i]=buffer[i]; + } + case xzy: + { + j=i/(ny*nz); + k=(i%(ny*nz))/nz; + l=i%nz; + dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l]; + } + } +} + +__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer,double* dev_data, + unsigned nx,unsigned ny,unsigned nz,copy_mode mode) +{ + if(mode==x) mode=xx; + unsigned length=nx; + if(ny>0) length*=ny; + if(nz>0) length*=nz; + unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l; + if(i>=length) return; + switch(mode) + { + case xx: + dev_data[i]=buffer[i]; + case xy: + dev_data[i]=buffer[i]; + case yx: + j=i/ny; + k=i%ny; + dev_data[k*nx+j]=buffer[j*ny+k]; + case xyz: + dev_data[i]=buffer[i]; + case xzy: + j=i/(ny*nz); + k=(i%(ny*nz))/nz; + l=i%nz; + dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l]; + } +} + +__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer,double* dev_data, + unsigned nx,unsigned ny,unsigned nz,copy_mode mode) +{ + if(mode==x) mode=xx; + unsigned length=nx; + if(ny>0) length*=ny; + if(nz>0) length*=nz; + unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l; + if(i>=length) return; + switch(mode) + { + case xx: + dev_data[i]=buffer[i]; + case xy: + dev_data[i]=buffer[i]; + case yx: + j=i/ny; + k=i%ny; + dev_data[k*nx+j]=buffer[j*ny+k]; + case xyz: + dev_data[i]=buffer[i]; + case xzy: + j=i/(ny*nz); + k=(i%(ny*nz))/nz; + l=i%nz; + dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l]; + } +} + +__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer,float* dev_data, + unsigned nx,unsigned ny,unsigned nz,copy_mode mode) +{ + if(mode==x) mode=xx; + unsigned length=nx; + if(ny>0) length*=ny; + if(nz>0) length*=nz; + unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l; + if(i>=length) return; + switch(mode) + { + case xx: + dev_data[i]=buffer[i]; + case xy: + dev_data[i]=buffer[i]; + case yx: + j=i/ny; + k=i%ny; + dev_data[k*nx+j]=buffer[j*ny+k]; + case xyz: + dev_data[i]=buffer[i]; + case xzy: + j=i/(ny*nz); + k=(i%(ny*nz))/nz; + l=i%nz; + dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l]; + } +} + +__global__ void CudaData_Upload_Kernel_IntInt(int* buffer,int* dev_data, + unsigned nx,unsigned ny,unsigned nz,copy_mode mode) +{ + if(mode==x) mode=xx; + unsigned length=nx; + if(ny>0) length*=ny; + if(nz>0) length*=nz; + unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l; + if(i>=length) return; + switch(mode) + { + case xx: + dev_data[i]=buffer[i]; + case xy: + dev_data[i]=buffer[i]; + case yx: + j=i/ny; + k=i%ny; + dev_data[k*nx+j]=buffer[j*ny+k]; + case xyz: + dev_data[i]=buffer[i]; + case xzy: + j=i/(ny*nz); + k=(i%(ny*nz))/nz; + l=i%nz; + dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l]; + } +} diff --git a/lib/cuda/cuda_kernel.cu b/lib/cuda/cuda_kernel.cu new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/cuda/cuda_pair.cu b/lib/cuda/cuda_pair.cu new file mode 100644 index 0000000000..531db7e2b3 --- /dev/null +++ b/lib/cuda/cuda_pair.cu @@ -0,0 +1,1000 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +enum PAIR_FORCES {PAIR_NONE,PAIR_BORN,PAIR_BUCK,PAIR_CG_CMM,PAIR_LJ_CHARMM,PAIR_LJ_CLASS2,PAIR_LJ_CUT, PAIR_LJ_EXPAND, PAIR_LJ_GROMACS, PAIR_LJ_SMOOTH, PAIR_LJ96_CUT, PAIR_MORSE, PAIR_MORSE_R6}; +enum COUL_FORCES {COUL_NONE,COUL_CHARMM,COUL_CHARMM_IMPLICIT,COUL_CUT,COUL_LONG, COUL_DEBYE, COUL_GROMACS,COUL_SPECIAL}; +#define DATA_NONE 0 +#define DATA_V 1 +#define DATA_TAG 2 +#define DATA_RMASS 4 +#define DATA_MASS 8 +#define DATA_TORQUE 16 +#define DATA_OMEGA 32 +#define DATA_RADIUS 64 +#define DATA_DENSITY 128 +#define DATA_MASK 256 +#define DATA_V_RADIUS 512 +#define DATA_OMEGA_RMASS 1024 + +#define MY_PREFIX cuda_pair +#define IncludeCommonNeigh +#include "cuda_shared.h" +#include "cuda_common.h" +#include "cuda_wrapper_cu.h" +#include "crm_cuda_utils.cu" + +//constants used by multiple forces + +//general +#define _cutsq MY_AP(cutsq) +#define _offset MY_AP(offset) +#define _special_lj MY_AP(special_lj) +#define _special_coul MY_AP(special_coul) +#define _cutsq_global MY_AP(cutsq_global) +#define _collect_forces_later MY_AP(collect_forces_later) + +__device__ __constant__ X_FLOAT _cutsq[CUDA_MAX_TYPES2]; +__device__ __constant__ ENERGY_FLOAT _offset[CUDA_MAX_TYPES2]; +__device__ __constant__ F_FLOAT _special_lj[4]; +__device__ __constant__ F_FLOAT _special_coul[4]; +__device__ __constant__ X_FLOAT _cutsq_global; +__device__ __constant__ int _collect_forces_later; + +__device__ __constant__ F_FLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space) +__device__ __constant__ F_FLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2]; +__device__ __constant__ F_FLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2]; +__device__ __constant__ F_FLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2]; +__device__ __constant__ F_FLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2]; + + +__device__ __constant__ F_FLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space) +__device__ __constant__ F_FLOAT* MY_AP(coeff2_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff3_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff4_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff5_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff6_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff7_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff8_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff9_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff10_gm); + + #define _coeff1_gm_tex MY_AP(coeff1_gm_tex) + #if F_PRECISION == 1 + texture _coeff1_gm_tex; + #else + texture _coeff1_gm_tex; + #endif + + #define _coeff2_gm_tex MY_AP(coeff2_gm_tex) + #if F_PRECISION == 1 + texture _coeff2_gm_tex; + #else + texture _coeff2_gm_tex; + #endif + + #define _coeff3_gm_tex MY_AP(coeff3_gm_tex) + #if F_PRECISION == 1 + texture _coeff3_gm_tex; + #else + texture _coeff3_gm_tex; + #endif + + #define _coeff4_gm_tex MY_AP(coeff4_gm_tex) + #if F_PRECISION == 1 + texture _coeff4_gm_tex; + #else + texture _coeff4_gm_tex; + #endif + + #define _coeff5_gm_tex MY_AP(coeff5_gm_tex) + #if F_PRECISION == 1 + texture _coeff5_gm_tex; + #else + texture _coeff5_gm_tex; + #endif + + #define _coeff6_gm_tex MY_AP(coeff6_gm_tex) + #if F_PRECISION == 1 + texture _coeff6_gm_tex; + #else + texture _coeff6_gm_tex; + #endif + + #define _coeff7_gm_tex MY_AP(coeff7_gm_tex) + #if F_PRECISION == 1 + texture _coeff7_gm_tex; + #else + texture _coeff7_gm_tex; + #endif + + #define _coeff8_gm_tex MY_AP(coeff8_gm_tex) + #if F_PRECISION == 1 + texture _coeff8_gm_tex; + #else + texture _coeff8_gm_tex; + #endif + + #define _coeff9_gm_tex MY_AP(coeff9_gm_tex) + #if F_PRECISION == 1 + texture _coeff9_gm_tex; + #else + texture _coeff9_gm_tex; + #endif + + #define _coeff10_gm_tex MY_AP(coeff10_gm_tex) + #if F_PRECISION == 1 + texture _coeff10_gm_tex; + #else + texture _coeff10_gm_tex; + #endif + +//if more than 5 coefficients are needed for a pair potential add them here + + +//coulomb +#define _cut_coulsq MY_AP(cut_coulsq) +#define _cut_coulsq_global MY_AP(cut_coulsq_global) +#define _g_ewald MY_AP(g_ewald) +#define _qqrd2e MY_AP(qqrd2e) +#define _kappa MY_AP(kappa) +__device__ __constant__ X_FLOAT _cut_coulsq[CUDA_MAX_TYPES2]; +__device__ __constant__ X_FLOAT _cut_coulsq_global; +__device__ __constant__ F_FLOAT _g_ewald; +__device__ __constant__ F_FLOAT _qqrd2e; +__device__ __constant__ F_FLOAT _kappa; + +//inner cutoff +#define _cut_innersq MY_AP(cut_innersq) +#define _cut_innersq_global MY_AP(cut_innersq_global) +__device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2]; +__device__ __constant__ X_FLOAT _cut_innersq_global; + + +template +__global__ void Pair_Kernel_TpA(int eflag, int vflag,int eflag_atom,int vflag_atom); + +template +__global__ void Pair_Kernel_BpA(int eflag, int vflag,int eflag_atom,int vflag_atom); + +template +__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase); + +template +__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase); + +#include +#include "cuda_pair_cu.h" +#include "cuda_pair_virial_kernel_nc.cu" + +//Functions which are shared by pair styles + +//Update Buffersize +void Cuda_UpdateBuffer(cuda_shared_data* sdata,int size) +{ + CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed"); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); + CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed"); +} + +//Update constants after nmax change which are generally needed by all pair styles +void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin"); + //Neighbor + cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(overlap_comm) , & sdata->overlap_comm, sizeof(int) ); + + if(sdata->overlap_comm) + { + cudaMemcpyToSymbol(MY_CONST(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*) ); + } + + //System + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + + //Atom + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) ); + + + //Other + cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata , sizeof(int*) ); + CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End"); +} + +//Initialisation of GPU Constants which rarely change +void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=false, bool use_global_params=false, bool need_innercut=false, bool need_cut=true ) +{ + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes; + unsigned n = sizeof(F_FLOAT) * cuda_ntypes2; + unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2; + + //check if enough constant memory is available + if((cuda_ntypes2 > CUDA_MAX_TYPES2 )&& !use_global_params) + printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u " + "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " + "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE-1); + if((cuda_ntypes2 > CUDA_MAX_TYPES2 )&& !use_global_params) + exit(0); + //type conversion of cutoffs and parameters + if(need_cut) + { + X_FLOAT cutsq[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_global * sdata->pair.cut_global); + } + } + + int cutsqdiffer=0; + X_FLOAT cutsq_global; + cutsq_global = (X_FLOAT) (sdata->pair.cut_global * sdata->pair.cut_global); + if(sdata->pair.cut) + { + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + if(sdata->pair.cut[i][j]>1e-6) + cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); + else + if(sdata->pair.cut[j][i]>1e-6) + cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut[j][i] * sdata->pair.cut[j][i]); + if(i==1&&j==1) cutsq_global = cutsq[i * cuda_ntypes + j]; + if((cutsq_global - cutsq[i * cuda_ntypes + j])*(cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } + + if(sdata->pair.cutsq) + { + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + if(sdata->pair.cut[i][j]>1e-6) + cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cutsq[i][j]); + else + if(sdata->pair.cut[j][i]>1e-6) + cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cutsq[j][i]); + if(i==1&&j==1) cutsq_global = cutsq[i * cuda_ntypes + j]; + if((cutsq_global - cutsq[i * cuda_ntypes + j])*(cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } + + if(cutsqdiffer) + { + cutsq_global = -1.0; + cudaMemcpyToSymbol(MY_CONST(cutsq) , cutsq , nx ); + } + cudaMemcpyToSymbol(MY_CONST(cutsq_global) ,&cutsq_global , sizeof(X_FLOAT) ); + } + + if(need_innercut) + { + X_FLOAT cut_innersq[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + cut_innersq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); + } + } + + int cutsqdiffer=0; + X_FLOAT cut_innersq_global; + cut_innersq_global = (X_FLOAT) (sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); + if(sdata->pair.cut_inner) + { + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + if(sdata->pair.cut_inner[i][j]>1e-6) + cut_innersq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); + else + if(sdata->pair.cut_inner[j][i]>1e-6) + cut_innersq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_inner[j][i] * sdata->pair.cut_inner[j][i]); + if(i==1&&j==1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j]; + if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j])*(cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } + if(cutsqdiffer) + { + cut_innersq_global = -1.0; + cudaMemcpyToSymbol(MY_CONST(cut_innersq) , cut_innersq , nx ); + } + cudaMemcpyToSymbol(MY_CONST(cut_innersq_global) ,&cut_innersq_global , sizeof(X_FLOAT) ); + } + + if(need_q) + { + X_FLOAT cut_coulsq[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); + } + } + + int cutsqdiffer=0; + X_FLOAT cut_coulsq_global; + cut_coulsq_global = (X_FLOAT) (sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); + if(sdata->pair.cut_coulsq_global> cut_coulsq_global) cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global; + if(sdata->pair.cut_coul) + { + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + if(sdata->pair.cut_coul[i][j]>1e-6) + cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); + else + if(sdata->pair.cut_coul[j][i]>1e-6) + cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_coul[j][i] * sdata->pair.cut_coul[j][i]); + if(i==1&&j==1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j]; + if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j])*(cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } + if(cutsqdiffer) + { + cut_coulsq_global = -1.0; + cudaMemcpyToSymbol(MY_CONST(cut_coulsq) , cut_coulsq , nx ); + } + cudaMemcpyToSymbol(MY_CONST(cut_coulsq_global),&cut_coulsq_global , sizeof(X_FLOAT) ); + } + CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed"); + + if(ncoeff>0) + { + F_FLOAT coeff1[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j]; + } + } + + if(use_global_params) + { + cudaMemcpyToSymbol(MY_CONST(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*) ); + cudaMemcpy((sdata->pair.coeff1_gm.dev_data),coeff1, n,cudaMemcpyHostToDevice); + + _coeff1_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff1_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff1_gm_texture_ptr; + cudaGetTextureReference(&coeff1_gm_texture_ptr, MY_CONST(coeff1_gm_tex)); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed"); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed"); + cudaBindTexture(0,coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed"); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b-d failed"); + cudaBindTexture(0,coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed"); + #endif + + } + else + cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n); + } + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed"); + + if(ncoeff>1) + { + F_FLOAT coeff2[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j]; + } + } + + if(use_global_params) + { + cudaMemcpyToSymbol(MY_CONST(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*) ); + cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n,cudaMemcpyHostToDevice); + + _coeff2_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff2_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff2_gm_texture_ptr; + cudaGetTextureReference(&coeff2_gm_texture_ptr, MY_CONST(coeff2_gm_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); + #endif + + } + else + cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n); + } + CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed"); + + if(ncoeff>2) + { + F_FLOAT coeff3[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j]; + } + } + + if(use_global_params) + { + cudaMemcpyToSymbol(MY_CONST(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*) ); + cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n,cudaMemcpyHostToDevice); + _coeff3_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff3_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff3_gm_texture_ptr; + cudaGetTextureReference(&coeff3_gm_texture_ptr, MY_CONST(coeff3_gm_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); + #endif + } + else + cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n); + } + CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed"); + + if(ncoeff>3) + { + F_FLOAT coeff4[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j]; + } + } + + if(use_global_params) + { + cudaMemcpyToSymbol(MY_CONST(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*) ); + cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n,cudaMemcpyHostToDevice); + _coeff4_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff4_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff4_gm_texture_ptr; + cudaGetTextureReference(&coeff4_gm_texture_ptr, MY_CONST(coeff4_gm_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); + #endif + } + else + cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n); + } + CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed"); + + if(ncoeff>4) + { + F_FLOAT coeff5[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j]; + } + } + + if(use_global_params) + { + cudaMemcpyToSymbol(MY_CONST(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*) ); + cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n,cudaMemcpyHostToDevice); + _coeff5_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff5_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff5_gm_texture_ptr; + cudaGetTextureReference(&coeff5_gm_texture_ptr, MY_CONST(coeff5_gm_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); + #endif + } + else + cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed"); + if(ncoeff>5) + { + F_FLOAT coeff6[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j]; + } + } + + if(use_global_params) + { + cudaMemcpyToSymbol(MY_CONST(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*) ); + cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n,cudaMemcpyHostToDevice); + _coeff6_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff6_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff6_gm_texture_ptr; + cudaGetTextureReference(&coeff6_gm_texture_ptr, MY_CONST(coeff6_gm_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); + #endif + } + } + CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed"); + + if(ncoeff>6) + { + F_FLOAT coeff7[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j]; + } + } + + if(use_global_params) + { + cudaMemcpyToSymbol(MY_CONST(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*) ); + cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n,cudaMemcpyHostToDevice); + _coeff7_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff7_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff7_gm_texture_ptr; + cudaGetTextureReference(&coeff7_gm_texture_ptr, MY_CONST(coeff7_gm_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); + #endif + } + } + CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed"); + + if(ncoeff>7) + { + F_FLOAT coeff8[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j]; + } + } + + if(use_global_params) + { + cudaMemcpyToSymbol(MY_CONST(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*) ); + cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n,cudaMemcpyHostToDevice); + _coeff8_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff8_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff8_gm_texture_ptr; + cudaGetTextureReference(&coeff8_gm_texture_ptr, MY_CONST(coeff8_gm_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); + #endif + } + } + CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed"); + + if(ncoeff>8) + { + F_FLOAT coeff9[cuda_ntypes2]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j]; + } + } + + if(use_global_params) + { + cudaMemcpyToSymbol(MY_CONST(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*) ); + cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n,cudaMemcpyHostToDevice); + _coeff9_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff9_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff9_gm_texture_ptr; + cudaGetTextureReference(&coeff9_gm_texture_ptr, MY_CONST(coeff9_gm_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0,coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); + #endif + } + } + CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed"); + + F_FLOAT special_lj[4]; + special_lj[0]=sdata->pair.special_lj[0]; + special_lj[1]=sdata->pair.special_lj[1]; + special_lj[2]=sdata->pair.special_lj[2]; + special_lj[3]=sdata->pair.special_lj[3]; + + + X_FLOAT box_size[3] = + { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + + cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); + cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(special_lj) , special_lj , sizeof(F_FLOAT)*4); + cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 ); + cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) ); + + if(need_q) + { + F_FLOAT qqrd2e_tmp=sdata->pppm.qqrd2e; + F_FLOAT special_coul[4]; + special_coul[0]=sdata->pair.special_coul[0]; + special_coul[1]=sdata->pair.special_coul[1]; + special_coul[2]=sdata->pair.special_coul[2]; + special_coul[3]=sdata->pair.special_coul[3]; + + cudaMemcpyToSymbol(MY_CONST(special_coul) , special_coul , sizeof(F_FLOAT)*4); + cudaMemcpyToSymbol(MY_CONST(g_ewald) ,&sdata->pair.g_ewald , sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(qqrd2e) ,&qqrd2e_tmp , sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(kappa) ,&sdata->pair.kappa , sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(eng_coul) ,&sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*) ); + } + CUT_CHECK_ERROR("Cuda_Pair: init failed"); +} +timespec startpairtime, endpairtime; +//Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary +void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,int eflag, int vflag, dim3& grid, dim3& threads, int& sharedperproc,bool need_q=false,int maxthreads=256) +{ + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax_AllStyles(sdata,sneighlist); + if(sdata->atom.update_nlocal) + { + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + } + + + + BindXTypeTexture(sdata); + if(need_q) BindQTexture(sdata); + + + sharedperproc=0; + if(sdata->pair.use_block_per_atom) sharedperproc+=3; + if(eflag) sharedperproc+=1; + if(need_q && eflag) sharedperproc+=1; + if(vflag) sharedperproc+=6; + + int threadnum = sneighlist->inum; + if (sdata->comm.comm_phase==2)threadnum=sneighlist->inum_border2; + if(sdata->pair.use_block_per_atom) {threadnum*=64; maxthreads=64;} + int3 layout=getgrid(threadnum,sharedperproc*sizeof(ENERGY_FLOAT),maxthreads,true); //need to limit to 192 threads due to register limit + threads.x = layout.z; threads.y = 1; threads.z = 1; + grid.x = layout.x; grid.y = layout.y; grid.z = 1; + + int size=(unsigned)(layout.y*layout.x)*sharedperproc*sizeof(ENERGY_FLOAT); + Cuda_UpdateBuffer(sdata,size); + + if(sdata->pair.use_block_per_atom) + cudaMemset(sdata->buffer, 0, size); + + sdata->pair.lastgridsize=grid.x*grid.y; + sdata->pair.n_energy_virial=sharedperproc; + if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial-=3; + + clock_gettime(CLOCK_REALTIME,&startpairtime); + + MYDBG( printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); ) +} + +//Function which is called after the kernel invocation, collects energy and virial +void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3& grid, int& sharedperproc,int eflag, int vflag) +{ + if((not sdata->pair.collect_forces_later) && (eflag||vflag))//not sdata->comm.comm_phase==2)) + { + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME,&endpairtime); + sdata->cuda_timings.pair_kernel+= + endpairtime.tv_sec-startpairtime.tv_sec+1.0*(endpairtime.tv_nsec-startpairtime.tv_nsec)/1000000000; + CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); + + if(eflag||vflag) + { + int n=grid.x*grid.y; + if(sdata->pair.use_block_per_atom) + grid.x=sharedperproc-3; + else + grid.x=sharedperproc; + grid.y=1; + dim3 threads(128,1,1); + MYDBG( printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); ) + MY_AP(PairVirialCompute_reduce)<<>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed"); + } + + MYDBG( printf("# CUDA: Cuda_Pair: kernel done\n"); ) + } +} + + +#include "pair_born_coul_long_cuda.cu" +#include "pair_buck_coul_cut_cuda.cu" +#include "pair_buck_coul_long_cuda.cu" +#include "pair_buck_cuda.cu" +#include "pair_cg_cmm_cuda.cu" +#include "pair_cg_cmm_coul_cut_cuda.cu" +#include "pair_cg_cmm_coul_debye_cuda.cu" +#include "pair_cg_cmm_coul_long_cuda.cu" +#include "pair_gran_hooke_cuda.cu" +#include "pair_lj_charmm_coul_charmm_implicit_cuda.cu" +#include "pair_lj_charmm_coul_charmm_cuda.cu" +#include "pair_lj_charmm_coul_long_cuda.cu" +#include "pair_lj_class2_coul_cut_cuda.cu" +#include "pair_lj_class2_coul_long_cuda.cu" +#include "pair_lj_class2_cuda.cu" +#include "pair_lj_cut_coul_cut_cuda.cu" +#include "pair_lj_cut_coul_debye_cuda.cu" +#include "pair_lj_cut_coul_long_cuda.cu" +#include "pair_lj_cut_cuda.cu" +#include "pair_lj_cut_experimental_cuda.cu" +#include "pair_lj_expand_cuda.cu" +#include "pair_lj_gromacs_cuda.cu" +#include "pair_lj_gromacs_coul_gromacs_cuda.cu" +#include "pair_lj_smooth_cuda.cu" +#include "pair_lj96_cut_cuda.cu" +#include "pair_morse_coul_long_cuda.cu" +#include "pair_morse_cuda.cu" +#include "pair_eam_cuda.cu" + +#include "cuda_pair_kernel.cu" + + +void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata) +{ + CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed"); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) ); + cudaMemcpyToSymbol(MY_CONST(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*) ); + cudaMemcpyToSymbol(MY_CONST(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(omega_rmass),& sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*) ); + CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed"); +} + + +void Cuda_Pair_GenerateXType(cuda_shared_data* sdata) +{ + MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); ) + Cuda_Pair_UpdateNmax(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout); ) + + int3 layout=getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + MYDBG(printf(" # CUDA: GenerateXType ... kernel start test\n"); fflush(stdout);) + Pair_GenerateXType_Kernel<<>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); + MYDBG(printf(" # CUDA: GenerateXType ... end\n"); fflush(stdout); ) +} + +void Cuda_Pair_RevertXType(cuda_shared_data* sdata) +{ + MYDBG(printf(" # CUDA: RevertXType ... start\n"); ) + Cuda_Pair_UpdateNmax(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + + int3 layout=getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Pair_RevertXType_Kernel<<>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); + MYDBG(printf(" # CUDA: RevertXType ... end\n"); ) +} + +void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata) +{ + MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); ) + Cuda_Pair_UpdateNmax(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout); ) + + int3 layout=getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + MYDBG(printf(" # CUDA: GenerateVRadius ... kernel start test\n"); fflush(stdout);) + Pair_GenerateVRadius_Kernel<<>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateVRadius: Kernel failed"); + MYDBG(printf(" # CUDA: GenerateVRadius ... end\n"); fflush(stdout); ) +} + +void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata) +{ + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); ) + Cuda_Pair_UpdateNmax(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout); ) + + int3 layout=getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... kernel start test\n"); fflush(stdout);) + Pair_GenerateOmegaRmass_Kernel<<>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateOmegaRmass: Kernel failed"); + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... end\n"); fflush(stdout); ) +} + +void Cuda_Pair_BuildXHold(cuda_shared_data* sdata) +{ + Cuda_Pair_UpdateNmax(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + + int3 layout=getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Pair_BuildXHold_Kernel<<>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); +} + +void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag) +{ + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME,&endpairtime); + sdata->cuda_timings.pair_kernel+= + endpairtime.tv_sec-startpairtime.tv_sec+1.0*(endpairtime.tv_nsec-startpairtime.tv_nsec)/1000000000; + CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); + dim3 threads; + dim3 grid; + + if(eflag||vflag) + { + int n=sdata->pair.lastgridsize; + grid.x=sdata->pair.n_energy_virial; + grid.y=1; + threads.x=128; + //printf("A grid.x: %i\n",grid.x); + MY_AP(PairVirialCompute_reduce)<<>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed"); + } + int3 layout=getgrid(sdata->atom.nlocal); + threads.x = layout.z; + grid.x = layout.x; + grid.y = layout.y; + Pair_CollectForces_Kernel<<>>(sdata->pair.n_energy_virial,sdata->pair.lastgridsize); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair_CollectForces: Force Summation Kernel execution failed"); + +} diff --git a/lib/cuda/cuda_pair_cu.h b/lib/cuda/cuda_pair_cu.h new file mode 100644 index 0000000000..1844735a16 --- /dev/null +++ b/lib/cuda/cuda_pair_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#include "cuda_shared.h" + +extern "C" void Cuda_Pair_GenerateXType(cuda_shared_data* sdata); +extern "C" void Cuda_Pair_RevertXType(cuda_shared_data* sdata); +extern "C" void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata); +extern "C" void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata); +extern "C" void Cuda_Pair_BuildXHold(cuda_shared_data* sdata); +extern "C" void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag); diff --git a/lib/cuda/cuda_pair_kernel.cu b/lib/cuda/cuda_pair_kernel.cu new file mode 100644 index 0000000000..fe7a38a782 --- /dev/null +++ b/lib/cuda/cuda_pair_kernel.cu @@ -0,0 +1,1350 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 + +template +__global__ void Pair_Kernel_TpA(int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_FLOAT ecoul = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE; + ENERGY_FLOAT* sharedECoul; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + if(eflag||eflag_atom) + { + sharedE = &sharedmem[threadIdx.x]; + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + if(coul_type!=COUL_NONE) + { + sharedECoul = sharedE + blockDim.x; + sharedECoul[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + } + if(vflag||vflag_atom) + { + sharedV[0*blockDim.x] = ENERGY_F(0.0); + sharedV[1*blockDim.x] = ENERGY_F(0.0); + sharedV[2*blockDim.x] = ENERGY_F(0.0); + sharedV[3*blockDim.x] = ENERGY_F(0.0); + sharedV[4*blockDim.x] = ENERGY_F(0.0); + sharedV[5*blockDim.x] = ENERGY_F(0.0); + } + + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + X_FLOAT xtmp,ytmp,ztmp; + X_FLOAT4 myxtype; + F_FLOAT fxtmp,fytmp,fztmp,fpair; + F_FLOAT delx,dely,delz; + F_FLOAT factor_lj,factor_coul; + F_FLOAT qtmp; + int itype,i,j; + int jnum=0; + int* jlist; + + if(ii < _inum) + { + i = _ilist[ii]; + + myxtype=fetchXType(i); + xtmp=myxtype.x; + ytmp=myxtype.y; + ztmp=myxtype.z; + itype=static_cast (myxtype.w); + + + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + + if(coul_type!=COUL_NONE) + qtmp = fetchQ(i); + + jnum = _numneigh[i]; + jlist = &_neighbors[i]; + } + __syncthreads(); + + for (int jj = 0; jj < jnum; jj++) + { + if(ii < _inum) + if(jj (myxtype.w); + + + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); + if (in_cutoff) + { + switch(pair_type) + { + case PAIR_BORN: + fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_BUCK: + fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_CG_CMM: + fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CHARMM: + fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CLASS2: + fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CUT: + fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_EXPAND: + fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_GROMACS: + fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_SMOOTH: + fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ96_CUT: + fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_MORSE_R6: + fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_MORSE: + fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + } + } + + if(coul_type!=COUL_NONE) + { + const F_FLOAT qiqj=qtmp*fetchQ(j); + if(qiqj*qiqj>1e-8) + { + const bool in_coul_cutoff = + rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); + if (in_coul_cutoff) + { + switch(coul_type) + { + case COUL_CHARMM: + fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_CHARMM_IMPLICIT: + fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_CUT: + { + const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq); + if(eflag) + { + ecoul += forcecoul; + } + fpair += forcecoul*(F_F(1.0)/rsq); + } + break; + + case COUL_DEBYE: + { + const F_FLOAT r2inv = F_F(1.0)/rsq; + const X_FLOAT r = _RSQRT_(r2inv); + const X_FLOAT rinv = F_F(1.0)/r; + const F_FLOAT screening = _EXP_(-_kappa*r); + F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + if(eflag) + { + ecoul += forcecoul*rinv; + } + forcecoul *= (_kappa + rinv); + fpair += forcecoul*r2inv; + } + break; + + case COUL_GROMACS: + fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_LONG: + { + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT grij = _g_ewald * r; + const F_FLOAT expm2 = _EXP_(-grij*grij); + const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij); + const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r); + F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); + if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; + if(eflag) + { + ecoul += prefactor*erfc; + if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; + } + fpair += forcecoul*r2inv; + } + break; + + } + } + in_cutoff=in_cutoff || in_coul_cutoff; + } + } + + + if (in_cutoff) + { + F_FLOAT dxfp,dyfp,dzfp; + fxtmp += dxfp = delx*fpair; + fytmp += dyfp = dely*fpair; + fztmp += dzfp = delz*fpair; + if(vflag) + { + sharedV[0 * blockDim.x]+= delx*dxfp; + sharedV[1 * blockDim.x]+= dely*dyfp; + sharedV[2 * blockDim.x]+= delz*dzfp; + sharedV[3 * blockDim.x]+= delx*dyfp; + sharedV[4 * blockDim.x]+= delx*dzfp; + sharedV[5 * blockDim.x]+= dely*dzfp; + } + } + } + } + __syncthreads(); + if(ii < _inum) + { + F_FLOAT* my_f; + if(_collect_forces_later) + { + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + if(eflag) + { + buffer=&buffer[1 * gridDim.x * gridDim.y]; + if(coul_type!=COUL_NONE) + buffer=&buffer[1 * gridDim.x * gridDim.y]; + } + if(vflag) + { + buffer=&buffer[6 * gridDim.x * gridDim.y]; + } + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = fxtmp; my_f += _nmax; + *my_f = fytmp; my_f += _nmax; + *my_f = fztmp; + } + else + { + my_f = _f + i; + *my_f += fxtmp; my_f += _nmax; + *my_f += fytmp; my_f += _nmax; + *my_f += fztmp; + } + } + __syncthreads(); + + if(eflag) + { + sharedE[0] = evdwl; + if(coul_type!=COUL_NONE) + sharedECoul[0] = ecoul; + } + if(eflag_atom && i<_nlocal) + { + if(coul_type!=COUL_NONE) + _eatom[i] += evdwl + ecoul; + else + _eatom[i] += evdwl; + } + + if(vflag_atom && i<_nlocal) + { + _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; + _vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; + _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; + _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; + _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; + _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; + } + if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,coul_type!=COUL_NONE?1:0); + } + +template + __global__ void Pair_Kernel_BpA(int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + int ii = (blockIdx.x*gridDim.y+blockIdx.y); + if( ii >= _inum ) + return; + + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_FLOAT ecoul = ENERGY_F(0.0); + F_FLOAT3* sharedVirial1; + F_FLOAT3* sharedVirial2; + F_FLOAT* sharedEnergy; + F_FLOAT* sharedEnergyCoul; + + F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0]; + if(vflag) + { + sharedVirial1 = &sharedForce[64]; + sharedVirial2 = &sharedVirial1[64]; + } + else + { + sharedVirial1 = &sharedForce[0]; + sharedVirial2 = &sharedVirial1[0]; + } + + if(eflag) + { + if(vflag||vflag_atom) + sharedEnergy = (F_FLOAT*) &sharedVirial2[64]; + else + sharedEnergy = (F_FLOAT*) &sharedForce[64]; + + if(coul_type!=COUL_NONE) + sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64]; + + } + + F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + + X_FLOAT xtmp,ytmp,ztmp; + X_FLOAT4 myxtype; + F_FLOAT delx,dely,delz; + F_FLOAT factor_lj,factor_coul; + F_FLOAT fpair; + F_FLOAT qtmp; + int itype,jnum,i,j; + int* jlist; + + i = _ilist[ii]; + + myxtype = fetchXType(i); + + xtmp=myxtype.x; + ytmp=myxtype.y; + ztmp=myxtype.z; + itype=static_cast (myxtype.w); + + if(coul_type!=COUL_NONE) + qtmp = fetchQ(i); + + jnum = _numneigh[i]; + + jlist = &_neighbors[i*_maxneighbors]; + __syncthreads(); + for (int jj = threadIdx.x; jj < jnum+blockDim.x; jj+=blockDim.x) + { + if(jj (myxtype.w); + + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); + bool in_coul_cutoff; + if (in_cutoff) + { + switch(pair_type) + { + case PAIR_BORN: + fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_BUCK: + fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_CG_CMM: + fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CHARMM: + fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CLASS2: + fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CUT: + fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_EXPAND: + fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_GROMACS: + fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_SMOOTH: + fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ96_CUT: + fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_MORSE_R6: + fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_MORSE: + fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + } + } + + if(coul_type!=COUL_NONE) + { + const F_FLOAT qiqj=qtmp*fetchQ(j); + if(qiqj*qiqj>(1e-8f)) + { + in_coul_cutoff = + rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); + if (in_coul_cutoff) + { + switch(coul_type) + { + case COUL_CHARMM: + fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_CHARMM_IMPLICIT: + fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_GROMACS: + fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_LONG: + { + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT grij = _g_ewald * r; + const F_FLOAT expm2 = _EXP_(-grij*grij); + const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij); + const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r); + F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); + if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; + if(eflag) + { + ecoul += prefactor*erfc; + if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; + } + fpair += forcecoul*r2inv; + } + break; + + case COUL_DEBYE: + { + const F_FLOAT r2inv = F_F(1.0)/rsq; + const X_FLOAT r = _RSQRT_(r2inv); + const X_FLOAT rinv = F_F(1.0)/r; + const F_FLOAT screening = _EXP_(-_kappa*r); + F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + if(eflag) + { + ecoul += forcecoul*rinv; + } + forcecoul *= (_kappa + rinv); + fpair += forcecoul*r2inv; + } + break; + + case COUL_CUT: + { + const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq); + if(eflag) + { + ecoul += forcecoul; + } + fpair += forcecoul*(F_F(1.0)/rsq); + } + break; + + + } + } + } + } + + + + if (in_cutoff||in_coul_cutoff) + { + F_FLOAT dxfp,dyfp,dzfp; + partialForce.x += dxfp = delx*fpair; + partialForce.y += dyfp = dely*fpair; + partialForce.z += dzfp = delz*fpair; + if(vflag) + { + partialVirial1.x+= delx*dxfp; + partialVirial1.y+= dely*dyfp; + partialVirial1.z+= delz*dzfp; + partialVirial2.x+= delx*dyfp; + partialVirial2.y+= delx*dzfp; + partialVirial2.z+= dely*dzfp; + } + } + } + } + + if(eflag) + { + sharedEnergy[threadIdx.x]= evdwl; + if(coul_type!=COUL_NONE) + sharedEnergyCoul[threadIdx.x]= ecoul; + } + sharedForce[threadIdx.x]=partialForce; + if(vflag) + { + sharedVirial1[threadIdx.x]=partialVirial1; + sharedVirial2[threadIdx.x]=partialVirial2; + } + + __syncthreads(); + + + for( unsigned int s = blockDim.x >> 1; s > 0; s >>= 1 ) + { + + if( threadIdx.x < s ) + { + sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x; + sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y; + sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z; + + if(vflag) + { + sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x; + sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y; + sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z; + + sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x; + sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y; + sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z; + } + + if(eflag) + { + sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ]; + if(coul_type!=COUL_NONE) + sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ]; + } + } + __syncthreads(); + } + + if(threadIdx.x == 0) + { + + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + if(eflag) + { + ENERGY_FLOAT tmp_evdwl; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergy[0]; + if(eflag_atom) + _eatom[i] = tmp_evdwl; + buffer=&buffer[gridDim.x * gridDim.y]; + if(coul_type!=COUL_NONE) + { + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergyCoul[0]; + if(eflag_atom) + _eatom[i] += tmp_evdwl; + buffer=&buffer[gridDim.x * gridDim.y]; + } + } + if(vflag) + { + ENERGY_FLOAT tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].x; + if(vflag_atom) _vatom[i+0*_nmax] = tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].y; + if(vflag_atom) _vatom[i+1*_nmax] = tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].z; + if(vflag_atom) _vatom[i+2*_nmax] = tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].x; + if(vflag_atom) _vatom[i+3*_nmax] = tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].y; + if(vflag_atom) _vatom[i+4*_nmax] = tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].z; + if(vflag_atom) _vatom[i+5*_nmax] = tmp; + buffer=&buffer[6 * gridDim.x * gridDim.y]; + } + F_FLOAT* my_f; + if(_collect_forces_later) + { + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = sharedForce[0].x; my_f += _nmax; + *my_f = sharedForce[0].y; my_f += _nmax; + *my_f = sharedForce[0].z; + } + else + { + my_f = _f + i; + *my_f += sharedForce[0].x; my_f += _nmax; + *my_f += sharedForce[0].y; my_f += _nmax; + *my_f += sharedForce[0].z; + } + } +} + + +template +__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_FLOAT ecoul = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE; + ENERGY_FLOAT* sharedECoul; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + if(eflag||eflag_atom) + { + sharedE = &sharedmem[threadIdx.x]; + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + if(coul_type!=COUL_NONE) + { + sharedECoul = sharedE + blockDim.x; + sharedECoul[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + } + if(vflag||vflag_atom) + { + sharedV[0*blockDim.x] = ENERGY_F(0.0); + sharedV[1*blockDim.x] = ENERGY_F(0.0); + sharedV[2*blockDim.x] = ENERGY_F(0.0); + sharedV[3*blockDim.x] = ENERGY_F(0.0); + sharedV[4*blockDim.x] = ENERGY_F(0.0); + sharedV[5*blockDim.x] = ENERGY_F(0.0); + } + + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + X_FLOAT xtmp,ytmp,ztmp; + X_FLOAT4 myxtype; + F_FLOAT fxtmp,fytmp,fztmp,fpair; + F_FLOAT delx,dely,delz; + F_FLOAT factor_lj,factor_coul; + F_FLOAT qtmp; + int itype,i,j; + int jnum=0; + int* jlist; + + if(ii < (comm_phase<2?_inum:_inum_border[0])) + { + i = comm_phase<2? _ilist[ii] : _ilist_border[ii] ; + + myxtype=fetchXType(i); + myxtype=_x_type[i]; + xtmp=myxtype.x; + ytmp=myxtype.y; + ztmp=myxtype.z; + itype=static_cast (myxtype.w); + + + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + + if(coul_type!=COUL_NONE) + qtmp = fetchQ(i); + jnum = comm_phase==0? _numneigh[i]: (comm_phase==1?_numneigh_inner[i]:_numneigh_border[ii]); + + + jlist = comm_phase==0? &_neighbors[i]: (comm_phase==1?&_neighbors_inner[i]:&_neighbors_border[ii]); + } + __syncthreads(); + + for (int jj = 0; jj < jnum; jj++) + { + if(ii < (comm_phase<2?_inum:_inum_border[0])) + if(jj (myxtype.w); + + + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); + if (in_cutoff) + { + switch(pair_type) + { + case PAIR_BORN: + fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_BUCK: + fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_CG_CMM: + fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CHARMM: + fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CLASS2: + fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CUT: + fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_EXPAND: + fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_GROMACS: + fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_SMOOTH: + fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ96_CUT: + fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_MORSE_R6: + fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_MORSE: + fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + } + } + + if(coul_type!=COUL_NONE) + { + const F_FLOAT qiqj=qtmp*fetchQ(j); + if(qiqj*qiqj>1e-8) + { + const bool in_coul_cutoff = + rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); + if (in_coul_cutoff) + { + switch(coul_type) + { + case COUL_CHARMM: + fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_CHARMM_IMPLICIT: + fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_CUT: + { + const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq); + if(eflag) + { + ecoul += forcecoul; + } + fpair += forcecoul*(F_F(1.0)/rsq); + } + break; + + case COUL_DEBYE: + { + const F_FLOAT r2inv = F_F(1.0)/rsq; + const X_FLOAT r = _RSQRT_(r2inv); + const X_FLOAT rinv = F_F(1.0)/r; + const F_FLOAT screening = _EXP_(-_kappa*r); + F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + if(eflag) + { + ecoul += forcecoul*rinv; + } + forcecoul *= (_kappa + rinv); + fpair += forcecoul*r2inv; + } + break; + + case COUL_GROMACS: + fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_LONG: + { + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT grij = _g_ewald * r; + const F_FLOAT expm2 = _EXP_(-grij*grij); + const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij); + const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r); + F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); + if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; + if(eflag) + { + ecoul += prefactor*erfc; + if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; + } + fpair += forcecoul*r2inv; + } + break; + + } + } + in_cutoff=in_cutoff || in_coul_cutoff; + } + } + + + if (in_cutoff) + { + F_FLOAT dxfp,dyfp,dzfp; + fxtmp += dxfp = delx*fpair; + fytmp += dyfp = dely*fpair; + fztmp += dzfp = delz*fpair; + if(vflag) + { + sharedV[0 * blockDim.x]+= delx*dxfp; + sharedV[1 * blockDim.x]+= dely*dyfp; + sharedV[2 * blockDim.x]+= delz*dzfp; + sharedV[3 * blockDim.x]+= delx*dyfp; + sharedV[4 * blockDim.x]+= delx*dzfp; + sharedV[5 * blockDim.x]+= dely*dzfp; + } + } + } + } + __syncthreads(); + if(ii < (comm_phase<2?_inum:_inum_border[0])) + { + F_FLOAT* my_f; + if(_collect_forces_later) + { + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + if(eflag) + { + buffer=&buffer[1 * gridDim.x * gridDim.y]; + if(coul_type!=COUL_NONE) + buffer=&buffer[1 * gridDim.x * gridDim.y]; + } + if(vflag) + { + buffer=&buffer[6 * gridDim.x * gridDim.y]; + } + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = fxtmp; my_f += _nmax; + *my_f = fytmp; my_f += _nmax; + *my_f = fztmp; + } + else + { + my_f = _f + i; + *my_f += fxtmp; my_f += _nmax; + *my_f += fytmp; my_f += _nmax; + *my_f += fztmp; + } + } + __syncthreads(); + + if(eflag) + { + sharedE[0] = evdwl; + if(coul_type!=COUL_NONE) + sharedECoul[0] = ecoul; + } + if(eflag_atom && i<_nlocal) + { + if(coul_type!=COUL_NONE) + _eatom[i] += evdwl + ecoul; + else + _eatom[i] += evdwl; + } + + if(vflag_atom && i<_nlocal) + { + _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; + _vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; + _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; + _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; + _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; + _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; + } + if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,coul_type!=COUL_NONE?1:0); + } + +template + __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase) +{ + int ii = (blockIdx.x*gridDim.y+blockIdx.y); + if( ii >= (comm_phase<2?_inum:_inum_border[0])) + return; + + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_FLOAT ecoul = ENERGY_F(0.0); + F_FLOAT3* sharedVirial1; + F_FLOAT3* sharedVirial2; + F_FLOAT* sharedEnergy; + F_FLOAT* sharedEnergyCoul; + + F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0]; + if(vflag) + { + sharedVirial1 = &sharedForce[64]; + sharedVirial2 = &sharedVirial1[64]; + } + else + { + sharedVirial1 = &sharedForce[0]; + sharedVirial2 = &sharedVirial1[0]; + } + + if(eflag) + { + if(vflag||vflag_atom) + sharedEnergy = (F_FLOAT*) &sharedVirial2[64]; + else + sharedEnergy = (F_FLOAT*) &sharedForce[64]; + + if(coul_type!=COUL_NONE) + sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64]; + + } + + F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + + X_FLOAT xtmp,ytmp,ztmp; + X_FLOAT4 myxtype; + F_FLOAT delx,dely,delz; + F_FLOAT factor_lj,factor_coul; + F_FLOAT fpair; + F_FLOAT qtmp; + int itype,jnum,i,j; + int* jlist; + + i = comm_phase<2? _ilist[ii] : _ilist_border[ii]; + + myxtype = fetchXType(i); + + xtmp=myxtype.x; + ytmp=myxtype.y; + ztmp=myxtype.z; + itype=static_cast (myxtype.w); + + if(coul_type!=COUL_NONE) + qtmp = fetchQ(i); + + jnum = comm_phase==0? _numneigh[i]: (comm_phase==1?_numneigh_inner[i]:_numneigh_border[ii]); + + jlist = comm_phase==0? &_neighbors[i*_maxneighbors]: (comm_phase==1?&_neighbors_inner[i*_maxneighbors]:&_neighbors_border[ii*_maxneighbors]); + __syncthreads(); + for (int jj = threadIdx.x; jj < jnum+blockDim.x; jj+=blockDim.x) + { + if(jj (myxtype.w); + + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); + bool in_coul_cutoff; + if (in_cutoff) + { + switch(pair_type) + { + case PAIR_BORN: + fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_BUCK: + fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_CG_CMM: + fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CHARMM: + fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CLASS2: + fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_CUT: + fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_EXPAND: + fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_GROMACS: + fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ_SMOOTH: + fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_LJ96_CUT: + fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_MORSE_R6: + fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + case PAIR_MORSE: + fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); + break; + } + } + + if(coul_type!=COUL_NONE) + { + const F_FLOAT qiqj=qtmp*fetchQ(j); + if(qiqj*qiqj>(1e-8f)) + { + in_coul_cutoff = + rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); + if (in_coul_cutoff) + { + switch(coul_type) + { + case COUL_CHARMM: + fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_CHARMM_IMPLICIT: + fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_GROMACS: + fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj); + break; + + case COUL_LONG: + { + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT grij = _g_ewald * r; + const F_FLOAT expm2 = _EXP_(-grij*grij); + const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij); + const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r); + F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); + if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; + if(eflag) + { + ecoul += prefactor*erfc; + if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; + } + fpair += forcecoul*r2inv; + } + break; + + case COUL_DEBYE: + { + const F_FLOAT r2inv = F_F(1.0)/rsq; + const X_FLOAT r = _RSQRT_(r2inv); + const X_FLOAT rinv = F_F(1.0)/r; + const F_FLOAT screening = _EXP_(-_kappa*r); + F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + if(eflag) + { + ecoul += forcecoul*rinv; + } + forcecoul *= (_kappa + rinv); + fpair += forcecoul*r2inv; + } + break; + + case COUL_CUT: + { + const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq); + if(eflag) + { + ecoul += forcecoul; + } + fpair += forcecoul*(F_F(1.0)/rsq); + } + break; + + + } + } + } + } + + + + if (in_cutoff||in_coul_cutoff) + { + F_FLOAT dxfp,dyfp,dzfp; + partialForce.x += dxfp = delx*fpair; + partialForce.y += dyfp = dely*fpair; + partialForce.z += dzfp = delz*fpair; + if(vflag) + { + partialVirial1.x+= delx*dxfp; + partialVirial1.y+= dely*dyfp; + partialVirial1.z+= delz*dzfp; + partialVirial2.x+= delx*dyfp; + partialVirial2.y+= delx*dzfp; + partialVirial2.z+= dely*dzfp; + } + } + } + } + + if(eflag) + { + sharedEnergy[threadIdx.x]= evdwl; + if(coul_type!=COUL_NONE) + sharedEnergyCoul[threadIdx.x]= ecoul; + } + sharedForce[threadIdx.x]=partialForce; + if(vflag) + { + sharedVirial1[threadIdx.x]=partialVirial1; + sharedVirial2[threadIdx.x]=partialVirial2; + } + + __syncthreads(); + + + for( unsigned int s = blockDim.x >> 1; s > 0; s >>= 1 ) + { + + if( threadIdx.x < s ) + { + sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x; + sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y; + sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z; + + if(vflag) + { + sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x; + sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y; + sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z; + + sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x; + sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y; + sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z; + } + + if(eflag) + { + sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ]; + if(coul_type!=COUL_NONE) + sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ]; + } + } + __syncthreads(); + } + + if(threadIdx.x == 0) + { + + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + if(eflag) + { + ENERGY_FLOAT tmp_evdwl; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergy[0]; + if(eflag_atom) + _eatom[i] = tmp_evdwl; + buffer=&buffer[gridDim.x * gridDim.y]; + if(coul_type!=COUL_NONE) + { + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergyCoul[0]; + if(eflag_atom) + _eatom[i] += tmp_evdwl; + buffer=&buffer[gridDim.x * gridDim.y]; + } + } + if(vflag) + { + ENERGY_FLOAT tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].x; + if(vflag_atom) _vatom[i+0*_nmax] = tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].y; + if(vflag_atom) _vatom[i+1*_nmax] = tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].z; + if(vflag_atom) _vatom[i+2*_nmax] = tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].x; + if(vflag_atom) _vatom[i+3*_nmax] = tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].y; + if(vflag_atom) _vatom[i+4*_nmax] = tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].z; + if(vflag_atom) _vatom[i+5*_nmax] = tmp; + buffer=&buffer[6 * gridDim.x * gridDim.y]; + } + F_FLOAT* my_f; + if(_collect_forces_later) + { + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = sharedForce[0].x; my_f += _nmax; + *my_f = sharedForce[0].y; my_f += _nmax; + *my_f = sharedForce[0].z; + } + else + { + my_f = _f + i; + *my_f += sharedForce[0].x; my_f += _nmax; + *my_f += sharedForce[0].y; my_f += _nmax; + *my_f += sharedForce[0].z; + } + } +} + +__global__ void Pair_GenerateXType_Kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nall) + { + X_FLOAT4 xtype; + xtype.x=_x[i]; + xtype.y=_x[i+_nmax]; + xtype.z=_x[i+2*_nmax]; + xtype.w=_type[i]; + _x_type[i]=xtype; + } + +} + +__global__ void Pair_GenerateVRadius_Kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nall) + { + V_FLOAT4 vradius; + vradius.x=_v[i]; + vradius.y=_v[i+_nmax]; + vradius.z=_v[i+2*_nmax]; + vradius.w=_radius[i]; + _v_radius[i]=vradius; + } +} + +__global__ void Pair_GenerateOmegaRmass_Kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nall) + { + V_FLOAT4 omegarmass; + omegarmass.x=_omega[i]; + omegarmass.y=_omega[i+_nmax]; + omegarmass.z=_omega[i+2*_nmax]; + omegarmass.w=_rmass[i]; + _omega_rmass[i]=omegarmass; + } +} + +__global__ void Pair_RevertXType_Kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nall) + { + X_FLOAT4 xtype=_x_type[i]; + _x[i]=xtype.x; + _x[i+_nmax]=xtype.y; + _x[i+2*_nmax]=xtype.z; + _type[i]=static_cast (xtype.w); + } + +} + +__global__ void Pair_BuildXHold_Kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nall) + { + X_FLOAT4 xtype=_x_type[i]; + _xhold[i]=xtype.x; + _xhold[i+_nmax]=xtype.y; + _xhold[i+2*_nmax]=xtype.z; + } + +} + +__global__ void Pair_CollectForces_Kernel(int nperblock,int n) +{ + int i = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i>=_nlocal) return; + ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; + + F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n]; + F_FLOAT* my_f = _f + i; + buf_f += i; + *my_f += * buf_f; my_f+=_nmax; buf_f+=_nmax; + *my_f += * buf_f; my_f+=_nmax; buf_f+=_nmax; + *my_f += * buf_f; my_f+=_nmax; +} diff --git a/lib/cuda/cuda_pair_virial_kernel_nc.cu b/lib/cuda/cuda_pair_virial_kernel_nc.cu new file mode 100644 index 0000000000..8ea06604c9 --- /dev/null +++ b/lib/cuda/cuda_pair_virial_kernel_nc.cu @@ -0,0 +1,126 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ ENERGY_FLOAT sharedmem[]; + +static inline __device__ void PairVirialCompute_A_Kernel(int &eflag,int &vflag,int coulflag=0) +{ + __syncthreads(); + ENERGY_FLOAT* shared=sharedmem; + + if(eflag) + { + reduceBlock(shared); + shared+=blockDim.x; + if(coulflag) + { + reduceBlock(shared); + shared+=blockDim.x; + } + } + if(vflag) + { + reduceBlock(shared + 0 * blockDim.x); + reduceBlock(shared + 1 * blockDim.x); + reduceBlock(shared + 2 * blockDim.x); + reduceBlock(shared + 3 * blockDim.x); + reduceBlock(shared + 4 * blockDim.x); + reduceBlock(shared + 5 * blockDim.x); + } + if(threadIdx.x == 0) + { + shared=sharedmem; + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + if(eflag) + { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0]; + shared+=blockDim.x; buffer+=gridDim.x * gridDim.y; + if(coulflag) + { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0]; + shared+=blockDim.x; buffer+=gridDim.x * gridDim.y; + } + } + if(vflag) + { + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[0 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[1 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[2 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[3 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[4 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[5 * blockDim.x]; + } + } + __syncthreads(); +} + +__global__ void MY_AP(PairVirialCompute_reduce)(int n) +{ + sharedmem[threadIdx.x] = ENERGY_F(0.0); + ENERGY_FLOAT sum = ENERGY_F(0.0); + ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; + buf = &buf[blockIdx.x * n]; + //if(blockIdx.x==2) buf=&buf[n]; + + for(int i = 0; i < n; i += blockDim.x) + { + sharedmem[threadIdx.x] = (i + threadIdx.x < n) ? buf[i + threadIdx.x] : ENERGY_F(0.0); + __syncthreads(); + reduceBlock(sharedmem); + if(threadIdx.x == 0) sum += sharedmem[0]; + } + if(threadIdx.x==0) + { + if(gridDim.x == 1) //evdwl + { + _eng_vdwl[0]+=sum; + } + if(gridDim.x == 2) //evdwl + ecoul only + { + if(blockIdx.x==0) + _eng_vdwl[0]+=sum; + else + _eng_coul[0]+=sum; + } + if(gridDim.x == 6) //virial + { + _virial[blockIdx.x] += sum; + } + if(gridDim.x == 7) //evdwl+virial + { + if(blockIdx.x==0) + _eng_vdwl[0]+=sum; + else _virial[blockIdx.x-1] += sum; + } + if(gridDim.x == 8) //evdwl+ecoul+virial + { + if(blockIdx.x==0) + _eng_vdwl[0]+=sum; + else + if(blockIdx.x==1) + _eng_coul[0]+=sum; + else + _virial[blockIdx.x-2] += sum; + } + } +} diff --git a/lib/cuda/cuda_precision.h b/lib/cuda/cuda_precision.h new file mode 100644 index 0000000000..5b7d6a6843 --- /dev/null +++ b/lib/cuda/cuda_precision.h @@ -0,0 +1,269 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef CUDA_PRECISION_H_ +#define CUDA_PRECISION_H_ +/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA. + * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation). + * ***_FLOAT: type definition of given property + * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F) + */ + +#ifdef CUDA_USE_BINNING +#define CUDA_IF_BINNING(a) a +#else +#define CUDA_IF_BINNING(a) +#endif + +//GLOBAL + +#ifdef CUDA_PRECISION + #if CUDA_PRECISION == 1 + #define CUDA_FLOAT float + #define CUDA_F(x) x##f + #endif + #if CUDA_PRECISION == 2 + #define CUDA_FLOAT double + #define CUDA_F(x) x + #endif +#endif + +#ifndef CUDA_PRECISION + #define CUDA_FLOAT double + #define CUDA_F(x) x + #define CUDA_PRECISION 2 +#endif +//-------------------------------- +//-----------FFT----------------- +//-------------------------------- + +#ifdef FFT_PRECISION_CU + #if FFT_PRECISION_CU == 1 + #define FFT_FLOAT float + #define FFT_F(x) x##f + #endif + #if FFT_PRECISION_CU == 2 + #define FFT_FLOAT double + #define FFT_F(x) x + #endif +#endif + +#ifndef FFT_PRECISION_CU + #define FFT_FLOAT CUDA_FLOAT + #define FFT_F(x) CUDA_F(x) + #define FFT_PRECISION_CU CUDA_PRECISION +#endif + +//-------------------------------- +//-----------PPPM----------------- +//-------------------------------- + +#ifdef PPPM_PRECISION + #if PPPM_PRECISION == 1 + #define PPPM_FLOAT float + #define PPPM_F(x) x##f + #endif + #if PPPM_PRECISION == 2 + #define PPPM_FLOAT double + #define PPPM_F(x) x + #endif +#endif + +#ifndef PPPM_PRECISION + #define PPPM_FLOAT CUDA_FLOAT + #define PPPM_F(x) CUDA_F(x) + #define PPPM_PRECISION CUDA_PRECISION +#endif + +//-------------------------------- +//-----------FORCE----------------- +//-------------------------------- + + +#ifdef F_PRECISION + #if F_PRECISION == 1 + #define F_FLOAT float + #define F_F(x) x##f + #endif + #if F_PRECISION == 2 + #define F_FLOAT double + #define F_F(x) x + #endif +#endif + +#ifndef F_PRECISION + #define F_FLOAT CUDA_FLOAT + #define F_F(x) CUDA_F(x) + #define F_PRECISION CUDA_PRECISION +#endif + +#if F_PRECISION == 1 +#define _SQRT_ sqrtf +#define _RSQRT_ rsqrtf +#define _EXP_ expf +#else +#define _SQRT_ sqrt +#define _RSQRT_ rsqrt +#define _EXP_ exp +#endif + +#if F_PRECISION == 2 +struct F_FLOAT2 +{ + F_FLOAT x; + F_FLOAT y; +}; +struct F_FLOAT3 +{ + F_FLOAT x; + F_FLOAT y; + F_FLOAT z; +}; +struct F_FLOAT4 +{ + F_FLOAT x; + F_FLOAT y; + F_FLOAT z; + F_FLOAT w; +}; +#else +#define F_FLOAT2 float2 +#define F_FLOAT3 float3 +#define F_FLOAT4 float4 +#endif +//-------------------------------- +//-----------ENERGY----------------- +//-------------------------------- + +#ifndef ENERGY_PRECISION + #define ENERGY_FLOAT CUDA_FLOAT + #define ENERGY_F(x) CUDA_F(x) +#endif + +#ifdef ENERGY_PRECISION + #if ENERGY_PRECISION == 1 + #define ENERGY_FLOAT float + #define ENERGY_F(x) x##f + #endif + #if ENERGY_PRECISION == 2 + #define ENERGY_FLOAT double + #define ENERGY_F(x) x + #endif +#endif + +#ifndef ENERGY_PRECISION + #define ENERGY_FLOAT CUDA_FLOAT + #define ENERGY_F(x) CUDA_F(x) + #define ENERGY_PRECISION CUDA_PRECISION +#endif + +//-------------------------------- +//-----------POSITIONS------------ +//-------------------------------- + +#ifdef X_PRECISION + #if X_PRECISION == 1 + #define X_FLOAT float + #define X_F(x) x##f + #endif + #if X_PRECISION == 2 + #define X_FLOAT double + #define X_F(x) x + #endif +#endif + +#ifndef X_PRECISION + #define X_FLOAT CUDA_FLOAT + #define X_F(x) CUDA_F(x) + #define X_PRECISION CUDA_PRECISION +#endif + +#if X_PRECISION == 2 +struct X_FLOAT2 +{ + X_FLOAT x; + X_FLOAT y; +}; +struct X_FLOAT3 +{ + X_FLOAT x; + X_FLOAT y; + X_FLOAT z; +}; +struct X_FLOAT4 +{ + X_FLOAT x; + X_FLOAT y; + X_FLOAT z; + X_FLOAT w; +}; +#else +#define X_FLOAT2 float2 +#define X_FLOAT3 float3 +#define X_FLOAT4 float4 +#endif + +//-------------------------------- +//-----------velocities----------- +//-------------------------------- + +#ifdef V_PRECISION + #if V_PRECISION == 1 + #define V_FLOAT float + #define V_F(x) x##f + #endif + #if V_PRECISION == 2 + #define V_FLOAT double + #define V_F(x) x + #endif +#endif + +#ifndef V_PRECISION + #define V_FLOAT CUDA_FLOAT + #define V_F(x) CUDA_F(x) + #define V_PRECISION CUDA_PRECISION +#endif + +#if V_PRECISION == 2 +struct V_FLOAT4 +{ + V_FLOAT x; + V_FLOAT y; + V_FLOAT z; + V_FLOAT w; +}; +#else +#define V_FLOAT4 float4 +#endif + +#ifdef NO_PREC_TIMING +struct timespec_2 +{ + unsigned int tv_sec; + unsigned int tv_nsec; +}; + +#define timespec timespec_2 +#define clock_gettime(a,b) +#endif +#endif /*CUDA_PRECISION_H_*/ diff --git a/lib/cuda/cuda_shared.h b/lib/cuda/cuda_shared.h new file mode 100644 index 0000000000..f7983fff05 --- /dev/null +++ b/lib/cuda/cuda_shared.h @@ -0,0 +1,378 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef _CUDA_SHARED_H_ +#define _CUDA_SHARED_H_ +#include "cuda_precision.h" + +#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int) + +struct dev_array +{ + void* dev_data; // pointer to memory address on cuda device + unsigned dim[3]; // array dimensions +}; + +struct cuda_shared_atom // relevent data from atom class +{ + dev_array dx; // cumulated distance for binning settings + dev_array x; // position + dev_array v; // velocity + dev_array f; // force + dev_array tag; + dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1) + dev_array mask; + dev_array image; + dev_array q; // charges + dev_array mass; // per-type masses + dev_array rmass; // per-atom masses + dev_array radius; // per-atom radius + dev_array density; + dev_array omega; + dev_array torque; + dev_array molecule; + + dev_array special; + int maxspecial; + dev_array nspecial; + int* special_flag; + int molecular; + + dev_array eatom; // per-atom energy + dev_array vatom; // per-atom virial + int need_eatom; + int need_vatom; + + dev_array x_type; // position + type in X_FLOAT4 struct + dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style + dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style + + double* mass_host; // remember per-type host pointer to masses + //int natoms; // total # of atoms in system, could be 0 + int nghost; // and ghost atoms on this proc + int nlocal; // # of owned + int nall; // total # of atoms in this proc + int nmax; // max # of owned+ghost in arrays on this proc + int ntypes; + int q_flag; // do we have charges? + int rmass_flag; // do we have per-atom masses? + int firstgroup; + int nfirst; + + int update_nlocal; + int update_nmax; + + dev_array xhold; // position at last neighboring + X_FLOAT triggerneighsq; // maximum square movement before reneighboring + int reneigh_flag; // is reneighboring necessary + int maxhold; // size of xhold + int dist_check; //perform distance check for reneighboring + dev_array binned_id; //id of each binned atom (not tag!!) + dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]] + float bin_extraspace; + int bin_dim[3]; + int bin_nmax; + dev_array map_array; +}; + +struct cuda_shared_pair // relevent data from pair class +{ + char cudable_force; // check for (cudable_force!=0) + X_FLOAT cut_global; + X_FLOAT cut_inner_global; + X_FLOAT cut_coul_global; + double** cut; // type-type cutoff + double** cutsq; // type-type cutoff + double** cut_inner; // type-type cutoff for coul + double** cut_coul; // type-type cutoff for coul + double** coeff1; // tpye-type pair parameters + double** coeff2; + double** coeff3; + double** coeff4; + double** coeff5; + double** coeff6; + double** coeff7; + double** coeff8; + double** coeff9; + double** coeff10; + double** offset; + double* special_lj; + double* special_coul; + dev_array virial; // ENERGY_FLOAT + dev_array eng_vdwl; // ENERGY_FLOAT + dev_array eng_coul; // ENERGY_FLOAT + X_FLOAT cut_coulsq_global; + F_FLOAT g_ewald,kappa; + int freeze_group_bit; + + dev_array coeff1_gm; + dev_array coeff2_gm; + dev_array coeff3_gm; + dev_array coeff4_gm; + dev_array coeff5_gm; + dev_array coeff6_gm; + dev_array coeff7_gm; + dev_array coeff8_gm; + dev_array coeff9_gm; + dev_array coeff10_gm; + + int lastgridsize; + int n_energy_virial; + int collect_forces_later; + int use_block_per_atom; + int override_block_per_atom; + +}; + +struct cuda_shared_domain // relevent data from domain class +{ + X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc + X_FLOAT subhi[3]; + X_FLOAT boxlo[3]; + X_FLOAT boxhi[3]; + X_FLOAT prd[3]; + int periodicity[3]; // xyz periodicity as array + + int triclinic; + X_FLOAT xy; + X_FLOAT xz; + X_FLOAT yz; + X_FLOAT boxlo_lamda[3]; + X_FLOAT boxhi_lamda[3]; + X_FLOAT prd_lamda[3]; + X_FLOAT h[6]; + X_FLOAT h_inv[6]; + V_FLOAT h_rate[6]; + int update; +}; + +struct cuda_shared_pppm +{ + char cudable_force; +#ifdef FFT_CUFFT + FFT_FLOAT* work1; + FFT_FLOAT* work2; + FFT_FLOAT* work3; + PPPM_FLOAT* greensfn; + PPPM_FLOAT* fkx; + PPPM_FLOAT* fky; + PPPM_FLOAT* fkz; + PPPM_FLOAT* vg; +#endif + int* part2grid; + PPPM_FLOAT* density_brick; + int* density_brick_int; + PPPM_FLOAT density_intScale; + PPPM_FLOAT* vdx_brick; + PPPM_FLOAT* vdy_brick; + PPPM_FLOAT* vdz_brick; + PPPM_FLOAT* density_fft; + ENERGY_FLOAT* energy; + ENERGY_FLOAT* virial; + int nxlo_in; + int nxhi_in; + int nxlo_out; + int nxhi_out; + int nylo_in; + int nyhi_in; + int nylo_out; + int nyhi_out; + int nzlo_in; + int nzhi_in; + int nzlo_out; + int nzhi_out; + int nx_pppm; + int ny_pppm; + int nz_pppm; + PPPM_FLOAT qqrd2e; + int order; + // float3 sublo; + PPPM_FLOAT* rho_coeff; + int nmax; + int nlocal; + PPPM_FLOAT* debugdata; + PPPM_FLOAT delxinv; + PPPM_FLOAT delyinv; + PPPM_FLOAT delzinv; + int nlower; + int nupper; + PPPM_FLOAT shiftone; + +}; + +struct cuda_shared_comm +{ + int maxswap; + int maxlistlength; + dev_array pbc; + dev_array slablo; + dev_array slabhi; + dev_array multilo; + dev_array multihi; + dev_array sendlist; + int grow_flag; + int comm_phase; + + int nsend; + int* nsend_swap; + int* send_size; + int* recv_size; + double** buf_send; + void** buf_send_dev; + double** buf_recv; + void** buf_recv_dev; + void* buffer; + int buffer_size; + double overlap_split_ratio; +}; + +struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data +{ + int maxlocal; + int inum; // # of I atoms neighbors are stored for local indices of I atoms + int inum_border2; + dev_array inum_border; // # of atoms which interact with border atoms + dev_array ilist; + dev_array ilist_border; + dev_array numneigh; + dev_array numneigh_inner; + dev_array numneigh_border; + dev_array firstneigh; + dev_array neighbors; + dev_array neighbors_border; + dev_array neighbors_inner; + int maxpage; + dev_array page_pointers; + dev_array* pages; + int maxneighbors; + int neigh_lists_per_page; + double** cutneighsq; + CUDA_FLOAT* cu_cutneighsq; + int* binned_id; + int* bin_dim; + int bin_nmax; + float bin_extraspace; + double maxcut; + dev_array ex_type; + int nex_type; + dev_array ex1_bit; + dev_array ex2_bit; + int nex_group; + dev_array ex_mol_bit; + int nex_mol; + +}; + +struct cuda_compile_settings // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files +{ + int prec_glob; + int prec_x; + int prec_v; + int prec_f; + int prec_pppm; + int prec_fft; + int cufft; + int arch; +}; + +struct cuda_timings_struct +{ + //Debug: + double test1; + double test2; + //transfers + double transfer_upload_tmp_constr; + double transfer_download_tmp_deconstr; + + //communication + double comm_forward_total; + double comm_forward_mpi_upper; + double comm_forward_mpi_lower; + double comm_forward_kernel_pack; + double comm_forward_kernel_unpack; + double comm_forward_kernel_self; + double comm_forward_upload; + double comm_forward_download; + + double comm_exchange_total; + double comm_exchange_mpi; + double comm_exchange_kernel_pack; + double comm_exchange_kernel_unpack; + double comm_exchange_kernel_fill; + double comm_exchange_cpu_pack; + double comm_exchange_upload; + double comm_exchange_download; + + double comm_border_total; + double comm_border_mpi; + double comm_border_kernel_pack; + double comm_border_kernel_unpack; + double comm_border_kernel_self; + double comm_border_kernel_buildlist; + double comm_border_upload; + double comm_border_download; + + //pair forces + double pair_xtype_conversion; + double pair_kernel; + double pair_virial; + double pair_force_collection; + + //neighbor + double neigh_bin; + double neigh_build; + double neigh_special; + + //PPPM + double pppm_particle_map; + double pppm_make_rho; + double pppm_brick2fft; + double pppm_poisson; + double pppm_fillbrick; + double pppm_fieldforce; + double pppm_compute; + +}; + +struct cuda_shared_data // holds space for all relevent data from the different classes +{ + void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine] + int buffersize; //maxsize of buffer + int buffer_new; //should be 1 if the pointer to buffer has changed + void* flag; + void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array + cuda_shared_atom atom; + cuda_shared_pair pair; + cuda_shared_domain domain; + cuda_shared_pppm pppm; + cuda_shared_comm comm; + cuda_compile_settings compile_settings; + cuda_timings_struct cuda_timings; + int exchange_dim; + int me; //mpi rank + unsigned int datamask; + int overlap_comm; +}; + + +#endif // #ifndef _CUDA_SHARED_H_ diff --git a/lib/cuda/cuda_wrapper.cu b/lib/cuda/cuda_wrapper.cu new file mode 100644 index 0000000000..d74f731da0 --- /dev/null +++ b/lib/cuda/cuda_wrapper.cu @@ -0,0 +1,315 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include "cuda_shared.h" +#include "cuda_common.h" +#include "cuda_wrapper_cu.h" +#include "cuda_wrapper_kernel.cu" + +static int CudaWrapper_total_gpu_mem=0; +static double CudaWrapper_total_upload_time=0; +static double CudaWrapper_total_download_time=0; +static double CudaWrapper_cpubuffer_upload_time=0; +static double CudaWrapper_cpubuffer_download_time=0; +static cudaStream_t* streams; +static int nstreams=0; + +void CudaWrapper_Init(int argc, char** argv,int me,int ppn,int* devicelist) +{ + MYDBG( printf("# CUDA: debug mode on\n"); ) + + #if __DEVICE_EMULATION__ + + printf("# CUDA: emulation mode on\n"); + + #else + + // modified from cutil.h + static int deviceCount=0; + static bool sharedmode=false; + if(deviceCount && !sharedmode) return; + if(deviceCount && sharedmode) cudaThreadExit(); + + CUDA_SAFE_CALL_NO_SYNC( cudaGetDeviceCount(&deviceCount) ); + if (deviceCount == 0) + { + fprintf(stderr, "cutil error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + MYDBG( printf("# CUDA There are %i devices supporting CUDA in this system.\n",deviceCount);) + + cudaDeviceProp deviceProp[deviceCount]; + for(int i=0;ideviceCount) {printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n"); exit(0);} + int devicea=me%ppn; + if(devicelist) devicea=devicelist[devicea]; + else + devicea=dev_list[devicea]; + if(devicea>=deviceCount) {printf("Asking for non existent GPU %i. Found only %i GPUs.\n",devicea,deviceCount); exit(0);} + MYDBG( + printf(" # CUDA myid: %i take device: %i\n",me,devicea); + ) + CUDA_SAFE_CALL( cudaSetDevice(devicea) ); + } + else + { + CUDA_SAFE_CALL( cudaSetValidDevices(dev_list,deviceCount) ); + } + cudaSetDeviceFlags(cudaDeviceMapHost); + cudaThreadSynchronize(); + + int dev; + CUDA_SAFE_CALL( cudaGetDevice(&dev)); + + if (deviceProp[dev].major < 1) + { + fprintf(stderr, "CUDA error: device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + else + if ((deviceProp[dev].major == 1)&&(deviceProp[dev].minor != 3)) + { + fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n",dev,deviceProp[dev].name,deviceProp[dev].major,deviceProp[dev].minor); + exit(EXIT_FAILURE); + } + if ((deviceProp[dev].major == 2)&&(CUDA_ARCH<20)) + { + fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n",deviceProp[dev].major,deviceProp[dev].minor); + } + if ((deviceProp[dev].major == 1)&&(CUDA_ARCH>=20)) + { + fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n",CUDA_ARCH); + exit(EXIT_FAILURE); + } + + +fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name); + MYDBG( fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);) + + MYDBG + ( + printf("name = %s\n", deviceProp[dev].name); + printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem); + printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock); + printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock); + printf("warpSize = %i\n", deviceProp[dev].warpSize); + printf("memPitch = %i\n", deviceProp[dev].memPitch); + printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock); + printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]); + printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]); + printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem); + printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor); + printf("clockRate = %i\n", deviceProp[dev].clockRate); + printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment); + printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap); + printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount); + printf("computeMode = %i\n", deviceProp[dev].computeMode); + ) + + #endif + } + +void* CudaWrapper_AllocCudaData(unsigned nbytes) +{ + void* dev_data; + CUDA_SAFE_CALL( cudaMalloc((void**)&dev_data, nbytes) ); + MYDBG( printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data); ) + CudaWrapper_total_gpu_mem+=nbytes; + return dev_data; +} + +void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes) +{ + MYDBG( printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data,host_data); ) + cudaThreadSynchronize(); + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + CUDA_SAFE_CALL( cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice) ); + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_total_upload_time+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; +} + +void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream) +{ + MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); ) + cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice,streams[stream]); +} + +void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes) +{ + MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); ) + cudaThreadSynchronize(); + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + CUDA_SAFE_CALL( cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost) ); + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_total_download_time+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; +} + +void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream) +{ + MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); ) + cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost,streams[stream]); +} + +void CudaWrapper_FreeCudaData(void* dev_data,unsigned nbytes) +{ + MYDBG( printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data,nbytes,(char*)dev_data+nbytes); ) + CUDA_SAFE_CALL( cudaFree(dev_data) ); + CudaWrapper_total_gpu_mem-=nbytes; +} + +void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes) +{ + MYDBG( printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data); ) + CUDA_SAFE_CALL( cudaMemset(dev_data, value, nbytes) ); +} + +void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes) +{ + MYDBG( printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source,dev_dest); ) + CUDA_SAFE_CALL( cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice) ); +} + +void* CudaWrapper_AllocPinnedHostData(unsigned nbytes,bool mapped,bool writeCombined) +{ + void* host_data; + int flags=0; + if(mapped) flags=flags | cudaHostAllocMapped; + if(writeCombined) flags=flags | cudaHostAllocWriteCombined; + + CUDA_SAFE_CALL( cudaHostAlloc((void**)&host_data, nbytes,flags) ); +// CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) ); + MYDBG( printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data); ) + return host_data; +} + +void CudaWrapper_FreePinnedHostData(void* host_data) +{ + MYDBG( printf("# CUDA: freeing pinned host memory at %p \n",host_data); ) + if(host_data) + CUDA_SAFE_CALL( cudaFreeHost(host_data) ); +} + +void cuda_check_error(char* comment) +{ + printf("ERROR-CUDA %s %s\n",comment,cudaGetErrorString(cudaGetLastError())); +} + +int CudaWrapper_CheckMemUseage() +{ + size_t free,total; + cudaMemGetInfo(&free,&total); + return total-free; //possible with cuda 3.0 ??? + //return CudaWrapper_total_gpu_mem; +} + +double CudaWrapper_CheckUploadTime(bool reset) +{ + if(reset) CudaWrapper_total_upload_time=0.0; + return CudaWrapper_total_upload_time; +} + +double CudaWrapper_CheckDownloadTime(bool reset) +{ + if(reset) CudaWrapper_total_download_time=0.0; + return CudaWrapper_total_download_time; +} + +double CudaWrapper_CheckCPUBufUploadTime(bool reset) +{ + if(reset) CudaWrapper_cpubuffer_upload_time=0.0; + return CudaWrapper_cpubuffer_upload_time; +} + +double CudaWrapper_CheckCPUBufDownloadTime(bool reset) +{ + if(reset) CudaWrapper_cpubuffer_download_time=0.0; + return CudaWrapper_cpubuffer_download_time; +} + +void CudaWrapper_AddCPUBufUploadTime(double dt) +{ + CudaWrapper_cpubuffer_upload_time+=dt; +} + +void CudaWrapper_AddCPUBufDownloadTime(double dt) +{ + CudaWrapper_cpubuffer_download_time+=dt; +} + +void CudaWrapper_Sync() +{ + cudaThreadSynchronize(); +} + +void CudaWrapper_SyncStream(int stream) +{ + cudaStreamSynchronize(streams[stream]); +} + +void CudaWrapper_AddStreams(int n) +{ + cudaStream_t* new_streams=new cudaStream_t[nstreams+n]; + for(int i=0;i0) + delete [] streams; + streams=new_streams; + nstreams+=n; +} + +void* CudaWrapper_returnStreams() +{ + return (void*) streams; +} + +int CudaWrapper_returnNStreams() +{ + return nstreams; +} + diff --git a/lib/cuda/cuda_wrapper_cu.h b/lib/cuda/cuda_wrapper_cu.h new file mode 100644 index 0000000000..85d51a8586 --- /dev/null +++ b/lib/cuda/cuda_wrapper_cu.h @@ -0,0 +1,52 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef _CUDA_DATA_WRAPPER_H_ +#define _CUDA_DATA_WRAPPER_H_ + +extern "C" void CudaWrapper_Init(int argc, char** argv,int me=0,int ppn=2,int* devicelist=NULL); +extern "C" void* CudaWrapper_AllocCudaData(unsigned nbytes); +extern "C" void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes); +extern "C" void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id); +extern "C" void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes); +extern "C" void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id); +extern "C" void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes=0); +extern "C" void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes); +extern "C" void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes); +extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped=false, bool writeCombind=false); +extern "C" void CudaWrapper_FreePinnedHostData(void* dev_data); +extern "C" void cuda_check_error(char* comment); +extern "C" int CudaWrapper_CheckMemUseage(); +extern "C" double CudaWrapper_CheckUploadTime(bool reset=false); +extern "C" double CudaWrapper_CheckDownloadTime(bool reset=false); +extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset=false); +extern "C" double CudaWrapper_CheckCPUBufDownloadTime(bool reset=false); +extern "C" void CudaWrapper_AddCPUBufUploadTime(double dt); +extern "C" void CudaWrapper_AddCPUBufDownloadTime(double dt); +extern "C" void CudaWrapper_Sync(); +extern "C" void CudaWrapper_SyncStream(int n); +extern "C" void CudaWrapper_AddStreams(int n); +extern "C" void* CudaWrapper_returnStreams(); +extern "C" int CudaWrapper_returnNStreams(); + +#endif // _CUDA_DATA_WRAPPER_H_ diff --git a/lib/cuda/cuda_wrapper_kernel.cu b/lib/cuda/cuda_wrapper_kernel.cu new file mode 100644 index 0000000000..951563b67b --- /dev/null +++ b/lib/cuda/cuda_wrapper_kernel.cu @@ -0,0 +1,24 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +// empty file to obay common make rule diff --git a/lib/cuda/domain.cu b/lib/cuda/domain.cu new file mode 100644 index 0000000000..0f1583dda1 --- /dev/null +++ b/lib/cuda/domain.cu @@ -0,0 +1,194 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX domain +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "domain_cu.h" +#include "domain_kernel.cu" + +void Cuda_Domain_UpdateBuffer(cuda_shared_data* sdata,int size) +{ + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); +} + +void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(image) , & sdata->atom.image.dev_data, sizeof(int*) ); +} + +void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(boxlo) , sdata->domain.boxlo , 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(boxhi) , sdata->domain.boxhi , 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo , 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi , 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd , 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , 3*sizeof(int)); + cudaMemcpyToSymbol(MY_CONST(triclinic) , & sdata->domain.triclinic , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(boxlo_lamda) , sdata->domain.boxlo_lamda , 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(boxhi_lamda) , sdata->domain.boxhi_lamda , 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(prd_lamda) , sdata->domain.prd_lamda , 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(h) , sdata->domain.h , 6*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(h_inv) , sdata->domain.h_inv , 6*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(h_rate) , sdata->domain.h_rate , 6*sizeof(V_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag , sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(debugdata) , &sdata->debugdata , sizeof(int*)); +} + +void Cuda_Domain_Init(cuda_shared_data* sdata) +{ + Cuda_Domain_UpdateNmax(sdata); + Cuda_Domain_UpdateDomain(sdata); +} + +void Cuda_Domain_PBC(cuda_shared_data* sdata,int deform_remap,int deform_groupbit,double* extent) +{ + Cuda_Domain_UpdateNmax(sdata); + //if(sdata->domain.update) + Cuda_Domain_UpdateDomain(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int box_change=0; + if(extent) box_change=1; + + int sharedmem=0; + if(box_change) sharedmem=6*sizeof(X_FLOAT); + + int3 layout=getgrid(sdata->atom.nlocal,sharedmem); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + sharedmem*=threads.x; + + if((box_change)&&(sdata->buffer_new or (6*sizeof(X_FLOAT)*grid.x*grid.y>sdata->buffersize))) + Cuda_Domain_UpdateBuffer(sdata,layout.x*layout.y*6*sizeof(X_FLOAT)); + + + Domain_PBC_Kernel<<>>(deform_remap,deform_groupbit,box_change); + cudaThreadSynchronize(); + + CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed"); + if(box_change) + { + X_FLOAT buf2[6*layout.x*layout.y]; + X_FLOAT* buf=buf2; + int flag; + cudaMemcpy(buf, sdata->buffer, 6*layout.x*layout.y*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + //printf("Flag: %i\n",flag); + X_FLOAT min,max; + min=1.0*BIG; + max=-1.0*BIG; + for(int i=0;imax) max=buf[i+layout.x*layout.y]; + } + extent[0]=min; + extent[1]=max; + + buf+=2*layout.x*layout.y; + min=1.0*BIG; + max=-1.0*BIG; + for(int i=0;imax) max=buf[i+layout.x*layout.y]; + } + extent[2]=min; + extent[3]=max; + + buf+=2*layout.x*layout.y; + min=1.0*BIG; + max=-1.0*BIG; + for(int i=0;imax) max=buf[i+layout.x*layout.y]; + } + extent[4]=min; + extent[5]=max; + //printf("Extent: %lf %lf %lf %lf %lf %lf\n",extent[0],extent[1],extent[2],extent[3],extent[4],extent[5]); +/* int n=grid.x*grid.y; + if(n<128) threads.x=32; + else if(n<256) threads.x=64; + else threads.x=128; + sharedmem=n*sizeof(X_FLOAT); + grid.x=6; + grid.y=1; + Domain_reduceBoxExtent<<>>(extent,n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Domain_reduceBoxExtent: Kernel execution failed");*/ + } +} + +void Cuda_Domain_lamda2x(cuda_shared_data* sdata,int n) +{ + Cuda_Domain_UpdateNmax(sdata); + //if(sdata->domain.update) + Cuda_Domain_UpdateDomain(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Domain_lamda2x_Kernel<<>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Domain_lamda2x: Kernel execution failed"); +} + +void Cuda_Domain_x2lamda(cuda_shared_data* sdata,int n) +{ + Cuda_Domain_UpdateNmax(sdata); + //if(sdata->domain.update) + Cuda_Domain_UpdateDomain(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Domain_x2lamda_Kernel<<>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Domain_x2lamda: Kernel execution failed"); +} diff --git a/lib/cuda/domain_cu.h b/lib/cuda/domain_cu.h new file mode 100644 index 0000000000..f04e5610c2 --- /dev/null +++ b/lib/cuda/domain_cu.h @@ -0,0 +1,29 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_Domain_Init(cuda_shared_data* sdata); +extern "C" void Cuda_Domain_PBC(cuda_shared_data* sdata,int deform_remap,int deform_groupbit,double* extent=NULL); +extern "C" void Cuda_Domain_lamda2x(cuda_shared_data* sdata,int n); +extern "C" void Cuda_Domain_x2lamda(cuda_shared_data* sdata,int n); diff --git a/lib/cuda/domain_kernel.cu b/lib/cuda/domain_kernel.cu new file mode 100644 index 0000000000..ec5ef897c1 --- /dev/null +++ b/lib/cuda/domain_kernel.cu @@ -0,0 +1,269 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ X_FLOAT sharedmem[]; + +#define BIG 1e10 +__global__ void Domain_PBC_Kernel(int deform_remap,int deform_groupbit,int box_change) +{ + int idim,otherdims; + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + X_FLOAT lo[3]; + X_FLOAT hi[3]; + X_FLOAT* period; + + if (_triclinic == 0) { + lo[0] = _boxlo[0]; + lo[1] = _boxlo[1]; + lo[2] = _boxlo[2]; + + hi[0] = _boxhi[0]; + hi[1] = _boxhi[1]; + hi[2] = _boxhi[2]; + period = _prd; + } else { + lo[0] = _boxlo_lamda[0]; + lo[1] = _boxlo_lamda[1]; + lo[2] = _boxlo_lamda[2]; + + hi[0] = _boxhi_lamda[0]; + hi[1] = _boxhi_lamda[1]; + hi[2] = _boxhi_lamda[2]; + period = _prd_lamda; + } + + + X_FLOAT tmpx=X_F(0.5)*(hi[0]+lo[0]); + X_FLOAT tmpy=X_F(0.5)*(hi[1]+lo[1]); + X_FLOAT tmpz=X_F(0.5)*(hi[2]+lo[2]); + + X_FLOAT* buf=(X_FLOAT*) _buffer; + buf+=blockIdx.x*gridDim.y+blockIdx.y; + buf[0]=tmpx; + buf+=gridDim.x*gridDim.y; + buf[0]=tmpx; + buf+=gridDim.x*gridDim.y; + buf[0]=tmpy; + buf+=gridDim.x*gridDim.y; + buf[0]=tmpy; + buf+=gridDim.x*gridDim.y; + buf[0]=tmpz; + buf+=gridDim.x*gridDim.y; + buf[0]=tmpz; + + if(i<_nlocal) + { + + if (_periodicity[0]) { + if (_x[i] < lo[0]) { + _x[i] += period[0]; + if (deform_remap && _mask[i] & deform_groupbit) _v[i] += _h_rate[0]; + idim = _image[i] & 1023; + otherdims = _image[i] ^ idim; + idim--; + idim &= 1023; + _image[i] = otherdims | idim; + } + if (_x[i] >= hi[0]) { + _x[i] -= period[0]; + _x[i] = MAX(_x[i],lo[0]); + if (deform_remap && _mask[i] & deform_groupbit) _v[i] -= _h_rate[0]; + idim = _image[i] & 1023; + otherdims = _image[i] ^ idim; + idim++; + idim &= 1023; + _image[i] = otherdims | idim; + } + } + + if (_periodicity[1]) { + if (_x[i+_nmax] < lo[1]) { + _x[i+_nmax] += period[1]; + if (deform_remap && _mask[i] & deform_groupbit) { + _v[i] += _h_rate[5]; + _v[i+_nmax] += _h_rate[1]; + } + idim = (_image[i] >> 10) & 1023; + otherdims = _image[i] ^ (idim << 10); + idim--; + idim &= 1023; + _image[i] = otherdims | (idim << 10); + } + if (_x[i+_nmax] >= hi[1]) { + _x[i+_nmax] -= period[1]; + _x[i+_nmax] = MAX(_x[i+_nmax],lo[1]); + if (deform_remap && _mask[i] & deform_groupbit) { + _v[i] -= _h_rate[5]; + _v[i+_nmax] -= _h_rate[1]; + } + idim = (_image[i] >> 10) & 1023; + otherdims = _image[i] ^ (idim << 10); + idim++; + idim &= 1023; + _image[i] = otherdims | (idim << 10); + } + } + + if (_periodicity[2]) { + if (_x[i+2*_nmax] < lo[2]) { + _x[i+2*_nmax] += period[2]; + if (deform_remap && _mask[i] & deform_groupbit) { + _v[i] += _h_rate[4]; + _v[i+_nmax] += _h_rate[3]; + _v[i+2*_nmax] += _h_rate[2]; + } + idim = _image[i] >> 20; + otherdims = _image[i] ^ (idim << 20); + idim--; + idim &= 1023; + _image[i] = otherdims | (idim << 20); + } + if (_x[i+2*_nmax] >= hi[2]) { + _x[i+2*_nmax] -= period[2]; + _x[i+2*_nmax] = MAX(_x[i+2*_nmax],lo[2]); + if (deform_remap && _mask[i] & deform_groupbit) { + _v[i] -= _h_rate[4]; + _v[i+_nmax] -= _h_rate[3]; + _v[i+2*_nmax] -= _h_rate[2]; + } + idim = _image[i] >> 20; + otherdims = _image[i] ^ (idim << 20); + idim++; + idim &= 1023; + _image[i] = otherdims | (idim << 20); + } + } + if(box_change) + { + tmpx=_x[i]; + tmpy=_x[i+_nmax]; + tmpz=_x[i+2*_nmax]; + + + } + } + __syncthreads(); + if(box_change) + { + X_FLOAT minx=BIG; + X_FLOAT maxx=-BIG; + X_FLOAT miny=BIG; + X_FLOAT maxy=-BIG; + X_FLOAT minz=BIG; + X_FLOAT maxz=-BIG; + + if (not _periodicity[0]) { + sharedmem[threadIdx.x]=tmpx; + minOfBlock(sharedmem); + minx=sharedmem[0]; + __syncthreads(); + sharedmem[threadIdx.x]=tmpx; + maxOfBlock(sharedmem); + maxx=sharedmem[0]; + __syncthreads(); + } + else {minx=lo[0];maxx=hi[0];} + if (not _periodicity[1]) { + sharedmem[threadIdx.x]=tmpy; + minOfBlock(sharedmem); + miny=sharedmem[0]; + __syncthreads(); + sharedmem[threadIdx.x]=tmpy; + maxOfBlock(sharedmem); + maxy=sharedmem[0]; + __syncthreads(); + } + else {minx=lo[1];maxx=hi[1];} + if (not _periodicity[2]) { + sharedmem[threadIdx.x]=tmpz; + minOfBlock(sharedmem); + minz=sharedmem[0]; + __syncthreads(); + sharedmem[threadIdx.x]=tmpz; + maxOfBlock(sharedmem); + maxz=sharedmem[0]; + __syncthreads(); + } + else {minx=lo[2];maxx=hi[2];} + if(threadIdx.x==0) + { + buf=(X_FLOAT*) _buffer; + buf+=blockIdx.x*gridDim.y+blockIdx.y; + buf[0]=minx; + buf+=gridDim.x*gridDim.y; + buf[0]=maxx; + buf+=gridDim.x*gridDim.y; + buf[0]=miny; + buf+=gridDim.x*gridDim.y; + buf[0]=maxy; + buf+=gridDim.x*gridDim.y; + buf[0]=minz; + buf+=gridDim.x*gridDim.y; + buf[0]=maxz; + } + } +} + +__global__ void Domain_reduceBoxExtent(double* extent,int n) +{ + X_FLOAT* buf=(X_FLOAT*) _buffer; + buf+=blockIdx.x*n; + copyGlobToShared(buf,sharedmem,n); + if(blockIdx.x%2==0) + minOfData(sharedmem,n); + else + maxOfData(sharedmem,n); + extent[blockIdx.x]=sharedmem[0]; +} + +__global__ void Domain_lamda2x_Kernel(int n) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i + +void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow) +{ + + dim3 grid; + grid.x=nslow; + grid.y=nmid; + grid.z=1; + dim3 threads; + threads.x=nfast; + threads.y=1; + threads.z=1; + cudaThreadSynchronize(); + initfftdata_kernel<<>>(in,out); + cudaThreadSynchronize(); + MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n",cudaGetErrorString(cudaGetLastError()))); +} + + +void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow) +{ + + dim3 grid; + grid.x=nslow; + grid.y=nmid; + grid.z=1; + dim3 threads; + threads.x=nfast*2; + threads.y=1; + threads.z=1; + permute_kernel<<>>((FFT_FLOAT*)in,(FFT_FLOAT*)out); + cudaThreadSynchronize(); + MYDBG(printf("ERROR-CUDA permute_kernel: %s\n",cudaGetErrorString(cudaGetLastError()))); +} + +void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow) +{ + + dim3 grid; + grid.x=nslow; + grid.y=nmid; + grid.z=1; + dim3 threads; + threads.x=nfast*2; + threads.y=1; + threads.z=1; + permute_kernel<<>>((FFT_FLOAT*)in,(FFT_FLOAT*)out); + cudaThreadSynchronize(); +} +void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo) +{ + + dim3 grid; + grid.x=(ihi-ilo+1); + grid.y=(jhi-jlo+1); + grid.z=1; + dim3 threads; + threads.x=(khi-klo+1)*2; + threads.y=1; + threads.z=1; + permute_part_kernel<<>>((FFT_FLOAT*)in,(FFT_FLOAT*)out,nfast,nmid,nslow,ihi,ilo,jhi,jlo,khi,klo); + cudaThreadSynchronize(); + } + + void FFTsyncthreads() + { + cudaThreadSynchronize(); + } + diff --git a/lib/cuda/fft3d_cuda_cu.h b/lib/cuda/fft3d_cuda_cu.h new file mode 100644 index 0000000000..426b61d40c --- /dev/null +++ b/lib/cuda/fft3d_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow); +extern "C" void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow); +extern "C" void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow); +extern "C" void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo); +extern "C" void FFTsyncthreads(); diff --git a/lib/cuda/fft3d_cuda_kernel.cu b/lib/cuda/fft3d_cuda_kernel.cu new file mode 100644 index 0000000000..0ee414998f --- /dev/null +++ b/lib/cuda/fft3d_cuda_kernel.cu @@ -0,0 +1,44 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__global__ void initfftdata_kernel(double* in,FFT_FLOAT* out) +{ + out[2*(((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x]; + out[2*(((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x)+1]=0; +} + + +__global__ void permute_kernel(FFT_FLOAT* in,FFT_FLOAT* out) +{ + out[2*(((threadIdx.x/2)*gridDim.x+blockIdx.x)*gridDim.y+blockIdx.y)+threadIdx.x-2*(threadIdx.x/2)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x]; +} + +__global__ void permute_scale_kernel(FFT_FLOAT* in,FFT_FLOAT* out) +{ + out[2*(((threadIdx.x/2)*gridDim.x+blockIdx.x)*gridDim.y+blockIdx.y)+threadIdx.x-2*(threadIdx.x/2)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x]*gridDim.x*gridDim.y*blockDim.x*0.5; +} + +__global__ void permute_part_kernel(FFT_FLOAT* in,FFT_FLOAT* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo) +{ + {out[2*((threadIdx.x/2)*(ihi-ilo+1)*(jhi-jlo+1)+(blockIdx.x)*(jhi-jlo+1)+blockIdx.y-jlo)+threadIdx.x-2*(threadIdx.x/2)]=in[2*(blockIdx.x+ilo)*nmid*nslow+2*(blockIdx.y+jlo)*nmid+threadIdx.x+2*klo]; } +} diff --git a/lib/cuda/fix_addforce_cuda.cu b/lib/cuda/fix_addforce_cuda.cu new file mode 100644 index 0000000000..33700b44b6 --- /dev/null +++ b/lib/cuda/fix_addforce_cuda.cu @@ -0,0 +1,89 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX fix_add_force_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "fix_addforce_cuda_cu.h" +#include "fix_addforce_cuda_kernel.cu" + +void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size=(unsigned)(layout.z*layout.y*layout.x)*4*sizeof(F_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); +} + +void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); +} + +void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixAddForceCuda_UpdateNmax(sdata); +} + +void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue,F_FLOAT* aforiginal) +{ + if(sdata->atom.update_nmax) + Cuda_FixAddForceCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + if(sdata->buffer_new) + Cuda_FixAddForceCuda_UpdateBuffer(sdata); + int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixAddForceCuda_PostForce_Kernel<<>> (groupbit,axvalue,ayvalue,azvalue); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed"); + + int oldgrid=grid.x; + grid.x=4; + threads.x=512; + reduce_foriginal<<>> (oldgrid,aforiginal); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed"); + +} diff --git a/lib/cuda/fix_addforce_cuda_cu.h b/lib/cuda/fix_addforce_cuda_cu.h new file mode 100644 index 0000000000..8aff462666 --- /dev/null +++ b/lib/cuda/fix_addforce_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue,F_FLOAT* aforiginal); diff --git a/lib/cuda/fix_addforce_cuda_kernel.cu b/lib/cuda/fix_addforce_cuda_kernel.cu new file mode 100644 index 0000000000..bbfbdbe35a --- /dev/null +++ b/lib/cuda/fix_addforce_cuda_kernel.cu @@ -0,0 +1,86 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ F_FLOAT sharedmem[]; + + +__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + sharedmem[threadIdx.x]=0; + sharedmem[threadIdx.x+blockDim.x]=0; + sharedmem[threadIdx.x+2*blockDim.x]=0; + sharedmem[threadIdx.x+3*blockDim.x]=0; + + if(i < _nlocal) + if (_mask[i] & groupbit) + //if (iregion >= 0 && + //match(x[i][0],x[i][1],x[i][2],iregion)) //currently not supported + { + sharedmem[threadIdx.x]=-xvalue*_x[i] - yvalue*_x[i+1*_nmax] - zvalue*_x[i+2*_nmax]; + sharedmem[threadIdx.x+blockDim.x]=_f[i]; + sharedmem[threadIdx.x+2*blockDim.x]=_f[i+1*_nmax]; + sharedmem[threadIdx.x+3*blockDim.x]=_f[i+2*_nmax]; + _f[i] += xvalue; + _f[i+1*_nmax] += yvalue; + _f[i+2*_nmax] += zvalue; + } + + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2*blockDim.x]); + reduceBlock(&sharedmem[3*blockDim.x]); + F_FLOAT* buffer=(F_FLOAT*) _buffer; + if(threadIdx.x==0) + { + buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x]; + } + +} + + +__global__ void reduce_foriginal(int n,F_FLOAT* foriginal) +{ + int i=0; + sharedmem[threadIdx.x]=0; + F_FLOAT myforig=0.0; + F_FLOAT* buf=(F_FLOAT*) _buffer; + buf=&buf[blockIdx.x*n]; + while(i +#define MY_PREFIX fix_ave_force_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "fix_aveforce_cuda_cu.h" +#include "fix_aveforce_cuda_kernel.cu" + +void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size=(unsigned)(layout.z*layout.y*layout.x)*4*sizeof(F_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); +} + +void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); +} + +void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixAveForceCuda_UpdateNmax(sdata); +} + +void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,F_FLOAT* aforiginal) +{ + if(sdata->atom.update_nmax) + Cuda_FixAveForceCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + if(sdata->buffer_new) + Cuda_FixAveForceCuda_UpdateBuffer(sdata); + + int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + + Cuda_FixAveForceCuda_PostForce_FOrg_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed"); + + int oldgrid=grid.x; + grid.x=4; + threads.x=512; + Cuda_FixAveForceCuda_reduce_foriginal<<>> (oldgrid,aforiginal); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed"); + +} + +void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue) +{ + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + + Cuda_FixAveForceCuda_PostForce_Set_Kernel<<>> (groupbit,xflag,yflag,zflag,axvalue,ayvalue,azvalue); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce_Set: fix ave_force post_force Compute Kernel execution failed"); + +} diff --git a/lib/cuda/fix_aveforce_cuda_cu.h b/lib/cuda/fix_aveforce_cuda_cu.h new file mode 100644 index 0000000000..dd9992d866 --- /dev/null +++ b/lib/cuda/fix_aveforce_cuda_cu.h @@ -0,0 +1,28 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,F_FLOAT* aforiginal); +extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue); diff --git a/lib/cuda/fix_aveforce_cuda_kernel.cu b/lib/cuda/fix_aveforce_cuda_kernel.cu new file mode 100644 index 0000000000..edccee8c4d --- /dev/null +++ b/lib/cuda/fix_aveforce_cuda_kernel.cu @@ -0,0 +1,87 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ F_FLOAT sharedmem[]; + + +__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + sharedmem[threadIdx.x]=0; + sharedmem[threadIdx.x+blockDim.x]=0; + sharedmem[threadIdx.x+2*blockDim.x]=0; + sharedmem[threadIdx.x+3*blockDim.x]=0; + if(i < _nlocal) + if (_mask[i] & groupbit) { + sharedmem[threadIdx.x]=_f[i]; + sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax]; + sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax]; + sharedmem[threadIdx.x+3*blockDim.x]=1; + } + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2*blockDim.x]); + reduceBlock(&sharedmem[3*blockDim.x]); + F_FLOAT* buffer=(F_FLOAT*) _buffer; + if(threadIdx.x==0) + { + buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x]; + } +} + + +__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n,F_FLOAT* foriginal) +{ + int i=0; + sharedmem[threadIdx.x]=0; + F_FLOAT myforig=0.0; + F_FLOAT* buf=(F_FLOAT*) _buffer; + buf=&buf[blockIdx.x*n]; + while(i +#define MY_PREFIX fix_enforce2d_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "fix_enforce2d_cuda_cu.h" +#include "fix_enforce2d_cuda_kernel.cu" + +void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); +} + +void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit) +{ + if(sdata->atom.update_nmax) + Cuda_FixEnforce2dCuda_Init(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + Cuda_FixEnforce2dCuda_PostForce_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixEnforce2dCuda_PostForce: fix enforce2d post_force Kernel execution failed"); +} diff --git a/lib/cuda/fix_enforce2d_cuda_cu.h b/lib/cuda/fix_enforce2d_cuda_cu.h new file mode 100644 index 0000000000..a35fadf806 --- /dev/null +++ b/lib/cuda/fix_enforce2d_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit); diff --git a/lib/cuda/fix_enforce2d_cuda_kernel.cu b/lib/cuda/fix_enforce2d_cuda_kernel.cu new file mode 100644 index 0000000000..c07f944901 --- /dev/null +++ b/lib/cuda/fix_enforce2d_cuda_kernel.cu @@ -0,0 +1,33 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + +__global__ void Cuda_FixEnforce2dCuda_PostForce_Kernel(int groupbit) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal) + if (_mask[i] & groupbit) { + _v[i+2*_nmax] = V_F(0.0); + _f[i+2*_nmax] = F_F(0.0); + } +} diff --git a/lib/cuda/fix_freeze_cuda.cu b/lib/cuda/fix_freeze_cuda.cu new file mode 100644 index 0000000000..ba6fe117ce --- /dev/null +++ b/lib/cuda/fix_freeze_cuda.cu @@ -0,0 +1,95 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX fix_freeze_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_freeze_cuda_cu.h" +#include "fix_freeze_cuda_kernel.cu" + +void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + + } + cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); +} + +void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*) ); +} + + +void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixFreezeCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT* foriginal) +{ + if(sdata->atom.update_nmax) + Cuda_FixFreezeCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + if(sdata->buffer_new) + Cuda_FixFreezeCuda_UpdateBuffer(sdata); + + + int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixFreezeCuda_PostForce_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed"); + + int oldgrid=grid.x; + grid.x=3; + threads.x=512; + Cuda_FixFreezeCuda_Reduce_FOriginal<<>> (oldgrid,foriginal); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed"); + +} diff --git a/lib/cuda/fix_freeze_cuda_cu.h b/lib/cuda/fix_freeze_cuda_cu.h new file mode 100644 index 0000000000..2df8743a6a --- /dev/null +++ b/lib/cuda/fix_freeze_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT* foriginal); diff --git a/lib/cuda/fix_freeze_cuda_kernel.cu b/lib/cuda/fix_freeze_cuda_kernel.cu new file mode 100644 index 0000000000..d6721311b6 --- /dev/null +++ b/lib/cuda/fix_freeze_cuda_kernel.cu @@ -0,0 +1,82 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ F_FLOAT sharedmem[]; + + +__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + sharedmem[threadIdx.x]=0; + sharedmem[threadIdx.x+blockDim.x]=0; + sharedmem[threadIdx.x+2*blockDim.x]=0; + if(i < _nlocal) + if (_mask[i] & groupbit) { + sharedmem[threadIdx.x]=_f[i]; + sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax]; + sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax]; + + _f[i] = F_F(0.0); + _f[i+1*_nmax] = F_F(0.0); + _f[i+2*_nmax] = F_F(0.0); + _torque[i] = F_F(0.0); + _torque[i+1*_nmax] = F_F(0.0); + _torque[i+2*_nmax] = F_F(0.0); + } + + + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2*blockDim.x]); + F_FLOAT* buffer=(F_FLOAT*)_buffer; + if(threadIdx.x==0) + { + buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; + } +} + + +__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n,F_FLOAT* foriginal) +{ + int i=0; + sharedmem[threadIdx.x]=0; + F_FLOAT myforig=0.0; + F_FLOAT* buf=(F_FLOAT*)_buffer; + buf=&buf[blockIdx.x*n]; + while(i +#define MY_PREFIX fix_gravity_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_gravity_cuda_cu.h" +#include "fix_gravity_cuda_kernel.cu" + +void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + + } + cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); +} + +void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) ); +} + +void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixGravityCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc) +{ + if(sdata->atom.update_nmax) + Cuda_FixGravityCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + if(sdata->buffer_new) + Cuda_FixGravityCuda_UpdateBuffer(sdata); + + + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixGravityCuda_PostForce_Kernel<<>> (groupbit,xacc,yacc,zacc); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixGravityCuda_PostForce: fix add_force post_force compute Kernel execution failed"); +} diff --git a/lib/cuda/fix_gravity_cuda_cu.h b/lib/cuda/fix_gravity_cuda_cu.h new file mode 100644 index 0000000000..d69816bb67 --- /dev/null +++ b/lib/cuda/fix_gravity_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc); diff --git a/lib/cuda/fix_gravity_cuda_kernel.cu b/lib/cuda/fix_gravity_cuda_kernel.cu new file mode 100644 index 0000000000..6a77933acb --- /dev/null +++ b/lib/cuda/fix_gravity_cuda_kernel.cu @@ -0,0 +1,36 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + if(i < _nlocal) + if (_mask[i] & groupbit) { + F_FLOAT mass = _rmass_flag?_rmass[i]:_mass[_type[i]]; + _f[i] += mass*xacc; + _f[i+1*_nmax] += mass*yacc; + _f[i+2*_nmax] += mass*zacc; + } +} + diff --git a/lib/cuda/fix_nh_cuda.cu b/lib/cuda/fix_nh_cuda.cu new file mode 100644 index 0000000000..ee91e473e2 --- /dev/null +++ b/lib/cuda/fix_nh_cuda.cu @@ -0,0 +1,219 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX fix_nh_cuda +#define IncludeCommonNeigh +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "fix_nh_cuda_cu.h" +#include "fix_nh_cuda_kernel.cu" + +void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*) ); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_CONST(maxhold) , & sdata->atom.maxhold, sizeof(int) ); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_CONST(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata +} + +void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int size=(unsigned)10*sizeof(int); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + + } + cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata +} + +void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf) +{ + if(sdata->atom.mass_host) + cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(dtf) , & dtf , sizeof(V_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(dtv) , & dtv , sizeof(X_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(dist_check), & sdata->atom.dist_check , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag , sizeof(int) ); // + Cuda_FixNHCuda_UpdateNmax(sdata); +} + + +void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup see cpp +{ + timespec atime1,atime2; + clock_gettime(CLOCK_REALTIME,&atime1); + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); + clock_gettime(CLOCK_REALTIME,&atime2); + sdata->cuda_timings.test1+= + atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]}; + F_FLOAT3 factor2; + if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];} + + int3 layout=getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNHCuda_nh_v_press_Kernel<<>> (groupbit,factor,p_triclinic,factor2); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed"); + +} + +void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup see cpp +{ + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]}; + F_FLOAT3 factor2; + if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];} + + int3 layout=getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel<<>> (groupbit,factor,p_triclinic,factor2); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed"); + +} + +void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp +{ + timespec atime1,atime2; + clock_gettime(CLOCK_REALTIME,&atime1); + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); + clock_gettime(CLOCK_REALTIME,&atime2); + sdata->cuda_timings.test1+= + atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + + int3 layout=getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNHCuda_nh_v_temp_Kernel<<>> (groupbit,factor_eta); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixNHCuda: fix nh v_temp Kernel execution failed"); + +} +void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp +{ + timespec atime1,atime2; + clock_gettime(CLOCK_REALTIME,&atime1); + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); + clock_gettime(CLOCK_REALTIME,&atime2); + sdata->cuda_timings.test1+= + atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + + int3 layout=getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNHCuda_nve_v_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixNHCuda: nve_v Kernel execution failed"); +} + + +void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp +{ + timespec atime1,atime2; + clock_gettime(CLOCK_REALTIME,&atime1); + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); + clock_gettime(CLOCK_REALTIME,&atime2); + sdata->cuda_timings.test1+= + atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + + int3 layout=getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + cudaMemset(sdata->buffer,0,sizeof(int)); + FixNHCuda_nve_x_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + int reneigh_flag; + cudaMemcpy((void*) (&reneigh_flag), sdata->buffer, sizeof(int),cudaMemcpyDeviceToHost); + sdata->atom.reneigh_flag+=reneigh_flag; + CUT_CHECK_ERROR("FixNHCuda: nve_x Kernel execution failed"); +} + +void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata,int groupbit, double* factor_h, int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup see cpp +{ + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + + F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]}; + F_FLOAT3 factor2; + if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];} + + int3 layout=getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel<<>> (groupbit,factor,p_triclinic,factor2); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixNHCuda__nve_v_and_nh_v_press_NoBias: Kernel execution failed"); +} + diff --git a/lib/cuda/fix_nh_cuda_cu.h b/lib/cuda/fix_nh_cuda_cu.h new file mode 100644 index 0000000000..e6ba4e08bd --- /dev/null +++ b/lib/cuda/fix_nh_cuda_cu.h @@ -0,0 +1,32 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf); +extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata,int groupbit, double* factor_h, int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup see cpp diff --git a/lib/cuda/fix_nh_cuda_kernel.cu b/lib/cuda/fix_nh_cuda_kernel.cu new file mode 100644 index 0000000000..a6a3a52a87 --- /dev/null +++ b/lib/cuda/fix_nh_cuda_kernel.cu @@ -0,0 +1,187 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +static inline __device__ void check_distance(X_FLOAT &xtmp,X_FLOAT &ytmp,X_FLOAT &ztmp,int &i,int groupbit) +{ + if(_dist_check) + { + + X_FLOAT d=X_F(0.0); + if(i<_nlocal) + { + X_FLOAT tmp=xtmp-_xhold[i]; + d=tmp*tmp; + tmp=ytmp-_xhold[i+_maxhold]; + d+=tmp*tmp; + tmp=ztmp-_xhold[i+2*_maxhold]; + d+=tmp*tmp; + + d=((_mask[i] & groupbit))?d:X_F(0.0); + } + if(not __all(d<=_triggerneighsq)) + _reneigh_flag[0]=1; + } +} + +__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal && _mask[i] & groupbit) + { + V_FLOAT* my_v = _v + i; + V_FLOAT vx=my_v[0]; + V_FLOAT vy=my_v[_nmax]; + V_FLOAT vz=my_v[2*_nmax]; + vx*=factor.x; + vy*=factor.y; + vz*=factor.z; + if(p_triclinic) { + vx += vy*factor2.z + vz*factor2.y; + vy += vz*factor2.x; + } + vx*=factor.x; + vy*=factor.y; + vz*=factor.z; + my_v[0] = vx; + my_v[_nmax] = vy; + my_v[2*_nmax] = vz; + } + +} + +__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal && _mask[i] & groupbit) + { + V_FLOAT* my_v = _v + i; + my_v[0]*=factor_eta; + my_v[_nmax]*=factor_eta; + my_v[2*_nmax]*=factor_eta; + } + +} + +__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal && _mask[i] & groupbit) + { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + + V_FLOAT dtfm = _dtf; + if(_rmass_flag) dtfm*= V_F(1.0) / _rmass[i]; + else dtfm*= V_F(1.0) / _mass[_type[i]]; + + V_FLOAT vx=my_v[0]; + V_FLOAT vy=my_v[_nmax]; + V_FLOAT vz=my_v[2*_nmax]; + vx*=factor.x; + vy*=factor.y; + vz*=factor.z; + if(p_triclinic) { + vx += vy*factor2.z + vz*factor2.y; + vy += vz*factor2.x; + } + vx*=factor.x; + vy*=factor.y; + vz*=factor.z; + my_v[0] = vx + dtfm * my_f[0]; + my_v[_nmax] = vy + dtfm * my_f[_nmax]; + my_v[2*_nmax] = vz + dtfm * my_f[_nmax*2]; + } + +} + +__global__ void FixNHCuda_nve_v_Kernel(int groupbit) +{ + + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal && _mask[i] & groupbit) + { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + + V_FLOAT dtfm = _dtf; + if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i]; + else dtfm*=V_F(1.0) / _mass[_type[i]]; + + *my_v = (*my_v + dtfm*(*my_f)); my_f += _nmax; my_v += _nmax; + *my_v = (*my_v + dtfm*(*my_f)); my_f += _nmax; my_v += _nmax; + *my_v = (*my_v + dtfm*(*my_f)); + } +} + +__global__ void FixNHCuda_nve_x_Kernel(int groupbit) +{ + X_FLOAT xtmp,ytmp,ztmp; + + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal && _mask[i] & groupbit) + { + V_FLOAT* my_v = _v + i; + X_FLOAT* my_x = _x + i; + + xtmp = *my_x += _dtv * *my_v; my_v += _nmax; my_x += _nmax; + ytmp = *my_x += _dtv * *my_v; my_v += _nmax; my_x += _nmax; + ztmp = *my_x += _dtv * *my_v; + } + check_distance(xtmp,ytmp,ztmp,i,groupbit); +} + + +__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2) +{ + + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal && _mask[i] & groupbit) + { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + + V_FLOAT dtfm = _dtf; + if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i]; + else dtfm*=V_F(1.0) / _mass[_type[i]]; + + V_FLOAT vx = my_v[0] + dtfm*my_f[0]; + V_FLOAT vy = my_v[_nmax] + dtfm*my_f[_nmax]; + V_FLOAT vz = my_v[2*_nmax] + dtfm*my_f[2*_nmax]; + + vx*=factor.x; + vy*=factor.y; + vz*=factor.z; + if(p_triclinic) { + vx += vy*factor2.z + vz*factor2.y; + vy += vz*factor2.x; + } + vx*=factor.x; + vy*=factor.y; + vz*=factor.z; + my_v[0] = vx; + my_v[_nmax] = vy; + my_v[2*_nmax] = vz; + + } +} + diff --git a/lib/cuda/fix_nve_cuda.cu b/lib/cuda/fix_nve_cuda.cu new file mode 100644 index 0000000000..624292431d --- /dev/null +++ b/lib/cuda/fix_nve_cuda.cu @@ -0,0 +1,162 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX fix_nve_cuda +#define IncludeCommonNeigh +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "fix_nve_cuda_cu.h" +#include "fix_nve_cuda_kernel.cu" + +void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata) +{ + #ifdef CUDA_USE_BINNING + + + cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*)); + cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*)); + cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int)*3 ); + cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(binned_x) , & sdata->atom.binned_x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(binned_rmass) , & sdata->atom.binned_rmass .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) ); + + } + + #else + + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*) ); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_CONST(maxhold) , & sdata->atom.maxhold, sizeof(int) ); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_CONST(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata + + #endif +} + +void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int size=(unsigned)10*sizeof(int); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + + } + cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata +} + +void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf) +{ + + if(sdata->atom.mass_host) + cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(dtf) , & dtf , sizeof(V_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(dtv) , & dtv , sizeof(X_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(dist_check), & sdata->atom.dist_check , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag , sizeof(int) ); // + Cuda_FixNVECuda_UpdateNmax(sdata); +} + + +void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp +{ + if(sdata->atom.update_nmax) + Cuda_FixNVECuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); + if(sdata->buffer_new) + Cuda_FixNVECuda_UpdateBuffer(sdata); + + #ifdef CUDA_USE_BINNING + + dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1); + dim3 threads(sdata->domain.bin_nmax, 1, 1); + FixNVECuda_InitialIntegrate_N_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate (binning) Kernel execution failed"); + + #else + + int3 layout=getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + cudaMemset(sdata->buffer,0,sizeof(int)); + FixNVECuda_InitialIntegrate_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + int reneigh_flag; + cudaMemcpy((void*) (&reneigh_flag), sdata->buffer, sizeof(int),cudaMemcpyDeviceToHost); + sdata->atom.reneigh_flag+=reneigh_flag; + CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate Kernel execution failed"); + + #endif + +} + +void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp +{ + if(sdata->atom.update_nmax) + Cuda_FixNVECuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); + if(sdata->buffer_new) + Cuda_FixNVECuda_UpdateBuffer(sdata); + + #ifdef CUDA_USE_BINNING + + dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1); + dim3 threads(sdata->domain.bin_nmax, 1, 1); + FixNVECuda_FinalIntegrate_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate (binning) Kernel execution failed"); + + #else + + int3 layout=getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNVECuda_FinalIntegrate_Kernel<<>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate Kernel execution failed"); + + #endif +} + diff --git a/lib/cuda/fix_nve_cuda_cu.h b/lib/cuda/fix_nve_cuda_cu.h new file mode 100644 index 0000000000..93cabe8d8b --- /dev/null +++ b/lib/cuda/fix_nve_cuda_cu.h @@ -0,0 +1,28 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf); +extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal); +extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal); diff --git a/lib/cuda/fix_nve_cuda_kernel.cu b/lib/cuda/fix_nve_cuda_kernel.cu new file mode 100644 index 0000000000..84f59fb307 --- /dev/null +++ b/lib/cuda/fix_nve_cuda_kernel.cu @@ -0,0 +1,137 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +static inline __device__ void check_distance(X_FLOAT &xtmp,X_FLOAT &ytmp,X_FLOAT &ztmp,int &i,int groupbit) +{ + if(_dist_check) + { + X_FLOAT tmp=xtmp-_xhold[i]; + X_FLOAT d=tmp*tmp; + tmp=ytmp-_xhold[i+_maxhold]; + d+=tmp*tmp; + tmp=ztmp-_xhold[i+2*_maxhold]; + d+=tmp*tmp; + + d=((i < _nlocal) && (_mask[i] & groupbit))?d:X_F(0.0); + + if(not __all(d<=_triggerneighsq)) + _reneigh_flag[0]=1; + } +} + + +__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit) +{ + X_FLOAT xtmp,ytmp,ztmp; + #ifdef CUDA_USE_BINNING + + const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; + if(threadIdx.x < _bin_count_local[bin]) + { + const int i = 3*blockDim.x * bin + threadIdx.x; + if(_mask[i] & groupbit) + { + F_FLOAT* my_f = _binned_f + i; + V_FLOAT* my_v = _binned_v + i; + X_FLOAT* my_x = _binned_x + i; + + V_FLOAT dtfm = _dtf + if(_rmass_flag) dtfm*= V_F(1.0) / _binned_rmass[i]; + else dtfm*= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]]; + + V_FLOAT v_mem; + v_mem = *my_v += dtfm * (*my_f); xtmp = *my_x += _dtv * v_mem; my_f += blockDim.x; my_v += blockDim.x; my_x += blockDim.x; + v_mem = *my_v += dtfm * (*my_f); ytmp = *my_x += _dtv * v_mem; my_f += blockDim.x; my_v += blockDim.x; my_x += blockDim.x; + v_mem = *my_v += dtfm * (*my_f); ztmp = *my_x += _dtv * v_mem; + } + } + + #else + + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal && _mask[i] & groupbit) + { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + X_FLOAT* my_x = _x + i; + + V_FLOAT dtfm = _dtf; + if(_rmass_flag) dtfm*= V_F(1.0) / _rmass[i]; + else dtfm*= V_F(1.0) / _mass[_type[i]]; + + V_FLOAT v_mem; + v_mem = *my_v += dtfm * (*my_f); xtmp=*my_x += _dtv * v_mem; my_f += _nmax; my_v += _nmax; my_x += _nmax; + v_mem = *my_v += dtfm * (*my_f); ytmp=*my_x += _dtv * v_mem; my_f += _nmax; my_v += _nmax; my_x += _nmax; + v_mem = *my_v += dtfm * (*my_f); ztmp=*my_x += _dtv * v_mem; + } + + #endif + + check_distance(xtmp,ytmp,ztmp,i,groupbit); +} + +__global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit) +{ + #ifdef CUDA_USE_BINNING + + const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; + if(threadIdx.x < _bin_count_local[bin]) + { + const int i = 3*blockDim.x * bin + threadIdx.x; + if(_mask[i] & groupbit) + { + F_FLOAT* my_f = _binned_f + i; + V_FLOAT* my_v = _binned_v + i; + + V_FLOAT dtfm = _dtf + if(_rmass_flag) dtfm*= V_F(1.0) / _binned_rmass[i]; + else dtfm*= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]]; + + *my_v += dtfm * (*my_f); my_f += blockDim.x; my_v += blockDim.x; + *my_v += dtfm * (*my_f); my_f += blockDim.x; my_v += blockDim.x; + *my_v += dtfm * (*my_f); + } + } + + #else + + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal && _mask[i] & groupbit) + { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + + V_FLOAT dtfm = _dtf; + if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i]; + else dtfm*=V_F(1.0) / _mass[_type[i]]; + + *my_v += dtfm * (*my_f); my_f += _nmax; my_v += _nmax; + *my_v += dtfm * (*my_f); my_f += _nmax; my_v += _nmax; + *my_v += dtfm * (*my_f); + } + + #endif +} + + + diff --git a/lib/cuda/fix_set_force_cuda.cu b/lib/cuda/fix_set_force_cuda.cu new file mode 100644 index 0000000000..6d0f2fde66 --- /dev/null +++ b/lib/cuda/fix_set_force_cuda.cu @@ -0,0 +1,93 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX fix_set_force_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_set_force_cuda_cu.h" +#include "fix_set_force_cuda_kernel.cu" + +void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + + } + cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); +} + +void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); +} + +void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixSetForceCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,F_FLOAT* foriginal,int flagx,int flagy,int flagz) +{ + if(sdata->atom.update_nmax) + Cuda_FixSetForceCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + if(sdata->buffer_new) + Cuda_FixSetForceCuda_UpdateBuffer(sdata); + + + int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixSetForceCuda_PostForce_Kernel<<>> (groupbit,xvalue,yvalue,zvalue,flagx,flagy,flagz); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed"); + + int oldgrid=grid.x; + grid.x=3; + threads.x=512; + Cuda_FixSetForceCuda_Reduce_FOriginal<<>> (oldgrid,foriginal); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed"); + +} diff --git a/lib/cuda/fix_set_force_cuda_cu.h b/lib/cuda/fix_set_force_cuda_cu.h new file mode 100644 index 0000000000..3121a684ad --- /dev/null +++ b/lib/cuda/fix_set_force_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,F_FLOAT* foriginal,int flagx,int flagy,int flagz); diff --git a/lib/cuda/fix_set_force_cuda_kernel.cu b/lib/cuda/fix_set_force_cuda_kernel.cu new file mode 100644 index 0000000000..f5836dee5f --- /dev/null +++ b/lib/cuda/fix_set_force_cuda_kernel.cu @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ F_FLOAT sharedmem[]; + + +__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,int flagx,int flagy,int flagz) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + sharedmem[threadIdx.x]=0; + sharedmem[threadIdx.x+blockDim.x]=0; + sharedmem[threadIdx.x+2*blockDim.x]=0; + if(i < _nlocal) + if (_mask[i] & groupbit) { + sharedmem[threadIdx.x]=_f[i]; + sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax]; + sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax]; + + if(flagx) _f[i] = xvalue; + if(flagy) _f[i+1*_nmax] = yvalue; + if(flagz) _f[i+2*_nmax] = zvalue; + } + + + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2*blockDim.x]); + F_FLOAT* buffer=(F_FLOAT*)_buffer; + if(threadIdx.x==0) + { + buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; + } +} + + +__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n,F_FLOAT* foriginal) +{ + int i=0; + sharedmem[threadIdx.x]=0; + F_FLOAT myforig=0.0; + F_FLOAT* buf=(F_FLOAT*)_buffer; + buf=&buf[blockIdx.x*n]; + while(i +#define MY_PREFIX fix_shake_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "fix_shake_cuda_cu.h" +#include "cuda_pair_virial_kernel_nc.cu" + +#define _shake_atom MY_AP(shake_atom) +#define _shake_type MY_AP(shake_type) +#define _shake_flag MY_AP(shake_flag) +#define _xshake MY_AP(xshake) +#define _dtfsq MY_AP(dtfsq) +#define _bond_distance MY_AP(bond_distance) +#define _angle_distance MY_AP(angle_distance) +#define _max_iter MY_AP(max_iter) +#define _tolerance MY_AP(tolerance) +__device__ __constant__ int* _shake_atom; +__device__ __constant__ int* _shake_type; +__device__ __constant__ int* _shake_flag; +__device__ __constant__ X_FLOAT3* _xshake; +__device__ __constant__ F_FLOAT _dtfsq; +__device__ __constant__ X_FLOAT* _bond_distance; +__device__ __constant__ X_FLOAT* _angle_distance; +__device__ __constant__ int _max_iter; +__device__ __constant__ X_FLOAT _tolerance; + +#include "fix_shake_cuda_kernel.cu" + +void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(map_array), & sdata->atom.map_array .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(debugdata), & sdata->debugdata , sizeof(int*) ); +} + +void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity , sizeof(int)*3 ); + cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd , sizeof(X_FLOAT)*3 ); + cudaMemcpyToSymbol(MY_CONST(triclinic) , &sdata->domain.triclinic , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(h) , sdata->domain.h , sizeof(X_FLOAT)*6 ); +} + +void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata,int size) +{ + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + + } + cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); +} + +void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata,X_FLOAT dtv, F_FLOAT dtfsq, + void* shake_flag,void* shake_atom,void* shake_type, void* xshake, + void* bond_distance,void* angle_distance,void* virial, + int max_iter,X_FLOAT tolerance) +{ + Cuda_FixShakeCuda_UpdateNmax(sdata); + Cuda_FixShakeCuda_UpdateDomain(sdata); + cudaMemcpyToSymbol(MY_CONST(shake_atom) , & shake_atom , sizeof(void*) ); + cudaMemcpyToSymbol(MY_CONST(shake_type) , & shake_type , sizeof(void*) ); + cudaMemcpyToSymbol(MY_CONST(shake_flag) , & shake_flag , sizeof(void*) ); + cudaMemcpyToSymbol(MY_CONST(xshake) , & xshake , sizeof(void*) ); + cudaMemcpyToSymbol(MY_CONST(dtv) , & dtv , sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(dtfsq) , & dtfsq , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(bond_distance) , & bond_distance , sizeof(void*) ); + cudaMemcpyToSymbol(MY_CONST(angle_distance) , & angle_distance , sizeof(void*) ); + cudaMemcpyToSymbol(MY_CONST(virial) , & virial , sizeof(void*) ); + cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag , sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(max_iter) , &max_iter , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(tolerance) , &tolerance , sizeof(X_FLOAT)); + + if(sdata->atom.mass_host) + cudaMemcpyToSymbol(MY_CONST(mass),& sdata->atom.mass.dev_data , sizeof(V_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag , sizeof(int) ); // + + cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag, sizeof(int*)); + +} + +void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata) +{ + if(sdata->atom.update_nmax) + Cuda_FixShakeCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); + if(sdata->buffer_new) + Cuda_FixShakeCuda_UpdateBuffer(sdata,10*sizeof(double)); + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + FixShakeCuda_UnconstrainedUpdate_Kernel<<>> (); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixShakeCuda_UnconstrainedUpdate: Kernel execution failed"); +} + +void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata,int vflag,int vflag_atom,int* list,int nlist) +{ + if(sdata->atom.update_nmax) + Cuda_FixShakeCuda_UpdateNmax(sdata); + if(sdata->domain.update) + Cuda_FixShakeCuda_UpdateDomain(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); + int3 layout=getgrid(sdata->atom.nlocal,6*sizeof(ENERGY_FLOAT),64); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->buffer_new) + Cuda_FixShakeCuda_UpdateBuffer(sdata,grid.x*grid.y*6*sizeof(ENERGY_FLOAT)); + + BindXTypeTexture(sdata); + + FixShakeCuda_Shake_Kernel<<>> (vflag,vflag_atom,list,nlist); + cudaThreadSynchronize(); + + CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed"); + + if(vflag) + { + int n=grid.x*grid.y; + grid.x=6; + grid.y=1; + threads.x=256; + MY_AP(PairVirialCompute_reduce)<<>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed"); + } + +} + +int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + if(sdata->atom.update_nmax) + Cuda_FixShakeCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_FixShakeCuda_UpdateBuffer(sdata,size); + + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal>0) + { + cudaMemset( sdata->flag,0,sizeof(int)); + FixShakeCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz); + cudaThreadSynchronize(); + cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + int aflag; + cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + if(aflag!=0) printf("aflag PackComm: %i\n",aflag); + CUT_CHECK_ERROR("Cuda_FixShakeCuda_PackComm: Kernel execution failed"); + + } + return 3*n; +} + +int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + if(sdata->atom.update_nmax) + Cuda_FixShakeCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_FixShakeCuda_UpdateBuffer(sdata,size); + static int count=-1; + count++; + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + FixShakeCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); + } + + return 3*n; +} + +void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv) +{ + if(sdata->atom.update_nmax) + Cuda_FixShakeCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_FixShakeCuda_UpdateBuffer(sdata,size); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice); + FixShakeCuda_UnpackComm_Kernel<<>>(n,first); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed"); + + } +} diff --git a/lib/cuda/fix_shake_cuda_cu.h b/lib/cuda/fix_shake_cuda_cu.h new file mode 100644 index 0000000000..b4276b741a --- /dev/null +++ b/lib/cuda/fix_shake_cuda_cu.h @@ -0,0 +1,34 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#include "cuda_shared.h" + +extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata,X_FLOAT dtv, F_FLOAT dtfsq, + void* shake_flag,void* shake_atom,void* shake_type, void* xshake, + void* bond_distance,void* angle_distance,void* virial, + int max_iter,X_FLOAT tolerance); +extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata); +extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata,int vflag,int vflag_atom,int* list,int nlist); +extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag); +extern "C" int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); +extern "C" void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv); + diff --git a/lib/cuda/fix_shake_cuda_kernel.cu b/lib/cuda/fix_shake_cuda_kernel.cu new file mode 100644 index 0000000000..e4ca822a77 --- /dev/null +++ b/lib/cuda/fix_shake_cuda_kernel.cu @@ -0,0 +1,971 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ void v_tally(int& vflag_global,int& vflag_atom,int& n, int *list, ENERGY_FLOAT total, ENERGY_FLOAT *v) +{ + /*if(vflag_global) + { + ENERGY_FLOAT fraction = n/total; + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + *shared += fraction*v[0]; shared+=blockDim.x; + *shared += fraction*v[1]; shared+=blockDim.x; + *shared += fraction*v[2]; shared+=blockDim.x; + *shared += fraction*v[3]; shared+=blockDim.x; + *shared += fraction*v[4]; shared+=blockDim.x; + *shared += fraction*v[5]; + }*/ + if (vflag_atom) { + ENERGY_FLOAT fraction = ENERGY_F(1.0)/total; + for (int i = 0; i < n; i++) { + int m = list[i]; + ENERGY_FLOAT* myvatom=&_vatom[m]; + + *myvatom += fraction*v[0]; myvatom+=_nmax; + *myvatom += fraction*v[1]; myvatom+=_nmax; + *myvatom += fraction*v[2]; myvatom+=_nmax; + *myvatom += fraction*v[3]; myvatom+=_nmax; + *myvatom += fraction*v[4]; myvatom+=_nmax; + *myvatom += fraction*v[5]; + } + } +} + +inline __device__ void minimum_image(X_FLOAT3& delta) +{ + if (_triclinic == 0) { + if (_periodicity[0]) { + delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] : + (delta.x > X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0)); + } + if (_periodicity[1]) { + delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] : + (delta.y > X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0)); + } + if (_periodicity[2]) { + delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] : + (delta.z > X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0)); + } + + } else { + if (_periodicity[1]) { + delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] : + (delta.z > X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0)); + delta.y += delta.z < -X_F(0.5)*_prd[2] ? _h[3] : + (delta.z > X_F(0.5)*_prd[2] ?-_h[3] : X_F(0.0)); + delta.x += delta.z < -X_F(0.5)*_prd[2] ? _h[4] : + (delta.z > X_F(0.5)*_prd[2] ?-_h[4] : X_F(0.0)); + + } + if (_periodicity[1]) { + delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] : + (delta.y > X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0)); + delta.x += delta.y < -X_F(0.5)*_prd[1] ? _h[5] : + (delta.y > X_F(0.5)*_prd[1] ?-_h[5] : X_F(0.0)); + + } + if (_periodicity[0]) { + delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] : + (delta.x > X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0)); + } + } +} + +__global__ void FixShakeCuda_UnconstrainedUpdate_Kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i>=_nlocal) return; + + X_FLOAT3 my_xshake = {X_F(0.0),X_F(0.0),X_F(0.0)}; + if(_shake_flag[i]) + { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + X_FLOAT* my_x = _x + i; + + V_FLOAT dtfmsq = _dtfsq; + if(_rmass_flag) dtfmsq*= V_F(1.0) / _rmass[i]; + else dtfmsq*= V_F(1.0) / _mass[_type[i]]; + + my_xshake.x = *my_x + _dtv* *my_v + dtfmsq* *my_f; my_f += _nmax; my_v += _nmax; my_x += _nmax; + my_xshake.y = *my_x + _dtv* *my_v + dtfmsq* *my_f; my_f += _nmax; my_v += _nmax; my_x += _nmax; + my_xshake.z = *my_x + _dtv* *my_v + dtfmsq* *my_f; + } + _xshake[i]=my_xshake; +} + + + + +__device__ void FixShakeCuda_Shake2(int& vflag,int& vflag_atom,int& m) +{ + int nlist,list[2]; + ENERGY_FLOAT v[6]; + X_FLOAT invmass0,invmass1; + + // local atom IDs and constraint distances + + int i0 = _map_array[_shake_atom[m]]; + int i1 = _map_array[_shake_atom[m+_nmax]]; + X_FLOAT bond1 = _bond_distance[_shake_type[m]]; + + // r01 = distance vec between atoms, with PBC + + X_FLOAT3 r01; + + X_FLOAT4 x_i0,x_i1; + x_i0=fetchXType(i0); + x_i1=fetchXType(i1); + + r01.x = x_i0.x - x_i1.x; + r01.y = x_i0.y - x_i1.y; + r01.z = x_i0.z - x_i1.z; + minimum_image(r01); + + // s01 = distance vec after unconstrained update, with PBC + + X_FLOAT3 s01; + X_FLOAT3 xs_i0=_xshake[i0]; + X_FLOAT3 xs_i1=_xshake[i1]; + + s01.x = xs_i0.x - xs_i1.x; + s01.y = xs_i0.y - xs_i1.y; + s01.z = xs_i0.z - xs_i1.z; + minimum_image(s01); + + // scalar distances between atoms + + X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z; + X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z; + + // a,b,c = coeffs in quadratic equation for lamda + + if (_rmass_flag) { + invmass0 = X_F(1.0)/_rmass[i0]; + invmass1 = X_F(1.0)/_rmass[i1]; + } else { + invmass0 = X_F(1.0)/_mass[static_cast (x_i0.w)]; + invmass1 = X_F(1.0)/_mass[static_cast (x_i1.w)]; + } + + X_FLOAT a = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + X_FLOAT b = X_F(2.0) * (invmass0+invmass1) * + (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z); + X_FLOAT c = s01sq - bond1*bond1; + + // error check + + X_FLOAT determ = b*b - X_F(4.0)*a*c; + if (determ < X_F(0.0)) { + _flag[0]++; + determ = X_F(0.0); + } + + // exact quadratic solution for lamda + + X_FLOAT lamda,lamda1,lamda2; + lamda1 = -b+_SQRT_(determ); + lamda2 = -lamda1 - X_F(2.0)*b; + lamda1 *= X_F(1.0) / (X_F(2.0)*a); + lamda2 *= X_F(1.0) / (X_F(2.0)*a); + + lamda = (fabs(lamda1) <= fabs(lamda2))? lamda1 : lamda2; + + // update forces if atom is owned by this processor + + lamda*= X_F(1.0)/_dtfsq; + + + //attenion: are shake clusters <-> atom unique? + nlist = 0; + if (i0 < _nlocal) { + _f[i0] += lamda*r01.x; + _f[i0+_nmax] += lamda*r01.y; + _f[i0+2*_nmax] += lamda*r01.z; + list[nlist++] = i0; + } + + if (i1 < _nlocal) { + _f[i1] -= lamda*r01.x; + _f[i1+_nmax] -= lamda*r01.y; + _f[i1+2*_nmax] -= lamda*r01.z; + list[nlist++] = i1; + } + + if (vflag||vflag_atom) { + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + X_FLOAT factor=nlist; + v[0] = lamda*r01.x*r01.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 + v[1] = lamda*r01.y*r01.y; *shared = factor*v[1]; shared+=blockDim.x; + v[2] = lamda*r01.z*r01.z; *shared = factor*v[2]; shared+=blockDim.x; + v[3] = lamda*r01.x*r01.y; *shared = factor*v[3]; shared+=blockDim.x; + v[4] = lamda*r01.x*r01.z; *shared = factor*v[4]; shared+=blockDim.x; + v[5] = lamda*r01.y*r01.z; *shared = factor*v[5]; shared+=blockDim.x; + + v_tally(vflag,vflag_atom,nlist,list,2.0,v); + } +} + + +__device__ void FixShakeCuda_Shake3(int& vflag,int& vflag_atom,int& m) +{ + int nlist,list[3]; + ENERGY_FLOAT v[6]; + X_FLOAT invmass0,invmass1,invmass2; + + // local atom IDs and constraint distances + + int i0 = _map_array[_shake_atom[m]]; + int i1 = _map_array[_shake_atom[m+_nmax]]; + int i2 = _map_array[_shake_atom[m+2*_nmax]]; + X_FLOAT bond1 = _bond_distance[_shake_type[m]]; + X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]]; + + // r01 = distance vec between atoms, with PBC + + X_FLOAT3 r01,r02; + + X_FLOAT4 x_i0,x_i1,x_i2; + x_i0=fetchXType(i0); + x_i1=fetchXType(i1); + x_i2=fetchXType(i2); + + r01.x = x_i0.x - x_i1.x; + r01.y = x_i0.y - x_i1.y; + r01.z = x_i0.z - x_i1.z; + minimum_image(r01); + + r02.x = x_i0.x - x_i2.x; + r02.y = x_i0.y - x_i2.y; + r02.z = x_i0.z - x_i2.z; + minimum_image(r02); + + // s01 = distance vec after unconstrained update, with PBC + + X_FLOAT3 s01,s02; + X_FLOAT3 xs_i0=_xshake[i0]; + X_FLOAT3 xs_i1=_xshake[i1]; + X_FLOAT3 xs_i2=_xshake[i2]; + + s01.x = xs_i0.x - xs_i1.x; + s01.y = xs_i0.y - xs_i1.y; + s01.z = xs_i0.z - xs_i1.z; + minimum_image(s01); + + s02.x = xs_i0.x - xs_i2.x; + s02.y = xs_i0.y - xs_i2.y; + s02.z = xs_i0.z - xs_i2.z; + minimum_image(s02); + + // scalar distances between atoms + + X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z; + X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z; + X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z; + X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z; + + // a,b,c = coeffs in quadratic equation for lamda + + if (_rmass_flag) { + invmass0 = X_F(1.0)/_rmass[i0]; + invmass1 = X_F(1.0)/_rmass[i1]; + invmass2 = X_F(1.0)/_rmass[i2]; + } else { + invmass0 = X_F(1.0)/_mass[static_cast (x_i0.w)]; + invmass1 = X_F(1.0)/_mass[static_cast (x_i1.w)]; + invmass2 = X_F(1.0)/_mass[static_cast (x_i2.w)]; + } + + X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) * + (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z); + X_FLOAT a12 = X_F(2.0) * invmass0 * + (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z); + X_FLOAT a21 = X_F(2.0) * invmass0 * + (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z); + X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) * + (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z); + + // error check + + X_FLOAT determ = a11*a22 - a12*a21; + if (determ == X_F(0.0)) _flag[0]++; + X_FLOAT determinv = X_F(1.0)/determ; + + X_FLOAT a11inv = a22*determinv; + X_FLOAT a12inv = -a12*determinv; + X_FLOAT a21inv = -a21*determinv; + X_FLOAT a22inv = a11*determinv; + + // quadratic correction coeffs + + X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z); + + X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq; + X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102; + + X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; + X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq; + X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102; + + // iterate until converged + + X_FLOAT lamda01 = X_F(0.0); + X_FLOAT lamda02 = X_F(0.0); + int niter = 0; + int done = 0; + + X_FLOAT quad1,quad2,b1,b2,lamda01_new,lamda02_new; + +//maybe all running full loop? + while (__any(!done) && niter < _max_iter) { + quad1 = quad1_0101 * lamda01*lamda01 + quad1_0202 * lamda02*lamda02 + + quad1_0102 * lamda01*lamda02; + quad2 = quad2_0101 * lamda01*lamda01 + quad2_0202 * lamda02*lamda02 + + quad2_0102 * lamda01*lamda02; + + b1 = bond1*bond1 - s01sq - quad1; + b2 = bond2*bond2 - s02sq - quad2; + + lamda01_new = a11inv*b1 + a12inv*b2; + lamda02_new = a21inv*b1 + a22inv*b2; + + done++; + done = (fabs(lamda01_new-lamda01) > _tolerance)?0: done; + done = (fabs(lamda02_new-lamda02) > _tolerance)?0: done; + + + lamda01 = done<2?lamda01_new:lamda01; + lamda02 = done<2?lamda02_new:lamda02; + niter++; + } + // update forces if atom is owned by this processor + + lamda01 *= X_F(1.0)/_dtfsq; + lamda02 *= X_F(1.0)/_dtfsq; + + + //attenion: are shake clusters <-> atom unique? + nlist = 0; + if (i0 < _nlocal) { + _f[i0] += lamda01*r01.x + lamda02*r02.x; + _f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y; + _f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z; + list[nlist++] = i0; + } + + if (i1 < _nlocal) { + _f[i1] -= lamda01*r01.x; + _f[i1+_nmax] -= lamda01*r01.y; + _f[i1+2*_nmax] -= lamda01*r01.z; + list[nlist++] = i1; + } + + if (i2 < _nlocal) { + _f[i2] -= lamda02*r02.x; + _f[i2+_nmax] -= lamda02*r02.y; + _f[i2+2*_nmax] -= lamda02*r02.z; + list[nlist++] = i2; + } + + if (vflag||vflag_atom) { + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + X_FLOAT factor=X_F(2.0)/X_F(3.0)*nlist; + v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 + v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y; *shared = factor*v[1]; shared+=blockDim.x; + v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z; *shared = factor*v[2]; shared+=blockDim.x; + v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y; *shared = factor*v[3]; shared+=blockDim.x; + v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z; *shared = factor*v[4]; shared+=blockDim.x; + v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z; *shared = factor*v[5]; shared+=blockDim.x; + + v_tally(vflag,vflag_atom,nlist,list,3.0,v); + } +} + +__device__ void FixShakeCuda_Shake4(int& vflag,int& vflag_atom,int& m) +{ + int nlist,list[4]; + ENERGY_FLOAT v[6]; + X_FLOAT invmass0,invmass1,invmass2,invmass3; + + // local atom IDs and constraint distances + + int i0 = _map_array[_shake_atom[m]]; + int i1 = _map_array[_shake_atom[m+_nmax]]; + int i2 = _map_array[_shake_atom[m+2*_nmax]]; + int i3 = _map_array[_shake_atom[m+3*_nmax]]; + X_FLOAT bond1 = _bond_distance[_shake_type[m]]; + X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]]; + X_FLOAT bond3 = _bond_distance[_shake_type[m+2*_nmax]]; + + // r01 = distance vec between atoms, with PBC + + X_FLOAT3 r01,r02,r03; + + X_FLOAT4 x_i0,x_i1,x_i2,x_i3; + x_i0=fetchXType(i0); + x_i1=fetchXType(i1); + x_i2=fetchXType(i2); + x_i3=fetchXType(i3); + + r01.x = x_i0.x - x_i1.x; + r01.y = x_i0.y - x_i1.y; + r01.z = x_i0.z - x_i1.z; + minimum_image(r01); + + r02.x = x_i0.x - x_i2.x; + r02.y = x_i0.y - x_i2.y; + r02.z = x_i0.z - x_i2.z; + minimum_image(r02); + + r03.x = x_i0.x - x_i3.x; + r03.y = x_i0.y - x_i3.y; + r03.z = x_i0.z - x_i3.z; + minimum_image(r03); + + // s01 = distance vec after unconstrained update, with PBC + + X_FLOAT3 s01,s02,s03; + X_FLOAT3 xs_i0=_xshake[i0]; + X_FLOAT3 xs_i1=_xshake[i1]; + X_FLOAT3 xs_i2=_xshake[i2]; + X_FLOAT3 xs_i3=_xshake[i3]; + + s01.x = xs_i0.x - xs_i1.x; + s01.y = xs_i0.y - xs_i1.y; + s01.z = xs_i0.z - xs_i1.z; + minimum_image(s01); + + s02.x = xs_i0.x - xs_i2.x; + s02.y = xs_i0.y - xs_i2.y; + s02.z = xs_i0.z - xs_i2.z; + minimum_image(s02); + + s03.x = xs_i0.x - xs_i3.x; + s03.y = xs_i0.y - xs_i3.y; + s03.z = xs_i0.z - xs_i3.z; + minimum_image(s03); + + // scalar distances between atoms + + X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z; + X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z; + X_FLOAT r03sq = r03.x*r03.x + r03.y*r03.y + r03.z*r03.z; + X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z; + X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z; + X_FLOAT s03sq = s03.x*s03.x + s03.y*s03.y + s03.z*s03.z; + + // a,b,c = coeffs in quadratic equation for lamda + + if (_rmass_flag) { + invmass0 = X_F(1.0)/_rmass[i0]; + invmass1 = X_F(1.0)/_rmass[i1]; + invmass2 = X_F(1.0)/_rmass[i2]; + invmass3 = X_F(1.0)/_rmass[i3]; + } else { + invmass0 = X_F(1.0)/_mass[static_cast (x_i0.w)]; + invmass1 = X_F(1.0)/_mass[static_cast (x_i1.w)]; + invmass2 = X_F(1.0)/_mass[static_cast (x_i2.w)]; + invmass3 = X_F(1.0)/_mass[static_cast (x_i3.w)]; + } + + X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) * + (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z); + X_FLOAT a12 = X_F(2.0) * invmass0 * + (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z); + X_FLOAT a13 = X_F(2.0) * invmass0 * + (s01.x*r03.x + s01.y*r03.y + s01.z*r03.z); + X_FLOAT a21 = X_F(2.0) * invmass0 * + (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z); + X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) * + (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z); + X_FLOAT a23 = X_F(2.0) * (invmass0) * + (s02.x*r03.x + s02.y*r03.y + s02.z*r03.z); + X_FLOAT a31 = X_F(2.0) * (invmass0) * + (s03.x*r01.x + s03.y*r01.y + s03.z*r01.z); + X_FLOAT a32 = X_F(2.0) * (invmass0) * + (s03.x*r02.x + s03.y*r02.y + s03.z*r02.z); + X_FLOAT a33 = X_F(2.0) * (invmass0+invmass3) * + (s03.x*r03.x + s03.y*r03.y + s03.z*r03.z); + + // error check + + X_FLOAT determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 - + a11*a23*a32 - a12*a21*a33 - a13*a22*a31; + if (determ == X_F(0.0)) _flag[0]++; + X_FLOAT determinv = X_F(1.0)/determ; + + X_FLOAT a11inv = determinv * (a22*a33 - a23*a32); + X_FLOAT a12inv = -determinv * (a12*a33 - a13*a32); + X_FLOAT a13inv = determinv * (a12*a23 - a13*a22); + X_FLOAT a21inv = -determinv * (a21*a33 - a23*a31); + X_FLOAT a22inv = determinv * (a11*a33 - a13*a31); + X_FLOAT a23inv = -determinv * (a11*a23 - a13*a21); + X_FLOAT a31inv = determinv * (a21*a32 - a22*a31); + X_FLOAT a32inv = -determinv * (a11*a32 - a12*a31); + X_FLOAT a33inv = determinv * (a11*a22 - a12*a21); + + // quadratic correction coeffs + + X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z); + X_FLOAT r0103 = (r01.x*r03.x + r01.y*r03.y + r01.z*r03.z); + X_FLOAT r0203 = (r02.x*r03.x + r02.y*r03.y + r02.z*r03.z); + + X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq; + X_FLOAT quad1_0303 = invmass0*invmass0 * r03sq; + X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102; + X_FLOAT quad1_0103 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0103; + X_FLOAT quad1_0203 = X_F(2.0) * invmass0*invmass0 * r0203; + + X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq; + X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; + X_FLOAT quad2_0303 = invmass0*invmass0 * r03sq; + X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102; + X_FLOAT quad2_0103 = X_F(2.0) * invmass0*invmass0 * r0103; + X_FLOAT quad2_0203 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0203; + + X_FLOAT quad3_0101 = invmass0*invmass0 * r01sq; + X_FLOAT quad3_0202 = invmass0*invmass0 * r02sq; + X_FLOAT quad3_0303 = (invmass0+invmass3)*(invmass0+invmass3) * r03sq; + X_FLOAT quad3_0102 = X_F(2.0) * invmass0*invmass0 * r0102; + X_FLOAT quad3_0103 = X_F(2.0) * (invmass0+invmass3)*invmass0 * r0103; + X_FLOAT quad3_0203 = X_F(2.0) * (invmass0+invmass3)*invmass0 * r0203; + // iterate until converged + + X_FLOAT lamda01 = X_F(0.0); + X_FLOAT lamda02 = X_F(0.0); + X_FLOAT lamda03 = X_F(0.0); + int niter = 0; + int done = 0; + + X_FLOAT quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda03_new; + +//maybe all running full loop? + while (__any(!done) && niter < _max_iter) { + quad1 = quad1_0101 * lamda01*lamda01 + + quad1_0202 * lamda02*lamda02 + + quad1_0303 * lamda03*lamda03 + + quad1_0102 * lamda01*lamda02 + + quad1_0103 * lamda01*lamda03 + + quad1_0203 * lamda02*lamda03; + + quad2 = quad2_0101 * lamda01*lamda01 + + quad2_0202 * lamda02*lamda02 + + quad2_0303 * lamda03*lamda03 + + quad2_0102 * lamda01*lamda02 + + quad2_0103 * lamda01*lamda03 + + quad2_0203 * lamda02*lamda03; + + quad3 = quad3_0101 * lamda01*lamda01 + + quad3_0202 * lamda02*lamda02 + + quad3_0303 * lamda03*lamda03 + + quad3_0102 * lamda01*lamda02 + + quad3_0103 * lamda01*lamda03 + + quad3_0203 * lamda02*lamda03; + + b1 = bond1*bond1 - s01sq - quad1; + b2 = bond2*bond2 - s02sq - quad2; + b3 = bond3*bond3 - s03sq - quad3; + + lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3; + lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3; + lamda03_new = a31inv*b1 + a32inv*b2 + a33inv*b3; + + done++; + done = (fabs(lamda01_new-lamda01) > _tolerance)? 0:done; + done = (fabs(lamda02_new-lamda02) > _tolerance)? 0:done; + done = (fabs(lamda03_new-lamda03) > _tolerance)? 0:done; + + lamda01 = done<2?lamda01_new:lamda01; + lamda02 = done<2?lamda02_new:lamda02; + lamda03 = done<2?lamda03_new:lamda03; + niter++; + } + // update forces if atom is owned by this processor + + lamda01 *= X_F(1.0)/_dtfsq; + lamda02 *= X_F(1.0)/_dtfsq; + lamda03 *= X_F(1.0)/_dtfsq; + + + //attenion: are shake clusters <-> atom unique? + nlist = 0; + if (i0 < _nlocal) { + _f[i0] += lamda01*r01.x + lamda02*r02.x + lamda03*r03.x; + _f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y + lamda03*r03.y; + _f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z + lamda03*r03.z; + list[nlist++] = i0; + } + + if (i1 < _nlocal) { + _f[i1] -= lamda01*r01.x; + _f[i1+_nmax] -= lamda01*r01.y; + _f[i1+2*_nmax] -= lamda01*r01.z; + list[nlist++] = i1; + } + + if (i2 < _nlocal) { + _f[i2] -= lamda02*r02.x; + _f[i2+_nmax] -= lamda02*r02.y; + _f[i2+2*_nmax] -= lamda02*r02.z; + list[nlist++] = i2; + } + + if (i3 < _nlocal) { + _f[i3] -= lamda03*r03.x; + _f[i3+_nmax] -= lamda03*r03.y; + _f[i3+2*_nmax] -= lamda03*r03.z; + list[nlist++] = i3; + } + + if (vflag||vflag_atom) { + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + X_FLOAT factor=X_F(2.0)/X_F(4.0)*nlist; + v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x + lamda03*r03.x*r03.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 + v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y + lamda03*r03.y*r03.y; *shared = factor*v[1]; shared+=blockDim.x; + v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z + lamda03*r03.z*r03.z; *shared = factor*v[2]; shared+=blockDim.x; + v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y + lamda03*r03.x*r03.y; *shared = factor*v[3]; shared+=blockDim.x; + v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z + lamda03*r03.x*r03.z; *shared = factor*v[4]; shared+=blockDim.x; + v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z + lamda03*r03.y*r03.z; *shared = factor*v[5]; shared+=blockDim.x; + + v_tally(vflag,vflag_atom,nlist,list,4.0,v); + } +} + +__device__ void FixShakeCuda_Shake3Angle(int& vflag,int& vflag_atom,int& m) +{ + int nlist,list[3]; + ENERGY_FLOAT v[6]; + X_FLOAT invmass0,invmass1,invmass2; + + // local atom IDs and constraint distances + + int i0 = _map_array[_shake_atom[m]]; + int i1 = _map_array[_shake_atom[m+_nmax]]; + int i2 = _map_array[_shake_atom[m+2*_nmax]]; + X_FLOAT bond1 = _bond_distance[_shake_type[m]]; + X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]]; + X_FLOAT bond12 = _angle_distance[_shake_type[m+2*_nmax]]; + + // r01 = distance vec between atoms, with PBC + + X_FLOAT3 r01,r02,r12; + + X_FLOAT4 x_i0,x_i1,x_i2; + x_i0=fetchXType(i0); + x_i1=fetchXType(i1); + x_i2=fetchXType(i2); + + r01.x = x_i0.x - x_i1.x; + r01.y = x_i0.y - x_i1.y; + r01.z = x_i0.z - x_i1.z; + minimum_image(r01); + + r02.x = x_i0.x - x_i2.x; + r02.y = x_i0.y - x_i2.y; + r02.z = x_i0.z - x_i2.z; + minimum_image(r02); + + r12.x = x_i1.x - x_i2.x; + r12.y = x_i1.y - x_i2.y; + r12.z = x_i1.z - x_i2.z; + minimum_image(r12); + + // s01 = distance vec after unconstrained update, with PBC + + X_FLOAT3 s01,s02,s12; + X_FLOAT3 xs_i0=_xshake[i0]; + X_FLOAT3 xs_i1=_xshake[i1]; + X_FLOAT3 xs_i2=_xshake[i2]; + + s01.x = xs_i0.x - xs_i1.x; + s01.y = xs_i0.y - xs_i1.y; + s01.z = xs_i0.z - xs_i1.z; + minimum_image(s01); + + s02.x = xs_i0.x - xs_i2.x; + s02.y = xs_i0.y - xs_i2.y; + s02.z = xs_i0.z - xs_i2.z; + minimum_image(s02); + + s12.x = xs_i1.x - xs_i2.x; + s12.y = xs_i1.y - xs_i2.y; + s12.z = xs_i1.z - xs_i2.z; + minimum_image(s12); + + // scalar distances between atoms + + X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z; + X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z; + X_FLOAT r12sq = r12.x*r12.x + r12.y*r12.y + r12.z*r12.z; + X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z; + X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z; + X_FLOAT s12sq = s12.x*s12.x + s12.y*s12.y + s12.z*s12.z; + + // a,b,c = coeffs in quadratic equation for lamda + + if (_rmass_flag) { + invmass0 = X_F(1.0)/_rmass[i0]; + invmass1 = X_F(1.0)/_rmass[i1]; + invmass2 = X_F(1.0)/_rmass[i2]; + } else { + invmass0 = X_F(1.0)/_mass[static_cast (x_i0.w)]; + invmass1 = X_F(1.0)/_mass[static_cast (x_i1.w)]; + invmass2 = X_F(1.0)/_mass[static_cast (x_i2.w)]; + } + + X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) * + (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z); + X_FLOAT a12 = X_F(2.0) * invmass0 * + (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z); + X_FLOAT a13 = - X_F(2.0) * invmass1 * + (s01.x*r12.x + s01.y*r12.y + s01.z*r12.z); + X_FLOAT a21 = X_F(2.0) * invmass0 * + (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z); + X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) * + (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z); + X_FLOAT a23 = X_F(2.0) * invmass2 * + (s02.x*r12.x + s02.y*r12.y + s02.z*r12.z); + X_FLOAT a31 = - X_F(2.0) * invmass1 * + (s12.x*r01.x + s12.y*r01.y + s12.z*r01.z); + X_FLOAT a32 = X_F(2.0) * invmass2 * + (s12.x*r02.x + s12.y*r02.y + s12.z*r02.z); + X_FLOAT a33 = X_F(2.0) * (invmass1+invmass2) * + (s12.x*r12.x + s12.y*r12.y + s12.z*r12.z); + + // inverse of matrix + + X_FLOAT determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 - + a11*a23*a32 - a12*a21*a33 - a13*a22*a31; + if (determ == X_F(0.0)) _flag[0]++; + X_FLOAT determinv = X_F(1.0)/determ; + + X_FLOAT a11inv = determinv * (a22*a33 - a23*a32); + X_FLOAT a12inv = -determinv * (a12*a33 - a13*a32); + X_FLOAT a13inv = determinv * (a12*a23 - a13*a22); + X_FLOAT a21inv = -determinv * (a21*a33 - a23*a31); + X_FLOAT a22inv = determinv * (a11*a33 - a13*a31); + X_FLOAT a23inv = -determinv * (a11*a23 - a13*a21); + X_FLOAT a31inv = determinv * (a21*a32 - a22*a31); + X_FLOAT a32inv = -determinv * (a11*a32 - a12*a31); + X_FLOAT a33inv = determinv * (a11*a22 - a12*a21); + + // quadratic correction coeffs + + X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z); + X_FLOAT r0112 = (r01.x*r12.x + r01.y*r12.y + r01.z*r12.z); + X_FLOAT r0212 = (r02.x*r12.x + r02.y*r12.y + r02.z*r12.z); + + X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq; + X_FLOAT quad1_1212 = invmass1*invmass1 * r12sq; + X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102; + X_FLOAT quad1_0112 = - X_F(2.0) * (invmass0+invmass1)*invmass1 * r0112; + X_FLOAT quad1_0212 = - X_F(2.0) * invmass0*invmass1 * r0212; + + X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq; + X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; + X_FLOAT quad2_1212 = invmass2*invmass2 * r12sq; + X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102; + X_FLOAT quad2_0112 = X_F(2.0) * invmass0*invmass2 * r0112; + X_FLOAT quad2_0212 = X_F(2.0) * (invmass0+invmass2)*invmass2 * r0212; + + X_FLOAT quad3_0101 = invmass1*invmass1 * r01sq; + X_FLOAT quad3_0202 = invmass2*invmass2 * r02sq; + X_FLOAT quad3_1212 = (invmass1+invmass2)*(invmass1+invmass2) * r12sq; + X_FLOAT quad3_0102 = - X_F(2.0) * invmass1*invmass2 * r0102; + X_FLOAT quad3_0112 = - X_F(2.0) * (invmass1+invmass2)*invmass1 * r0112; + X_FLOAT quad3_0212 = X_F(2.0) * (invmass1+invmass2)*invmass2 * r0212; + // iterate until converged + + X_FLOAT lamda01 = X_F(0.0); + X_FLOAT lamda02 = X_F(0.0); + X_FLOAT lamda12 = X_F(0.0); + int niter = 0; + int done = 0; + + X_FLOAT quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda12_new; + +//maybe all running full loop? + while (__any(!done) && niter < _max_iter) { + quad1 = quad1_0101 * lamda01*lamda01 + + quad1_0202 * lamda02*lamda02 + + quad1_1212 * lamda12*lamda12 + + quad1_0102 * lamda01*lamda02 + + quad1_0112 * lamda01*lamda12 + + quad1_0212 * lamda02*lamda12; + + quad2 = quad2_0101 * lamda01*lamda01 + + quad2_0202 * lamda02*lamda02 + + quad2_1212 * lamda12*lamda12 + + quad2_0102 * lamda01*lamda02 + + quad2_0112 * lamda01*lamda12 + + quad2_0212 * lamda02*lamda12; + + quad3 = quad3_0101 * lamda01*lamda01 + + quad3_0202 * lamda02*lamda02 + + quad3_1212 * lamda12*lamda12 + + quad3_0102 * lamda01*lamda02 + + quad3_0112 * lamda01*lamda12 + + quad3_0212 * lamda02*lamda12; + + b1 = bond1*bond1 - s01sq - quad1; + b2 = bond2*bond2 - s02sq - quad2; + b3 = bond12*bond12 - s12sq - quad3; + + lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3; + lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3; + lamda12_new = a31inv*b1 + a32inv*b2 + a33inv*b3; + + done++; + done = (fabs(lamda01_new-lamda01) > _tolerance)?0: done; + done = (fabs(lamda02_new-lamda02) > _tolerance)?0: done; + done = (fabs(lamda12_new-lamda12) > _tolerance)?0: done; + + lamda01 = done<2?lamda01_new:lamda01; + lamda02 = done<2?lamda02_new:lamda02; + lamda12 = done<2?lamda12_new:lamda12; + niter++; + } + // update forces if atom is owned by this processor + + lamda01 *= X_F(1.0)/_dtfsq; + lamda02 *= X_F(1.0)/_dtfsq; + lamda12 *= X_F(1.0)/_dtfsq; + + + //attenion: are shake clusters <-> atom unique? + nlist = 0; + if (i0 < _nlocal) { + _f[i0] += lamda01*r01.x + lamda02*r02.x; + _f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y; + _f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z; + list[nlist++] = i0; + } + + if (i1 < _nlocal) { + _f[i1] -= lamda01*r01.x - lamda12*r12.x; + _f[i1+_nmax] -= lamda01*r01.y - lamda12*r12.y; + _f[i1+2*_nmax] -= lamda01*r01.z - lamda12*r12.z; + list[nlist++] = i1; + } + + if (i2 < _nlocal) { + _f[i2] -= lamda02*r02.x + lamda12*r12.x; + _f[i2+_nmax] -= lamda02*r02.y + lamda12*r12.y; + _f[i2+2*_nmax] -= lamda02*r02.z + lamda12*r12.z; + list[nlist++] = i2; + } + + if (vflag||vflag_atom) { + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + X_FLOAT factor=X_F(2.0)/X_F(3.0)*nlist; + v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x + lamda12*r12.x*r12.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 + v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y + lamda12*r12.y*r12.y; *shared = factor*v[1]; shared+=blockDim.x; + v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z + lamda12*r12.z*r12.z; *shared = factor*v[2]; shared+=blockDim.x; + v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y + lamda12*r12.x*r12.y; *shared = factor*v[3]; shared+=blockDim.x; + v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z + lamda12*r12.x*r12.z; *shared = factor*v[4]; shared+=blockDim.x; + v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z + lamda12*r12.y*r12.z; *shared = factor*v[5]; shared+=blockDim.x; + + v_tally(vflag,vflag_atom,nlist,list,3.0,v); + } +} + +__global__ void FixShakeCuda_Shake_Kernel(int vflag,int vflag_atom,int* list,int nlist) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i_nmax) _flag[0]=1; + X_FLOAT3 xs=_xshake[j]; + ((X_FLOAT*) _buffer)[i]=xs.x + dx; + ((X_FLOAT*) _buffer)[i+1*n] = xs.y + dy; + ((X_FLOAT*) _buffer)[i+2*n] = xs.z + dz; + } + +} + +__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + int* list=sendlist+iswap*maxlistlength; + if(i_nmax) _flag[0]=1; + X_FLOAT3 xs=_xshake[j]; + xs.x += dx; + xs.y += dy; + xs.z += dz; + _xshake[i+first]=xs; + } + +} + +__global__ void FixShakeCuda_UnpackComm_Kernel(int n,int first) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i +#define MY_PREFIX fix_temp_berendsen_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_temp_berendsen_cuda_cu.h" +#include "fix_temp_berendsen_cuda_kernel.cu" + + +void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); +} + +void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixTempBerendsenCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor) +{ + V_FLOAT factor=afactor; + if(sdata->atom.update_nmax) + Cuda_FixTempBerendsenCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixTempBerendsenCuda_EndOfStep_Kernel<<>> (groupbit,factor); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixTempBerendsenCuda_PostForce: fix add_force post_force compute Kernel execution failed"); +} diff --git a/lib/cuda/fix_temp_berendsen_cuda_cu.h b/lib/cuda/fix_temp_berendsen_cuda_cu.h new file mode 100644 index 0000000000..fd64f98e42 --- /dev/null +++ b/lib/cuda/fix_temp_berendsen_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor); diff --git a/lib/cuda/fix_temp_berendsen_cuda_kernel.cu b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu new file mode 100644 index 0000000000..716cbeac1e --- /dev/null +++ b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu @@ -0,0 +1,36 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + + +__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal) + if (_mask[i] & groupbit) { + _v[i]*=factor; + _v[i+_nmax]*=factor; + _v[i+2*_nmax]*=factor; + } +} + diff --git a/lib/cuda/fix_temp_rescale_cuda.cu b/lib/cuda/fix_temp_rescale_cuda.cu new file mode 100644 index 0000000000..6ca0942970 --- /dev/null +++ b/lib/cuda/fix_temp_rescale_cuda.cu @@ -0,0 +1,64 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX fix_temp_rescale_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_temp_rescale_cuda_cu.h" +#include "fix_temp_rescale_cuda_kernel.cu" + + +void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); +} + +void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixTempRescaleCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor) +{ + V_FLOAT factor=afactor; + //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step + Cuda_FixTempRescaleCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + //cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixTempRescaleCuda_EndOfStep_Kernel<<>> (groupbit,factor); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleCuda_PostForce: fix add_force post_force compute Kernel execution failed"); +} diff --git a/lib/cuda/fix_temp_rescale_cuda_cu.h b/lib/cuda/fix_temp_rescale_cuda_cu.h new file mode 100644 index 0000000000..689b51a603 --- /dev/null +++ b/lib/cuda/fix_temp_rescale_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor); diff --git a/lib/cuda/fix_temp_rescale_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_cuda_kernel.cu new file mode 100644 index 0000000000..19d04a5156 --- /dev/null +++ b/lib/cuda/fix_temp_rescale_cuda_kernel.cu @@ -0,0 +1,36 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + + +__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal) + if (_mask[i] & groupbit) { + _v[i]*=factor; + _v[i+_nmax]*=factor; + _v[i+2*_nmax]*=factor; + } +} + diff --git a/lib/cuda/fix_temp_rescale_limit_cuda.cu b/lib/cuda/fix_temp_rescale_limit_cuda.cu new file mode 100644 index 0000000000..5e2c43e932 --- /dev/null +++ b/lib/cuda/fix_temp_rescale_limit_cuda.cu @@ -0,0 +1,64 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX fix_temp_rescale_limit_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_temp_rescale_limit_cuda_cu.h" +#include "fix_temp_rescale_limit_cuda_kernel.cu" + + +void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); +} + +void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor,double limit) +{ + V_FLOAT factor=afactor; + //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step + Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + //cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int3 layout=getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel<<>> (groupbit,factor,limit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleLimitCuda_PostForce: fix add_force post_force compute Kernel execution failed"); +} diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_cu.h b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h new file mode 100644 index 0000000000..117bca28d8 --- /dev/null +++ b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor,double limit); diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu new file mode 100644 index 0000000000..a6cf446677 --- /dev/null +++ b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu @@ -0,0 +1,43 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + + +__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor,V_FLOAT limit) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if(i < _nlocal) + if (_mask[i] & groupbit) { + V_FLOAT vx = _v[i]; + V_FLOAT vy = _v[i+_nmax]; + V_FLOAT vz = _v[i+2*_nmax]; + vx*=factor; + vy*=factor; + vz*=factor; + + _v[i]=vx>0?min(vx,limit):max(vx,-limit); + _v[i+_nmax]=vy>0?min(vy,limit):max(vy,-limit); + _v[i+2*_nmax]=vz>0?min(vz,limit):max(vz,-limit); + } +} + diff --git a/lib/cuda/fix_viscous_cuda.cu b/lib/cuda/fix_viscous_cuda.cu new file mode 100644 index 0000000000..3406115e58 --- /dev/null +++ b/lib/cuda/fix_viscous_cuda.cu @@ -0,0 +1,66 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX fix_viscous_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_viscous_cuda_cu.h" +#include "fix_viscous_cuda_kernel.cu" + +void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); +} + +void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixViscousCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit,void* gamma) +{ + if(sdata->atom.update_nmax) + Cuda_FixViscousCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + + int3 layout=getgrid(sdata->atom.nlocal,0); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixViscousCuda_PostForce_Kernel<<>> (groupbit,(F_FLOAT*) gamma); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed"); + +} diff --git a/lib/cuda/fix_viscous_cuda_cu.h b/lib/cuda/fix_viscous_cuda_cu.h new file mode 100644 index 0000000000..b785a598a8 --- /dev/null +++ b/lib/cuda/fix_viscous_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit,void* gamma); diff --git a/lib/cuda/fix_viscous_cuda_kernel.cu b/lib/cuda/fix_viscous_cuda_kernel.cu new file mode 100644 index 0000000000..2cd225bbd1 --- /dev/null +++ b/lib/cuda/fix_viscous_cuda_kernel.cu @@ -0,0 +1,35 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit,F_FLOAT* gamma) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + if(i < _nlocal) + if (_mask[i] & groupbit) { + F_FLOAT drag = gamma[_type[i]]; + _f[i] -= drag*_v[i]; + _f[i+1*_nmax] -= drag*_v[i+1*_nmax]; + _f[i+2*_nmax] -= drag*_v[i+2*_nmax]; + } +} diff --git a/lib/cuda/neighbor.cu b/lib/cuda/neighbor.cu new file mode 100644 index 0000000000..a01d5b6ba9 --- /dev/null +++ b/lib/cuda/neighbor.cu @@ -0,0 +1,367 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#define MY_PREFIX neighbor +#define IncludeCommonNeigh +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "cuda_wrapper_cu.h" + +#define _cutneighsq MY_AP(cutneighsq) +#define _ex_type MY_AP(ex_type) +#define _nex_type MY_AP(nex_type) +#define _ex1_bit MY_AP(ex1_bit) +#define _ex2_bit MY_AP(ex2_bit) +#define _nex_group MY_AP(nex_group) +#define _ex_mol_bit MY_AP(ex_mol_bit) +#define _nex_mol MY_AP(nex_mol) +__device__ __constant__ CUDA_FLOAT* _cutneighsq; +__device__ __constant__ int* _ex_type; +__device__ __constant__ int _nex_type; +__device__ __constant__ int* _ex1_bit; +__device__ __constant__ int* _ex2_bit; +__device__ __constant__ int _nex_group; +__device__ __constant__ int* _ex_mol_bit; +__device__ __constant__ int _nex_mol; + +#include "neighbor_cu.h" +#include "neighbor_kernel.cu" + +void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed"); + + int size=(unsigned)(sizeof(int)*20+sneighlist->bin_dim[0]*sneighlist->bin_dim[1]*sneighlist->bin_dim[2]*(sizeof(int)+sneighlist->bin_nmax*3*sizeof(CUDA_FLOAT))); + if(sdata->buffersizebuffer,sdata->buffersize);) + if(sdata->buffer!=NULL) CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer=CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); + CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed"); +} + +int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + if(sdata->buffer_new) + Cuda_Neighbor_UpdateBuffer(sdata,sneighlist); + + // initialize only on first call + CUDA_FLOAT rez_bin_size[3] = + { + (1.0 * sneighlist->bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]), + (1.0 * sneighlist->bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]), + (1.0 * sneighlist->bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2]) + }; + + short init = 0; + if(! init) + { + init = 0; + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo , sizeof(X_FLOAT)*3); + } + + + int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + timespec starttime,endtime; + clock_gettime(CLOCK_REALTIME,&starttime); + + cudaMemset((int*) (sdata->buffer),0,sizeof(int)*(20+(sneighlist->bin_dim[0])*(sneighlist->bin_dim[1])*(sneighlist->bin_dim[2]))+3*sizeof(CUDA_FLOAT)*(sneighlist->bin_dim[0])*(sneighlist->bin_dim[1])*(sneighlist->bin_dim[2])*(sneighlist->bin_nmax)); + + Binning_Kernel<<>> (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],sneighlist->bin_dim[2],rez_bin_size[0],rez_bin_size[1],rez_bin_size[2]); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME,&endtime); + sdata->cuda_timings.neigh_bin+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + + + int binning_error; + cudaMemcpy((void*) &binning_error,(void*) sdata->buffer,1*sizeof(int),cudaMemcpyDeviceToHost); + if(binning_error) + { + sneighlist->bin_extraspace+=0.05; + } + else + { + MYDBG(printf("CUDA: binning successful\n");) + } + CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed"); + return binning_error; +} + +int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + //Cuda_Neighbor_UpdateBuffer(sdata,sneighlist); + CUDA_FLOAT globcutoff=-1.0; + + short init=0; + if(! init) + { + init = 1; + + // !! LAMMPS indexes atom types starting with 1 !! + + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + + unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes; + + CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx); + //printf("Allocate: %i\n",nx); + sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx); + + if(sneighlist->cutneighsq) + { + int cutoffsdiffer=0; + double cutoff0 = sneighlist->cutneighsq[1][1]; + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT) (sneighlist->cutneighsq[i][j]); + if((sneighlist->cutneighsq[i][j]-cutoff0)*(sneighlist->cutneighsq[i][j]-cutoff0)>1e-6) cutoffsdiffer++; + } + } + if(not cutoffsdiffer) globcutoff=(CUDA_FLOAT) cutoff0; + } + else + { + MYEMUDBG( printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n"); ) + return 0; + } + + int size = 100; + if(sdata->buffersize < size) + { + MYDBG( printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize); ) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG( printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize); ) + } + + CudaWrapper_UploadCudaData(acutneighsq,sneighlist->cu_cutneighsq,nx); + cudaMemcpyToSymbol(MY_CONST(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_FLOAT*) ); + + cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(special_flag) , sdata->atom.special_flag , 4*sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(molecular) , & sdata->atom.molecular , sizeof(int) ); + } + + cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); + //cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(special) , & sdata->atom.special .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(maxspecial) , & sdata->atom.maxspecial , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nspecial) , & sdata->atom.nspecial .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata , sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(overlap_comm) , & sdata->overlap_comm, sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(ex_type) , & sneighlist->ex_type.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(ex1_bit) , & sneighlist->ex1_bit.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(ex2_bit) , & sneighlist->ex2_bit.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(ex_mol_bit) , & sneighlist->ex_mol_bit.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(nex_type) , & sneighlist->nex_type, sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nex_group) , & sneighlist->nex_group, sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nex_mol) , & sneighlist->nex_mol, sizeof(int) ); + + if(sdata->overlap_comm) + { + cudaMemcpyToSymbol(MY_CONST(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*) ); + } + + //dim3 threads(sneighlist->bin_nmax,1,1); + dim3 threads(MIN(128,sneighlist->bin_nmax),1,1); + dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1],sneighlist->bin_dim[2],1); + + //printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax); + int buffer[20]; + buffer[0]=1; + buffer[1]=0; + CudaWrapper_UploadCudaData( buffer, sdata->buffer, 2*sizeof(int)); + CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error"); + //cudaMemset(sdata->debugdata,0,100*sizeof(int)); + unsigned int shared_size=(sizeof(int)+3*sizeof(CUDA_FLOAT))*threads.x; + MYDBG(printf("Configuration: %i %i %i %u %i\n",grid.x,grid.y,threads.x,shared_size,sneighlist->bin_nmax);) + //shared_size=2056; + timespec starttime,endtime; + clock_gettime(CLOCK_REALTIME,&starttime); + //for(int i=0;i<100;i++) + { + if(sdata->overlap_comm) + NeighborBuildFullBin_OverlapComm_Kernel<<>> + (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom); + else + { + int exclude=sneighlist->nex_mol|sneighlist->nex_group|sneighlist->nex_type; + if(exclude) + NeighborBuildFullBin_Kernel<1><<>> + (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom); + else + NeighborBuildFullBin_Kernel<0><<>> + (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom); + } + //NeighborBuildFullBin_Kernel_Restrict<<>> + // (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff); + + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed"); + clock_gettime(CLOCK_REALTIME,&endtime); + sdata->cuda_timings.neigh_build+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + //dim3 threads,grid; + CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int)); + if(buffer[0]>=0&&true&&sdata->atom.molecular) + { + //printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall); + clock_gettime(CLOCK_REALTIME,&starttime); + int3 layout=getgrid(sdata->atom.nlocal,0,512); + threads.x = layout.z; threads.y = 1; threads.z = 1; + grid.x = layout.x; grid.y = layout.y; grid.z = 1; + FindSpecial<<>>(sdata->pair.use_block_per_atom); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed"); + clock_gettime(CLOCK_REALTIME,&endtime); + sdata->cuda_timings.neigh_special+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + } + } + //printf("Neightime: %lf\n",sdata->cuda_timings.test1); + CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed"); + + //CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int)); + + MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");) + return buffer[0]; +} + +int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");) + // initialize only on first call + /*static*/ short init=0; + if(! init) + { + init = 1; + + // !! LAMMPS indexes atom types starting with 1 !! + + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2) + printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u " + "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " + "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); + + unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes; + CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx); + + if(sneighlist->cutneighsq) + { + for(int i=1; i<=sdata->atom.ntypes; ++i) + { + for(int j=1; j<=sdata->atom.ntypes; ++j) + { + acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT) (sneighlist->cutneighsq[i][j]); + //printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]); + } + } + } + else + { + MYEMUDBG( printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n"); ) + return 0; + } + + int size = 100; + if(sdata->buffersize < size) + { + MYDBG( printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize); ) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG( printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize); ) + } + + cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer , sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(cutneighsq) , acutneighsq , nx ); + cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); + + free(acutneighsq); + } + + int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + int return_value = 1; + CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int)); + + CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed"); + NeighborBuildFullNsq_Kernel<<>> (); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed"); + + int buffer[20]; + CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int)*20); + MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");) + return return_value=buffer[0]; +} diff --git a/lib/cuda/neighbor_cu.h b/lib/cuda/neighbor_cu.h new file mode 100644 index 0000000000..6ca1440de0 --- /dev/null +++ b/lib/cuda/neighbor_cu.h @@ -0,0 +1,32 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef NEIGHBOR_CU_H_ +#define NEIGHBOR_CU_H_ +#include "cuda_shared.h" + +extern "C" int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist); +extern "C" int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist); +extern "C" int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist); + +#endif /*NEIGHBOR_CU_H_*/ diff --git a/lib/cuda/neighbor_kernel.cu b/lib/cuda/neighbor_kernel.cu new file mode 100644 index 0000000000..ad1a6a8fe7 --- /dev/null +++ b/lib/cuda/neighbor_kernel.cu @@ -0,0 +1,623 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__global__ void Binning_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,int bin_dim_z, + CUDA_FLOAT rez_bin_size_x,CUDA_FLOAT rez_bin_size_y,CUDA_FLOAT rez_bin_size_z) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + /*int* bin_count=(int*) _buffer; + bin_count=bin_count+20; + CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/ + CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer; + binned_x = &binned_x[2]; + int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax]; + if(i < _nall) + { + // copy atom position from global device memory to local register + // in this 3 steps to get as much coalesced access as possible + X_FLOAT* my_x = _x + i; + CUDA_FLOAT x_i = *my_x; my_x += _nmax; + CUDA_FLOAT y_i = *my_x; my_x += _nmax; + CUDA_FLOAT z_i = *my_x; + + + // calculate flat bin index + int bx=__float2int_rd(rez_bin_size_x * (x_i - _sublo[0]))+2; + int by=__float2int_rd(rez_bin_size_y * (y_i - _sublo[1]))+2; + int bz=__float2int_rd(rez_bin_size_z * (z_i - _sublo[2]))+2; + + bx-=bx*negativCUDA(1.0f*bx); + bx-=(bx-bin_dim_x+1)*negativCUDA(1.0f*bin_dim_x-1.0f-1.0f*bx); + by-=by*negativCUDA(1.0f*by); + by-=(by-bin_dim_y+1)*negativCUDA(1.0f*bin_dim_y-1.0f-1.0f*by); + bz-=bz*negativCUDA(1.0f*bz); + bz-=(bz-bin_dim_z+1)*negativCUDA(1.0f*bin_dim_z-1.0f-1.0f*bz); + + + const unsigned j = bin_dim_z * ( bin_dim_y *bx+by)+bz; + + // add new atom to bin, get bin-array position + const unsigned k = atomicAdd(& bin_count[j], 1); + if(k < bin_nmax) + { + binned_id [bin_nmax * j + k] = i; + binned_x [3 * bin_nmax * j + k] = x_i; + binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i; + binned_x [3 * bin_nmax * j + k + 2*bin_nmax] = z_i; + } + else + { // normally, this should not happen: + int errorn=atomicAdd((int*) _buffer, 1); + MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); ) + } + } +} + + +__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype) +{ + int m; + + if (_nex_type) + if( _ex_type[itype * _cuda_ntypes + jtype]) return 1; + + if (_nex_group) { + for (m = 0; m < _nex_group; m++) { + if (_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1; + if (_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1; + } + } + + if (_nex_mol) { + if(_molecule[i] == _molecule[j]) + for (m = 0; m < _nex_mol; m++) + if (_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m] ) return 1; + } + + return 0; +} + +extern __shared__ CUDA_FLOAT shared[]; + +__device__ inline int find_special(int3 &n, int* list,int & tag,int3 flag) +{ + int k=n.z; + for (int l = 0; l < n.z; l++) k = ((list[l] == tag)?l:k); + + return k +__global__ void NeighborBuildFullBin_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style) +{ + //const bool domol=false; + int bin_dim_z=gridDim.y; + CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer; + binned_x = &binned_x[2]; + int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax]; + int bin = __mul24(gridDim.y,blockIdx.x)+blockIdx.y; + int bin_x = blockIdx.x/bin_dim_y; + int bin_y = blockIdx.x-bin_x*bin_dim_y; + int bin_z = blockIdx.y; + int bin_c = bin_count[bin]; + + + CUDA_FLOAT cut; + if(globcutoff>0) + cut = globcutoff; + + int i=_nall; + CUDA_FLOAT* my_x; + CUDA_FLOAT x_i,y_i,z_i; + + for(int actOffset=0; actOffset=bin_dim_x||obin_y>=bin_dim_y||obin_z>=bin_dim_z) continue; + int other_bin=bin_dim_z * ( bin_dim_y * obin_x + obin_y) + obin_z; + if(other_bin==bin) continue; + + int obin_c=bin_count[other_bin]; + + for(int otherActOffset=0; otherActOffset _maxneighbors) ((int*)_buffer)[0] = -jnum; + + if(i<_nlocal) + _numneigh[i] = jnum; + } +} + + +__global__ void FindSpecial(int block_style) +{ + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + int which; + int tag_mask=0; + int3 spec_flag; + + int3 mynspecial = {0,0,1}; + if(ii>=_nlocal) return; + int special_id[CUDA_MAX_NSPECIAL]; + + int i = _ilist[ii]; + if(i>=_nlocal) return; + int jnum = _numneigh[i]; + if (_special_flag[1] == 0) spec_flag.x = -1; + else if (_special_flag[1] == 1) spec_flag.x = 0; + else spec_flag.x = 1; + + if (_special_flag[2] == 0) spec_flag.y = -1; + else if (_special_flag[2] == 1) spec_flag.y = 0; + else spec_flag.y = 2; + + if (_special_flag[3] == 0) spec_flag.z = -1; + else if (_special_flag[3] == 1) spec_flag.z = 0; + else spec_flag.z = 3; + + mynspecial.x=_nspecial[i]; + mynspecial.y=_nspecial[i+_nmax]; + mynspecial.z=_nspecial[i+2*_nmax]; + + if(i<_nlocal) + { + int* list = &_special[i]; + for(int k=0;k0) + { + if(block_style) + _neighbors[i*_maxneighbors+k]=j+which*_nall; + else + _neighbors[i+k*_nlocal]=j+which*_nall; + } + else if(which<0) + { + if(block_style) + _neighbors[i*_maxneighbors+k]=_neighbors[i*_maxneighbors+jnum-1]; + else + _neighbors[i+k*_nlocal]=_neighbors[i+(jnum-1)*_nlocal]; + jnum--; + k--; + } + } + } + _numneigh[i]=jnum; +} + +__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style) +{ + int bin_dim_z=gridDim.y; + CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer; + binned_x = &binned_x[2]; + int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax]; + int bin = __mul24(gridDim.y,blockIdx.x)+blockIdx.y; + int bin_x = blockIdx.x/bin_dim_y; + int bin_y = blockIdx.x-bin_x*bin_dim_y; + int bin_z = blockIdx.y; + int bin_c = bin_count[bin]; + + + CUDA_FLOAT cut; + if(globcutoff>0) + cut = globcutoff; + + int i=_nall; + CUDA_FLOAT* my_x; + CUDA_FLOAT x_i,y_i,z_i; + + for(int actOffset=0; actOffset=_nlocal)&&(i_border<0)) + i_border=atomicAdd(_inum_border,1); + + if(jnum<_maxneighbors) + { + if(block_style) + { + _neighbors[i*_maxneighbors+jnum]= j; + if(j>=_nlocal) + {_neighbors_border[i_border*_maxneighbors+jnum_border]=j;} + else + {_neighbors_inner[i*_maxneighbors+jnum_inner]=j;} + } + else + { + _neighbors[i+jnum*_nlocal]=j; + if(j>=_nlocal) + {_neighbors_border[i_border+jnum_border*_nlocal]=j;} + else + {_neighbors_inner[i+jnum_inner*_nlocal]=j;} + } + } + ++jnum; + if(j>=_nlocal) + jnum_border++; + else + jnum_inner++; + } + } + } + __syncthreads(); + } + for(int obin_x=bin_x-1;obin_x=bin_dim_x||obin_y>=bin_dim_y||obin_z>=bin_dim_z) continue; + int other_bin=bin_dim_z * ( bin_dim_y * obin_x + obin_y) + obin_z; + if(other_bin==bin) continue; + + int obin_c=bin_count[other_bin]; + + for(int otherActOffset=0; otherActOffset=_nlocal)&&(i_border<0)) + i_border=atomicAdd(_inum_border,1); + if(jnum<_maxneighbors) + { + if(block_style) + { + _neighbors[i*_maxneighbors+jnum]= j; + if(j>=_nlocal) + {_neighbors_border[i_border*_maxneighbors+jnum_border]=j;} + else + {_neighbors_inner[i*_maxneighbors+jnum_inner]=j;} + } + else + { + _neighbors[i+jnum*_nlocal]=j; + if(j>=_nlocal) + {_neighbors_border[i_border+jnum_border*_nlocal]=j;} + else + {_neighbors_inner[i+jnum_inner*_nlocal]=j;} + } + } + ++jnum; + if(j>=_nlocal) + jnum_border++; + else + jnum_inner++; + } + } + } + __syncthreads(); + } + } + + if(jnum > _maxneighbors) ((int*)_buffer)[0] = -jnum; + + if(i<_nlocal) + { + _numneigh[i] = jnum; + _numneigh_inner[i] = jnum_inner; + if(i_border>=0) _numneigh_border[i_border] = jnum_border; + if(i_border>=0) _ilist_border[i_border] = i; + + } + } +} + +__global__ void NeighborBuildFullNsq_Kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* buffer = (int*) _buffer; + + if(i < _nlocal) + { + X_FLOAT* my_x = _x + i; + CUDA_FLOAT x_i = *my_x; my_x += _nmax; + CUDA_FLOAT y_i = *my_x; my_x += _nmax; + CUDA_FLOAT z_i = *my_x; + int jnum = 0; + int* jlist = _firstneigh[i]; + _ilist[i]=i; + + int itype = _type[i]; + __syncthreads(); + for(int j = 0; j < _nall; ++j) + { + my_x = _x + j; + CUDA_FLOAT x_j = *my_x; my_x += _nmax; + CUDA_FLOAT y_j = *my_x; my_x += _nmax; + CUDA_FLOAT z_j = *my_x; + CUDA_FLOAT delx = x_i - x_j; + CUDA_FLOAT dely = y_i - y_j; + CUDA_FLOAT delz = z_i - z_j; + CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz; + int jtype = _type[j]; + if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) + { + if(jnum<_maxneighbors) + jlist[jnum] = j; + if(i==151) ((int*)_buffer)[jnum+2]=j; + ++jnum; + } + __syncthreads(); + } + if(jnum > _maxneighbors) buffer[0] = 0; + _numneigh[i] = jnum; + if(i==151) ((int*)_buffer)[1]=jnum; + } +} + diff --git a/lib/cuda/pair_born_coul_long_cuda.cu b/lib/cuda/pair_born_coul_long_cuda.cu new file mode 100644 index 0000000000..913d5eb2c5 --- /dev/null +++ b/lib/cuda/pair_born_coul_long_cuda.cu @@ -0,0 +1,78 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _rhoinv MY_AP(coeff1) +#define _sigma MY_AP(coeff2) +#define _a MY_AP(coeff3) +#define _c MY_AP(coeff4) +#define _d MY_AP(coeff5) + +#include "pair_born_coul_long_cuda_cu.h" +#include "pair_born_coul_long_cuda_kernel_nc.cu" + +#include + +void Cuda_PairBornCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5,true); +} + +void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairBornCoulLongCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + + +#undef _rhoinv +#undef _sigma +#undef _a +#undef _c +#undef _d + diff --git a/lib/cuda/pair_born_coul_long_cuda_cu.h b/lib/cuda/pair_born_coul_long_cuda_cu.h new file mode 100644 index 0000000000..e47968d0f9 --- /dev/null +++ b/lib/cuda/pair_born_coul_long_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); +#endif diff --git a/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu new file mode 100644 index 0000000000..651326cb60 --- /dev/null +++ b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu @@ -0,0 +1,34 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairBornCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + const F_FLOAT rexp = _EXP_((_sigma[ij_type]-r)*_rhoinv[ij_type]); + const F_FLOAT forceborn = _a[ij_type]*_rhoinv[ij_type]*r*rexp - + F_F(6.0)*_c[ij_type]*r6inv + F_F(8.0)*_d[ij_type]*r2inv*r6inv; + if(eflag) evdwl += factor_lj*(_a[ij_type]*rexp - _c[ij_type]*r6inv + +_d[ij_type]*r2inv*r6inv-_offset[ij_type]); + return factor_lj*forceborn*r2inv; +} diff --git a/lib/cuda/pair_buck_coul_cut_cuda.cu b/lib/cuda/pair_buck_coul_cut_cuda.cu new file mode 100644 index 0000000000..b20de75efb --- /dev/null +++ b/lib/cuda/pair_buck_coul_cut_cuda.cu @@ -0,0 +1,74 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _rhoinv MY_AP(coeff1) +#define _buck1 MY_AP(coeff2) +#define _buck2 MY_AP(coeff3) +#define _a MY_AP(coeff4) +#define _c MY_AP(coeff5) + +#include "pair_buck_coul_cut_cuda_cu.h" + +#include +void Cuda_PairBuckCoulCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5,true); +} + +void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairBuckCoulCutCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _rhoinv +#undef _buck1 +#undef _buck2 +#undef _a +#undef _c + diff --git a/lib/cuda/pair_buck_coul_cut_cuda_cu.h b/lib/cuda/pair_buck_coul_cut_cuda_cu.h new file mode 100644 index 0000000000..1a2576ccae --- /dev/null +++ b/lib/cuda/pair_buck_coul_cut_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); +#endif diff --git a/lib/cuda/pair_buck_coul_long_cuda.cu b/lib/cuda/pair_buck_coul_long_cuda.cu new file mode 100644 index 0000000000..70e53edf08 --- /dev/null +++ b/lib/cuda/pair_buck_coul_long_cuda.cu @@ -0,0 +1,77 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _rhoinv MY_AP(coeff1) +#define _buck1 MY_AP(coeff2) +#define _buck2 MY_AP(coeff3) +#define _a MY_AP(coeff4) +#define _c MY_AP(coeff5) + +#include "pair_buck_coul_long_cuda_cu.h" + +#include + +void Cuda_PairBuckCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5,true); +} + +void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairBuckCoulLongCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + + + +#undef _rhoinv +#undef _buck1 +#undef _buck2 +#undef _a +#undef _c + diff --git a/lib/cuda/pair_buck_coul_long_cuda_cu.h b/lib/cuda/pair_buck_coul_long_cuda_cu.h new file mode 100644 index 0000000000..77cbb4c07f --- /dev/null +++ b/lib/cuda/pair_buck_coul_long_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); +#endif diff --git a/lib/cuda/pair_buck_cuda.cu b/lib/cuda/pair_buck_cuda.cu new file mode 100644 index 0000000000..c14abc0067 --- /dev/null +++ b/lib/cuda/pair_buck_cuda.cu @@ -0,0 +1,76 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _rhoinv MY_AP(coeff1) +#define _buck1 MY_AP(coeff2) +#define _buck2 MY_AP(coeff3) +#define _a MY_AP(coeff4) +#define _c MY_AP(coeff5) + +#include "pair_buck_cuda_cu.h" +#include "pair_buck_cuda_kernel_nc.cu" + +#include + +void Cuda_PairBuckCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5); +} + +void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairBuckCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _rhoinv +#undef _buck1 +#undef _buck2 +#undef _a +#undef _c + diff --git a/lib/cuda/pair_buck_cuda_cu.h b/lib/cuda/pair_buck_cuda_cu.h new file mode 100644 index 0000000000..92b6350d9f --- /dev/null +++ b/lib/cuda/pair_buck_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); +#endif diff --git a/lib/cuda/pair_buck_cuda_kernel_nc.cu b/lib/cuda/pair_buck_cuda_kernel_nc.cu new file mode 100644 index 0000000000..3ec40a26f8 --- /dev/null +++ b/lib/cuda/pair_buck_cuda_kernel_nc.cu @@ -0,0 +1,33 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairBuckCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT rexp = _EXP_(-r*_rhoinv[ij_type]); + const F_FLOAT forcebuck = _buck1[ij_type]*r*rexp - _buck2[ij_type]*r6inv; + if(eflag) evdwl += factor_lj*(_a[ij_type]*rexp - _c[ij_type]*r6inv - + _offset[ij_type]); + return (factor_lj*forcebuck) * r2inv; +} diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu new file mode 100644 index 0000000000..1f780674c1 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu @@ -0,0 +1,80 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _cg_type MY_AP(coeff5) + + +#include "pair_cg_cmm_coul_cut_cuda_cu.h" +#include + + + + +void Cuda_PairCGCMMCoulCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true, false ); + +} + + + + +void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + // initialize only on first call + static short init=0; + if(! init) + { + init = 1; + Cuda_PairCGCMMCoulCutCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _cg_type + diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h new file mode 100644 index 0000000000..00eb4c983c --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu new file mode 100644 index 0000000000..ead0fc9832 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu @@ -0,0 +1,80 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _cg_type MY_AP(coeff5) + + +#include "pair_cg_cmm_coul_debye_cuda_cu.h" +#include + + + + +void Cuda_PairCGCMMCoulDebyeCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true, false ); + +} + + + + +void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + // initialize only on first call + static short init=0; + if(! init) + { + init = 1; + Cuda_PairCGCMMCoulDebyeCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _cg_type + diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h new file mode 100644 index 0000000000..5b8bab44c5 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda.cu b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu new file mode 100644 index 0000000000..dbdc2d2a12 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu @@ -0,0 +1,80 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _cg_type MY_AP(coeff5) + + +#include "pair_cg_cmm_coul_long_cuda_cu.h" +#include + + + + +void Cuda_PairCGCMMCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true, false ); + +} + + + + +void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + // initialize only on first call + static short init=0; + if(! init) + { + init = 1; + Cuda_PairCGCMMCoulLongCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _cg_type + diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h new file mode 100644 index 0000000000..bed897d5d3 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_cuda.cu b/lib/cuda/pair_cg_cmm_cuda.cu new file mode 100644 index 0000000000..b4bb31e094 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_cuda.cu @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _cg_type MY_AP(coeff5) + + enum {CG_NOT_SET=0, CG_LJ9_6, CG_LJ12_4, CG_LJ12_6, NUM_CG_TYPES, + CG_COUL_NONE, CG_COUL_CUT, CG_COUL_DEBYE, CG_COUL_LONG}; + +#include "pair_cg_cmm_cuda_cu.h" +#include "pair_cg_cmm_cuda_kernel_nc.cu" +#include + + + + +void Cuda_PairCGCMMCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, false, false ); + +} + + + + +void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + // initialize only on first call + static short init=0; + if(! init) + { + init = 1; + Cuda_PairCGCMMCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + int maxthreads=128; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,maxthreads); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _cg_type + diff --git a/lib/cuda/pair_cg_cmm_cuda_cu.h b/lib/cuda/pair_cg_cmm_cuda_cu.h new file mode 100644 index 0000000000..da6d6075f0 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu new file mode 100644 index 0000000000..dcaaab7955 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu @@ -0,0 +1,48 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) //0.11 of 0.4 +{ + const F_FLOAT r2inv = F_F(1.0)/rsq; + const int cg_type = _cg_type[ij_type]; + const F_FLOAT r4inv = r2inv*r2inv; + const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); + const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?-r2inv:-F_F(1.0); + const F_FLOAT forcelj = r4inv * (_lj1[ij_type]*r4inv*rNinv_first + _lj2[ij_type]*rNinv_second); + + if(eflag) evdwl += factor_lj*(r4inv*(_lj3[ij_type]*r4inv*rNinv_first+_lj4[ij_type]*rNinv_second) - _offset[ij_type]); + return factor_lj*forcelj*r2inv; +} + +/*__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const int cg_type = tex1Dfetch(_coeff5_gm_tex,ij_type); + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r4inv = r2inv*r2inv; + const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); + const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0); + const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second); + + if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second)); + return factor_lj*forcelj*r2inv; +}*/ diff --git a/lib/cuda/pair_eam_cuda.cu b/lib/cuda/pair_eam_cuda.cu new file mode 100644 index 0000000000..29ad4af271 --- /dev/null +++ b/lib/cuda/pair_eam_cuda.cu @@ -0,0 +1,330 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _type2frho MY_AP(coeff1) +#define _type2rhor MY_AP(coeff2) +#define _type2z2r MY_AP(coeff3) +#define _rdr MY_AP(rdr) +#define _rdrho MY_AP(rdrho) +#define _nr MY_AP(nr) +#define _nrho MY_AP(nrho) +#define _nfrho MY_AP(nfrho) +#define _nrhor MY_AP(nrhor) +#define _nz2r MY_AP(nz2r) +#define _frho_spline MY_AP(frho_spline) +#define _rhor_spline MY_AP(rhor_spline) +#define _z2r_spline MY_AP(z2r_spline) +#define _rho MY_AP(rho) +#define _fp MY_AP(fp) + +__device__ __constant__ F_FLOAT MY_AP(rdr); +__device__ __constant__ F_FLOAT MY_AP(rdrho); +__device__ __constant__ int MY_AP(nr); +__device__ __constant__ int MY_AP(nrho); +__device__ __constant__ int MY_AP(nfrho); +__device__ __constant__ int MY_AP(nrhor); +__device__ __constant__ int MY_AP(nz2r); +__device__ __constant__ F_FLOAT* MY_AP(frho_spline); +__device__ __constant__ F_FLOAT* MY_AP(rhor_spline); +__device__ __constant__ F_FLOAT* MY_AP(z2r_spline); +__device__ __constant__ F_FLOAT* MY_AP(rho); +__device__ __constant__ F_FLOAT* MY_AP(fp); + +#define _rhor_spline_tex MY_AP(rhor_spline_tex) +#if F_PRECISION == 1 +texture _rhor_spline_tex; +#else +texture _rhor_spline_tex; +#endif + + +#define _z2r_spline_tex MY_AP(z2r_spline_tex) +#if F_PRECISION == 1 +texture _z2r_spline_tex; +#else +texture _z2r_spline_tex; +#endif + + + +#include "pair_eam_cuda_cu.h" +#include "pair_eam_cuda_kernel_nc.cu" +#include + +int eam_buff_offset; +int rhor_spline_size; +void* rhor_spline_pointer; +int z2r_spline_size; +void* z2r_spline_pointer; + + +inline void BindEAMTextures(cuda_shared_data* sdata) +{ + _rhor_spline_tex.normalized = false; // access with normalized texture coordinates + _rhor_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _rhor_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + + const textureReference* rhor_spline_texture_ptr; + cudaGetTextureReference(&rhor_spline_texture_ptr, MY_CONST(rhor_spline_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc(); + cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); + #else + cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc(); + cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); + #endif + + _z2r_spline_tex.normalized = false; // access with normalized texture coordinates + _z2r_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _z2r_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + + const textureReference* z2r_spline_texture_ptr; + cudaGetTextureReference(&z2r_spline_texture_ptr, MY_CONST(z2r_spline_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc(); + cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); + #else + cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc(); + cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); + #endif + +} + +void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed"); + int3 layout=getgrid(sneighlist->inum,7*sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size=(unsigned)(layout.y*layout.x)*7*sizeof(F_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + if(sdata->buffer!=NULL) cudaFree(sdata->buffer); + cudaMalloc((void**)&sdata->buffer,size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed"); +} + +void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed"); + cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) ); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed"); +} + + +void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata,double rdr,double rdrho,int nfrho, int nrhor,int nr, int nrho,int nz2r, +void* frho_spline,void* rhor_spline,void* z2r_spline,void* rho,void* fp, +int* type2frho,int** type2z2r,int** type2rhor) +{ + // !! LAMMPS indexes atom types starting with 1 !! + + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2) + printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u " + "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 " + "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); + unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes; + + X_FLOAT cutsq_global; + cutsq_global = (X_FLOAT) (sdata->pair.cut_global); + cudaMemcpyToSymbol(MY_CONST(cutsq_global) ,&cutsq_global , sizeof(X_FLOAT) ); + + + F_FLOAT* coeff_buf=new F_FLOAT[cuda_ntypes*cuda_ntypes]; + for(int i=0;idomain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + F_FLOAT rdr_F=rdr; + F_FLOAT rdrho_F=rdrho; + cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); + cudaMemcpyToSymbol(MY_CONST(cuda_ntypes), & cuda_ntypes , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity, sizeof(int)*3 ); + cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(rdr), &rdr_F, sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(rdrho), &rdrho_F, sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(nr), &nr, sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nrho), &nrho, sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nfrho), &nfrho, sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(rho), &rho, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(fp), &fp, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(frho_spline), &frho_spline, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(rhor_spline), &rhor_spline, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(z2r_spline), &z2r_spline, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int) ); + + rhor_spline_size = nrhor*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT); + z2r_spline_size = nz2r*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT); + rhor_spline_pointer = rhor_spline; + z2r_spline_pointer = z2r_spline; + + CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed"); + +} + + + +void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + if(sdata->atom.update_nmax) + Cuda_PairEAMCuda_UpdateNmax(sdata,sneighlist); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + if(sdata->buffer_new) + Cuda_PairEAMCuda_UpdateBuffer(sdata,sneighlist); + cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) ); + + int sharedperproc=0; + if(eflag||eflag_atom) sharedperproc=1; + if(vflag||vflag_atom) sharedperproc=7; + + int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + eam_buff_offset=grid.x*grid.y; + + BindXTypeTexture(sdata); + BindEAMTextures( sdata);// initialize only on first call + + + MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); ) + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation"); + PairEAMCuda_Kernel1<<>> (eflag, vflag,eflag_atom,vflag_atom); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed"); + + + + MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); ) + +} + +void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + int sharedperproc=0; + if(eflag||eflag_atom) sharedperproc=1; + if(vflag||vflag_atom) sharedperproc=7; + int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + BindXTypeTexture(sdata); + BindEAMTextures( sdata);// initialize only on first call + // initialize only on first call + sdata->pair.lastgridsize=grid.x*grid.y; + sdata->pair.n_energy_virial=sharedperproc; + + MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); ) + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation"); + PairEAMCuda_Kernel2<<>> (eflag, vflag,eflag_atom,vflag_atom); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed"); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed"); + + if(eflag||vflag) + { + int n=grid.x*grid.y; + grid.x=sharedperproc; + grid.y=1; + threads.x=256; + MY_AP(PairVirialCompute_reduce)<<>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed"); + } + + MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); ) + +} + +void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send) +{ + int3 layout=getgrid(n,0); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + F_FLOAT* buf=(F_FLOAT*) (& ((double*)sdata->buffer)[eam_buff_offset]); + + PairEAMCuda_PackComm_Kernel<<>> ((int*) sdata->comm.sendlist.dev_data,n + ,sdata->comm.maxlistlength,iswap,buf); + cudaThreadSynchronize(); + cudaMemcpy(buf_send, buf, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + cudaThreadSynchronize(); +} + +void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,void* fp) +{ + F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]); + cudaMemcpy(fp_first,buf_recv, n*sizeof(F_FLOAT), cudaMemcpyHostToDevice); +} + +#undef _type2frho +#undef _type2rhor +#undef _type2z2r + + +/* ---------------------------------------------------------------------- + tally eng_vdwl and virial into global and per-atom accumulators + need i < nlocal test since called by bond_quartic and dihedral_charmm +------------------------------------------------------------------------- */ + diff --git a/lib/cuda/pair_eam_cuda_cu.h b/lib/cuda/pair_eam_cuda_cu.h new file mode 100644 index 0000000000..dee4a036e2 --- /dev/null +++ b/lib/cuda/pair_eam_cuda_cu.h @@ -0,0 +1,33 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" +extern "C" void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata,double rdr,double rdrho,int nfrho, int nrhor,int nr, int nrho,int nz2r, +void* frho_spline,void* rhor_spline,void* z2r_spline,void* rho,void* fp, +int* type2frho,int** type2z2r,int** type2rhor); +extern "C" void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); +extern "C" void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); +extern "C" void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send); +extern "C" void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,void* fp); + +#define EAM_COEFF_LENGTH 8 diff --git a/lib/cuda/pair_eam_cuda_kernel_nc.cu b/lib/cuda/pair_eam_cuda_kernel_nc.cu new file mode 100644 index 0000000000..a3dc30f397 --- /dev/null +++ b/lib/cuda/pair_eam_cuda_kernel_nc.cu @@ -0,0 +1,340 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + + + +static __device__ inline F_FLOAT4 fetchRhor(int i) +{ + #ifdef CUDA_USE_TEXTURE + #if F_PRECISION == 1 + return tex1Dfetch(_rhor_spline_tex,i); + #else + return tex1Dfetch_double_f(_rhor_spline_tex,i); + #endif + #else + return _rhor_spline[i]; + #endif +} + +static __device__ inline F_FLOAT4 fetchZ2r(int i) +{ + #ifdef CUDA_USE_TEXTURE + #if F_PRECISION == 1 + return tex1Dfetch(_z2r_spline_tex,i); + #else + return tex1Dfetch_double_f(_z2r_spline_tex,i); + #endif + #else + return _z2r_spline[i]; + #endif +} + +__global__ void PairEAMCuda_Kernel1(int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + ENERGY_FLOAT* sharedE; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + + if(eflag||eflag_atom) + { + sharedE = &sharedmem[threadIdx.x]; + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + if(vflag||vflag_atom) + { + sharedV[0*blockDim.x] = ENERGY_F(0.0); + sharedV[1*blockDim.x] = ENERGY_F(0.0); + sharedV[2*blockDim.x] = ENERGY_F(0.0); + sharedV[3*blockDim.x] = ENERGY_F(0.0); + sharedV[4*blockDim.x] = ENERGY_F(0.0); + sharedV[5*blockDim.x] = ENERGY_F(0.0); + } + + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + X_FLOAT xtmp,ytmp,ztmp; + X_FLOAT4 myxtype; + F_FLOAT delx,dely,delz; + int itype; + int i=_nlocal; + int jnum=0; + int* jlist; + + if(ii < _inum) + { + i = _ilist[ii]; + + myxtype=fetchXType(i); + xtmp=myxtype.x; + ytmp=myxtype.y; + ztmp=myxtype.z; + itype=static_cast (myxtype.w); + + jnum = _numneigh[i]; + + jlist = &_neighbors[i]; + if(i<_nlocal) + _rho[i]=F_F(0.0); + } + __syncthreads(); + + for (int jj = 0; jj < jnum; jj++) + { + if(ii < _inum) + if(jj (myxtype.w); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + if (rsq < _cutsq_global) + { + F_FLOAT p = sqrt(rsq)*_rdr + F_F(1.0); + int m = static_cast (p); + m = MIN(m,_nr-1); + p -= m; + p = MIN(p,F_F(1.0)); + + int k=(static_cast (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2; + F_FLOAT4 c=fetchRhor(k+1); + _rho[i] += ((c.w*p+c.x)*p+c.y)*p+c.z; + } + } + } + + if(ii < _inum) + { + + F_FLOAT p = _rho[i]*_rdrho + F_F(1.0); + int m = static_cast (p); + m = MAX(1,MIN(m,_nrho-1)); + p -= m; + p = MIN(p,F_F(1.0)); + F_FLOAT* coeff = &_frho_spline[(static_cast (_type2frho[itype])*(_nrho+1)+m)*EAM_COEFF_LENGTH]; + _fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2]; + if (eflag||eflag_atom) { + sharedmem[threadIdx.x] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; + } + + } + __syncthreads(); + if(eflag||eflag_atom) + { + if(i<_nlocal&&eflag_atom) + _eatom[i]+=sharedmem[threadIdx.x]; + reduceBlock(sharedmem); + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0)*sharedmem[0]; + } +} + +__global__ void PairEAMCuda_Kernel2(int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + + if(eflag||eflag_atom) + { + sharedE = &sharedmem[threadIdx.x]; + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + if(vflag||vflag_atom) + { + sharedV[0*blockDim.x] = ENERGY_F(0.0); + sharedV[1*blockDim.x] = ENERGY_F(0.0); + sharedV[2*blockDim.x] = ENERGY_F(0.0); + sharedV[3*blockDim.x] = ENERGY_F(0.0); + sharedV[4*blockDim.x] = ENERGY_F(0.0); + sharedV[5*blockDim.x] = ENERGY_F(0.0); + } + + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + X_FLOAT xtmp,ytmp,ztmp; + X_FLOAT4 myxtype; + F_FLOAT fxtmp,fytmp,fztmp,fpair; + F_FLOAT delx,dely,delz; + int itype,i; + int jnum=0; + int* jlist; + + if(ii < _inum) + { + i = _ilist[ii]; + + myxtype=fetchXType(i); + xtmp=myxtype.x; + ytmp=myxtype.y; + ztmp=myxtype.z; + itype=static_cast (myxtype.w); + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + + jnum = _numneigh[i]; + + jlist = &_neighbors[i]; + if(i<_nlocal) + _rho[i]=F_F(0.0); + } + if(ii (myxtype.w); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if (rsq < _cutsq_global) + { + F_FLOAT r = _SQRT_(rsq); + F_FLOAT p = r*_rdr + F_F(1.0); + int m = static_cast (p); + m = MIN(m,_nr-1); + p -= m; + p = MIN(p,F_F(1.0)); + + int k=(static_cast (_type2rhor[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2; + F_FLOAT4 c=fetchRhor(k); + F_FLOAT rhoip = (c.x*p + c.y)*p + c.z; + k=(static_cast (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2; + c=fetchRhor(k); + F_FLOAT rhojp = (c.x*p + c.y)*p + c.z; + k=(static_cast (_type2z2r[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2; + c=fetchZ2r(k); + F_FLOAT z2p = (c.x*p + c.y)*p + c.z; + c=fetchZ2r(k+1); + F_FLOAT z2 = ((c.w*p + c.x)*p + c.y)*p+c.z; + + F_FLOAT recip = F_F(1.0)/r; + F_FLOAT phi = z2*recip; + F_FLOAT phip = z2p*recip - phi*recip; + F_FLOAT psip = _fp[i]*rhojp + _fp[j]*rhoip + phip; + fpair = -psip*recip; + + F_FLOAT dxfp,dyfp,dzfp; + fxtmp += dxfp = delx*fpair; + fytmp += dyfp = dely*fpair; + fztmp += dzfp = delz*fpair; + evdwl+=phi; + if(vflag||vflag_atom) + { + sharedV[0 * blockDim.x]+= delx*dxfp; + sharedV[1 * blockDim.x]+= dely*dyfp; + sharedV[2 * blockDim.x]+= delz*dzfp; + sharedV[3 * blockDim.x]+= delx*dyfp; + sharedV[4 * blockDim.x]+= delx*dzfp; + sharedV[5 * blockDim.x]+= dely*dzfp; + } + } + } + } + + __syncthreads(); + if(ii < _inum) + { + F_FLOAT* my_f; + if(_collect_forces_later) + { + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + if(eflag) + { + buffer=&buffer[1 * gridDim.x * gridDim.y]; + } + if(vflag) + { + buffer=&buffer[6 * gridDim.x * gridDim.y]; + } + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = fxtmp; my_f += _nmax; + *my_f = fytmp; my_f += _nmax; + *my_f = fztmp; + } + else + { + my_f = _f + i; + *my_f += fxtmp; my_f += _nmax; + *my_f += fytmp; my_f += _nmax; + *my_f += fztmp; + } + } + __syncthreads(); + + if(eflag) + { + sharedE[0] = evdwl; + } + if(eflag_atom && i<_nlocal) + { + _eatom[i] += evdwl; + } + + if(vflag_atom && i<_nlocal) + { + _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; + _vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; + _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; + _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; + _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; + _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; + } + if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,0); +} + +__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,F_FLOAT* buffer) +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + int* list=sendlist+iswap*maxlistlength; + if(i + +#define _kn MY_AP(coeff1) //[0] +#define _kt MY_AP(coeff1) //[1] +#define _gamman MY_AP(coeff1) //[2] +#define _gammat MY_AP(coeff3) //[0] +#define _xmu MY_AP(coeff2) //[0] +#define _dampflag MY_AP(coeff2) //[1] + +#include "pair_gran_hooke_cuda_cu.h" +#include "pair_gran_hooke_cuda_kernel_nc.cu" +#include + +void Cuda_PairGranHookeCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateBuffer failed"); + int3 layout=getgrid(sneighlist->inum,7*sizeof(ENERGY_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size=(unsigned)(layout.y*layout.x)*7*sizeof(ENERGY_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + if(sdata->buffer!=NULL) cudaFree(sdata->buffer); + cudaMalloc((void**)&sdata->buffer,size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: updateBuffer failed"); +} + +void Cuda_PairGranHookeCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateNmax failed"); + cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); + //cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) ); + cudaMemcpyToSymbol(MY_CONST(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*) ); + cudaMemcpyToSymbol(MY_CONST(omega_rmass),& sdata->atom.omega_rmass.dev_data,sizeof(V_FLOAT4*) ); + cudaMemcpyToSymbol(MY_CONST(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(maxneighbors),&sneighlist->maxneighbors , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata , sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(freeze_group_bit) , & sdata->pair.freeze_group_bit, sizeof(int) ); + + + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: updateNmax failed"); +} + + +void Cuda_PairGranHookeCuda_Init(cuda_shared_data* sdata) +{ + // !! LAMMPS indexes atom types starting with 1 !! + + unsigned cuda_ntypes = sdata->atom.ntypes + 2; + if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2) + printf("# CUDA: Cuda_PairGranHookeCuda_Init: you need %u types. this is more than %u " + "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " + "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE-1); + unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes; + unsigned n = sizeof(F_FLOAT) * cuda_ntypes2; + + F_FLOAT coeffs1[cuda_ntypes2]; + coeffs1[0]= (F_FLOAT) sdata->pair.coeff1[0][0]; + coeffs1[1]= (F_FLOAT) sdata->pair.coeff1[0][1]; + coeffs1[2]= (F_FLOAT) sdata->pair.coeff1[1][0]; + F_FLOAT coeffs3[cuda_ntypes2]; + coeffs3[0]= (F_FLOAT) sdata->pair.coeff1[1][1]; + F_FLOAT coeffs2[cuda_ntypes2]; + coeffs2[0]= (F_FLOAT) sdata->pair.coeff2[0][0]; + coeffs2[1]= (F_FLOAT) sdata->pair.coeff2[0][1]; + + + X_FLOAT box_size[3] = + { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + //printf("n: %i %i\n",n,CUDA_MAX_TYPES2); + cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); + cudaMemcpyToSymbol(MY_CONST(cuda_ntypes), & cuda_ntypes , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(coeff1) , coeffs1 , n ); + cudaMemcpyToSymbol(MY_CONST(coeff2) , coeffs2 , n ); + cudaMemcpyToSymbol(MY_CONST(coeff3) , coeffs3 , n ); + cudaMemcpyToSymbol(MY_CONST(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity, sizeof(int)*3 ); + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: init failed"); +} + + + +void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + //if(sdata->atom.update_nmax) + Cuda_PairGranHookeCuda_UpdateNmax(sdata,sneighlist); + //if(sdata->atom.update_nlocal) + { + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + } + //if(sdata->buffer_new) + Cuda_PairGranHookeCuda_UpdateBuffer(sdata,sneighlist); + + BindXTypeTexture(sdata); + BindVRadiusTexture(sdata); + BindOmegaRmassTexture(sdata); + + int sharedperproc=0; + if(eflag) sharedperproc+=1; + if(vflag) sharedperproc+=6; + + int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT),128); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + // initialize only on first call + static short init=0; + if(! init) + { + init = 1; + Cuda_PairGranHookeCuda_Init(sdata); + } + + MYDBG( printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); ) + + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pre pair lj cut Kernel problems before kernel invocation"); + PairGranHookeCuda_Kernel<<>> (eflag, vflag,eflag_atom,vflag_atom,(int**)sneighlist->firstneigh.dev_data,sneighlist->binned_id + ,(F_FLOAT) sdata->pair.coeff1[0][0],(F_FLOAT) sdata->pair.coeff1[1][0],(F_FLOAT) sdata->pair.coeff1[1][1],(F_FLOAT) sdata->pair.coeff2[0][0]); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pair lj cut Kernel execution failed"); + + if(eflag||vflag) + { + int n=grid.x*grid.y; + grid.x=sharedperproc; + grid.y=1; + threads.x=256; + MY_AP(PairVirialCompute_reduce)<<>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) virial compute Kernel execution failed"); + } + + MYDBG( printf("# CUDA: Cuda_PairGranHookeCoulLongCuda: kernel done\n"); ) + +} + + +#undef _kn +#undef _kt +#undef _gamman +#undef _gammat +#undef _xmu +#undef _dampflag + + diff --git a/lib/cuda/pair_gran_hooke_cuda_cu.h b/lib/cuda/pair_gran_hooke_cuda_cu.h new file mode 100644 index 0000000000..03cbd36519 --- /dev/null +++ b/lib/cuda/pair_gran_hooke_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu new file mode 100644 index 0000000000..f063def443 --- /dev/null +++ b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu @@ -0,0 +1,219 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + +__global__ void PairGranHookeCuda_Kernel(int eflag, int vflag,int eflag_atom,int vflag_atom,int** firstneight,int* binned_id +,F_FLOAT kn,F_FLOAT gamman,F_FLOAT gammat, F_FLOAT xmu) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE; + ENERGY_FLOAT* sharedV; + + if(eflag||eflag_atom) + { + sharedE = &sharedmem[threadIdx.x]; + sharedV = &sharedmem[0]; + sharedE[0] = ENERGY_F(0.0); sharedV+=blockDim.x; + } + if(vflag||vflag_atom) + { + sharedV += threadIdx.x; + sharedV[0*blockDim.x] = ENERGY_F(0.0); + sharedV[1*blockDim.x] = ENERGY_F(0.0); + sharedV[2*blockDim.x] = ENERGY_F(0.0); + sharedV[3*blockDim.x] = ENERGY_F(0.0); + sharedV[4*blockDim.x] = ENERGY_F(0.0); + sharedV[5*blockDim.x] = ENERGY_F(0.0); + } + + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + MYEMUDBG( if(ii==0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n"); ) + + X_FLOAT xtmp,ytmp,ztmp; + X_FLOAT4 myxtype; + V_FLOAT4 myvradius, ovradius; + F_FLOAT fxtmp,fytmp,fztmp,torquextmp,torqueytmp,torqueztmp; + F_FLOAT delx,dely,delz; + F_FLOAT radi,radj,radsum,r,rsqinv; + F_FLOAT vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3; + F_FLOAT wr1,wr2,wr3; + F_FLOAT vtr1,vtr2,vtr3,vrel; + F_FLOAT meff,damp,ccel,tor1,tor2,tor3; + F_FLOAT fn,fs,ft,fs1,fs2,fs3; + + int jnum =0; + int i,j; + int* jlist; + + if(ii < _inum) + { + i = _ilist[ii]; + + myxtype = fetchXType(i); + myvradius = fetchVRadius(i); + + xtmp=myxtype.x; + ytmp=myxtype.y; + ztmp=myxtype.z; + radi = myvradius.w; + + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + torquextmp = F_F(0.0); + torqueytmp = F_F(0.0); + torqueztmp = F_F(0.0); + + jnum = _numneigh[i]; + + jlist = &_neighbors[i]; + } + __syncthreads(); + + for (int jj = 0; jj < jnum; jj++) + { + if(ii < _inum) + if(jj + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj96_cut_cuda_cu.h" +#include "pair_lj96_cut_cuda_kernel_nc.cu" +#include + + + + +void Cuda_PairLJ96CutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4, false, false ); +} + + + + +void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + // initialize only on first call + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJ96CutCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,256); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 + + diff --git a/lib/cuda/pair_lj96_cut_cuda_cu.h b/lib/cuda/pair_lj96_cut_cuda_cu.h new file mode 100644 index 0000000000..24763103a7 --- /dev/null +++ b/lib/cuda/pair_lj96_cut_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu new file mode 100644 index 0000000000..28ccb839ba --- /dev/null +++ b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu @@ -0,0 +1,33 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJ96CutCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + const F_FLOAT r3inv = _SQRT_(r6inv); + const F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r3inv - _lj2[ij_type]); + if(eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r3inv-_lj4[ij_type]) - _offset[ij_type]); + return factor_lj*forcelj*r2inv; +} + diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu new file mode 100644 index 0000000000..b5a12755da --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu @@ -0,0 +1,78 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) + +#include "pair_lj_charmm_coul_charmm_cuda_cu.h" +#include "pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu" + +#include + +void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata,F_FLOAT cut_coul_innersq,F_FLOAT denom_lj_inv,F_FLOAT denom_coul_inv) +{ + Cuda_Pair_Init_AllStyles(sdata, 4,true,true,true); + cudaMemcpyToSymbol(MY_CONST(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT) ); + + return; +} + + + +void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul) +{ + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJCharmmCoulCharmmCuda_Init(sdata,cut_coul_innersq,1.0/denom_lj,1.0/denom_coul); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h new file mode 100644 index 0000000000..3b96ab4481 --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul); diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu new file mode 100644 index 0000000000..baaea5d4e5 --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu @@ -0,0 +1,72 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]); + F_FLOAT philj,switch1; + if(rsq > _cut_innersq_global) + { + switch1 = (_cutsq_global-rsq) * (_cutsq_global-rsq) * + (_cutsq_global + F_F(2.0)*rsq - F_F(3.0)*_cut_innersq_global) * _denom_lj_inv; + const F_FLOAT switch2 = F_F(12.0)*rsq * (_cutsq_global-rsq) * + (rsq-_cut_innersq_global) * _denom_lj_inv; + philj = r6inv * (_lj3[ij_type]*r6inv - _lj4[ij_type]); + forcelj = forcelj*switch1 + philj*switch2; + } + + if (eflag) + { + ENERGY_FLOAT evdwl_tmp = factor_lj; + if (rsq > _cut_innersq_global) + { + evdwl_tmp*=philj*switch1; + } + else + evdwl_tmp*= r6inv * (_lj3[ij_type]*r6inv - _lj4[ij_type]); + evdwl+=evdwl_tmp; + } + + return factor_lj*forcelj*r2inv; +} + +__device__ inline F_FLOAT CoulCharmmCuda_Eval(const F_FLOAT& rsq,F_FLOAT& factor_coul,int& eflag, ENERGY_FLOAT& ecoul, F_FLOAT qij) +{ + F_FLOAT forcecoul; + ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij *_RSQRT_(rsq)*factor_coul; + if (rsq > _cut_coul_innersq_global) { + const F_FLOAT switch1 = (_cut_coulsq_global-rsq) * (_cut_coulsq_global-rsq) * + (_cut_coulsq_global + F_F(2.0)*rsq - F_F(3.0)*_cut_coul_innersq_global) * _denom_coul_inv; + ecoul_tmp *= switch1; + const F_FLOAT switch2 = F_F(12.0)*rsq * (_cut_coulsq_global-rsq) * + (rsq-_cut_coul_innersq_global) * _denom_coul_inv; + forcecoul *= switch1 + switch2; + } + if(eflag) + { + ecoul += ecoul_tmp*factor_coul; + } + return forcecoul*(F_F(1.0)/rsq); +} + diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu new file mode 100644 index 0000000000..9bfb0bcc0e --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) +#define _cut_coul_innersq_global MY_AP(cut_coul_innersq_global) +#define _denom_lj_inv MY_AP(denom_lj_inv) +#define _denom_coul_inv MY_AP(denom_coul_inv) +__device__ __constant__ F_FLOAT _cut_coul_innersq_global; +__device__ __constant__ F_FLOAT _denom_lj_inv; +__device__ __constant__ F_FLOAT _denom_coul_inv; + + +#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h" +#include "pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu" + +#include + +void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata,F_FLOAT cut_coul_innersq,F_FLOAT denom_lj_inv,F_FLOAT denom_coul_inv) +{ + Cuda_Pair_Init_AllStyles(sdata, 4,true,true,true); + cudaMemcpyToSymbol(MY_CONST(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT) ); + + return; +} + + + +void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul) +{ + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(sdata,cut_coul_innersq,1.0/denom_lj,1.0/denom_coul); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h new file mode 100644 index 0000000000..119163b291 --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul); diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu new file mode 100644 index 0000000000..c67037b7ce --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu @@ -0,0 +1,42 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT CoulCharmmImplicitCuda_Eval(const F_FLOAT& rsq,F_FLOAT& factor_coul,int& eflag, ENERGY_FLOAT& ecoul, F_FLOAT qij) +{ + F_FLOAT forcecoul; + ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij *(F_F(1.0)/rsq)*factor_coul; + if (rsq > _cut_coul_innersq_global) { + const F_FLOAT switch1 = (_cut_coulsq_global-rsq) * (_cut_coulsq_global-rsq) * + (_cut_coulsq_global + F_F(2.0)*rsq - F_F(3.0)*_cut_coul_innersq_global) * _denom_coul_inv; + ecoul_tmp *= switch1; + const F_FLOAT switch2 = F_F(12.0)*rsq * (_cut_coulsq_global-rsq) * + (rsq-_cut_coul_innersq_global) * _denom_coul_inv; + forcecoul *= (switch1 + switch2); + } + if(eflag) + { + ecoul += ecoul_tmp*factor_coul; + } + return F_F(2.0)*forcecoul*(F_F(1.0)/rsq); +} + diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda.cu b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu new file mode 100644 index 0000000000..7c1a5ac46c --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu @@ -0,0 +1,75 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) + +#include "pair_lj_charmm_coul_long_cuda_cu.h" + +#include + +void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata,F_FLOAT denom_lj_inv) +{ + Cuda_Pair_Init_AllStyles(sdata, 4,true,true,true); + cudaMemcpyToSymbol(MY_CONST(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT) ); + + return; +} + + + +void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom,int vflag_atom,F_FLOAT denom_lj) +{ + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJCharmmCoulLongCuda_Init(sdata,1.0/denom_lj); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h new file mode 100644 index 0000000000..0f29e8f97b --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT denom_lj); diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda.cu b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu new file mode 100644 index 0000000000..7cd53d31ff --- /dev/null +++ b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu @@ -0,0 +1,72 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_class2_coul_cut_cuda_cu.h" + +#include + +void Cuda_PairLJClass2CoulCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4,true); +} + +void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJClass2CoulCutCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h new file mode 100644 index 0000000000..a656ebbd89 --- /dev/null +++ b/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda.cu b/lib/cuda/pair_lj_class2_coul_long_cuda.cu new file mode 100644 index 0000000000..4f15d42936 --- /dev/null +++ b/lib/cuda/pair_lj_class2_coul_long_cuda.cu @@ -0,0 +1,72 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_class2_coul_long_cuda_cu.h" + +#include + +void Cuda_PairLJClass2CoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4,true); +} + +void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJClass2CoulLongCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h b/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h new file mode 100644 index 0000000000..dea620defe --- /dev/null +++ b/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_class2_cuda.cu b/lib/cuda/pair_lj_class2_cuda.cu new file mode 100644 index 0000000000..1064d12cf6 --- /dev/null +++ b/lib/cuda/pair_lj_class2_cuda.cu @@ -0,0 +1,74 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_class2_cuda_cu.h" +#include "pair_lj_class2_cuda_kernel_nc.cu" + +#include + +void Cuda_PairLJClass2Cuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4); +} + +void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJClass2Cuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); + //if(CUDA_ARCH==20) maxthreads*=2; + //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt,cudaFuncCachePreferL1); + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_class2_cuda_cu.h b/lib/cuda/pair_lj_class2_cuda_cu.h new file mode 100644 index 0000000000..cc14d9eda4 --- /dev/null +++ b/lib/cuda/pair_lj_class2_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu new file mode 100644 index 0000000000..e5674d8b74 --- /dev/null +++ b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu @@ -0,0 +1,33 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJClass2Cuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + const F_FLOAT r3inv = _SQRT_(r6inv); + if (eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r3inv- + _lj4[ij_type]) - _offset[ij_type]); + return factor_lj*r6inv * (_lj1[ij_type]*r3inv - _lj2[ij_type])*r2inv; +} + diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda.cu b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu new file mode 100644 index 0000000000..c3b4a40749 --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu @@ -0,0 +1,72 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_cut_coul_cut_cuda_cu.h" + +#include + +void Cuda_PairLJCutCoulCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4,true); +} + +void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJCutCoulCutCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h new file mode 100644 index 0000000000..95fadcd39b --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda.cu b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu new file mode 100644 index 0000000000..f5e074ba82 --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_cut_coul_debye_cuda_cu.h" + +#include + +void Cuda_PairLJCutCoulDebyeCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4,true); +} + +void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJCutCoulDebyeCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h new file mode 100644 index 0000000000..b6df066ac1 --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda.cu b/lib/cuda/pair_lj_cut_coul_long_cuda.cu new file mode 100644 index 0000000000..dd3e1df978 --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_long_cuda.cu @@ -0,0 +1,72 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_cut_coul_long_cuda_cu.h" + +#include + +void Cuda_PairLJCutCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4,true); +} + +void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJCutCoulLongCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h new file mode 100644 index 0000000000..9cac5457bd --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); +#endif diff --git a/lib/cuda/pair_lj_cut_cuda.cu b/lib/cuda/pair_lj_cut_cuda.cu new file mode 100644 index 0000000000..8f0c862004 --- /dev/null +++ b/lib/cuda/pair_lj_cut_cuda.cu @@ -0,0 +1,74 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_cut_cuda_cu.h" +#include "pair_lj_cut_cuda_kernel_nc.cu" + +#include + +void Cuda_PairLJCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4); +} + +void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJCutCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); + //if(CUDA_ARCH==20) maxthreads*=2; + //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt,cudaFuncCachePreferL1); + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_cuda_cu.h b/lib/cuda/pair_lj_cut_cuda_cu.h new file mode 100644 index 0000000000..9d9722501f --- /dev/null +++ b/lib/cuda/pair_lj_cut_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairLJCutCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); +#endif diff --git a/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu new file mode 100644 index 0000000000..d263e4a5cf --- /dev/null +++ b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu @@ -0,0 +1,32 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJCutCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + if (eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r6inv- + _lj4[ij_type]) - _offset[ij_type]); + return factor_lj*r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type])*r2inv; +} + diff --git a/lib/cuda/pair_lj_cut_experimental_cuda.cu b/lib/cuda/pair_lj_cut_experimental_cuda.cu new file mode 100644 index 0000000000..6996c02236 --- /dev/null +++ b/lib/cuda/pair_lj_cut_experimental_cuda.cu @@ -0,0 +1,75 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_cut_experimental_cuda_cu.h" + +#include + +void Cuda_PairLJCutExperimentalCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4); +} + +void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJCutExperimentalCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); + //if(CUDA_ARCH==20) maxthreads*=2; + //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt,cudaFuncCachePreferL1); + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); + if (sharedperproc==0) sharedperproc++; + //printf("comm_phase: %i\n",sdata->comm.comm_phase); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA_opt + <<>> (eflag, vflag,eflag_atom,vflag_atom,sdata->comm.comm_phase); + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_experimental_cuda_cu.h b/lib/cuda/pair_lj_cut_experimental_cuda_cu.h new file mode 100644 index 0000000000..4cc1f6de36 --- /dev/null +++ b/lib/cuda/pair_lj_cut_experimental_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_expand_cuda.cu b/lib/cuda/pair_lj_expand_cuda.cu new file mode 100644 index 0000000000..e1fa43d050 --- /dev/null +++ b/lib/cuda/pair_lj_expand_cuda.cu @@ -0,0 +1,77 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _shift MY_AP(coeff5) + +#include "pair_lj_expand_cuda_cu.h" +#include "pair_lj_expand_cuda_kernel_nc.cu" +#include + + +void Cuda_PairLJExpandCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5); +} + + + + +void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + // initialize only on first call + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJExpandCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,256); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 + + diff --git a/lib/cuda/pair_lj_expand_cuda_cu.h b/lib/cuda/pair_lj_expand_cuda_cu.h new file mode 100644 index 0000000000..24164b6fa7 --- /dev/null +++ b/lib/cuda/pair_lj_expand_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu new file mode 100644 index 0000000000..533bd761fc --- /dev/null +++ b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu @@ -0,0 +1,33 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairLJExpandCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const F_FLOAT r = _SQRT_(rsq); + const F_FLOAT rshift = r - _shift[ij_type]; + const F_FLOAT rshiftsq = rshift*rshift; + const F_FLOAT r2inv = F_F(1.0)/rshiftsq; + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + const F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]); + if(eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r6inv-_lj4[ij_type]) - _offset[ij_type]); + return factor_lj*forcelj*(F_F(1.0)/rshift)*(F_F(1.0)/r); +} diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu new file mode 100644 index 0000000000..7532e4b643 --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu @@ -0,0 +1,102 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) +#define _ljsw1 MY_AP(coeff5_gm) +#define _ljsw2 MY_AP(coeff6_gm) +#define _ljsw3 MY_AP(coeff7_gm) +#define _ljsw4 MY_AP(coeff8_gm) +#define _ljsw5 MY_AP(coeff9_gm) + +#define _cut_coul_inner_global MY_AP(cut_coul_inner_global) +#define _coulsw1 MY_AP(coulsw1) +#define _coulsw2 MY_AP(coulsw2) +#define _coulsw5 MY_AP(coulsw5) +__device__ __constant__ F_FLOAT _cut_coul_inner_global; +__device__ __constant__ F_FLOAT _coulsw1; +__device__ __constant__ F_FLOAT _coulsw2; +__device__ __constant__ F_FLOAT _coulsw5; + + +#include "pair_lj_gromacs_coul_gromacs_cuda_cu.h" +#include "pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu" + +#include + +void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata,F_FLOAT cut_coul_inner,F_FLOAT coulsw1,F_FLOAT coulsw2,F_FLOAT coulsw5) +{ + Cuda_Pair_Init_AllStyles(sdata, 9,true,true,true); + cudaMemcpyToSymbol(MY_CONST(cut_coul_inner_global) , &cut_coul_inner , sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(coulsw1) , &coulsw1 , sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(coulsw2) , &coulsw2 , sizeof(F_FLOAT) ); + cudaMemcpyToSymbol(MY_CONST(coulsw5) , &coulsw5 , sizeof(F_FLOAT) ); + + return; +} + + + +void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom,int vflag_atom,F_FLOAT cut_coul_inner,F_FLOAT coulsw1,F_FLOAT coulsw2,F_FLOAT coulsw5) +{ + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJGromacsCoulGromacsCuda_Init(sdata,cut_coul_inner,coulsw1,coulsw2,coulsw5); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _ljsw1 +#undef _ljsw2 +#undef _ljsw3 +#undef _ljsw4 +#undef _ljsw5 +#undef _cut_coul_inner_global +#undef _coulsw1 +#undef _coulsw2 +#undef _coulsw5 diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h new file mode 100644 index 0000000000..8dc5f8fcde --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT cut_coul_inner,F_FLOAT coulsw1,F_FLOAT coulsw2,F_FLOAT coulsw5); diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu new file mode 100644 index 0000000000..29e0a63c90 --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu @@ -0,0 +1,46 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT CoulGromacsCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_coul,int& eflag, ENERGY_FLOAT& ecoul, F_FLOAT qij) +{ + if (qij != F_F(0.0)) + { + F_FLOAT ecoul_tmp; + F_FLOAT forcecoul = _RSQRT_(rsq); + if(eflag) ecoul_tmp=forcecoul - _coulsw5; + if (rsq > _cut_coul_inner_global*_cut_coul_inner_global) { + const F_FLOAT r = F_F(1.0)/forcecoul; + const F_FLOAT tc = r - _cut_coul_inner_global; + forcecoul += r*tc*tc*(_coulsw1 + _coulsw2*tc); + if(eflag) ecoul_tmp-=tc*tc*tc*(_coulsw1*(F_F(1.0)/F_F(3.0)) + _coulsw2*tc*(F_F(1.0)/F_F(4.0))); + } + F_FLOAT qprod=_qqrd2e * qij*factor_coul; + forcecoul*=qprod; + if(eflag) + { + ecoul += ecoul_tmp*qprod; + } + return forcecoul*(F_F(1.0)/rsq); + } + return F_F(0.0); +} diff --git a/lib/cuda/pair_lj_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_cuda.cu new file mode 100644 index 0000000000..ce0c08f6f0 --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_cuda.cu @@ -0,0 +1,83 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) +#define _ljsw1 MY_AP(coeff5_gm) +#define _ljsw2 MY_AP(coeff6_gm) +#define _ljsw3 MY_AP(coeff7_gm) +#define _ljsw4 MY_AP(coeff8_gm) +#define _ljsw5 MY_AP(coeff9_gm) + +#include "pair_lj_gromacs_cuda_cu.h" +#include "pair_lj_gromacs_cuda_kernel_nc.cu" + +#include + +void Cuda_PairLJGromacsCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 9,false,true,true); +} + + + +void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom,int vflag_atom) +{ + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJGromacsCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); + +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _ljsw1 +#undef _ljsw2 +#undef _ljsw3 +#undef _ljsw4 +#undef _ljsw5 diff --git a/lib/cuda/pair_lj_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_cuda_cu.h new file mode 100644 index 0000000000..970eb1f832 --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu new file mode 100644 index 0000000000..818c9f55fc --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu @@ -0,0 +1,51 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJGromacsCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + F_FLOAT tlj; + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]); + const X_FLOAT cut_lj_innersq=(_cut_innersq_global > X_F(0.0)? _cut_innersq_global : _cut_innersq[ij_type]); + if (rsq > cut_lj_innersq) + { + tlj = r - _SQRT_(cut_lj_innersq); + forcelj += r*tlj*tlj*(_ljsw1[ij_type] + _ljsw2[ij_type]*tlj); + } + + if (eflag) + { + ENERGY_FLOAT evdwl_tmp = r6inv*(_lj3[ij_type]*r6inv-_lj4[ij_type]); + + if (rsq > cut_lj_innersq) + { + evdwl_tmp += tlj*tlj*tlj* + (_ljsw3[ij_type] + _ljsw4[ij_type]*tlj) + _ljsw5[ij_type];; + } + + evdwl+=evdwl_tmp*factor_lj; + } + return factor_lj*forcelj*r2inv; +} diff --git a/lib/cuda/pair_lj_smooth_cuda.cu b/lib/cuda/pair_lj_smooth_cuda.cu new file mode 100644 index 0000000000..5723ffc94c --- /dev/null +++ b/lib/cuda/pair_lj_smooth_cuda.cu @@ -0,0 +1,83 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) +#define _ljsw1 MY_AP(coeff5_gm) +#define _ljsw2 MY_AP(coeff6_gm) +#define _ljsw3 MY_AP(coeff7_gm) +#define _ljsw4 MY_AP(coeff8_gm) +#define _ljsw0 MY_AP(coeff9_gm) + +#include "pair_lj_smooth_cuda_cu.h" +#include "pair_lj_smooth_cuda_kernel_nc.cu" + +#include + +void Cuda_PairLJSmoothCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 9,false,true,true); +} + + + +void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom,int vflag_atom) +{ + // initialize only on first call + static short init=0; + if(! init) + { + init = 1; + Cuda_PairLJSmoothCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _ljsw1 +#undef _ljsw2 +#undef _ljsw3 +#undef _ljsw4 +#undef _ljsw0 diff --git a/lib/cuda/pair_lj_smooth_cuda_cu.h b/lib/cuda/pair_lj_smooth_cuda_cu.h new file mode 100644 index 0000000000..504cf19f98 --- /dev/null +++ b/lib/cuda/pair_lj_smooth_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu new file mode 100644 index 0000000000..bcac8bf88a --- /dev/null +++ b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu @@ -0,0 +1,66 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJSmoothCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + F_FLOAT fskin,t,tsq,forcelj; + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + + + X_FLOAT cut_lj_innersq=(_cut_innersq_global > X_F(0.0)? _cut_innersq_global : _cut_innersq[ij_type]); + if (rsq < cut_lj_innersq) + { + forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]); + } + else + { + t = r - _SQRT_(cut_lj_innersq); + tsq = t*t; + fskin = _ljsw1[ij_type] + _ljsw2[ij_type]*t + + _ljsw3[ij_type]*tsq + _ljsw4[ij_type]*tsq*t; + forcelj = fskin*r; + + } + + if (eflag) + { + ENERGY_FLOAT evdwl_tmp; + + if (rsq < cut_lj_innersq) + { + evdwl_tmp = r6inv*(_lj3[ij_type]*r6inv-_lj4[ij_type]) - + _offset[ij_type]; + } + else + { + evdwl_tmp = _ljsw0[ij_type] - _ljsw1[ij_type]*t - + _ljsw2[ij_type]*tsq*F_F(0.5) - _ljsw3[ij_type]*tsq*t*(F_F(1.0)/F_F(3.0)) - + _ljsw4[ij_type]*tsq*tsq*(F_F(1.0)/F_F(4.0)) - _offset[ij_type]; + } + + evdwl+=evdwl_tmp*factor_lj; + } + return factor_lj*forcelj * r2inv; +} diff --git a/lib/cuda/pair_morse_coul_long_cuda.cu b/lib/cuda/pair_morse_coul_long_cuda.cu new file mode 100644 index 0000000000..cb226b58f4 --- /dev/null +++ b/lib/cuda/pair_morse_coul_long_cuda.cu @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _r0 MY_AP(coeff1) +#define _alpha MY_AP(coeff2) +#define _morse1 MY_AP(coeff3) +#define _d0 MY_AP(coeff4) +#define _c0 MY_AP(coeff5) + +#include "pair_morse_coul_long_cuda_cu.h" +#include "pair_morse_coul_long_cuda_kernel_nc.cu" + +#include + +void Cuda_PairMorseCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5,true); +} + +void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + + static short init=0; + if(! init) + { + init = 1; + Cuda_PairMorseCoulLongCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + + +#undef _rhoinv +#undef _sigma +#undef _a +#undef _c +#undef _d +#undef _c0 + diff --git a/lib/cuda/pair_morse_coul_long_cuda_cu.h b/lib/cuda/pair_morse_coul_long_cuda_cu.h new file mode 100644 index 0000000000..63055289f4 --- /dev/null +++ b/lib/cuda/pair_morse_coul_long_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); +#endif diff --git a/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu new file mode 100644 index 0000000000..b367914a78 --- /dev/null +++ b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu @@ -0,0 +1,33 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairMorseR6Cuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r = _SQRT_(rsq); + const F_FLOAT r4inv = r2inv*r2inv; + const F_FLOAT dr = r-_r0[ij_type]; + const F_FLOAT dexp = _EXP_(-_alpha[ij_type]*dr); + if(eflag) evdwl += factor_lj*(_d0[ij_type]*(dexp*dexp-F_F(2.0)*dexp) + _c0[ij_type]*r4inv*r4inv*r4inv + - _offset[ij_type]); + return factor_lj*(_morse1[ij_type]*(dexp*dexp-dexp)*(F_F(1.0)/r)- F_F(12.0)*_c0[ij_type]*r4inv*r4inv*r4inv*r2inv); +} diff --git a/lib/cuda/pair_morse_cuda.cu b/lib/cuda/pair_morse_cuda.cu new file mode 100644 index 0000000000..d33ac842d3 --- /dev/null +++ b/lib/cuda/pair_morse_cuda.cu @@ -0,0 +1,77 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#define _r0 MY_AP(coeff1) +#define _alpha MY_AP(coeff2) +#define _morse1 MY_AP(coeff3) +#define _d0 MY_AP(coeff4) + +#include "pair_morse_cuda_cu.h" +#include "pair_morse_cuda_kernel_nc.cu" +#include + + + +void Cuda_PairMorseCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4); +} + + + + +void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + + // initialize only on first call + static short init=0; + if(! init) + { + init = 1; + Cuda_PairMorseCuda_Init(sdata); + } + + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,256); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + else + Pair_Kernel_TpA + <<>> (eflag, vflag,eflag_atom,vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _r0 +#undef _alpha +#undef _morse1 +#undef _d0 + + diff --git a/lib/cuda/pair_morse_cuda_cu.h b/lib/cuda/pair_morse_cuda_cu.h new file mode 100644 index 0000000000..2cfe350458 --- /dev/null +++ b/lib/cuda/pair_morse_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_morse_cuda_kernel_nc.cu b/lib/cuda/pair_morse_cuda_kernel_nc.cu new file mode 100644 index 0000000000..ead1c54fb2 --- /dev/null +++ b/lib/cuda/pair_morse_cuda_kernel_nc.cu @@ -0,0 +1,32 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairMorseCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const F_FLOAT r = _SQRT_(rsq); + const F_FLOAT dr = r-_r0[ij_type]; + const F_FLOAT dexp = _EXP_(-_alpha[ij_type]*dr); + if(eflag) evdwl += factor_lj*(_d0[ij_type]*(dexp*dexp-F_F(2.0)*dexp) + - _offset[ij_type]); + return factor_lj*_morse1[ij_type]*(dexp*dexp-dexp)*(F_F(1.0)/r); +} + diff --git a/lib/cuda/pair_virial_compute_cu.h b/lib/cuda/pair_virial_compute_cu.h new file mode 100644 index 0000000000..fdd2cecb8c --- /dev/null +++ b/lib/cuda/pair_virial_compute_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairVirialCompute(cuda_shared_data* sdata, int offset, int end); diff --git a/lib/cuda/pppm_cuda.cu b/lib/cuda/pppm_cuda.cu new file mode 100644 index 0000000000..cabea885d3 --- /dev/null +++ b/lib/cuda/pppm_cuda.cu @@ -0,0 +1,579 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_precision.h" +//#define FFT_CUFFT +#define MY_PREFIX pppm +#include "cuda_shared.h" +#include "cuda_common.h" +#include "pppm_cuda_cu.h" +#include "cuda_runtime.h" +#include + +//#include "crm_cuda_utils.cu" +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + + __device__ __constant__ FFT_FLOAT* work1; + __device__ __constant__ FFT_FLOAT* work2; + __device__ __constant__ FFT_FLOAT* work3; + __device__ __constant__ PPPM_FLOAT* greensfn; + __device__ __constant__ PPPM_FLOAT* gf_b; + __device__ __constant__ PPPM_FLOAT* fkx; + __device__ __constant__ PPPM_FLOAT* fky; + __device__ __constant__ PPPM_FLOAT* fkz; + __device__ __constant__ PPPM_FLOAT* vg; + __device__ __constant__ int* part2grid; + __device__ __constant__ PPPM_FLOAT* density_brick; + __device__ __constant__ int* density_brick_int; + __device__ __constant__ PPPM_FLOAT density_intScale; + __device__ __constant__ PPPM_FLOAT* vdx_brick; + __device__ __constant__ PPPM_FLOAT* vdy_brick; + __device__ __constant__ PPPM_FLOAT* vdz_brick; + __device__ __constant__ PPPM_FLOAT* density_fft; + __device__ __constant__ ENERGY_FLOAT* energy; + __device__ __constant__ ENERGY_FLOAT* virial; + __device__ __constant__ int nxlo_in; + __device__ __constant__ int nxhi_in; + __device__ __constant__ int nxlo_out; + __device__ __constant__ int nxhi_out; + __device__ __constant__ int nylo_in; + __device__ __constant__ int nyhi_in; + __device__ __constant__ int nylo_out; + __device__ __constant__ int nyhi_out; + __device__ __constant__ int nzlo_in; + __device__ __constant__ int nzhi_in; + __device__ __constant__ int nzlo_out; + __device__ __constant__ int nzhi_out; + __device__ __constant__ int nxlo_fft; + __device__ __constant__ int nxhi_fft; + __device__ __constant__ int nylo_fft; + __device__ __constant__ int nyhi_fft; + __device__ __constant__ int nzlo_fft; + __device__ __constant__ int nzhi_fft; + __device__ __constant__ int nx_pppm; + __device__ __constant__ int ny_pppm; + __device__ __constant__ int nz_pppm; + __device__ __constant__ int slabflag; + __device__ __constant__ PPPM_FLOAT qqrd2e; + __device__ __constant__ int order; + //__device__ __constant__ float3 sublo; + __device__ __constant__ PPPM_FLOAT* rho_coeff; + __device__ __constant__ int nmax; + __device__ __constant__ int nlocal; + __device__ __constant__ PPPM_FLOAT* debugdata; + __device__ __constant__ PPPM_FLOAT delxinv; + __device__ __constant__ PPPM_FLOAT delyinv; + __device__ __constant__ PPPM_FLOAT delzinv; + __device__ __constant__ int nlower; + __device__ __constant__ int nupper; + __device__ __constant__ PPPM_FLOAT shiftone; + + +#include "pppm_cuda_kernel.cu" +#include "stdio.h" +void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial + ,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg + ,int cu_nxlo_in,int cu_nxhi_in,int cu_nylo_in,int cu_nyhi_in,int cu_nzlo_in,int cu_nzhi_in,int cu_nxlo_out,int cu_nxhi_out,int cu_nylo_out,int cu_nyhi_out,int cu_nzlo_out,int cu_nzhi_out,int cu_nx_pppm,int cu_ny_pppm,int cu_nz_pppm + ,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b + ,double cu_qqrd2e, int cu_order, void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_int,int cu_slabflag + ) +{ + CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start"); + cudaMemcpyToSymbol("density_brick",&cu_density_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("density_brick_int",&cu_density_brick_int, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("vdx_brick",&cu_vdx_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("vdy_brick",&cu_vdy_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("vdz_brick",&cu_vdz_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("density_fft",&cu_density_fft, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("energy",&cu_energy, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol("virial",&cu_virial, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol("nxlo_in",&cu_nxlo_in, sizeof(int)); + cudaMemcpyToSymbol("nxhi_in",&cu_nxhi_in, sizeof(int)); + cudaMemcpyToSymbol("nxlo_out",&cu_nxlo_out, sizeof(int)); + cudaMemcpyToSymbol("nxhi_out",&cu_nxhi_out, sizeof(int)); + cudaMemcpyToSymbol("nylo_in",&cu_nylo_in, sizeof(int)); + cudaMemcpyToSymbol("nyhi_in",&cu_nyhi_in, sizeof(int)); + cudaMemcpyToSymbol("nylo_out",&cu_nylo_out, sizeof(int)); + cudaMemcpyToSymbol("nyhi_out",&cu_nyhi_out, sizeof(int)); + cudaMemcpyToSymbol("nzlo_in",&cu_nzlo_in, sizeof(int)); + cudaMemcpyToSymbol("nzhi_in",&cu_nzhi_in, sizeof(int)); + cudaMemcpyToSymbol("nzlo_out",&cu_nzlo_out, sizeof(int)); + cudaMemcpyToSymbol("nzhi_out",&cu_nzhi_out, sizeof(int)); + cudaMemcpyToSymbol("nxlo_fft",&cu_nxlo_fft, sizeof(int)); + cudaMemcpyToSymbol("nxhi_fft",&cu_nxhi_fft, sizeof(int)); + cudaMemcpyToSymbol("nylo_fft",&cu_nylo_fft, sizeof(int)); + cudaMemcpyToSymbol("nyhi_fft",&cu_nyhi_fft, sizeof(int)); + cudaMemcpyToSymbol("nzlo_fft",&cu_nzlo_fft, sizeof(int)); + cudaMemcpyToSymbol("nzhi_fft",&cu_nzhi_fft, sizeof(int)); + cudaMemcpyToSymbol("slabflag",&cu_slabflag, sizeof(int)); + cudaMemcpyToSymbol("nx_pppm",&cu_nx_pppm, sizeof(int)); + cudaMemcpyToSymbol("ny_pppm",&cu_ny_pppm, sizeof(int)); + cudaMemcpyToSymbol("nz_pppm",&cu_nz_pppm, sizeof(int)); + cudaMemcpyToSymbol("work1",&cu_work1, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol("work2",&cu_work2, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol("work3",&cu_work3, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol("greensfn",&cu_greensfn, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("gf_b",&cu_gf_b, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("fkx",&cu_fkx, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("fky",&cu_fky, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("fkz",&cu_fkz, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("vg",&cu_vg, sizeof(PPPM_FLOAT*)); + + PPPM_FLOAT cu_qqrd2e_a=cu_qqrd2e; + cudaMemcpyToSymbol("qqrd2e",&cu_qqrd2e_a, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol("order",&cu_order, sizeof(int)); + cudaMemcpyToSymbol("rho_coeff",&cu_rho_coeff, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("debugdata",&cu_debugdata, sizeof(PPPM_FLOAT*)); + + CUT_CHECK_ERROR("ERROR-CUDA poisson_init"); + +/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n"); + +#ifdef PPPM_PRECISION +if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n"); +if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n"); +#endif +#ifdef ENERGY_PRECISION +if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n"); +if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n"); +#endif +#ifdef ENERGY_PRECISION +if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n"); +if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n"); +#endif +#ifdef X_PRECISION +if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n"); +if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n"); +#endif +#ifdef F_PRECISION +if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n"); +if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n"); +#endif*/ +} + +void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT cu_shiftone,PPPM_FLOAT cu_delxinv,PPPM_FLOAT cu_delyinv,PPPM_FLOAT cu_delzinv,int cu_nlower,int cu_nupper) +{ + cudaMemcpyToSymbol("delxinv",&cu_delxinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol("delyinv",&cu_delyinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol("delzinv",&cu_delzinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol("shiftone",&cu_shiftone, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol("nlower",&cu_nlower, sizeof(int)); + cudaMemcpyToSymbol("nupper",&cu_nupper, sizeof(int)); + cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo, 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi, 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(boxlo) , sdata->domain.boxlo, 3*sizeof(X_FLOAT)); + CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup"); +} + +void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa) +{ + cudaMemcpyToSymbol("part2grid",&cu_part2grid, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + //cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int)); + cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int)); + cudaMemcpyToSymbol("nmax" , &nmaxa, sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update"); + +} + +void pppm_update_nlocal(int nlocala) +{ + cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b"); +} + + +void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + setup_fkxyz_vg<<>>(unitkx,unitky,unitkz,g_ewald); + cudaThreadSynchronize(); + + CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg "); +} + +void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald, +int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + setup_greensfn<<>>(unitkx,unitky,unitkz,g_ewald,nbx,nby,nbz,xprd,yprd, zprd_slab); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn "); +} + +void poisson_scale(int nx_pppma,int ny_pppma,int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + poisson_scale_kernel<<>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_scale "); + +} + +void poisson_xgrad(int nx_pppma,int ny_pppma,int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + poisson_xgrad_kernel<<>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad "); +} + +void poisson_ygrad(int nx_pppma,int ny_pppma,int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + poisson_ygrad_kernel<<>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad "); +} + +void poisson_zgrad(int nx_pppma,int ny_pppma,int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + poisson_zgrad_kernel<<>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad "); +} + +void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppma,int ny_pppma,int nz_pppma) +{ + + dim3 grid; + dim3 threads; + grid.x=khi-klo+1; + grid.y=jhi-jlo+1; + grid.z=1; + threads.x=ihi-ilo+1; + threads.y=1; + threads.z=1; + //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x); + poisson_vdx_brick_kernel<<>>(ilo,jlo,klo); + CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick "); + cudaThreadSynchronize(); +} + +void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm) +{ + dim3 grid; + dim3 threads; + grid.x=khi-klo+1; + grid.y=jhi-jlo+1; + grid.z=1; + threads.x=ihi-ilo+1; + threads.y=1; + threads.z=1; + poisson_vdy_brick_kernel<<>>(ilo,jlo,klo); + CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick "); + cudaThreadSynchronize(); +} + +void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm) +{ + dim3 grid; + dim3 threads; + grid.x=khi-klo+1; + grid.y=jhi-jlo+1; + grid.z=1; + threads.x=ihi-ilo+1; + threads.y=1; + threads.z=1; + poisson_vdz_brick_kernel<<>>(ilo,jlo,klo); + CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick "); + cudaThreadSynchronize(); +} + + +void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag) +{ + //printf("VFLAG_GPU: %i\n",vflag); + CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start "); + dim3 grid; + dim3 threads; + grid.x=nzhi_fft-nzlo_fft+1; + grid.y=nyhi_fft-nylo_fft+1; + grid.z=1; + threads.x=nxhi_fft-nxlo_fft+1; + threads.y=1; + threads.z=1; + poisson_energy_kernel<<>>(nxlo_fft,nylo_fft,nzlo_fft,vflag); + + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end "); +} + +ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial) +{ + ENERGY_FLOAT host_energy=0; + dim3 grid; + dim3 threads; + + grid.x=nz_pppma; + grid.y=1; + grid.z=1; + threads.x=ny_pppma; + threads.y=1; + threads.z=1; + sum_energy_kernel1<<>>(vflag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 "); + + grid.x=1; + grid.y=1; + grid.z=1; + threads.x=nz_pppma; + threads.y=1; + threads.z=1; + sum_energy_kernel2<<>>(vflag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 "); + + cudaMemcpy((void*) (&host_energy), cu_energy, sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); + if(vflag) + cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); + CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy"); + + return host_energy; +} + +void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int) +{ + CUT_CHECK_ERROR("cuda_make_rho begin"); + dim3 grid,threads; + int cpu_flag[3]; + grid.x=(sdata->atom.nlocal+31)/32; + grid.y=1; + grid.z=1; + threads.x=32; + threads.y=1; + threads.z=1; + int sharedmemsize=(32+32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT); + do + { + cpu_flag[0]=0; + cpu_flag[1]=0; + cpu_flag[2]=0; + cudaMemcpyToSymbol("density_intScale",cu_density_intScale,sizeof(PPPM_FLOAT*)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z"); + cudaMemset(flag,0,3*sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A"); + cudaMemset(cu_density_brick,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(PPPM_FLOAT)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B"); + cudaMemset(cu_density_brick_int,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C"); + make_rho_kernel<<>>((int*) flag,32/(sdata->pppm.nupper-sdata->pppm.nlower+1)); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA make_rho A"); + cudaMemcpy((void*) &cpu_flag, flag, 3*sizeof(int),cudaMemcpyDeviceToHost); + if(cpu_flag[0]!=0) {(*cu_density_intScale)/=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n",*cu_density_intScale);)} + if((cpu_flag[0]==0)&&(cpu_flag[1]==0)) {(*cu_density_intScale)*=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n",*cu_density_intScale);)} + /* if((*cu_density_intScale)>0xe0000000) + { + printf("Error Scaling\n"); + cpu_flag[0]=0; + cpu_flag[1]=1; + }*/ + CUT_CHECK_ERROR("ERROR-CUDA make_rho B"); + } while((cpu_flag[0]!=0)||(cpu_flag[1]==0)); + + + grid.x=khi-klo+1; + grid.y=jhi-jlo+1; + threads.x=ihi-ilo+1; + scale_rho_kernel<<>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale"); +} + + +int cuda_particle_map(cuda_shared_data* sdata,void* flag) +{ + dim3 grid,threads; + int cpu_flag; + grid.x=(sdata->atom.nlocal+31)/32; + grid.y=1; + grid.z=1; + threads.x=32; + threads.y=1; + threads.z=1; + CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre"); + particle_map_kernel<<>>((int*) flag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA particla_map a"); + cudaMemcpy((void*) &cpu_flag, flag, sizeof(int),cudaMemcpyDeviceToHost); + CUT_CHECK_ERROR("ERROR-CUDA particla_map b"); + return cpu_flag; +} + + +void cuda_fieldforce(cuda_shared_data* sdata,void* flag) +{ + dim3 grid,threads; + grid.x=(sdata->atom.nlocal+31)/32; + grid.y=1; + grid.z=1; + threads.x=32; + threads.y=1; + threads.z=1; + int sharedmemsize=(32+3*32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT); + fieldforce_kernel<<>> + (sdata->pppm.nupper-sdata->pppm.nlower+1,32/(sdata->pppm.nupper-sdata->pppm.nlower+1),(int*) flag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA fieldforce"); +} + +double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf) +{ + dim3 grid,threads; + grid.x=(sdata->atom.nlocal+31)/32; + grid.y=1; + grid.z=1; + threads.x=32; + threads.y=1; + threads.z=1; + slabcorr_energy_kernel<<>>(dev_buf); + cudaThreadSynchronize(); + cudaMemcpy((void*) buf, dev_buf, grid.x*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); + + double dipole_all=0.0; + for(int i=0;iatom.nlocal+31)/32; + grid.y=1; + grid.z=1; + threads.x=32; + threads.y=1; + threads.z=1; + slabcorr_force_kernel<<>>(ffact); + cudaThreadSynchronize(); +} + +void sum_virial(double* host_virial) +{ +} + +void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out) +{ + int nslow=sdata->pppm.nzhi_in-sdata->pppm.nzlo_in; + int nmid=sdata->pppm.nyhi_in-sdata->pppm.nylo_in; + int nfast=sdata->pppm.nxhi_in-sdata->pppm.nxlo_in; + int nrimz=MAX(sdata->pppm.nzlo_in-sdata->pppm.nzlo_out,sdata->pppm.nzhi_out-sdata->pppm.nzhi_in); + int nrimy=MAX(sdata->pppm.nylo_in-sdata->pppm.nylo_out,sdata->pppm.nyhi_out-sdata->pppm.nyhi_in); + int nrimx=MAX(sdata->pppm.nxlo_in-sdata->pppm.nxlo_out,sdata->pppm.nxhi_out-sdata->pppm.nxhi_in); + dim3 grid; + grid.x=nslow+1; + grid.y=nmid+1; + grid.z=1; + dim3 threads; + threads.x=nfast+1; + threads.y=1; + threads.z=1; + cudaThreadSynchronize(); + initfftdata_core_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nrimz; + grid.y=nmid+1; + threads.x=nfast+1; + initfftdata_z_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nslow+1; + grid.y=nrimy; + threads.x=nfast+1; + initfftdata_y_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nslow+1; + grid.y=nmid+1; + threads.x=nrimx; + initfftdata_x_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nrimz; + grid.y=nrimy; + threads.x=nfast+1; + initfftdata_yz_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nrimz; + grid.y=nmid+1; + threads.x=nrimx; + initfftdata_xz_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nslow+1; + grid.y=nrimy; + threads.x=nrimx; + initfftdata_xy_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nrimz; + grid.y=nrimy; + threads.x=nrimx; + initfftdata_xyz_kernel<<>>(in,out); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel"); +} + + diff --git a/lib/cuda/pppm_cuda_cu.h b/lib/cuda/pppm_cuda_cu.h new file mode 100644 index 0000000000..b594715b7c --- /dev/null +++ b/lib/cuda/pppm_cuda_cu.h @@ -0,0 +1,55 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef PPPM_CUDA_CU_H_ +#define PPPM_CUDA_CU_H_ + +extern "C" void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial + ,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg + ,int nxlo_in,int nxhi_in,int nylo_in,int nyhi_in,int nzlo_in,int nzhi_in,int nxlo_out,int nxhi_out,int nylo_out,int nyhi_out,int nzlo_out,int nzhi_out, int nx_pppm,int ny_pppm,int nz_pppm + ,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b + ,double cu_qqrd2e, int cu_order,void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_lock,int slabflag + ); +extern "C" void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT shiftone,PPPM_FLOAT delxinv,PPPM_FLOAT delyinv,PPPM_FLOAT delzinv,int nlower,int nupper); +extern "C" void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald); +extern "C" void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald, + int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab); + +extern "C" void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa); +extern "C" void pppm_update_nlocal(int nlocala); +extern "C" void poisson_scale(int nx_pppm,int ny_pppm,int nz_pppm); +extern "C" void poisson_xgrad(int nx_pppm,int ny_pppm,int nz_pppm); +extern "C" void poisson_ygrad(int nx_pppm,int ny_pppm,int nz_pppm); +extern "C" void poisson_zgrad(int nx_pppm,int ny_pppm,int nz_pppm); +extern "C" void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm); +extern "C" void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm); +extern "C" void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm); +extern "C" void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag); +extern "C" ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial); +extern "C" int cuda_particle_map(cuda_shared_data* sdata,void* flag); +extern "C" void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int); +extern "C" void cuda_fieldforce(cuda_shared_data* sdata,void* flag); +extern "C" double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf); +extern "C" void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact); +extern "C" void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out); +#endif /*PPPM_CUDA_CU_H_*/ diff --git a/lib/cuda/pppm_cuda_kernel.cu b/lib/cuda/pppm_cuda_kernel.cu new file mode 100644 index 0000000000..808c98fe39 --- /dev/null +++ b/lib/cuda/pppm_cuda_kernel.cu @@ -0,0 +1,816 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#define OFFSET 4096 +__device__ int negativCUDA(float f) +{ + return ((unsigned int)1<<31&(__float_as_int(f)))>>31; +} + +__device__ void reduceBlock(float* data) +{ + int p2=1; + while(p2*2= nzlo_fft)&&(blockIdx.x <=nzhi_fft)&& + (blockIdx.y >= nylo_fft)&&(blockIdx.y <=nyhi_fft)&& + (threadIdx.x>= nxlo_fft)&&(threadIdx.x<=nxhi_fft)) + { + int n=((int(blockIdx.x)-nzlo_fft)*(nyhi_fft-nylo_fft+1)+int(blockIdx.y)-nylo_fft)*(nxhi_fft-nxlo_fft+1)+int(threadIdx.x)-nxlo_fft; + PPPM_FLOAT sqk = my_fkx*my_fkx + my_fky*my_fky + my_fkz*my_fkz; + PPPM_FLOAT vterm = (sqk==PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0)/sqk + PPPM_F(0.25)/(g_ewald*g_ewald)); + vg[6*n+0] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkx*my_fkx; + vg[6*n+1] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fky*my_fky; + vg[6*n+2] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkz*my_fkz; + vg[6*n+3] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx*my_fky; + vg[6*n+4] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx*my_fkz; + vg[6*n+5] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fky*my_fkz; + + } +} + +__device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z) +{ + PPPM_FLOAT sx,sy,sz; + sz = sy = sx = PPPM_F(0.0); + for (int l = order-1; l >= 0; l--) { + sx = gf_b[l] + sx*x; + sy = gf_b[l] + sy*y; + sz = gf_b[l] + sz*z; + } + PPPM_FLOAT s = sx*sy*sz; + return s*s; +} + +__global__ void setup_greensfn(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald, +int nbx,int nby,int nbz, +PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab) +{ + PPPM_FLOAT sqk; + int nx,ny,nz,kper,lper,mper,k,l,m; + PPPM_FLOAT snx,sny,snz,snx2,sny2,snz2; + PPPM_FLOAT argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + PPPM_FLOAT sum1,dot1,dot2; + PPPM_FLOAT numerator,denominator; + + PPPM_FLOAT form=PPPM_F(1.0); + int n=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + m=blockIdx.x; + l=blockIdx.y; + k=threadIdx.x; + + mper = m - nz_pppm*(2*m/nz_pppm); + snz = sin(PPPM_F(0.5)*unitkz*mper*zprd_slab/nz_pppm); + snz2 = snz*snz; + + + lper = l - ny_pppm*(2*l/ny_pppm); + sny = sin(PPPM_F(0.5)*unitky*lper*yprd/ny_pppm); + sny2 = sny*sny; + + kper = k - nx_pppm*(2*k/nx_pppm); + snx = sin(PPPM_F(0.5)*unitkx*kper*xprd/nx_pppm); + snx2 = snx*snx; + + sqk = pow(unitkx*kper,PPPM_F(2.0)) + pow(unitky*lper,PPPM_F(2.0)) + + pow(unitkz*mper,PPPM_F(2.0)); + + if (sqk != PPPM_F(0.0)) { + numerator = form*PPPM_F(12.5663706)/sqk; + denominator = gf_denom(snx2,sny2,snz2); + sum1 = PPPM_F(0.0); + for (nx = -nbx; nx <= nbx; nx++) { + qx = unitkx*(kper+nx_pppm*nx); + sx = exp(PPPM_F(-.25)*pow(qx/g_ewald,PPPM_F(2.0))); + wx = PPPM_F(1.0); + argx = PPPM_F(0.5)*qx*xprd/nx_pppm; + if (argx != PPPM_F(0.0)) wx = pow(sin(argx)/argx,order); + for (ny = -nby; ny <= nby; ny++) { + qy = unitky*(lper+ny_pppm*ny); + sy = exp(PPPM_F(-.25)*pow(qy/g_ewald,PPPM_F(2.0))); + wy = PPPM_F(1.0); + argy = PPPM_F(0.5)*qy*yprd/ny_pppm; + if (argy != PPPM_F(0.0)) wy = pow(sin(argy)/argy,order); + for (nz = -nbz; nz <= nbz; nz++) { + qz = unitkz*(mper+nz_pppm*nz); + sz = exp(PPPM_F(-.25)*pow(qz/g_ewald,PPPM_F(2.0))); + wz = PPPM_F(1.0); + argz = PPPM_F(0.5)*qz*zprd_slab/nz_pppm; + if (argz != PPPM_F(0.0)) wz = pow(sin(argz)/argz,order); + + dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; + dot2 = qx*qx+qy*qy+qz*qz; + sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,PPPM_F(2.0)); + } + } + } + greensfn[n] = numerator*sum1/denominator; + } else greensfn[n] = PPPM_F(0.0); +} + +__global__ void poisson_scale_kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + FFT_FLOAT scaleinv=FFT_F(1.0)/(gridDim.x*gridDim.y*blockDim.x); + work1[2*i] *= scaleinv * greensfn[i]; + work1[2*i+1] *= scaleinv * greensfn[i]; +} + +__global__ void poisson_xgrad_kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + work2[2*i] = fkx[threadIdx.x] * work1[2*i+1]; + work2[2*i+1] = -fkx[threadIdx.x] * work1[2*i]; +} + +__global__ void poisson_ygrad_kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + work2[2*i] = fky[blockIdx.y] * work1[2*i+1]; + work2[2*i+1] = -fky[blockIdx.y] * work1[2*i]; +} + +__global__ void poisson_zgrad_kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + work2[2*i] = fkz[blockIdx.x] * work1[2*i+1]; + work2[2*i+1] = -fkz[blockIdx.x] * work1[2*i]; +} + +__global__ void poisson_vdx_brick_kernel(int ilo,int jlo,int klo) +{ + int k=blockIdx.x+klo; + k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1)); + int j=blockIdx.y+jlo; + j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1)); + int i=threadIdx.x+ilo; + i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1)); + vdx_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)]; +} + +__global__ void poisson_vdy_brick_kernel(int ilo,int jlo,int klo) +{ + int k=blockIdx.x+klo; + k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1)); + int j=blockIdx.y+jlo; + j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1)); + int i=threadIdx.x+ilo; + i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1)); + vdy_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)]; +} + +__global__ void poisson_vdz_brick_kernel(int ilo,int jlo,int klo) +{ + int k=blockIdx.x+klo; + k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1)); + int j=blockIdx.y+jlo; + j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1)); + int i=threadIdx.x+ilo; + i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1)); + vdz_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)]; +} + +__global__ void poisson_energy_kernel(int nxlo_fft,int nylo_fft,int nzlo_fft,int vflag) +{ + ENERGY_FLOAT scaleinv=FFT_F(1.0)/(nx_pppm*ny_pppm*nz_pppm); + int i=(blockIdx.x+nzlo_fft)*ny_pppm*nx_pppm+(blockIdx.y+nylo_fft)*nx_pppm+threadIdx.x+nxlo_fft; + ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem; + ENERGY_FLOAT myenergy= scaleinv*scaleinv * greensfn[i] * (work1[2*i]*work1[2*i] + work1[2*i+1]*work1[2*i+1]); + s_energy[threadIdx.x]=myenergy; + + __syncthreads(); + reduceBlock(s_energy); + if(threadIdx.x==0) + energy[blockIdx.x*ny_pppm+blockIdx.y]=s_energy[0]; + if(vflag) + { + __syncthreads(); + for (int j = 0; j < 6; j++) + { + s_energy[threadIdx.x]= myenergy*vg[((blockIdx.x*gridDim.y+blockIdx.y)*(blockDim.x)+threadIdx.x)*6+j]; + __syncthreads(); + reduceBlock(s_energy); + if(threadIdx.x==0) + virial[blockIdx.x*ny_pppm+blockIdx.y+j*nz_pppm*ny_pppm]=s_energy[0]; + } + } +} + + +__global__ void sum_energy_kernel1(int vflag) +{ + ENERGY_FLOAT myenergy=energy[(blockIdx.x*ny_pppm+threadIdx.x)]; + ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem; + s_energy[threadIdx.x]=myenergy; + __syncthreads(); + reduceBlock(s_energy); + if(threadIdx.x==0) + energy[blockIdx.x*ny_pppm]=s_energy[0]; + if(vflag) + { + __syncthreads(); + for (int j = 0; j < 6; j++) + { + myenergy=virial[blockIdx.x*ny_pppm+threadIdx.x+j*ny_pppm*nz_pppm]; + s_energy[threadIdx.x]=myenergy; + __syncthreads(); + reduceBlock(s_energy); + if(threadIdx.x==0) + virial[blockIdx.x*ny_pppm+j*ny_pppm*nz_pppm]=s_energy[0]; + } + } + +} + +__global__ void sum_energy_kernel2(int vflag) +{ + ENERGY_FLOAT myenergy=energy[threadIdx.x*ny_pppm]; + ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem; + s_energy[threadIdx.x]=myenergy; + __syncthreads(); + reduceBlock(s_energy); + if(threadIdx.x==0) + energy[0]=s_energy[0]; + if(vflag) + { + __syncthreads(); + for (int j = 0; j < 6; j++) + { + myenergy=virial[threadIdx.x*ny_pppm+j*ny_pppm*nz_pppm]; + s_energy[threadIdx.x]=myenergy; + __syncthreads(); + reduceBlock(s_energy); + if(threadIdx.x==0) + virial[j]=s_energy[0]; + } + } +} + +__device__ PPPM_FLOAT rho1d(int k,PPPM_FLOAT d,PPPM_FLOAT* srho_coeff) +{ + PPPM_FLOAT rho1d_tmp=PPPM_F(0.0); + for (int l = order-1; l >= 0; l--) + rho1d_tmp = srho_coeff[l*order+k-(1-order)/2] + rho1d_tmp*d; + return rho1d_tmp; +} + +__global__ void particle_map_kernel(int* flag) +{ + int i=blockIdx.x*blockDim.x+threadIdx.x; + if(i nxhi_out || + ny+nlower < nylo_out || ny+nupper > nyhi_out || + nz+nlower < nzlo_out || nz+nupper > nzhi_out) + {flag[0]++; + debugdata[0]=i; + debugdata[1]=_boxlo[0]; + debugdata[2]=_boxlo[1]; + debugdata[3]=_boxlo[2]; + debugdata[4]=nx; + debugdata[5]=ny; + debugdata[6]=nz; + debugdata[7]=_x[i]; + debugdata[8]=_x[i+_nmax]; + debugdata[9]=_x[i+2*_nmax]; + debugdata[10]=nlocal; + + } + } +} + +__global__ void make_rho_kernelA() +{ + int i,l,m,n,nx,ny,nz,mx,my,mz; + + // clear 3d density array + + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + i=blockIdx.x*blockDim.x+threadIdx.x; + + if(i < nlocal) { + + PPPM_FLOAT dx,dy,dz,x0,y0,z0; + nx = part2grid[i]; + ny = part2grid[i+nmax]; + nz = part2grid[i+2*nmax]; + dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv; + dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv; + dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv; + + z0 = delxinv*delyinv*delzinv * _q[i]; + for (n = nlower; n <= nupper; n++) + { + mz = n+nz; + y0 = z0*rho1d(n,dz,rho_coeff); + for (m = nlower; m <= nupper; m++) + { + my = m+ny; + x0 = y0*rho1d(m,dy,rho_coeff); + for (l = nlower; l <= nupper; l++) + { + mx = l+nx; + int mzyx=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+mx-nxlo_out; + while(atomicAdd(&density_brick_int[mzyx],1)!=0) atomicAdd(&density_brick_int[mzyx],-1); + density_brick[mzyx]+=x0*rho1d(l,dx,rho_coeff); + __threadfence(); + atomicAdd(&density_brick_int[mzyx],-1); + __syncthreads(); + + } + } + } + } +} + +__global__ void make_rho_kernel(int* flag,int read_threads_at_same_time) +{ + int i,l,m,n,nx,ny,nz,mx,my,mz,a,b; + + // clear 3d density array + + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // int nzxy=blockIdx.x*gridDim.y+blockIdx.y; + + int nelements=nupper-nlower+1; + int* idx=(int*) sharedmem; + int* sdensity_brick_int=&idx[blockDim.x]; + PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &sdensity_brick_int[nelements*blockDim.x]; + if(threadIdx.x-1)) + { + a=sdensity_brick_int[ii*nelements+threadIdx.x]; + //if(a*a>1e-100) + b=(atomicAdd(&density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements],a)|a); + //else + //b=(density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements]|a); + if(((b)&(0x7c000000))&&(not((b)&(0x80000000)))) + { + flag[1]++; + if((b)&(0x60000000)) flag[0]++; + } + } + } + __syncthreads(); //*/ + } + } + + } +} + +__global__ void scale_rho_kernel() +{ + int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + density_brick[i]=(1.0/density_intScale)*density_brick_int[i]; +} + +__global__ void fieldforce_kernel(int elements_per_thread,int read_threads_at_same_time,int* flag) //20*x64 0.36 +{ + int i; + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + i=blockIdx.x*blockDim.x+threadIdx.x; + int* idx=(int*) sharedmem; + PPPM_FLOAT* tmp_brick=(PPPM_FLOAT*) &idx[blockDim.x]; + PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &tmp_brick[3*blockDim.x*elements_per_thread]; + if(threadIdx.x-1)) + { + tmp_brick[ii*elements_per_thread+threadIdx.x]=vdx_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread]; + tmp_brick[(ii+blockDim.x)*elements_per_thread+threadIdx.x]=vdy_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread]; + tmp_brick[(ii+2*blockDim.x)*elements_per_thread+threadIdx.x]=vdz_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread]; + } + } + __syncthreads(); + + if(i