diff --git a/lib/cuda/Makefile b/lib/cuda/Makefile deleted file mode 100644 index 844906ba89..0000000000 --- a/lib/cuda/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -#Makefile for liblammpscuda.a -#No need to modify anything here! The CUDA path is inserted into Makefile.common - -include Makefile.cudalib \ No newline at end of file diff --git a/lib/cuda/Makefile.common b/lib/cuda/Makefile.common deleted file mode 100644 index d98f75f3b5..0000000000 --- a/lib/cuda/Makefile.common +++ /dev/null @@ -1,124 +0,0 @@ -#Common commandline argument interpreter for compilation with lammpscuda (USER-CUDA) installed - -# make options: -# emu=1 switch to cuda emulation mode (otherwise: use gpu) -# dbg=1 print a lot of debugging output during runtime -# verbose=1 output nvcc command line during compilation -# keep=1 do not delete temporary compilation files (.ii, .cubin, ...) -# cufft=1 use cuda's fast fourier transformation lib "cufft" where possible (otherwise: use cpu fftw) -# binning=1 create virtual particle grid (neighbor-lists otherwise); currently this is not supported -# precision=1 single precision (global setting) -# precision=2 double precision (global setting) - -SHELL = /bin/sh - -# System-specific settings - -#CUDA_INSTALL_PATH = /usr/local/cuda -CUDA_INSTALL_PATH = /home/crtrott/lib/cuda -# e.g. in Gentoo -# CUDA_INSTALL_PATH = /opt/cuda - - -#////////////////////////////////////////////////////////////////////////////////////////////// -# no need to change anything below this line -#////////////////////////////////////////////////////////////////////////////////////////////// - -#use CPU FFT if cufft=0 is requested. -FALLBACK_FFT = 1 - -#default settings for compiler switches -ifdef COMPILELIB -include Makefile.defaults -else -include ../../lib/cuda/Makefile.defaults -endif - -#shell echo "Compiling with precision = " ${precision} ", arch = " ${arch} ", cufft = " ${cufft} ", dbg = " ${dbg} ", prec_timer = " ${prec_timer} - -CUDA_FLAGS := -I${CUDA_INSTALL_PATH}/include -DUNIX -CUDA_USRLIB_CONDITIONAL := -L${CUDA_INSTALL_PATH}/lib -L${CUDA_INSTALL_PATH}/lib64 - -# debug setting -ifeq ($(strip $(dbg)), 1) - CUDA_FLAGS += -D_DEBUG -g - NVCC_FLAGS += -g -G -else - NVCC_FLAGS += --compiler-options -fno-strict-aliasing -O2 -endif - -# skip timing on Mac and Windows manually -ifeq ($(strip $(prec_timer)), 0) - CUDA_FLAGS += -DNO_PREC_TIMING -endif - -# set fft routine -ifeq ($(strip $(cufft)), 0) - ifneq ($(FALLBACK_FFT), 1) - FFT_INC = -DFFT_NONE - FFT_PATH = - FFT_LIB = - CUDA_FLAGS += -DFFT_NONE - endif -else - CUDA_FLAGS += -DFFT_CUFFT - CUDA_USRLIB_CONDITIONAL += -lcufft -endif - -# make global precision setting - -ifeq ($(strip $(precision)), 1) - CUDA_FLAGS += -DCUDA_PRECISION=1 -else - ifeq ($(strip $(precision)), 3) - CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 - else - ifeq ($(strip $(precision)), 4) - CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2 - else - CUDA_FLAGS += -DCUDA_PRECISION=2 - endif - endif -endif - -# make architecture settings -ifeq ($(strip $(arch)), 13) - CUDA_FLAGS += -DCUDA_ARCH=13 - SMVERSIONFLAGS := -arch sm_13 -else - ifeq ($(strip $(arch)), 20) - CUDA_FLAGS += -DCUDA_ARCH=20 - #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true - NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false - SMVERSIONFLAGS := -arch sm_20 - else - ifeq ($(strip $(arch)), 21) - CUDA_FLAGS += -DCUDA_ARCH=20 - #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true - NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false - SMVERSIONFLAGS := -arch sm_21 - else - ifeq ($(strip $(arch)), 30) - CUDA_FLAGS += -DCUDA_ARCH=20 - #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true - NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false - SMVERSIONFLAGS := -arch sm_30 - else - ifeq ($(strip $(arch)), 35) - CUDA_FLAGS += -DCUDA_ARCH=20 - #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true - NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false - SMVERSIONFLAGS := -arch sm_35 - else - CUDA_FLAGS += -DCUDA_ARCH=99 - SMVERSIONFLAGS := -arch sm_13 - endif - endif - endif - endif -endif - - - -CCFLAGS := $(CCFLAGS) $(CUDA_FLAGS) \ - -I$(CUDA_INSTALL_PATH)/include diff --git a/lib/cuda/Makefile.cudalib b/lib/cuda/Makefile.cudalib deleted file mode 100644 index f21e95e686..0000000000 --- a/lib/cuda/Makefile.cudalib +++ /dev/null @@ -1,87 +0,0 @@ -#Makefile for liblammpscuda.a -#No need to modify anything here! The CUDA path is inserted into Makefile.common - -.DEFAULT: lib - -COMPILELIB := 1 - -SHELL = /bin/sh - -CUDA_SRC_DIR = ../cuda -CUDA_TEMP = $(CUDA_SRC_DIR)/.lastmake -CUDA_TEMP_DUMMY := $(shell touch $(CUDA_TEMP) ) -include $(CUDA_TEMP) -CUDA_CU = $(wildcard $(CUDA_SRC_DIR)/*_kernel.cu) -CUDA_CUO = $(CUDA_CU:_kernel.cu=_cu.o) -CUDA_OBJ = $(subst $(CUDA_SRC_DIR)/,,$(CUDA_CUO)) -CUDA_DEP = $(CUDA_OBJ:.o=.d) - -NVCC_FLAGS := - -VPATH = $(CUDA_SRC_DIR) - -#rewriting default settings if new ones are specified - - -ifdef precision -tmp := $(shell sed -i 's|precision ?= [0-9]|precision ?= '${precision}'|g' Makefile.defaults) -endif - -ifdef arch -tmp := $(shell sed -i 's|arch ?= [0-9][0-9]|arch ?= '${arch}'|g' Makefile.defaults) -endif - -ifdef cufft -tmp := $(shell sed -i 's|cufft ?= [0-9]|cufft ?= '${cufft}'|g' Makefile.defaults) -endif - -ifdef dbg -tmp := $(shell sed -i 's|dbg ?= [0-9]|dbg ?= '${dbg}'|g' Makefile.defaults) -endif - -ifdef prec_timer -tmp := $(shell sed -i 's|prec_timer ?= [0-9]|prec_timer ?= '${prec_timer}'|g' Makefile.defaults) -endif - -include Makefile.common - -tmp := $(shell sed -i '2 d' Makefile.lammps) -tmp := $(shell sed -i '2 d' Makefile.lammps) -tmp := $(shell sed -i '1a CUDA_FLAGS := ${CUDA_FLAGS}' Makefile.lammps) -tmp := $(shell sed -i '2a CUDA_USRLIB_CONDITIONAL := ${CUDA_USRLIB_CONDITIONAL}' Makefile.lammps) - -# verbose nvcc output during compilation -ifeq ($(verbose), 1) - VERBOSE := - NVCC_FLAGS += --ptxas-options=-v -else - VERBOSE := @ -endif - -# keep temporary compilation files of nvcc -ifeq ($(keep), 1) - NVCC_FLAGS += -keep -Xptxas="--verbose" -endif - - -NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc -CUDA_INCLUDES = -I./ -I$(CUDA_INSTALL_PATH)/include -I../../src/USER-CUDA -CUDA_USRLIB = - -# Link target - -lib: $(CUDA_OBJ) - $(NVCC) -lib $(CUDA_OBJ) $(CUDA_FLAGS) $(CUDA_USRLIB_CONDITIONAL) -o liblammpscuda.a - -clean: - rm $(CUDA_SRC_DIR)/*.o - rm liblammpscuda.a - -# Library target - - -# Cuda compilation rules - -%_cu.o: %.cu %_kernel.cu %_cu.h cuda_shared.h - $(VERBOSE)$(NVCC) $(NVCC_FLAGS) $(CUDA_FLAGS) $(CUDA_INCLUDES) $(CUDA_USRLIB) $(SMVERSIONFLAGS) -o $@ -c $< - diff --git a/lib/cuda/Makefile.defaults b/lib/cuda/Makefile.defaults deleted file mode 100644 index 2e208c3649..0000000000 --- a/lib/cuda/Makefile.defaults +++ /dev/null @@ -1,19 +0,0 @@ - -#precision setting: 1 single, 2 double, 4 mixed -precision ?= 1 - -#verbose setting: 0 no, 1 yes -verbose ?= 1 - -#GPU architecture (compute capability): 13, 20, 21 -arch ?= 20 - -#Using cufft (should not be changed) -cufft ?= 1 - -#Using dbg mode -dbg ?= 0 - -#On mac machines set this to 0 in order to avoid usage of linux specific precision timer -prec_timer ?= 1 - diff --git a/lib/cuda/Makefile.lammps b/lib/cuda/Makefile.lammps deleted file mode 100644 index 8ff782f5df..0000000000 --- a/lib/cuda/Makefile.lammps +++ /dev/null @@ -1,8 +0,0 @@ -# Settings that the LAMMPS build will import when this package library is used -CUDA_FLAGS := -I/usr/local/cuda/include -DUNIX -DFFT_CUFFT -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2 -DCUDA_ARCH=20 -CUDA_USRLIB_CONDITIONAL := -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -lcufft - - user-cuda_SYSINC = ${CUDA_FLAGS} - user-cuda_SYSLIB = -lcuda -lcudart -lrt - user-cuda_SYSPATH = $(CUDA_USRLIB_CONDITIONAL) - diff --git a/lib/cuda/README b/lib/cuda/README deleted file mode 100644 index ce0dedcffe..0000000000 --- a/lib/cuda/README +++ /dev/null @@ -1,26 +0,0 @@ -This directory has source files to build a library that LAMMPS -links against when using the USER-CUDA package. - -When you are done building this library, two files should -exist in this directory: - -liblammpscuda.a the library LAMMPS will link against -Makefile.lammps settings the LAMMPS Makefile will import - -The latter file will have settings like this (can be omitted if blank): - -user-cuda_SYSINC = -I$(CUDA_INSTALL_PATH)/include -user-cuda_SYSLIB = -lcuda -lcudart -lrt -user-cuda_SYSPATH = -L$(CUDA_INSTALL_PATH)/lib64 -L$(CUDA_INSTALL_PATH)/lib $(CUDA_USRLIB_CONDITIONAL) - -SYSINC is for settings needed to compile LAMMPS source files -SYSLIB is for additional system libraries needed by this package -SYSPATH is the path(s) to where those libraries are - -You must insure these settings are correct for your system, else -the LAMMPS build will likely fail. - -------------------------------------------------------------------------- - -Christian - there needs to additional info here about how -to build the lammpscuda lib. diff --git a/lib/cuda/atom_vec_angle_cuda.cu b/lib/cuda/atom_vec_angle_cuda.cu deleted file mode 100644 index a11d9adbe4..0000000000 --- a/lib/cuda/atom_vec_angle_cuda.cu +++ /dev/null @@ -1,85 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -const unsigned int ANGLE_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK; - -#include "atom_vec_angle_cuda_cu.h" - -void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata) -{ - return Cuda_AtomVecCuda_Init(sdata); -} - -int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackExchangeList(sdata,n,dim,buf_send); -} - -int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackExchange(sdata,nsend,buf_send,copylist); -} - -int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_UnpackExchange(sdata,nsend,buf_send,copylist); -} - -int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); -} - -int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); -} - -int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); -} - -int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); -} - -int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); -} - -int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); -} diff --git a/lib/cuda/atom_vec_angle_cuda_cu.h b/lib/cuda/atom_vec_angle_cuda_cu.h deleted file mode 100644 index d8f5a2b9a4..0000000000 --- a/lib/cuda/atom_vec_angle_cuda_cu.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef ATOM_VEC_ANGLE_CUDA_CU_H_ -#define ATOM_VEC_ANGLE_CUDA_CU_H_ - -extern "C" void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata); -extern "C" int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send); -extern "C" int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); -extern "C" int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); -extern "C" int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv); -extern "C" int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv); - -#endif /*ATOM_VEC_ANGLE2_CUDA_CU_H_*/ diff --git a/lib/cuda/atom_vec_atomic_cuda.cu b/lib/cuda/atom_vec_atomic_cuda.cu deleted file mode 100644 index 0a75de2754..0000000000 --- a/lib/cuda/atom_vec_atomic_cuda.cu +++ /dev/null @@ -1,85 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -const unsigned int ATOMIC_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK; - -#include "atom_vec_atomic_cuda_cu.h" - -void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata) -{ - return Cuda_AtomVecCuda_Init(sdata); -} - -int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK; - return Cuda_AtomVecCuda_PackExchangeList(sdata,n,dim,buf_send); -} - -int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK; - return Cuda_AtomVecCuda_PackExchange(sdata,nsend,buf_send,copylist); -} - -int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK; - return Cuda_AtomVecCuda_UnpackExchange(sdata,nsend,buf_send,copylist); -} - -int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; - return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); -} - -int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; - return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); -} - -int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; - return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); -} - -int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; - return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); -} - -int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; - return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); -} - -int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK; - return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); -} diff --git a/lib/cuda/atom_vec_atomic_cuda_cu.h b/lib/cuda/atom_vec_atomic_cuda_cu.h deleted file mode 100644 index 8e776308e0..0000000000 --- a/lib/cuda/atom_vec_atomic_cuda_cu.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef ATOM_VEC_ATOMIC_CUDA_CU_H_ -#define ATOM_VEC_ATOMIC_CUDA_CU_H_ - -extern "C" void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata); -extern "C" int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send); -extern "C" int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); -extern "C" int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); -extern "C" int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv); -extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv); - -#endif /*ATOM_VEC_ATOMIC2_CUDA_CU_H_*/ diff --git a/lib/cuda/atom_vec_charge_cuda.cu b/lib/cuda/atom_vec_charge_cuda.cu deleted file mode 100644 index a78ffb9de0..0000000000 --- a/lib/cuda/atom_vec_charge_cuda.cu +++ /dev/null @@ -1,85 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -const unsigned int CHARGE_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK; - -#include "atom_vec_charge_cuda_cu.h" - -void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata) -{ - return Cuda_AtomVecCuda_Init(sdata); -} - -int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK; - return Cuda_AtomVecCuda_PackExchangeList(sdata,n,dim,buf_send); -} - -int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK; - return Cuda_AtomVecCuda_PackExchange(sdata,nsend,buf_send,copylist); -} - -int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK; - return Cuda_AtomVecCuda_UnpackExchange(sdata,nsend,buf_send,copylist); -} - -int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; - return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); -} - -int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; - return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); -} - -int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; - return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); -} - -int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; - return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); -} - -int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; - return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); -} - -int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK; - return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); -} diff --git a/lib/cuda/atom_vec_charge_cuda_cu.h b/lib/cuda/atom_vec_charge_cuda_cu.h deleted file mode 100644 index 137b001847..0000000000 --- a/lib/cuda/atom_vec_charge_cuda_cu.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef ATOM_VEC_CHARGE_CUDA_CU_H_ -#define ATOM_VEC_CHARGE_CUDA_CU_H_ - -extern "C" void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata); -extern "C" int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send); -extern "C" int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); -extern "C" int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); -extern "C" int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv); -extern "C" int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv); - -#endif /*ATOM_VEC_CHARGE2_CUDA_CU_H_*/ diff --git a/lib/cuda/atom_vec_cuda.cu b/lib/cuda/atom_vec_cuda.cu deleted file mode 100644 index 3bee50d6ef..0000000000 --- a/lib/cuda/atom_vec_cuda.cu +++ /dev/null @@ -1,564 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX atom_vec_cuda -#include "cuda_shared.h" -#include "cuda_common.h" -#include "cuda_wrapper_cu.h" -#include "crm_cuda_utils.cu" - -#include "atom_vec_cuda_kernel.cu" - -int AtomVecCuda_CountDataItems(unsigned int data_mask) -{ - int n=0; - if(data_mask & X_MASK) n+=3; - if(data_mask & V_MASK) n+=3; - if(data_mask & F_MASK) n+=3; - if(data_mask & TAG_MASK) n++; - if(data_mask & TYPE_MASK) n++; - if(data_mask & MASK_MASK) n++; - if(data_mask & IMAGE_MASK) n++; - if(data_mask & Q_MASK) n++; - if(data_mask & MOLECULE_MASK) n++; - if(data_mask & RMASS_MASK) n++; - if(data_mask & RADIUS_MASK) n++; - if(data_mask & DENSITY_MASK) n++; - if(data_mask & OMEGA_MASK) n+=3; - if(data_mask & TORQUE_MASK) n++; - - //if(data_mask & NSPECIAL_MASK) n+=3; - return n; -} - -void Cuda_AtomVecCuda_UpdateBuffer(cuda_shared_data* sdata,int size) -{ - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); -} - -template -void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(image) , & sdata->atom.image.dev_data, sizeof(int*) ); - if(data_mask & Q_MASK) cudaMemcpyToSymbolAsync(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) ); - if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbolAsync(MY_CONST(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*) ); - if(data_mask & RADIUS_MASK) cudaMemcpyToSymbolAsync(MY_CONST(radius) , & sdata->atom.radius.dev_data, sizeof(int*) ); - if(data_mask & DENSITY_MASK) cudaMemcpyToSymbolAsync(MY_CONST(density) , & sdata->atom.density.dev_data, sizeof(int*) ); - if(data_mask & RMASS_MASK) cudaMemcpyToSymbolAsync(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(int*) ); - if(data_mask & OMEGA_MASK) cudaMemcpyToSymbolAsync(MY_CONST(omega) , & sdata->atom.omega.dev_data, sizeof(int*) ); - //if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_CONST(nspecial) , & sdata->atom.nspecial.dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(flag) , & sdata->flag, sizeof(int*) ); -} - -template -void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata) -{ - MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n"); ) - if(sdata->atom.update_nmax) - Cuda_AtomVecCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbolAsync(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n"); ) - cudaMemcpyToSymbolAsync(MY_CONST(prd) , sdata->domain.prd, 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbolAsync(MY_CONST(sublo) , & sdata->domain.sublo, 3*sizeof(X_FLOAT) ); - cudaMemcpyToSymbolAsync(MY_CONST(subhi) , & sdata->domain.subhi, 3*sizeof(X_FLOAT) ); - cudaMemcpyToSymbolAsync(MY_CONST(flag) , & sdata->flag, sizeof(int*) ); - cudaThreadSynchronize(); - MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n"); ) -} - - -template -int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_AtomVecCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbolAsync(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int n_data_items=AtomVecCuda_CountDataItems(data_mask); - int size=(n*n_data_items)*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_AtomVecCuda_UpdateBuffer(sdata,size); - - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - if(sdata->atom.nlocal>0) - { - cudaMemset( sdata->flag,0,sizeof(int)); - -clock_gettime(CLOCK_REALTIME,&time1); - - void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer; - Cuda_AtomVecCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n - ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_kernel_pack+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed"); - if(not sdata->overlap_comm) - cudaMemcpy(buf_send, sdata->buffer, n*n_data_items*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_forward_download+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - int aflag; - cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); - if(aflag!=0) printf("aflag PackComm: %i\n",aflag); - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed"); - - } - return n_data_items*n; -} - - -template -int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");) - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_AtomVecCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int n_data_items=AtomVecCuda_CountDataItems(data_mask); - int size=(n*n_data_items)*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_AtomVecCuda_UpdateBuffer(sdata,size); - static int count=-1; - count++; - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - -clock_gettime(CLOCK_REALTIME,&time1); - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed"); - - Cuda_AtomVecCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_kernel_self+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self: Kernel execution failed"); - } - - return n_data_items*n; -} - - -template -void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap) -{ - timespec time1,time2; - - if(sdata->atom.update_nmax) - Cuda_AtomVecCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int n_data_items=AtomVecCuda_CountDataItems(data_mask); - int size=(n*n_data_items)*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_AtomVecCuda_UpdateBuffer(sdata,size); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { -clock_gettime(CLOCK_REALTIME,&time1); - if(not sdata->overlap_comm||iswap<0) - cudaMemcpy(sdata->buffer,(void*)buf_recv, n_data_items*n*sizeof(X_FLOAT), cudaMemcpyHostToDevice); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_upload+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer; - Cuda_AtomVecCuda_UnpackComm_Kernel<<>>(n,first,buf); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_forward_kernel_unpack+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackComm: Kernel execution failed"); - - } -} - -template -int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) -{ - MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n",dim); ) - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: pre Kernel execution failed"); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - Cuda_AtomVecCuda_Init(sdata); - int size=n*sizeof(double); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_AtomVecCuda_UpdateBuffer(sdata,size); - - cudaMemset((int*) (sdata->buffer),0,sizeof(int)); - - int3 layout=getgrid(sdata->atom.nlocal,sizeof(int),256,true); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); - - Cuda_AtomVecCuda_PackExchangeList_Kernel<<>>(n-1,dim); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed"); - - clock_gettime(CLOCK_REALTIME,&time2); - sdata->cuda_timings.comm_exchange_kernel_pack+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost); - int return_value = ((int*) buf_send)[0]; - if(n>1+return_value) - cudaMemcpy(buf_send, sdata->buffer, (1+return_value)*sizeof(double), cudaMemcpyDeviceToHost); - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed"); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_exchange_download+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n"); ) - return return_value; -} - -template -int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) -{ - MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n"); ) - if(sdata->atom.update_nmax) - Cuda_AtomVecCuda_UpdateNmax(sdata); - //if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - - int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1; - int size=(nsend*n_data_items+1)*sizeof(double); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_AtomVecCuda_UpdateBuffer(sdata,size); - - cudaMemset((int*) (sdata->buffer),0,sizeof(int)); - - int3 layout=getgrid(nsend,0); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); - - Cuda_AtomVecCuda_PackExchange_Kernel<<>>(nsend,(int*) copylist); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed"); - - clock_gettime(CLOCK_REALTIME,&time2); - sdata->cuda_timings.comm_exchange_kernel_pack+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost); - - clock_gettime(CLOCK_REALTIME,&time1); - sdata->cuda_timings.comm_exchange_download+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... done\n"); ) - return nsend*n_data_items+1; -} - - -template -int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) -{ - Cuda_AtomVecCuda_UpdateNmax(sdata); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1; - - int size=(nsend*n_data_items+1)*sizeof(double); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_AtomVecCuda_UpdateBuffer(sdata,size); - cudaMemcpyToSymbol(MY_CONST(flag) , & sdata->flag, sizeof(int*) ); - - cudaMemset((int*) (sdata->flag),0,sizeof(int)); - if(nsend) - { - int3 layout=getgrid(nsend,0); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); - - cudaMemcpy(sdata->buffer,buf_send , size, cudaMemcpyHostToDevice); - - clock_gettime(CLOCK_REALTIME,&time2); - sdata->cuda_timings.comm_exchange_upload+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - Cuda_AtomVecCuda_UnpackExchange_Kernel<<>>(sdata->exchange_dim,nsend,(int*) copylist); - cudaThreadSynchronize(); - - clock_gettime(CLOCK_REALTIME,&time1); - sdata->cuda_timings.comm_exchange_kernel_unpack+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackExchange: Kernel execution failed"); - } - } - int naccept; - cudaMemcpy((void*)&naccept, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); - - return naccept; -} - -template -int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - timespec atime1,atime2; - clock_gettime(CLOCK_REALTIME,&atime1); - if(sdata->atom.update_nmax) - Cuda_AtomVecCuda_UpdateNmax(sdata); - - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - clock_gettime(CLOCK_REALTIME,&atime2); - sdata->cuda_timings.test1+= - atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; - - int n_data_items=AtomVecCuda_CountDataItems(data_mask); - - int size=nsend*n_data_items*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_AtomVecCuda_UpdateBuffer(sdata,size); - - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]; - dy = pbc[1]; - dz = pbc[2]; - }} - - int3 layout=getgrid(nsend); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - if(sdata->atom.nlocal>0) - { - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); - - Cuda_AtomVecCuda_PackBorder_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,nsend,sdata->comm.maxlistlength,iswap,dx,dy,dz); - cudaThreadSynchronize(); - - clock_gettime(CLOCK_REALTIME,&time2); - sdata->cuda_timings.comm_border_kernel_pack+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost); - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed"); - - clock_gettime(CLOCK_REALTIME,&time1); - sdata->cuda_timings.comm_border_download+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - } - return nsend*n_data_items; -} - -template -int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - if(sdata->atom.update_nmax) - Cuda_AtomVecCuda_UpdateNmax(sdata); - - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - - int n_data_items=AtomVecCuda_CountDataItems(data_mask); - - int size=n*n_data_items*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_AtomVecCuda_UpdateBuffer(sdata,size); - - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]; - dy = pbc[1]; - dz = pbc[2]; - }} - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); - - Cuda_AtomVecCuda_PackBorder_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); - cudaThreadSynchronize(); - - clock_gettime(CLOCK_REALTIME,&time2); - sdata->cuda_timings.comm_border_kernel_self+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder_Self: Kernel execution failed"); - - } - return n*n_data_items; -} - - -template -int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv) -{ - timespec atime1,atime2; - clock_gettime(CLOCK_REALTIME,&atime1); - if(sdata->atom.update_nmax) - Cuda_AtomVecCuda_UpdateNmax(sdata); - - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - clock_gettime(CLOCK_REALTIME,&atime2); - sdata->cuda_timings.test1+= - atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; - - int n_data_items=AtomVecCuda_CountDataItems(data_mask); - - int size=n*n_data_items*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_AtomVecCuda_UpdateBuffer(sdata,size); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); - - cudaMemset((int*) (sdata->flag),0,sizeof(int)); - cudaMemcpy(sdata->buffer,(void*)buf_recv, size, cudaMemcpyHostToDevice); - - clock_gettime(CLOCK_REALTIME,&time2); - sdata->cuda_timings.comm_border_upload+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - Cuda_AtomVecCuda_UnpackBorder_Kernel<<>>(n,first); - cudaThreadSynchronize(); - - clock_gettime(CLOCK_REALTIME,&time1); - sdata->cuda_timings.comm_border_kernel_unpack+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - cudaMemcpy(&sdata->comm.grow_flag,sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); - - CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackBorder: Kernel execution failed"); - - } - return sdata->comm.grow_flag; -} - - -#include "atom_vec_angle_cuda.cu" -#include "atom_vec_atomic_cuda.cu" -#include "atom_vec_charge_cuda.cu" -#include "atom_vec_full_cuda.cu" -//#include "atom_vec_granular_cuda.cu" diff --git a/lib/cuda/atom_vec_cuda_cu.h b/lib/cuda/atom_vec_cuda_cu.h deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/lib/cuda/atom_vec_cuda_kernel.cu b/lib/cuda/atom_vec_cuda_kernel.cu deleted file mode 100644 index 0ec079d45b..0000000000 --- a/lib/cuda/atom_vec_cuda_kernel.cu +++ /dev/null @@ -1,371 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -#define RIMLARGER 1.000001 -#define RIMSMALLER 0.999999 -#define SMALL 1e-5 - -extern __shared__ int shared[]; - -template -__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - int* list=sendlist+iswap*maxlistlength; - if(i_nmax) _flag[0]=1; - int k=0; - if(data_mask & X_MASK){ - ((X_FLOAT*) buffer)[i+k*n]=_x[j] + dx; k++; - ((X_FLOAT*) buffer)[i+k*n] = _x[j+_nmax] + dy; k++; - ((X_FLOAT*) buffer)[i+k*n] = _x[j+2*_nmax] + dz; k++;} - if(data_mask & V_MASK){ - ((X_FLOAT*) buffer)[i+k*n]=_v[j]; k++; - ((X_FLOAT*) buffer)[i+k*n] = _v[j+_nmax]; k++; - ((X_FLOAT*) buffer)[i+k*n] = _v[j+2*_nmax]; k++;} - if(data_mask & OMEGA_MASK){ - ((X_FLOAT*) buffer)[i+k*n]=_omega[j]; k++; - ((X_FLOAT*) buffer)[i+k*n] = _omega[j+_nmax]; k++; - ((X_FLOAT*) buffer)[i+k*n] = _omega[j+2*_nmax]; k++;} - if(data_mask & RADIUS_MASK) ((X_FLOAT*) buffer)[i+k*n]=_radius[j]; k++; - if(data_mask & RMASS_MASK) ((X_FLOAT*) buffer)[i+k*n]=_rmass[j]; k++; - } -} - -template -__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - int* list=sendlist+iswap*maxlistlength; - if(i -__global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n,int first,void* buffer) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i (_x[i+dim*_nmax]); - if (xdim_tmp < _sublo[dim] || xdim_tmp >= _subhi[dim]) - { - add=true; - } - } - shared[threadIdx.x]=add?1:0; - __syncthreads(); - int nsend=0; - if(threadIdx.x==0) - { - for(int k=0;k -__global__ void Cuda_AtomVecCuda_PackExchange_Kernel(int nsend, int* copylist) -{ - double* buf=(double*) _buffer; - int k=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(k>=nsend) return; - buf=&buf[1+k]; - - int i=static_cast (buf[0]); - int j=copylist[k]; - - int m=1; - if(data_mask & X_MASK){ - buf[(m++)*nsend] = static_cast (_x[i]); - buf[(m++)*nsend] = static_cast (_x[i+_nmax]); - buf[(m++)*nsend] = static_cast (_x[i+2*_nmax]);} - if(data_mask & V_MASK){ - buf[(m++)*nsend] = _v[i]; - buf[(m++)*nsend] = _v[i+_nmax]; - buf[(m++)*nsend] = _v[i+2*_nmax];} - if(data_mask & TAG_MASK) buf[(m++)*nsend] = _tag[i]; - if(data_mask & TYPE_MASK) buf[(m++)*nsend] = _type[i]; - if(data_mask & MASK_MASK) buf[(m++)*nsend] = _mask[i]; - if(data_mask & IMAGE_MASK) buf[(m++)*nsend] = _image[i]; - if(data_mask & Q_MASK) buf[(m++)*nsend] = _q[i]; - if(data_mask & MOLECULE_MASK) buf[(m++)*nsend] = _molecule[i]; - if(data_mask & RADIUS_MASK) buf[(m++)*nsend] = _radius[i]; - if(data_mask & DENSITY_MASK) buf[(m++)*nsend] = _density[i]; - if(data_mask & RMASS_MASK) buf[(m++)*nsend] = _rmass[i]; - if(data_mask & OMEGA_MASK) { - buf[(m++)*nsend] = _omega[i]; - buf[(m++)*nsend] = _omega[i+_nmax]; - buf[(m++)*nsend] = _omega[i+2*_nmax];} - -/* if(data_mask & NSPECIAL_MASK) - { - buf[(m++)*nsend] = _nspecial[i]; - buf[(m++)*nsend] = _nspecial[i+_nmax]; - buf[(m++)*nsend] = _nspecial[i+2* _nmax]; - }*/ - - if(i>=_nlocal) return; - if(data_mask & X_MASK){ - _x[i] = _x[j]; - _x[i+_nmax] = _x[j+_nmax]; - _x[i+2*_nmax] = _x[j+2*_nmax];} - if(data_mask & V_MASK){ - _v[i] = _v[j]; - _v[i+_nmax] = _v[j+_nmax]; - _v[i+2*_nmax] = _v[j+2*_nmax];} - if(data_mask & TAG_MASK) _tag[i] = _tag[j]; - if(data_mask & TYPE_MASK) _type[i] = _type[j]; - if(data_mask & MASK_MASK) _mask[i] = _mask[j]; - if(data_mask & IMAGE_MASK) _image[i] = _image[j]; - - if(data_mask & Q_MASK) _q[i] = _q[j]; - if(data_mask & MOLECULE_MASK) _molecule[i]= _molecule[j]; - if(data_mask & RADIUS_MASK) _radius[i] = _radius[j]; - if(data_mask & DENSITY_MASK) _density[i] = _density[j]; - if(data_mask & RMASS_MASK) _rmass[i] = _rmass[j]; - if(data_mask & OMEGA_MASK) - { - _omega[i] = _omega[j]; - _omega[i+_nmax] = _omega[j+_nmax]; - _omega[i+2*_nmax] = _omega[j+2*_nmax]; - } - /* if(data_mask & NSPECIAL_MASK) - { - _nspecial[i] = _nspecial[j]; - _nspecial[i+_nmax] = _nspecial[j+_nmax]; - _nspecial[i+2* _nmax] = _nspecial[j+2* _nmax]; - }*/ -} - -template -__global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim,int nsend,int* copylist) -{ - double* buf=(double*) _buffer; - int k=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(k>=nsend) return; - buf=&buf[1+k]; - int i=-1; - double xdim_tmp = buf[(1+dim)*nsend]; - if(xdim_tmp>=_sublo[dim]-SMALL && xdim_tmp<_subhi[dim]+SMALL) - { - i=atomicAdd(_flag,1)+_nlocal; - - int m=1; - if(data_mask & X_MASK){ - _x[i] = buf[(m++)*nsend]; - _x[i+_nmax] = buf[(m++)*nsend]; - _x[i+2*_nmax] = buf[(m++)*nsend];} - if(data_mask & V_MASK){ - _v[i] = buf[(m++)*nsend]; - _v[i+_nmax] = buf[(m++)*nsend]; - _v[i+2*_nmax] = buf[(m++)*nsend];} - if(data_mask & TAG_MASK) _tag[i] = buf[(m++)*nsend]; - if(data_mask & TYPE_MASK) _type[i] = buf[(m++)*nsend]; - if(data_mask & MASK_MASK) _mask[i] = buf[(m++)*nsend]; - if(data_mask & IMAGE_MASK) _image[i] = buf[(m++)*nsend]; - - if(data_mask & Q_MASK) _q[i] = buf[(m++)*nsend]; - if(data_mask & MOLECULE_MASK) _molecule[i] = buf[(m++)*nsend]; - if(data_mask & RADIUS_MASK) _radius[i] = buf[(m++)*nsend]; - if(data_mask & DENSITY_MASK) _density[i] = buf[(m++)*nsend]; - if(data_mask & RMASS_MASK) _rmass[i] = buf[(m++)*nsend]; - if(data_mask & OMEGA_MASK) - { - _omega[i] = buf[(m++)*nsend]; - _omega[i+_nmax] = buf[(m++)*nsend]; - _omega[i+2*_nmax] = buf[(m++)*nsend]; - } - /* if(data_mask & NSPECIAL_MASK) - { - _nspecial[i] = buf[(m++)*nsend]; - _nspecial[i+_nmax] = buf[(m++)*nsend]; - _nspecial[i+2*_nmax] = buf[(m++)*nsend]; - }*/ - } - copylist[k]=i; -} - -template -__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - int* list=sendlist+iswap*maxlistlength; - if(i -__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - int* list=sendlist+iswap*maxlistlength; - if(i -__global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n,int first) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i (((X_FLOAT*) _buffer)[i+(m++)*n]); - if(data_mask & TYPE_MASK) _type[i+first] = static_cast (((X_FLOAT*) _buffer)[i+(m++)*n]); - if(data_mask & MASK_MASK) _mask[i+first] = static_cast (((X_FLOAT*) _buffer)[i+(m++)*n]); - if(data_mask & Q_MASK) _q[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n]; - if(data_mask & MOLECULE_MASK) _molecule[i+first] = static_cast (((X_FLOAT*) _buffer)[i+(m++)*n]); - if(data_mask & RADIUS_MASK) _radius[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n]; - if(data_mask & DENSITY_MASK) _density[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n]; - if(data_mask & RMASS_MASK) _rmass[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n]; - if(data_mask & OMEGA_MASK) { - _omega[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n]; - _omega[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n]; - _omega[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];} - } - else - { - _flag[0]=1; - } - } -} - - diff --git a/lib/cuda/atom_vec_full_cuda.cu b/lib/cuda/atom_vec_full_cuda.cu deleted file mode 100644 index a5aae11824..0000000000 --- a/lib/cuda/atom_vec_full_cuda.cu +++ /dev/null @@ -1,85 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -const unsigned int FULL_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK; - -#include "atom_vec_full_cuda_cu.h" - -void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata) -{ - return Cuda_AtomVecCuda_Init(sdata); -} - -int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackExchangeList(sdata,n,dim,buf_send); -} - -int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackExchange(sdata,nsend,buf_send,copylist); -} - -int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_UnpackExchange(sdata,nsend,buf_send,copylist); -} - -int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); -} - -int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackBorder(sdata,nsend,iswap,buf_send,pbc,pbc_flag); -} - -int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); -} - -int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_PackBorder_Self(sdata,n,iswap,first,pbc,pbc_flag); -} - -int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv) -{ - const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); -} - -int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv) -{ - const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK; - return Cuda_AtomVecCuda_UnpackBorder(sdata,n,first,buf_recv); -} diff --git a/lib/cuda/atom_vec_full_cuda_cu.h b/lib/cuda/atom_vec_full_cuda_cu.h deleted file mode 100644 index 6cf163ab71..0000000000 --- a/lib/cuda/atom_vec_full_cuda_cu.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef ATOM_VEC_FULL_CUDA_CU_H_ -#define ATOM_VEC_FULL_CUDA_CU_H_ - -extern "C" void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata); -extern "C" int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send); -extern "C" int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); -extern "C" int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist); -extern "C" int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); -extern "C" int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv); -extern "C" int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv); - -#endif /*ATOM_VEC_FULL2_CUDA_CU_H_*/ diff --git a/lib/cuda/binning.cu b/lib/cuda/binning.cu deleted file mode 100644 index 823015ff55..0000000000 --- a/lib/cuda/binning.cu +++ /dev/null @@ -1,196 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#ifdef CUDA_USE_BINNING -#include -#define MY_PREFIX binning -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" -#include "binning_cu.h" -#include "binning_kernel.cu" - -void Cuda_PreBinning(cuda_shared_data* sdata) -{ - // initialize only on first call - short init = 0; - if(! init) - { - init = 1; - int cuda_dummy_type = sdata->atom.ntypes + 1; - X_FLOAT outside[3] = - { - (sdata->domain.subhi[0] - sdata->domain.sublo[0])/1000.0, - (sdata->domain.subhi[1] - sdata->domain.sublo[1])/1000.0, - (sdata->domain.subhi[2] - sdata->domain.sublo[2])/1000.0 - }; - cudaMemcpyToSymbol("binned_size_all" , & sdata->atom.binned_type.dim[0] , sizeof(unsigned) ); - cudaMemcpyToSymbol("cuda_dummy_type" , & cuda_dummy_type , sizeof(int) ); - cudaMemcpyToSymbol("outside" , & outside , sizeof(X_FLOAT)*3); - cudaMemcpyToSymbol(MY_CONST(binned_type), & sdata->atom.binned_type.dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(binned_x) , & sdata->atom.binned_x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi , sizeof(X_FLOAT)*3); - // bin_nmax == blockDim.x - - // printf("# CUDA: MY_CONST(binned_type) = %s\n", MY_CONST(binned_type)); - // int* p = pre_binning_binned_type; // pre_binning_binned_type is defined here!! - } - - dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_dim[2], 1); - dim3 threads(sdata->domain.bin_nmax, 1, 1); - - MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning grid = (%u, %u, %u)\n", grid.x, grid.y, grid.z);) - MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning threads = (%u, %u, %u)\n", threads.x, threads.y, threads.z); ) - PreBinning_Kernel<<>> (); - cudaThreadSynchronize(); - MYDBG(printf("ERROR-CUDA pre_binning: %s\n",cudaGetErrorString(cudaGetLastError()))); - CUT_CHECK_ERROR("Cuda_PreBinning: binning Kernel execution failed"); -} - -void Cuda_Binning(cuda_shared_data* sdata) -{ - MYDBG( // check assumption in debug mode - if(sdata->atom.x.dim[1] != 3) - { - printf("# CUDA: Cuda_Binning: binning error: atom array dimensions not Nx3\n"); - return; - } - ) - - // initialize only on first call - short init = 0; - if(! init) - { - init = 0; - X_FLOAT const_rez_bin_size[3] = - { - (1.0 * sdata->domain.bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]), - (1.0 * sdata->domain.bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]), - (1.0 * sdata->domain.bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2]) - }; - cudaMemcpyToSymbol("bin_error_count" , & sdata->atom.bin_error_count.dev_data, sizeof(X_FLOAT)*1); - cudaMemcpyToSymbol("rez_bin_size" , & const_rez_bin_size , sizeof(X_FLOAT)*3); - cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*)); - cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*)); - cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int3) ); - cudaMemcpyToSymbol(MY_CONST(bin_nmax) , & sdata->domain.bin_nmax , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(binned_q) , & sdata->atom.binned_q .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(binned_rmass) , & sdata->atom.binned_rmass .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(binpos) , & sdata->atom.binpos .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(natoms) , & sdata->atom.nall , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(nghost) , & sdata->atom.nghost , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo , sizeof(X_FLOAT)*3); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - } - - dim3 grid((unsigned)(1 + sdata->atom.nlocal/64.0), 1, 1); - MYDBG( printf("# CUDA: Cuda_Binning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); ) - dim3 threads(64, 1, 1); - - cudaMemset((int*) (sdata->atom.bin_count_all.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2])); - cudaMemset((int*) (sdata->atom.bin_count_local.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2])); - cudaMemset(sdata->atom.bin_error_count.dev_data,0,sizeof(int)*1); - int binning_error_l[1]; - - - Binning_Kernel<<>> ( - (X_FLOAT*) (sdata->atom. x.dev_data), - (X_FLOAT*) (sdata->atom.binned_x.dev_data), - sdata->atom.q_flag, - 0, - sdata->atom.rmass_flag - ); - cudaThreadSynchronize(); - cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost); - if(binning_error_l[0]!=0) - { - printf("CUDA-ERROR: binning local: could not bin %i atoms\n",binning_error_l[0]); - } - CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed"); - - grid.x=(unsigned)(1 + (sdata->atom.nall-sdata->atom.nlocal)/32.0); - MYDBG( printf("# CUDA: Cuda_Binning Ghost: grid dim.x = %u\n", grid.x); ) - - - Binning_Kernel<<>> ( - (X_FLOAT*) (sdata->atom. x.dev_data), - (X_FLOAT*) (sdata->atom.binned_x.dev_data), - sdata->atom.q_flag, - sdata->atom.nlocal, - sdata->atom.rmass_flag - ); - cudaThreadSynchronize(); - cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost); - if(binning_error_l[0]!=0) printf("CUDA-ERROR: binning ghost: could not bin %i atoms\n",binning_error_l[0]); -} - -void Cuda_ReverseBinning(cuda_shared_data* sdata) -{ - // initialize only on first call - short init = 0; - if(! init) - { - init = 0; - cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*)); - cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*)); - cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int3) ); - cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(binned_q) , & sdata->atom.binned_q .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(natoms) , & sdata->atom.nall , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - } - - dim3 grid((unsigned)(1 + sdata->atom.nlocal/32.0), 1, 1); - MYDBG( printf("# CUDA: Cuda_ReverseBinning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); ) - dim3 threads(32, 1, 1); - - ReverseBinning_Kernel<<>> ( - (X_FLOAT*) (sdata->atom. x.dev_data), - (X_FLOAT*) (sdata->atom.binned_x.dev_data), - sdata->atom.q_flag - ); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Binning: reverse binning Kernel execution failed"); -} - -#endif diff --git a/lib/cuda/binning_cu.h b/lib/cuda/binning_cu.h deleted file mode 100644 index 4f932c392f..0000000000 --- a/lib/cuda/binning_cu.h +++ /dev/null @@ -1,28 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PreBinning(cuda_shared_data* sdata); -extern "C" void Cuda_Binning(cuda_shared_data* sdata); -extern "C" void Cuda_ReverseBinning(cuda_shared_data* sdata); diff --git a/lib/cuda/binning_kernel.cu b/lib/cuda/binning_kernel.cu deleted file mode 100644 index f5677d475f..0000000000 --- a/lib/cuda/binning_kernel.cu +++ /dev/null @@ -1,149 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -// load some variables from shared cuda data into device's constant memory: -__device__ __constant__ X_FLOAT rez_bin_size[3]; -__device__ __constant__ unsigned* bin_error_count; - -__device__ __constant__ int cuda_dummy_type; -__device__ __constant__ unsigned binned_size_all; -__device__ __constant__ X_FLOAT outside[3]; - -__global__ void PreBinning_Kernel() -{ - const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; - - if(bin < gridDim.x * gridDim.y) // TODO: suspected always to be true - { - _binned_type[blockDim.x * bin + threadIdx.x] = cuda_dummy_type; - - const int i = 3*blockDim.x * bin + threadIdx.x; - X_FLOAT* binned_x = _binned_x + i; *binned_x = _subhi[0] + outside[0] * (1+i); - binned_x += blockDim.x; *binned_x = _subhi[1] + outside[1] * (1+i); - binned_x += blockDim.x; *binned_x = _subhi[2] + outside[2] * (1+i); - _binned_tag[i]=-1; - } -} - -__global__ void Binning_Kernel(X_FLOAT* x, X_FLOAT* binned_x, int q_flag, int offset, int rmass_flag) -{ - const unsigned i = blockDim.x * blockIdx.x + threadIdx.x+offset; - - int binatoms=_natoms; - if(offset==0) binatoms=_nlocal ; - - if(i < binatoms) - { - // copy atom position from global device memory to local register - // in this 3 steps to get as much coalesced access as possible - X_FLOAT my_xX, my_xY, my_xZ; - x += i; my_xX = *x; - x += _nmax; my_xY = *x; - x += _nmax; my_xZ = *x; - //my_xX=x[i]; - //my_xY=x[i+_nmax]; - //my_xZ=x[i+2*_nmax]; - - - // calculate flat bin index - int bx=__float2int_rd(rez_bin_size[0] * (my_xX - _sublo[0]))+2; - int by=__float2int_rd(rez_bin_size[1] * (my_xY - _sublo[1]))+2; - int bz=__float2int_rd(rez_bin_size[2] * (my_xZ - _sublo[2]))+2; - - bx-=bx*negativCUDA(1.0f*bx); - bx-=(bx-_bin_dim.x+1)*negativCUDA(1.0f*_bin_dim.x-1.0f-1.0f*bx); - by-=by*negativCUDA(1.0f*by); - by-=(by-_bin_dim.y+1)*negativCUDA(1.0f*_bin_dim.y-1.0f-1.0f*by); - bz-=bz*negativCUDA(1.0f*bz); - bz-=(bz-_bin_dim.z+1)*negativCUDA(1.0f*_bin_dim.z-1.0f-1.0f*bz); - - - const unsigned j = _bin_dim.z * ( _bin_dim.y *bx+by)+bz; - - // add new atom to bin, get bin-array position - const unsigned k = atomicAdd(& _bin_count_all[j], 1); - if(offset==0) atomicAdd(& _bin_count_local[j], 1); - if(k < _bin_nmax) - { - // copy register values back to global device memory - unsigned pos = 3*_bin_nmax * j + k; - _binpos[i]=pos; - binned_x += pos; *binned_x = my_xX; - binned_x += _bin_nmax; *binned_x = my_xY; - binned_x += _bin_nmax; *binned_x = my_xZ; - - // also copy velocity and force accordingly - - binned_x = _binned_v + pos; x = _v + i; *binned_x = *x; - binned_x += _bin_nmax; x += _nmax; *binned_x = *x; - binned_x += _bin_nmax; x += _nmax; *binned_x = *x; - - binned_x = _binned_f + pos; x = _f + i; *binned_x = *x; - binned_x += _bin_nmax; x += _nmax; *binned_x = *x; - binned_x += _bin_nmax; x += _nmax; *binned_x = *x; - - pos = _bin_nmax * j + k; - _binned_type [pos] = _type[i]; - _binned_tag [pos] = _tag[i]; - if(rmass_flag) - _binned_rmass[pos] = _rmass[i]; - if(q_flag) - _binned_q [pos] = _q[i]; - } - else - { // normally, this should not happen: - int errorn=atomicAdd(bin_error_count, 1); - MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); ) - } - } -} - -__global__ void ReverseBinning_Kernel(X_FLOAT* x, X_FLOAT* binned_x,int q_flag) -{ - const unsigned i = blockDim.x * blockIdx.x + threadIdx.x; - - if(i < _nlocal) - { - unsigned bin_pos3 = _binpos[i]; - unsigned bin_pos=bin_pos3/(3*_bin_nmax); - bin_pos*=_bin_nmax; - bin_pos+=bin_pos3-bin_pos*3; - - binned_x = _binned_x + bin_pos3; x = x + i; *x = *binned_x; - binned_x += _bin_nmax; x += _nmax; *x = *binned_x; - binned_x += _bin_nmax; x += _nmax; *x = *binned_x; - - binned_x = _binned_v + bin_pos3; x = _v + i; *x = *binned_x; - binned_x += _bin_nmax; x += _nmax; *x = *binned_x; - binned_x += _bin_nmax; x += _nmax; *x = *binned_x; - - binned_x = _binned_f + bin_pos3; x = _f + i; *x = *binned_x; - binned_x += _bin_nmax; x += _nmax; *x = *binned_x; - binned_x += _bin_nmax; x += _nmax; *x = *binned_x; - - - _type[i] = _binned_type[bin_pos]; - _tag[i] = _binned_tag[bin_pos]; - if(q_flag) _q[i] = _binned_q[bin_pos]; - } -} diff --git a/lib/cuda/comm_cuda.cu b/lib/cuda/comm_cuda.cu deleted file mode 100644 index dc7c01005d..0000000000 --- a/lib/cuda/comm_cuda.cu +++ /dev/null @@ -1,485 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX comm_cuda -#include "cuda_shared.h" -#include "cuda_common.h" - -#include "crm_cuda_utils.cu" - -#include "comm_cuda_cu.h" -#include "comm_cuda_kernel.cu" -#include - -void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata,int n) -{ - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); -} - - -void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbolAsync(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbolAsync(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbolAsync(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbolAsync(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbolAsync(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbolAsync(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); -} - - -void Cuda_CommCuda_Init(cuda_shared_data* sdata) -{ - Cuda_CommCuda_UpdateNmax(sdata); - int ntypesp=sdata->atom.ntypes+1; - cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , &ntypesp, sizeof(int)); - cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd, 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(debugdata) , &sdata->debugdata, sizeof(int*)); -} - -int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - if(sdata->atom.nlocal>0) - { - cudaMemset( sdata->flag,0,sizeof(int)); - -clock_gettime(CLOCK_REALTIME,&time1); - - void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer; - Cuda_CommCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n - ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_kernel_pack+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); - if(not sdata->overlap_comm) - cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_forward_download+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - int aflag; - cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); - if(aflag!=0) printf("aflag PackComm: %i\n",aflag); - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); - - } - return 3*n; -} - -int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*6*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - if(sdata->atom.nlocal>0) - { - cudaMemset( sdata->flag,0,sizeof(int)); - -clock_gettime(CLOCK_REALTIME,&time1); - - void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer; - Cuda_CommCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n - ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_kernel_pack+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); - if(not sdata->overlap_comm) - cudaMemcpy(buf_send, sdata->buffer, n*6*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_forward_download+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - int aflag; - cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); - if(aflag!=0) printf("aflag PackComm: %i\n",aflag); - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); - - } - return 6*n; -} - -int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - static int count=-1; - count++; - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - -clock_gettime(CLOCK_REALTIME,&time1); - - Cuda_CommCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_kernel_self+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); - } - - return 3*n; -} - -int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*6*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - static int count=-1; - count++; - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - -clock_gettime(CLOCK_REALTIME,&time1); - - Cuda_CommCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_kernel_self+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); - } - - return 6*n; -} - -void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap) -{ - timespec time1,time2; - - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { -clock_gettime(CLOCK_REALTIME,&time1); - if(not sdata->overlap_comm||iswap<0) - cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_upload+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer; - Cuda_CommCuda_UnpackComm_Kernel<<>>(n,first,buf); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_forward_kernel_unpack+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed"); - - } -} - -void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap) -{ - timespec time1,time2; - - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*6*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { -clock_gettime(CLOCK_REALTIME,&time1); - - if(not sdata->overlap_comm||iswap<0) - cudaMemcpy(sdata->buffer,(void*)buf_recv, n*6*sizeof(X_FLOAT), cudaMemcpyHostToDevice); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_upload+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer; - Cuda_CommCuda_UnpackComm_Kernel<<>>(n,first,buf); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_forward_kernel_unpack+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed"); - - } -} - -int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send) -{ - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(F_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - - F_FLOAT* buf=(F_FLOAT*)buf_send; - F_FLOAT* f_dev=(F_FLOAT*)sdata->atom.f.dev_data; - f_dev+=first; - cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); - buf+=n; f_dev+=sdata->atom.nmax; - cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); - buf+=n; f_dev+=sdata->atom.nmax; - cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); - return n*3; -} - - -void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv) -{ - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(F_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - cudaMemcpy(sdata->buffer,buf_recv, size, cudaMemcpyHostToDevice); - Cuda_CommCuda_UnpackReverse_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed"); - } -} - -void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first) -{ - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - if(sdata->atom.nlocal>0) - { - Cuda_CommCuda_UnpackReverse_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,first); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed"); - - } -} - - -int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap) -{ - MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");) - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - if(sdata->buffer_new or (80>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,10); - int n; - if (!bordergroup || ineed >= 2) - n=nlast-nfirst+1; - else - { - n=atom_nfirst; - if(nlast-sdata->atom.nlocal+1>n) n=nlast-sdata->atom.nlocal+1; - } - int3 layout=getgrid(n,0,512,true); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x+1, layout.y, 1); - - - cudaMemset((int*) (sdata->buffer),0,sizeof(int)); - -clock_gettime(CLOCK_REALTIME,&time1); - if(style==1) - Cuda_CommCuda_BuildSendlist_Single<<>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.slablo.dev_data,(X_FLOAT*) sdata->comm.slabhi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength); - else - Cuda_CommCuda_BuildSendlist_Multi<<>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.multilo.dev_data,(X_FLOAT*) sdata->comm.multihi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength); - cudaThreadSynchronize(); -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_border_kernel_buildlist+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed"); - int nsend; - cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost); - return nsend; - - -} - diff --git a/lib/cuda/comm_cuda_cu.h b/lib/cuda/comm_cuda_cu.h deleted file mode 100644 index b5b2d192ba..0000000000 --- a/lib/cuda/comm_cuda_cu.h +++ /dev/null @@ -1,35 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbcflag); -extern "C" int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbcflag); -extern "C" int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbcflag); -extern "C" int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbcflag); -extern "C" void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap=-1); -extern "C" void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap=-1); -extern "C" int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send); -extern "C" void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv); -extern "C" void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first); -extern "C" int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap); diff --git a/lib/cuda/comm_cuda_kernel.cu b/lib/cuda/comm_cuda_kernel.cu deleted file mode 100644 index c171a721a4..0000000000 --- a/lib/cuda/comm_cuda_kernel.cu +++ /dev/null @@ -1,353 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - int* list=sendlist+iswap*maxlistlength; - if(i_nmax) _flag[0]=1; - ((X_FLOAT*) buffer)[i]=_x[j] + dx; - ((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy; - ((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz; - } -} - -__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - int* list=sendlist+iswap*maxlistlength; - if(i_nmax) _flag[0]=1; - ((X_FLOAT*) buffer)[i]=_x[j] + dx; - ((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy; - ((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz; - ((X_FLOAT*) buffer)[i+3*n]=_v[j]; - ((X_FLOAT*) buffer)[i+4*n] = _v[j+_nmax]; - ((X_FLOAT*) buffer)[i+5*n] = _v[j+2*_nmax]; - } -} - -__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - int* list=sendlist+iswap*maxlistlength; - if(i= 2) { - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst; - if(i= lo && _x[i+dim*_nmax] <= hi) { - add=true; - } - shared[threadIdx.x]=add?1:0; - - __syncthreads(); - - int nsend=0; - if(threadIdx.x==0) - { - for(int k=0;k= lo && _x[i+dim*_nmax] <= hi) { - add=true; - } - - shared[threadIdx.x]=add?1:0; - - __syncthreads(); - - int nsend=0; - if(threadIdx.x==0) - { - for(int k=0;k= lo && _x[i+dim*_nmax] <= hi) { - add=true; - } - shared[threadIdx.x]=add?1:0; - - __syncthreads(); - - nsend=0; - if(threadIdx.x==0) - { - for(int k=0;k= 2) { - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst; - if(i= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) { - add=true; - } - } - shared[threadIdx.x]=add?1:0; - - __syncthreads(); - - int nsend=0; - if(threadIdx.x==0) - { - for(int k=0;k= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) { - add=true; - } - } - shared[threadIdx.x]=add?1:0; - - __syncthreads(); - - int nsend=0; - if(threadIdx.x==0) - { - for(int k=0;k= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) { - add=true; - } - } - shared[threadIdx.x]=add?1:0; - - __syncthreads(); - - nsend=0; - if(threadIdx.x==0) - { - for(int k=0;k -#define MY_PREFIX compute_temp_cuda -#include "cuda_shared.h" -#include "cuda_common.h" - -#include "crm_cuda_utils.cu" - -#include "compute_temp_cuda_cu.h" -#include "compute_temp_cuda_kernel.cu" - -void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata) -{ - int size=(unsigned)((sdata->atom.nlocal+63)/64.0)*6*sizeof(ENERGY_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); -} - -void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) ); - if(sdata->atom.rmass_flag) - cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); -} - -void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata) -{ - Cuda_ComputeTempCuda_UpdateNmax(sdata); -} - - -void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t) -{ - //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary - Cuda_ComputeTempCuda_UpdateNmax(sdata); - //if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - //if(sdata->buffer_new) - Cuda_ComputeTempCuda_UpdateBuffer(sdata); - - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - Cuda_ComputeTempCuda_Vector_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed"); - - int oldgrid=grid.x*grid.y; - grid.x=6; - grid.y=1; - threads.x=512; - Cuda_ComputeTempCuda_Reduce_Kernel<<>> (oldgrid,t); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed"); - } -} - -void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t) -{ - //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary - Cuda_ComputeTempCuda_UpdateNmax(sdata); - //if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - //if(sdata->buffer_new) - Cuda_ComputeTempCuda_UpdateBuffer(sdata); - MYDBG(printf("#CUDA ComputeTempCuda_Scalar: %i\n",sdata->atom.nlocal);) - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel"); - Cuda_ComputeTempCuda_Scalar_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed"); - - int oldgrid=grid.x*grid.y; - grid.x=1; - grid.y=1; - threads.x=512; - Cuda_ComputeTempCuda_Reduce_Kernel<<>> (oldgrid,t); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed"); - } -} diff --git a/lib/cuda/compute_temp_cuda_cu.h b/lib/cuda/compute_temp_cuda_cu.h deleted file mode 100644 index 0793be77cb..0000000000 --- a/lib/cuda/compute_temp_cuda_cu.h +++ /dev/null @@ -1,28 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t); -extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t); diff --git a/lib/cuda/compute_temp_cuda_kernel.cu b/lib/cuda/compute_temp_cuda_kernel.cu deleted file mode 100644 index c5de884cd1..0000000000 --- a/lib/cuda/compute_temp_cuda_kernel.cu +++ /dev/null @@ -1,109 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -extern __shared__ ENERGY_FLOAT sharedmem[]; - - -__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - sharedmem[threadIdx.x]=0; - if(i < _nlocal) - { - if (_rmass_flag) { - if (_mask[i] & groupbit) - sharedmem[threadIdx.x] = (_v[i]*_v[i] + _v[i+_nmax]*_v[i+_nmax] + _v[i+2*_nmax]*_v[i+2*_nmax]) * _rmass[i]; - } else { - if (_mask[i] & groupbit) - sharedmem[threadIdx.x] = (_v[i]*_v[i] + _v[i+_nmax]*_v[i+_nmax] + _v[i+2*_nmax]*_v[i+2*_nmax]) * (_mass[_type[i]]); - } - } - reduceBlock(sharedmem); - ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; - if(threadIdx.x==0) - { - buffer[(blockIdx.x*gridDim.y+blockIdx.y)]=sharedmem[0]; - } -} - -__global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - sharedmem[threadIdx.x]=0; - sharedmem[threadIdx.x+blockDim.x]=0; - sharedmem[threadIdx.x+2*blockDim.x]=0; - sharedmem[threadIdx.x+3*blockDim.x]=0; - sharedmem[threadIdx.x+4*blockDim.x]=0; - sharedmem[threadIdx.x+5*blockDim.x]=0; - if(i < _nlocal) - if (_mask[i] & groupbit) { - V_FLOAT massone; - if (_rmass_flag) massone = _rmass[i]; - else massone = _mass[_type[i]]; - sharedmem[threadIdx.x] = massone * _v[i]*_v[i]; - sharedmem[threadIdx.x+blockDim.x] = massone * _v[i+_nmax]*_v[i+_nmax]; - sharedmem[threadIdx.x+2*blockDim.x] = massone * _v[i+2*_nmax]*_v[i+2*_nmax]; - sharedmem[threadIdx.x+3*blockDim.x] = massone * _v[i]*_v[i+_nmax]; - sharedmem[threadIdx.x+4*blockDim.x] = massone * _v[i]*_v[i+2*_nmax]; - sharedmem[threadIdx.x+5*blockDim.x] = massone * _v[i+_nmax]*_v[i+2*_nmax]; - } - reduceBlock(sharedmem); - reduceBlock(&sharedmem[blockDim.x]); - reduceBlock(&sharedmem[2*blockDim.x]); - reduceBlock(&sharedmem[3*blockDim.x]); - reduceBlock(&sharedmem[4*blockDim.x]); - reduceBlock(&sharedmem[5*blockDim.x]); - ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; - if(threadIdx.x==0) - { - buffer[(blockIdx.x*gridDim.y+blockIdx.y)]=sharedmem[0]; - buffer[(blockIdx.x*gridDim.y+blockIdx.y)+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; - buffer[(blockIdx.x*gridDim.y+blockIdx.y)+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; - buffer[(blockIdx.x*gridDim.y+blockIdx.y)+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x]; - buffer[(blockIdx.x*gridDim.y+blockIdx.y)+4*gridDim.x*gridDim.y]=sharedmem[4*blockDim.x]; - buffer[(blockIdx.x*gridDim.y+blockIdx.y)+5*gridDim.x*gridDim.y]=sharedmem[5*blockDim.x]; - } -} - - -__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n,ENERGY_FLOAT* t) -{ - int i=0; - sharedmem[threadIdx.x]=0; - ENERGY_FLOAT myforig=0.0; - ENERGY_FLOAT* buf=(ENERGY_FLOAT*) _buffer; - buf=&buf[blockIdx.x*n]; - while(i -#define MY_PREFIX compute_temp_partial_cuda -#include "cuda_shared.h" -#include "cuda_common.h" - -#include "crm_cuda_utils.cu" - -#include "compute_temp_partial_cuda_cu.h" -#include "compute_temp_partial_cuda_kernel.cu" - -void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata) -{ - int size=(unsigned)((sdata->atom.nlocal+63)/64.0)*6*sizeof(ENERGY_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); -} - -void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) ); - if(sdata->atom.rmass_flag) - cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); -} - -void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata) -{ - Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); -} - - -void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag) -{ - //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary - Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); - //if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - //if(sdata->buffer_new) - Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); - - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - Cuda_ComputeTempPartialCuda_Vector_Kernel<<>> (groupbit,xflag,yflag,zflag); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed"); - - int oldgrid=grid.x*grid.y; - grid.x=6; - threads.x=512; - Cuda_ComputeTempPartialCuda_Reduce_Kernel<<>> (oldgrid,t); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed"); - } -} - -void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag) -{ - //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary - Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); - //if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - //if(sdata->buffer_new) - Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); - MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n",sdata->atom.nlocal);) - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel"); - Cuda_ComputeTempPartialCuda_Scalar_Kernel<<>> (groupbit,xflag,yflag,zflag); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed"); - - int oldgrid=grid.x*grid.y; - grid.x=1; - threads.x=512; - Cuda_ComputeTempPartialCuda_Reduce_Kernel<<>> (oldgrid,t); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed"); - } -} - -void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall) -{ - //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary - Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); - //if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - //if(sdata->buffer_new) - Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); - - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel<<>> (groupbit,xflag,yflag,zflag,(V_FLOAT*) vbiasall); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed"); - } -} - -void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall) -{ - //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary - Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); - //if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - //if(sdata->buffer_new) - Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); - - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel<<>> (groupbit,xflag,yflag,zflag,(V_FLOAT*) vbiasall); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed"); - } -} diff --git a/lib/cuda/compute_temp_partial_cuda_cu.h b/lib/cuda/compute_temp_partial_cuda_cu.h deleted file mode 100644 index 82fe86fa71..0000000000 --- a/lib/cuda/compute_temp_partial_cuda_cu.h +++ /dev/null @@ -1,30 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag); -extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag); -extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall); -extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall); diff --git a/lib/cuda/compute_temp_partial_cuda_kernel.cu b/lib/cuda/compute_temp_partial_cuda_kernel.cu deleted file mode 100644 index 7c7895ca43..0000000000 --- a/lib/cuda/compute_temp_partial_cuda_kernel.cu +++ /dev/null @@ -1,152 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -extern __shared__ ENERGY_FLOAT sharedmem[]; - - -__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit,int xflag,int yflag,int zflag) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - sharedmem[threadIdx.x]=0; - if(i < _nlocal) - { - if (_rmass_flag) { - if (_mask[i] & groupbit) - sharedmem[threadIdx.x] = (_v[i]*_v[i]*xflag + _v[i+_nmax]*_v[i+_nmax]*yflag + _v[i+2*_nmax]*_v[i+2*_nmax]*zflag) * _rmass[i]; - } else { - if (_mask[i] & groupbit) - sharedmem[threadIdx.x] = (_v[i]*_v[i]*xflag + _v[i+_nmax]*_v[i+_nmax]*yflag + _v[i+2*_nmax]*_v[i+2*_nmax]*zflag) * (_mass[_type[i]]); - } - } - reduceBlock(sharedmem); - ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; - if(threadIdx.x==0) - { - buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; - } -} - -__global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit,int xflag,int yflag,int zflag) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - sharedmem[threadIdx.x]=0; - sharedmem[threadIdx.x+blockDim.x]=0; - sharedmem[threadIdx.x+2*blockDim.x]=0; - sharedmem[threadIdx.x+3*blockDim.x]=0; - sharedmem[threadIdx.x+4*blockDim.x]=0; - sharedmem[threadIdx.x+5*blockDim.x]=0; - if(i < _nlocal) - if (_mask[i] & groupbit) { - V_FLOAT massone; - if (_rmass_flag) massone = _rmass[i]; - else massone = _mass[_type[i]]; - sharedmem[threadIdx.x] = massone * _v[i]*_v[i]*xflag; - sharedmem[threadIdx.x+blockDim.x] = massone * _v[i+_nmax]*_v[i+_nmax]*yflag; - sharedmem[threadIdx.x+2*blockDim.x] = massone * _v[i+2*_nmax]*_v[i+2*_nmax]*zflag; - sharedmem[threadIdx.x+3*blockDim.x] = massone * _v[i]*_v[i+_nmax]*xflag*yflag; - sharedmem[threadIdx.x+4*blockDim.x] = massone * _v[i]*_v[i+2*_nmax]*xflag*zflag; - sharedmem[threadIdx.x+5*blockDim.x] = massone * _v[i+_nmax]*_v[i+2*_nmax]*yflag*zflag; - } - reduceBlock(sharedmem); - reduceBlock(&sharedmem[blockDim.x]); - reduceBlock(&sharedmem[2*blockDim.x]); - reduceBlock(&sharedmem[3*blockDim.x]); - reduceBlock(&sharedmem[4*blockDim.x]); - reduceBlock(&sharedmem[5*blockDim.x]); - ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; - if(threadIdx.x==0) - { - buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+4*gridDim.x*gridDim.y]=sharedmem[4*blockDim.x]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+5*gridDim.x*gridDim.y]=sharedmem[5*blockDim.x]; - } -} - - -__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n,ENERGY_FLOAT* t) -{ - int i=0; - sharedmem[threadIdx.x]=0; - ENERGY_FLOAT myforig=0.0; - ENERGY_FLOAT* buf=(ENERGY_FLOAT*) _buffer; - buf=&buf[blockIdx.x*n]; - while(i (b) ? (a) : (b)) - -inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false) -{ - int3 gridparams; - int sharedsize = 16000; - - if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax; - - if((n < 60 * 32) || (threadsmax < 64)) - gridparams.z = 32; - else if((n < 60 * 64) || (threadsmax < 128)) - gridparams.z = 64; - else if((n < 60 * 128) || (threadsmax < 256)) - gridparams.z = 128; - else if((n < 60 * 256) || (threadsmax < 512)) - gridparams.z = 256; - else gridparams.z = 512; - - if(p2) { - gridparams.z = 16; - - while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2; - } - - - int blocks = (n + gridparams.z - 1) / gridparams.z; - - if(blocks > 10000) - gridparams.x = gridparams.y = int(sqrt(blocks)); - else { - gridparams.x = blocks; - gridparams.y = 1; - } - - while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++; - - if(gridparams.x == 0) gridparams.x = 1; - - return gridparams; -} - -//return value: 1 if f<0; else: 0 -//take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int -static inline __device__ int negativCUDA(float f) -{ - return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31; -} - -//return value: -1 if f<0; else +1 -static inline __device__ float fsignCUDA(float f) -{ - return f < 0.0f ? -1.0f : 1.0f; -} - -//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights) -//blockDim.y and blockDim.z are assumed to be 1 -static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n) -{ - int i, k; - k = n - blockDim.x; - - for(i = 0; i < k; i += blockDim.x) { - glob[i + threadIdx.x] = shared[i + threadIdx.x]; - } - - if(threadIdx.x < n - i) { - glob[i + threadIdx.x] = shared[i + threadIdx.x]; - } - - __syncthreads(); -} - -static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n) -{ - int i, k; - k = n - blockDim.x; - - for(i = 0; i < k; i += blockDim.x) { - glob[i + threadIdx.x] = shared[i + threadIdx.x]; - } - - if(threadIdx.x < n - i) { - glob[i + threadIdx.x] = shared[i + threadIdx.x]; - } - - __syncthreads(); -} - -static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n) -{ - int i, k; - k = n - blockDim.x; - - for(i = 0; i < k; i += blockDim.x) { - glob[i + threadIdx.x] = shared[i + threadIdx.x]; - } - - if(threadIdx.x < n - i) { - glob[i + threadIdx.x] = shared[i + threadIdx.x]; - } - - __syncthreads(); -} - -static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n) -{ - int i, k; - k = n - blockDim.x; - - for(i = 0; i < k; i += blockDim.x) { - shared[i + threadIdx.x] = glob[i + threadIdx.x]; - } - - if(threadIdx.x < n - i) { - shared[i + threadIdx.x] = glob[i + threadIdx.x]; - } - - __syncthreads(); -} - -static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n) -{ - int i, k; - k = n - blockDim.x; - - for(i = 0; i < k; i += blockDim.x) { - shared[i + threadIdx.x] = glob[i + threadIdx.x]; - } - - if(threadIdx.x < n - i) { - shared[i + threadIdx.x] = glob[i + threadIdx.x]; - } - - __syncthreads(); -} - -static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n) -{ - int i; - - for(i = 0; i < n - blockDim.x; i += blockDim.x) { - shared[i + threadIdx.x] = glob[i + threadIdx.x]; - } - - if(threadIdx.x < n - i) { - shared[i + threadIdx.x] = glob[i + threadIdx.x]; - } - - __syncthreads(); -} - -//copy data between two memory areas on device, 3d BlockDims are allowed -static __device__ inline void copyData(double* source, double* target, const int &n) -{ - int i; - int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; - - for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { - target[i + offset] = source[i + offset]; - } - - if(offset < n - i) { - target[i + offset] = source[i + offset]; - } - - __syncthreads(); -} - -static __device__ inline void copyData(float* source, float* target, const int &n) -{ - int i; - int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; - - for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { - target[i + offset] = source[i + offset]; - } - - if(offset < n - i) { - target[i + offset] = source[i + offset]; - } - - __syncthreads(); -} - -static __device__ inline void copyData(int* source, int* target, const int &n) -{ - int i; - int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; - - for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { - target[i + offset] = source[i + offset]; - } - - if(offset < n - i) { - target[i + offset] = source[i + offset]; - } - - __syncthreads(); -} - -static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n) -{ - int i; - int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; - - for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { - target[i + offset] = source[i + offset]; - } - - if(offset < n - i) { - target[i + offset] = source[i + offset]; - } - - __syncthreads(); -} - -//functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined -//in the end in data[0]=sum_i=0^blockDim.x data[i] -//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1 -static __device__ inline void reduceBlockP2(int* data) -{ - __syncthreads(); - - for(int i = 2; i <= blockDim.x; i *= 2) { - if(threadIdx.x < blockDim.x / i) - data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; - - __syncthreads(); - } -} - -static __device__ inline void reduceBlockP2(unsigned int* data) -{ - __syncthreads(); - - for(int i = 2; i <= blockDim.x; i *= 2) { - if(threadIdx.x < blockDim.x / i) - data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; - - __syncthreads(); - } -} - -static __device__ inline void reduceBlockP2(float* data) -{ - __syncthreads(); - - for(int i = 2; i <= blockDim.x; i *= 2) { - if(threadIdx.x < blockDim.x / i) - data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; - - __syncthreads(); - } -} - -static __device__ inline void reduceBlockP2(double* data) -{ - __syncthreads(); - - for(int i = 2; i <= blockDim.x; i *= 2) { - if(threadIdx.x < blockDim.x / i) - data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; - - __syncthreads(); - } -} - -static __device__ inline void reduceBlock(float* data) -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < blockDim.x) p2 *= 2; - - if(threadIdx.x < blockDim.x - p2) - data[threadIdx.x] += data[threadIdx.x + p2]; - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - if(threadIdx.x < p2 / i) - data[threadIdx.x] += data[threadIdx.x + p2 / i]; - - __syncthreads(); - } -} - -static __device__ inline void reduceBlock(int* data) -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < blockDim.x) p2 *= 2; - - if(threadIdx.x < blockDim.x - p2) - data[threadIdx.x] += data[threadIdx.x + p2]; - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - if(threadIdx.x < p2 / i) - data[threadIdx.x] += data[threadIdx.x + p2 / i]; - - __syncthreads(); - } -} - -static __device__ inline void reduceBlock(unsigned int* data) -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < blockDim.x) p2 *= 2; - - if(threadIdx.x < blockDim.x - p2) - data[threadIdx.x] += data[threadIdx.x + p2]; - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - if(threadIdx.x < p2 / i) - data[threadIdx.x] += data[threadIdx.x + p2 / i]; - - __syncthreads(); - } -} - -static __device__ inline void reduceBlock(double* data) -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < blockDim.x) p2 *= 2; - - if(threadIdx.x < blockDim.x - p2) - data[threadIdx.x] += data[threadIdx.x + p2]; - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - if(threadIdx.x < p2 / i) - data[threadIdx.x] += data[threadIdx.x + p2 / i]; - - __syncthreads(); - } -} - -static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value) -{ - int i; - - for(i = 0; i < n - blockDim.x; i += blockDim.x) { - data[i + threadIdx.x] = value; - } - - if(threadIdx.x < n - i) data[i + threadIdx.x] = value; -} - -static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value) -{ - int i; - - for(i = 0; i < n - blockDim.x; i += blockDim.x) { - data[i + threadIdx.x] = value; - } - - if(threadIdx.x < n - i) data[i + threadIdx.x] = value; -} - -static __device__ inline void reduce(float* data, int n) //cautious not sure if working -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < n) p2 *= 2; - - int j = 0; - - while((threadIdx.x + blockDim.x * j) * 2 < n - p2) { - data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2]; - j++; - } - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - while((threadIdx.x + blockDim.x * j) < p2 / i) { - data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i]; - j++; - } - - __syncthreads(); - } -} - -static __device__ inline void reduce(double* data, int n) //cautious not sure if working -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < n) p2 *= 2; - - int j = 0; - - while((threadIdx.x + blockDim.x * j) * 2 < n - p2) { - data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2]; - j++; - } - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - while((threadIdx.x + blockDim.x * j) < p2 / i) { - data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i]; - j++; - } - - __syncthreads(); - } -} - -static __device__ inline void minOfBlock(float* data) -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < blockDim.x) p2 *= 2; - - if(threadIdx.x < blockDim.x - p2) - data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]); - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - if(threadIdx.x < p2 / i) - data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]); - - __syncthreads(); - } -} - -static __device__ inline void maxOfBlock(float* data) -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < blockDim.x) p2 *= 2; - - if(threadIdx.x < blockDim.x - p2) - data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]); - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - if(threadIdx.x < p2 / i) - data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]); - - __syncthreads(); - } -} - -static __device__ inline void minOfBlock(double* data) -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < blockDim.x) p2 *= 2; - - if(threadIdx.x < blockDim.x - p2) - data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]); - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - if(threadIdx.x < p2 / i) - data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]); - - __syncthreads(); - } -} - -static __device__ inline void maxOfBlock(double* data) -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < blockDim.x) p2 *= 2; - - if(threadIdx.x < blockDim.x - p2) - data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]); - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - if(threadIdx.x < p2 / i) - data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]); - - __syncthreads(); - } -} - - -static __device__ inline void minOfData(double* data, int n) //cautious not sure if working -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < n) p2 *= 2; - - int j = 0; - - while((threadIdx.x + blockDim.x * j) < n - p2) { - data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); - j++; - } - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - while((threadIdx.x + blockDim.x * j) < p2 / i) { - data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); - j++; - } - - __syncthreads(); - } -} - -static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < n) p2 *= 2; - - int j = 0; - - while((threadIdx.x + blockDim.x * j) < n - p2) { - data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); - j++; - } - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - while((threadIdx.x + blockDim.x * j) < p2 / i) { - data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); - j++; - } - - __syncthreads(); - } -} - -static __device__ inline void minOfData(float* data, int n) //cautious not sure if working -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < n) p2 *= 2; - - int j = 0; - - while((threadIdx.x + blockDim.x * j) < n - p2) { - data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); - j++; - } - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - while((threadIdx.x + blockDim.x * j) < p2 / i) { - data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); - j++; - } - - __syncthreads(); - } -} - -static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working -{ - __syncthreads(); - int p2 = 1; - - while(p2 * 2 < n) p2 *= 2; - - int j = 0; - - while((threadIdx.x + blockDim.x * j) < n - p2) { - data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); - j++; - } - - __syncthreads(); - - for(int i = 2; i <= p2; i *= 2) { - while((threadIdx.x + blockDim.x * j) < p2 / i) { - data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); - j++; - } - - __syncthreads(); - } -} - -#if X_PRECISION == 2 -static __device__ inline double tex1Dfetch_double(texture t, int i) -{ - int2 v = tex1Dfetch(t, i); - return __hiloint2double(v.y, v.x); -} - -static __device__ inline X_FLOAT4 tex1Dfetch_double(texture t, int i) -{ - int4 v = tex1Dfetch(t, 2 * i); - int4 u = tex1Dfetch(t, 2 * i + 1); - X_FLOAT4 w; - - w.x = __hiloint2double(v.y, v.x); - w.y = __hiloint2double(v.w, v.z); - w.z = __hiloint2double(u.y, u.x); - w.w = __hiloint2double(u.w, u.z); - return w; -} -#endif - -inline void BindXTypeTexture(cuda_shared_data* sdata) -{ -#ifdef CUDA_USE_TEXTURE - _x_type_tex.normalized = false; // access with normalized texture coordinates - _x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex); - -#if X_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4)); -#else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4)); -#endif -#endif -} - -static __device__ inline X_FLOAT4 fetchXType(int i) -{ -#ifdef CUDA_USE_TEXTURE -#if X_PRECISION == 1 - return tex1Dfetch(_x_type_tex, i); -#else - return tex1Dfetch_double(_x_type_tex, i); -#endif -#else - return _x_type[i]; -#endif -} - -#if V_PRECISION == 2 -static __device__ inline double tex1Dfetch_double_v(texture t, int i) -{ - int2 v = tex1Dfetch(t, i); - return __hiloint2double(v.y, v.x); -} - -static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture t, int i) -{ - int4 v = tex1Dfetch(t, 2 * i); - int4 u = tex1Dfetch(t, 2 * i + 1); - V_FLOAT4 w; - - w.x = __hiloint2double(v.y, v.x); - w.y = __hiloint2double(v.w, v.z); - w.z = __hiloint2double(u.y, u.x); - w.w = __hiloint2double(u.w, u.z); - return w; -} -#endif - -inline void BindVRadiusTexture(cuda_shared_data* sdata) -{ -#ifdef CUDA_USE_TEXTURE - _v_radius_tex.normalized = false; // access with normalized texture coordinates - _v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex); - -#if V_PRECISION == 1 - cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); - cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4)); -#else - cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); - cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4)); -#endif -#endif -} - -static __device__ inline V_FLOAT4 fetchVRadius(int i) -{ -#ifdef CUDA_USE_TEXTURE -#if V_PRECISION == 1 - return tex1Dfetch(_v_radius_tex, i); -#else - return tex1Dfetch_double_v(_v_radius_tex, i); -#endif -#else - return _v_radius[i]; -#endif -} - -inline void BindOmegaRmassTexture(cuda_shared_data* sdata) -{ -#ifdef CUDA_USE_TEXTURE - _omega_rmass_tex.normalized = false; // access with normalized texture coordinates - _omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex); - -#if V_PRECISION == 1 - cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); - cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4)); -#else - cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); - cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4)); -#endif -#endif -} - -static __device__ inline V_FLOAT4 fetchOmegaRmass(int i) -{ -#ifdef CUDA_USE_TEXTURE -#if V_PRECISION == 1 - return tex1Dfetch(_omega_rmass_tex, i); -#else - return tex1Dfetch_double_v(_omega_rmass_tex, i); -#endif -#else - return _omega_rmass[i]; -#endif -} - -#if F_PRECISION == 2 -static __device__ inline double tex1Dfetch_double_f(texture t, int i) -{ - int2 v = tex1Dfetch(t, i); - return __hiloint2double(v.y, v.x); -} - -static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture t, int i) -{ - int4 v = tex1Dfetch(t, 2 * i); - int4 u = tex1Dfetch(t, 2 * i + 1); - F_FLOAT4 w; - - w.x = __hiloint2double(v.y, v.x); - w.y = __hiloint2double(v.w, v.z); - w.z = __hiloint2double(u.y, u.x); - w.w = __hiloint2double(u.w, u.z); - return w; -} -#endif - -inline void BindQTexture(cuda_shared_data* sdata) -{ -#ifdef CUDA_USE_TEXTURE - _q_tex.normalized = false; // access with normalized texture coordinates - _q_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* q_texture_ptr = &MY_AP(q_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); - cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT)); -#else - cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); - cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2)); -#endif -#endif -} - -static __device__ inline F_FLOAT fetchQ(int i) -{ -#ifdef CUDA_USE_TEXTURE -#if F_PRECISION == 1 - return tex1Dfetch(_q_tex, i); -#else - return tex1Dfetch_double_f(_q_tex, i); -#endif -#else - return _q[i]; -#endif -} - -#endif - -/* - -inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex) -{ - #ifdef CUDA_USE_TEXTURE - _coeff_tex.normalized = false; // access with normalized texture coordinates - _coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff_texture_ptr; - cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex)); - - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4)); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4)); - #endif - #endif -} - -static __device__ inline X_FLOAT4 fetchXType(int i) -{ - #ifdef CUDA_USE_TEXTURE - #if X_PRECISION == 1 - return tex1Dfetch(_x_type_tex,i); - #else - return tex1Dfetch_double(_x_type_tex,i); - #endif - #else - return _x_type[i]; - #endif -} -*/ -#define SBBITS 30 - -static inline __device__ int sbmask(int j) -{ - return j >> SBBITS & 3; -} - -static inline __device__ void minimum_image(X_FLOAT4 &delta) -{ - if(_triclinic == 0) { - if(_periodicity[0]) { - delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] : - (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0)); - } - - if(_periodicity[1]) { - delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] : - (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0)); - } - - if(_periodicity[2]) { - delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] : - (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0)); - } - - } else { - if(_periodicity[1]) { - delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] : - (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0)); - delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] : - (delta.z > X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0)); - delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] : - (delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0)); - - } - - if(_periodicity[1]) { - delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] : - (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0)); - delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] : - (delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0)); - - } - - if(_periodicity[0]) { - delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] : - (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0)); - } - } -} - -static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci) -{ - ci.x = x2.x - x1.x; - ci.y = x2.y - x1.y; - ci.z = x2.z - x1.z; - minimum_image(ci); - ci.x += x1.x; - ci.y += x1.y; - ci.z += x1.z; -} diff --git a/lib/cuda/cuda.cu b/lib/cuda/cuda.cu deleted file mode 100644 index 1fc4dc4a41..0000000000 --- a/lib/cuda/cuda.cu +++ /dev/null @@ -1,22 +0,0 @@ -#include "cuda_precision.h" -#include "cuda_shared.h" -#include "cuda_cu.h" - -void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata) -{ - sdata->compile_settings.prec_glob=sizeof(CUDA_FLOAT)/4; - sdata->compile_settings.prec_x=sizeof(X_FLOAT)/4; - sdata->compile_settings.prec_v=sizeof(V_FLOAT)/4; - sdata->compile_settings.prec_f=sizeof(F_FLOAT)/4; - sdata->compile_settings.prec_pppm=sizeof(PPPM_FLOAT)/4; - sdata->compile_settings.prec_fft=sizeof(FFT_FLOAT)/4; - - #ifdef FFT_CUFFT - sdata->compile_settings.cufft=1; - #else - sdata->compile_settings.cufft=0; - #endif - - sdata->compile_settings.arch=CUDA_ARCH; - -} diff --git a/lib/cuda/cuda_common.h b/lib/cuda/cuda_common.h deleted file mode 100644 index d4687ebd06..0000000000 --- a/lib/cuda/cuda_common.h +++ /dev/null @@ -1,344 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#ifndef _CUDA_COMMON_H_ -#define _CUDA_COMMON_H_ - -//#include "cutil.h" -#include "cuda_precision.h" -#include "cuda_wrapper_cu.h" - -#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types -//this can not be arbitrarly large, since constant space is limited. -//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types -//Christian -#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE) -#define CUDA_MAX_NSPECIAL 25 - -// define some easy-to-use debug and emulation macros -#ifdef _DEBUG -#define MYDBG(a) a -#else -#define MYDBG(a) -#endif - -#if __DEVICE_EMULATION__ -#define MYEMU(a) a -#else -#define MYEMU(a) -#endif - -#define MYEMUDBG(a) MYEMU(MYDBG(a)) - -// Add Prefix (needed as workaround, same constant's names in different files causes conflict) -#define MY_ADD_PREFIX(prefix, var) prefix##_##var -#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var) -#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var) - -#define MY_VAR_TO_STR(var) #var -#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var) -#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var)) - -#define CUDA_USE_TEXTURE -#define CUDA_USE_FLOAT4 - -//constants used by many classes - -//domain -#define _boxhi MY_AP(boxhi) -#define _boxlo MY_AP(boxlo) -#define _subhi MY_AP(subhi) -#define _sublo MY_AP(sublo) -#define _box_size MY_AP(box_size) -#define _prd MY_AP(prd) -#define _periodicity MY_AP(periodicity) -#define _triclinic MY_AP(triclinic) -#define _boxhi_lamda MY_AP(boxhi_lamda) -#define _boxlo_lamda MY_AP(boxlo_lamda) -#define _prd_lamda MY_AP(prd_lamda) -#define _h MY_AP(h) -#define _h_inv MY_AP(h_inv) -#define _h_rate MY_AP(h_rate) -__device__ __constant__ X_FLOAT _boxhi[3]; -__device__ __constant__ X_FLOAT _boxlo[3]; -__device__ __constant__ X_FLOAT _subhi[3]; -__device__ __constant__ X_FLOAT _sublo[3]; -__device__ __constant__ X_FLOAT _box_size[3]; -__device__ __constant__ X_FLOAT _prd[3]; -__device__ __constant__ int _periodicity[3]; -__device__ __constant__ int _triclinic; -__device__ __constant__ X_FLOAT _boxhi_lamda[3]; -__device__ __constant__ X_FLOAT _boxlo_lamda[3]; -__device__ __constant__ X_FLOAT _prd_lamda[3]; -__device__ __constant__ X_FLOAT _h[6]; -__device__ __constant__ X_FLOAT _h_inv[6]; -__device__ __constant__ V_FLOAT _h_rate[6]; - - -//atom properties -#define _x MY_AP(x) -#define _v MY_AP(v) -#define _f MY_AP(f) -#define _tag MY_AP(tag) -#define _type MY_AP(type) -#define _mask MY_AP(mask) -#define _image MY_AP(image) -#define _q MY_AP(q) -#define _mass MY_AP(mass) -#define _rmass MY_AP(rmass) -#define _rmass_flag MY_AP(rmass_flag) -#define _eatom MY_AP(eatom) -#define _vatom MY_AP(vatom) -#define _x_type MY_AP(x_type) -#define _radius MY_AP(radius) -#define _density MY_AP(density) -#define _omega MY_AP(omega) -#define _torque MY_AP(torque) -#define _special MY_AP(special) -#define _maxspecial MY_AP(maxspecial) -#define _nspecial MY_AP(nspecial) -#define _special_flag MY_AP(special_flag) -#define _molecule MY_AP(molecule) -#define _v_radius MY_AP(v_radius) -#define _omega_rmass MY_AP(omega_rmass) -#define _freeze_group_bit MY_AP(freeze_group_bit) -#define _map_array MY_AP(map_array) -__device__ __constant__ X_FLOAT* _x; //holds pointer to positions -__device__ __constant__ V_FLOAT* _v; -__device__ __constant__ F_FLOAT* _f; -__device__ __constant__ int* _tag; -__device__ __constant__ int* _type; -__device__ __constant__ int* _mask; -__device__ __constant__ int* _image; -__device__ __constant__ V_FLOAT* _mass; -__device__ __constant__ F_FLOAT* _q; -__device__ __constant__ V_FLOAT* _rmass; -__device__ __constant__ int _rmass_flag; -__device__ __constant__ ENERGY_FLOAT* _eatom; -__device__ __constant__ ENERGY_FLOAT* _vatom; -__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions -__device__ __constant__ X_FLOAT* _radius; -__device__ __constant__ F_FLOAT* _density; -__device__ __constant__ V_FLOAT* _omega; -__device__ __constant__ F_FLOAT* _torque; -__device__ __constant__ int* _special; -__device__ __constant__ int _maxspecial; -__device__ __constant__ int* _nspecial; -__device__ __constant__ int _special_flag[4]; -__device__ __constant__ int* _molecule; -__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions -__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions -__device__ __constant__ int _freeze_group_bit; -__device__ __constant__ int* _map_array; - -#ifdef CUDA_USE_TEXTURE - - #define _x_tex MY_AP(x_tex) - #if X_PRECISION == 1 - texture _x_tex; - #else - texture _x_tex; - #endif - - #define _type_tex MY_AP(type_tex) - texture _type_tex; - - #define _x_type_tex MY_AP(x_type_tex) - #if X_PRECISION == 1 - texture _x_type_tex; - #else - texture _x_type_tex; - #endif - - #define _v_radius_tex MY_AP(v_radius_tex) - #if V_PRECISION == 1 - texture _v_radius_tex; - #else - texture _v_radius_tex; - #endif - - #define _omega_rmass_tex MY_AP(omega_rmass_tex) - #if V_PRECISION == 1 - texture _omega_rmass_tex; - #else - texture _omega_rmass_tex; - #endif - - #define _q_tex MY_AP(q_tex) - #if F_PRECISION == 1 - texture _q_tex; - #else - texture _q_tex; - #endif - -#endif - -//neighbor -#ifdef IncludeCommonNeigh -#define _inum MY_AP(inum) -#define _inum_border MY_AP(inum_border) -#define _ilist MY_AP(ilist) -#define _ilist_border MY_AP(ilist_border) -#define _numneigh MY_AP(numneigh) -#define _numneigh_border MY_AP(numneigh_border) -#define _numneigh_inner MY_AP(numneigh_inner) -#define _firstneigh MY_AP(firstneigh) -#define _neighbors MY_AP(neighbors) -#define _neighbors_border MY_AP(neighbors_border) -#define _neighbors_inner MY_AP(neighbors_inner) -#define _reneigh_flag MY_AP(reneigh_flag) -#define _triggerneighsq MY_AP(triggerneighsq) -#define _xhold MY_AP(xhold) -#define _maxhold MY_AP(maxhold) -#define _dist_check MY_AP(dist_check) -#define _neighbor_maxlocal MY_AP(neighbor_maxlocal) -#define _maxneighbors MY_AP(maxneighbors) -#define _overlap_comm MY_AP(overlap_comm) -__device__ __constant__ int _inum; -__device__ __constant__ int* _inum_border; -__device__ __constant__ int* _ilist; -__device__ __constant__ int* _ilist_border; -__device__ __constant__ int* _numneigh; -__device__ __constant__ int* _numneigh_border; -__device__ __constant__ int* _numneigh_inner; -__device__ __constant__ int** _firstneigh; -__device__ __constant__ int* _neighbors; -__device__ __constant__ int* _neighbors_border; -__device__ __constant__ int* _neighbors_inner; -__device__ __constant__ int* _reneigh_flag; -__device__ __constant__ X_FLOAT _triggerneighsq; -__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions -__device__ __constant__ int _maxhold; -__device__ __constant__ int _dist_check; -__device__ __constant__ int _neighbor_maxlocal; -__device__ __constant__ int _maxneighbors; -__device__ __constant__ int _overlap_comm; -#endif - -//system properties -#define _nall MY_AP(nall) -#define _nghost MY_AP(nghost) -#define _nlocal MY_AP(nlocal) -#define _nmax MY_AP(nmax) -#define _cuda_ntypes MY_AP(cuda_ntypes) -#define _dtf MY_AP(dtf) -#define _dtv MY_AP(dtv) -#define _factor MY_AP(factor) -#define _virial MY_AP(virial) -#define _eng_vdwl MY_AP(eng_vdwl) -#define _eng_coul MY_AP(eng_coul) -#define _molecular MY_AP(molecular) -__device__ __constant__ unsigned _nall; -__device__ __constant__ unsigned _nghost; -__device__ __constant__ unsigned _nlocal; -__device__ __constant__ unsigned _nmax; -__device__ __constant__ unsigned _cuda_ntypes; -__device__ __constant__ V_FLOAT _dtf; -__device__ __constant__ X_FLOAT _dtv; -__device__ __constant__ V_FLOAT _factor; -__device__ __constant__ ENERGY_FLOAT* _virial; -__device__ __constant__ ENERGY_FLOAT* _eng_vdwl; -__device__ __constant__ ENERGY_FLOAT* _eng_coul; -__device__ __constant__ int _molecular; - -//other general constants -#define _buffer MY_AP(buffer) -#define _flag MY_AP(flag) -#define _debugdata MY_AP(debugdata) -__device__ __constant__ void* _buffer; -__device__ __constant__ int* _flag; -__device__ __constant__ int* _debugdata; - -// pointers to data fields on GPU are hold in constant space -// -> reduces register usage and number of parameters for kernelcalls -// will be variables of file scope in cuda files - - - - -// maybe used to output cudaError_t -#define MY_OUTPUT_RESULT(result) \ - switch(result) \ - { \ - case cudaSuccess: printf(" => cudaSuccess\n"); break; \ - case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \ - case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \ - case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \ - case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \ - default: printf(" => unknown\n"); break; \ - } - -#ifdef _DEBUG -# define CUT_CHECK_ERROR(errorMessage) { \ - cudaError_t err = cudaGetLastError(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ - } \ - err = cudaThreadSynchronize(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ - } \ - } -#else -# define CUT_CHECK_ERROR(errorMessage) { \ - cudaError_t err = cudaGetLastError(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ - } \ - } -#endif - -# define CUDA_SAFE_CALL_NO_SYNC( call) { \ - cudaError err = call; \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ - __FILE__, __LINE__, cudaGetErrorString( err) ); \ - exit(EXIT_FAILURE); \ - } } - -# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call); - -#define X_MASK 1 -#define V_MASK 2 -#define F_MASK 4 -#define TAG_MASK 8 -#define TYPE_MASK 16 -#define MASK_MASK 32 -#define IMAGE_MASK 64 -#define Q_MASK 128 -#define MOLECULE_MASK 256 -#define RMASS_MASK 512 -#define RADIUS_MASK 1024 -#define DENSITY_MASK 2048 -#define OMEGA_MASK 4096 -#define TORQUE_MASK 8192 - - - -#endif // #ifdef _CUDA_COMMON_H_ diff --git a/lib/cuda/cuda_cu.h b/lib/cuda/cuda_cu.h deleted file mode 100644 index 48498b8d0f..0000000000 --- a/lib/cuda/cuda_cu.h +++ /dev/null @@ -1 +0,0 @@ -extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata); diff --git a/lib/cuda/cuda_data.cu b/lib/cuda/cuda_data.cu deleted file mode 100644 index 327cbd9014..0000000000 --- a/lib/cuda/cuda_data.cu +++ /dev/null @@ -1,168 +0,0 @@ -enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet - -#include "cuda_data_cu.h" -#include "cuda_wrapper_cu.h" -#include "cuda_data_kernel.cu" -#include - -void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) -{ - int size=n[0]; - if(n[1]>0) size*=n[1]; - if(n[2]>0) size*=n[2]; - - dim3 threads; threads.x=1; threads.y=1; threads.z=1; - dim3 grid; grid.x=1; grid.y=1; grid.z=1; - - if(size<=128*30) - threads.x=32; - else if(size<=256*30) - threads.x=64; - else if(size<=512*30) - threads.x=128; - else - threads.x=256; - - grid.x=((size-1)+threads.x)/threads.x; - if(grid.x>32000) - grid.x=32000; - while(grid.x*grid.y*threads.x>>((double*)buffer,(float*)dev_data,n[0],n[1],n[2],mode); - cudaThreadSynchronize(); - CudaWrapper_DownloadCudaData(debugdata, dev_data, size/2); - double sum=0; - printf("debugdata: "); - for(int i=0;i0) size*=n[1]; - if(n[2]>0) size*=n[2]; - - dim3 threads; threads.x=1; threads.y=1; threads.z=1; - dim3 grid; grid.x=1; grid.y=1; grid.z=1; - - if(size<=128*30) - threads.x=32; - else if(size<=256*30) - threads.x=64; - else if(size<=512*30) - threads.x=128; - else - threads.x=256; - - grid.x=((size-1)+threads.x)/threads.x; - if(grid.x>32000) - grid.x=32000; - while(grid.x*grid.y*threads.x>>((double*)buffer,(double*)dev_data,n[0],n[1],n[2],mode); - cudaThreadSynchronize(); -} - -void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) -{ - int size=n[0]; - if(n[1]>0) size*=n[1]; - if(n[2]>0) size*=n[2]; - - dim3 threads; threads.x=1; threads.y=1; threads.z=1; - dim3 grid; grid.x=1; grid.y=1; grid.z=1; - - if(size<=128*30) - threads.x=32; - else if(size<=256*30) - threads.x=64; - else if(size<=512*30) - threads.x=128; - else - threads.x=256; - - grid.x=((size-1)+threads.x)/threads.x; - if(grid.x>32000) - grid.x=32000; - while(grid.x*grid.y*threads.x>>((float*)buffer,(double*)dev_data,n[0],n[1],n[2],mode); - cudaThreadSynchronize(); -} - -void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) -{ - int size=n[0]; - if(n[1]>0) size*=n[1]; - if(n[2]>0) size*=n[2]; - - dim3 threads; threads.x=1; threads.y=1; threads.z=1; - dim3 grid; grid.x=1; grid.y=1; grid.z=1; - - if(size<=128*30) - threads.x=32; - else if(size<=256*30) - threads.x=64; - else if(size<=512*30) - threads.x=128; - else - threads.x=256; - - grid.x=((size-1)+threads.x)/threads.x; - if(grid.x>32000) - grid.x=32000; - while(grid.x*grid.y*threads.x>>((float*)buffer,(float*)dev_data,n[0],n[1],n[2],mode); - cudaThreadSynchronize(); -} - -void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) -{ - int size=n[0]; - if(n[1]>0) size*=n[1]; - if(n[2]>0) size*=n[2]; - - dim3 threads; threads.x=1; threads.y=1; threads.z=1; - dim3 grid; grid.x=1; grid.y=1; grid.z=1; - - if(size<=128*30) - threads.x=32; - else if(size<=256*30) - threads.x=64; - else if(size<=512*30) - threads.x=128; - else - threads.x=256; - - grid.x=((size-1)+threads.x)/threads.x; - if(grid.x>32000) - grid.x=32000; - while(grid.x*grid.y*threads.x>>((int*)buffer,(int*)dev_data,n[0],n[1],n[2],mode); - cudaThreadSynchronize(); -} - -void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer) -{ -} diff --git a/lib/cuda/cuda_data_cu.h b/lib/cuda/cuda_data_cu.h deleted file mode 100644 index e323b30429..0000000000 --- a/lib/cuda/cuda_data_cu.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef CUDA_DATA_CU_H_ -#define CUDA_DATA_CU_H_ - -extern "C" void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer); -extern "C" void CudaData_Upload_DoubleDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer); -extern "C" void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer); -extern "C" void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer); -extern "C" void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer); - -extern "C" void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer); - - -#endif /*CUDA_DATA_CU_H_*/ diff --git a/lib/cuda/cuda_data_kernel.cu b/lib/cuda/cuda_data_kernel.cu deleted file mode 100644 index 831b7b08bb..0000000000 --- a/lib/cuda/cuda_data_kernel.cu +++ /dev/null @@ -1,156 +0,0 @@ -__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer,float* dev_data, - unsigned nx,unsigned ny,unsigned nz,copy_mode mode) -{ - if(mode==x) mode=xx; - unsigned length=nx; - if(ny>0) length*=ny; - if(nz>0) length*=nz; - unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l; - - - if(i>=length) return; - switch(mode) - { - case xx: - { - dev_data[i]=buffer[i]; - } - case xy: - { - dev_data[i]=buffer[i]; - } - case yx: - { - j=i/ny; - k=i%ny; - dev_data[k*nx+j]=buffer[j*ny+k]; - } - case xyz: - { - dev_data[i]=buffer[i]; - } - case xzy: - { - j=i/(ny*nz); - k=(i%(ny*nz))/nz; - l=i%nz; - dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l]; - } - } -} - -__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer,double* dev_data, - unsigned nx,unsigned ny,unsigned nz,copy_mode mode) -{ - if(mode==x) mode=xx; - unsigned length=nx; - if(ny>0) length*=ny; - if(nz>0) length*=nz; - unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l; - if(i>=length) return; - switch(mode) - { - case xx: - dev_data[i]=buffer[i]; - case xy: - dev_data[i]=buffer[i]; - case yx: - j=i/ny; - k=i%ny; - dev_data[k*nx+j]=buffer[j*ny+k]; - case xyz: - dev_data[i]=buffer[i]; - case xzy: - j=i/(ny*nz); - k=(i%(ny*nz))/nz; - l=i%nz; - dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l]; - } -} - -__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer,double* dev_data, - unsigned nx,unsigned ny,unsigned nz,copy_mode mode) -{ - if(mode==x) mode=xx; - unsigned length=nx; - if(ny>0) length*=ny; - if(nz>0) length*=nz; - unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l; - if(i>=length) return; - switch(mode) - { - case xx: - dev_data[i]=buffer[i]; - case xy: - dev_data[i]=buffer[i]; - case yx: - j=i/ny; - k=i%ny; - dev_data[k*nx+j]=buffer[j*ny+k]; - case xyz: - dev_data[i]=buffer[i]; - case xzy: - j=i/(ny*nz); - k=(i%(ny*nz))/nz; - l=i%nz; - dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l]; - } -} - -__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer,float* dev_data, - unsigned nx,unsigned ny,unsigned nz,copy_mode mode) -{ - if(mode==x) mode=xx; - unsigned length=nx; - if(ny>0) length*=ny; - if(nz>0) length*=nz; - unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l; - if(i>=length) return; - switch(mode) - { - case xx: - dev_data[i]=buffer[i]; - case xy: - dev_data[i]=buffer[i]; - case yx: - j=i/ny; - k=i%ny; - dev_data[k*nx+j]=buffer[j*ny+k]; - case xyz: - dev_data[i]=buffer[i]; - case xzy: - j=i/(ny*nz); - k=(i%(ny*nz))/nz; - l=i%nz; - dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l]; - } -} - -__global__ void CudaData_Upload_Kernel_IntInt(int* buffer,int* dev_data, - unsigned nx,unsigned ny,unsigned nz,copy_mode mode) -{ - if(mode==x) mode=xx; - unsigned length=nx; - if(ny>0) length*=ny; - if(nz>0) length*=nz; - unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l; - if(i>=length) return; - switch(mode) - { - case xx: - dev_data[i]=buffer[i]; - case xy: - dev_data[i]=buffer[i]; - case yx: - j=i/ny; - k=i%ny; - dev_data[k*nx+j]=buffer[j*ny+k]; - case xyz: - dev_data[i]=buffer[i]; - case xzy: - j=i/(ny*nz); - k=(i%(ny*nz))/nz; - l=i%nz; - dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l]; - } -} diff --git a/lib/cuda/cuda_kernel.cu b/lib/cuda/cuda_kernel.cu deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/lib/cuda/cuda_pair.cu b/lib/cuda/cuda_pair.cu deleted file mode 100644 index 9f9900a2d8..0000000000 --- a/lib/cuda/cuda_pair.cu +++ /dev/null @@ -1,1015 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -enum PAIR_FORCES {PAIR_NONE, PAIR_BORN, PAIR_BUCK, PAIR_CG_CMM, PAIR_LJ_CHARMM, PAIR_LJ_CLASS2, PAIR_LJ_CUT, PAIR_LJ_EXPAND, PAIR_LJ_GROMACS, PAIR_LJ_SMOOTH, PAIR_LJ96_CUT, PAIR_MORSE, PAIR_MORSE_R6}; -enum COUL_FORCES {COUL_NONE, COUL_CHARMM, COUL_CHARMM_IMPLICIT, COUL_CUT, COUL_LONG, COUL_DEBYE, COUL_GROMACS, COUL_SPECIAL}; -#define DATA_NONE 0 -#define DATA_V 1 -#define DATA_TAG 2 -#define DATA_RMASS 4 -#define DATA_MASS 8 -#define DATA_TORQUE 16 -#define DATA_OMEGA 32 -#define DATA_RADIUS 64 -#define DATA_DENSITY 128 -#define DATA_MASK 256 -#define DATA_V_RADIUS 512 -#define DATA_OMEGA_RMASS 1024 - -#define NEIGHMASK 0x3FFFFFFF - -#define MY_PREFIX cuda_pair -#define IncludeCommonNeigh -#include "cuda_shared.h" -#include "cuda_common.h" -#include "cuda_wrapper_cu.h" -#include "crm_cuda_utils.cu" - -//constants used by multiple forces - -//general -#define _cutsq MY_AP(cutsq) -#define _offset MY_AP(offset) -#define _special_lj MY_AP(special_lj) -#define _special_coul MY_AP(special_coul) -#define _cutsq_global MY_AP(cutsq_global) -#define _collect_forces_later MY_AP(collect_forces_later) - -__device__ __constant__ X_FLOAT _cutsq[CUDA_MAX_TYPES2]; -__device__ __constant__ ENERGY_FLOAT _offset[CUDA_MAX_TYPES2]; -__device__ __constant__ F_FLOAT _special_lj[4]; -__device__ __constant__ F_FLOAT _special_coul[4]; -__device__ __constant__ X_FLOAT _cutsq_global; -__device__ __constant__ int _collect_forces_later; - -__device__ __constant__ F_FLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space) -__device__ __constant__ F_FLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2]; -__device__ __constant__ F_FLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2]; -__device__ __constant__ F_FLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2]; -__device__ __constant__ F_FLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2]; - - -__device__ __constant__ F_FLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space) -__device__ __constant__ F_FLOAT* MY_AP(coeff2_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff3_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff4_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff5_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff6_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff7_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff8_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff9_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff10_gm); - -#define _coeff1_gm_tex MY_AP(coeff1_gm_tex) -#if F_PRECISION == 1 -texture _coeff1_gm_tex; -#else -texture _coeff1_gm_tex; -#endif - -#define _coeff2_gm_tex MY_AP(coeff2_gm_tex) -#if F_PRECISION == 1 -texture _coeff2_gm_tex; -#else -texture _coeff2_gm_tex; -#endif - -#define _coeff3_gm_tex MY_AP(coeff3_gm_tex) -#if F_PRECISION == 1 -texture _coeff3_gm_tex; -#else -texture _coeff3_gm_tex; -#endif - -#define _coeff4_gm_tex MY_AP(coeff4_gm_tex) -#if F_PRECISION == 1 -texture _coeff4_gm_tex; -#else -texture _coeff4_gm_tex; -#endif - -#define _coeff5_gm_tex MY_AP(coeff5_gm_tex) -#if F_PRECISION == 1 -texture _coeff5_gm_tex; -#else -texture _coeff5_gm_tex; -#endif - -#define _coeff6_gm_tex MY_AP(coeff6_gm_tex) -#if F_PRECISION == 1 -texture _coeff6_gm_tex; -#else -texture _coeff6_gm_tex; -#endif - -#define _coeff7_gm_tex MY_AP(coeff7_gm_tex) -#if F_PRECISION == 1 -texture _coeff7_gm_tex; -#else -texture _coeff7_gm_tex; -#endif - -#define _coeff8_gm_tex MY_AP(coeff8_gm_tex) -#if F_PRECISION == 1 -texture _coeff8_gm_tex; -#else -texture _coeff8_gm_tex; -#endif - -#define _coeff9_gm_tex MY_AP(coeff9_gm_tex) -#if F_PRECISION == 1 -texture _coeff9_gm_tex; -#else -texture _coeff9_gm_tex; -#endif - -#define _coeff10_gm_tex MY_AP(coeff10_gm_tex) -#if F_PRECISION == 1 -texture _coeff10_gm_tex; -#else -texture _coeff10_gm_tex; -#endif - -//if more than 5 coefficients are needed for a pair potential add them here - - -//coulomb -#define _cut_coulsq MY_AP(cut_coulsq) -#define _cut_coulsq_global MY_AP(cut_coulsq_global) -#define _g_ewald MY_AP(g_ewald) -#define _qqrd2e MY_AP(qqrd2e) -#define _kappa MY_AP(kappa) -__device__ __constant__ X_FLOAT _cut_coulsq[CUDA_MAX_TYPES2]; -__device__ __constant__ X_FLOAT _cut_coulsq_global; -__device__ __constant__ F_FLOAT _g_ewald; -__device__ __constant__ F_FLOAT _qqrd2e; -__device__ __constant__ F_FLOAT _kappa; - -//inner cutoff -#define _cut_innersq MY_AP(cut_innersq) -#define _cut_innersq_global MY_AP(cut_innersq_global) -__device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2]; -__device__ __constant__ X_FLOAT _cut_innersq_global; - - -template -__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom); - -template -__global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom); - -template -__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase); - -template -__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase); - -#include -#include "cuda_pair_cu.h" -#include "cuda_pair_virial_kernel_nc.cu" - -//Functions which are shared by pair styles - -//Update Buffersize -void Cuda_UpdateBuffer(cuda_shared_data* sdata, int size) -{ - CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed"); - - if(sdata->buffersize < size) { - MYDBG(printf("Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize = size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) - } - - cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); - CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed"); -} - -void Cuda_Pair_UpdateNeighbor_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - //Neighbor - cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(overlap_comm) , & sdata->overlap_comm, sizeof(int)); - - if(sdata->overlap_comm) { - cudaMemcpyToSymbol(MY_AP(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*)); - } - -} -//Update constants after nmax change which are generally needed by all pair styles -void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin"); - - //System - cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - - //Atom - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); - - - //Other - cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*)); - CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End"); -} - -//Initialisation of GPU Constants which rarely change -void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = false, bool use_global_params = false, bool need_innercut = false, bool need_cut = true) -{ - unsigned cuda_ntypes = sdata->atom.ntypes + 1; - unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes; - unsigned n = sizeof(F_FLOAT) * cuda_ntypes2; - unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2; - - //check if enough constant memory is available - if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params) - printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u " - "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " - "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1); - - if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params) - exit(0); - - //type conversion of cutoffs and parameters - if(need_cut) { - X_FLOAT cutsq[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global); - } - } - - int cutsqdiffer = 0; - X_FLOAT cutsq_global; - cutsq_global = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global); - - if(sdata->pair.cut) { - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = i; j <= sdata->atom.ntypes; ++j) { - if(sdata->pair.cut[i][j] > 1e-6) { - cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); - cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); - } - - if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j]; - - if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6) - cutsqdiffer++; - } - } - } - - if(sdata->pair.cutsq) { - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = i; j <= sdata->atom.ntypes; ++j) { - if(sdata->pair.cut[i][j] > 1e-6) { - cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cutsq[i][j]); - cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cutsq[i][j]); - } - - if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j]; - - if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6) - cutsqdiffer++; - } - } - } - - //printf("CUTSQGLOB: %i %e\n",cutsqdiffer,cutsq_global); - if(cutsqdiffer) { - - cutsq_global = -1.0; - cudaMemcpyToSymbol(MY_AP(cutsq) , cutsq , nx); - } - - cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT)); - } - - if(need_innercut) { - X_FLOAT cut_innersq[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); - } - } - - int cutsqdiffer = 0; - X_FLOAT cut_innersq_global; - cut_innersq_global = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); - - if(sdata->pair.cut_inner) { - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = i; j <= sdata->atom.ntypes; ++j) { - if(sdata->pair.cut_inner[i][j] > 1e-6) { - cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); - cut_innersq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); - } - - if(i == 1 && j == 1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j]; - - if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) * (cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6) - cutsqdiffer++; - } - } - } - - if(cutsqdiffer) { - cut_innersq_global = -1.0; - cudaMemcpyToSymbol(MY_AP(cut_innersq) , cut_innersq , nx); - } - - cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_FLOAT)); - } - - if(need_q) { - X_FLOAT cut_coulsq[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); - } - } - - int cutsqdiffer = 0; - X_FLOAT cut_coulsq_global; - cut_coulsq_global = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); - - if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global; - - if(sdata->pair.cut_coul) { - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = i; j <= sdata->atom.ntypes; ++j) { - if(sdata->pair.cut_coul[i][j] > 1e-6) { - cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); - cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); - } - - if(i == 1 && j == 1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j]; - - if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) * (cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6) - cutsqdiffer++; - } - } - } - - if(cutsqdiffer) { - cut_coulsq_global = -1.0; - cudaMemcpyToSymbol(MY_AP(cut_coulsq) , cut_coulsq , nx); - } - - cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_FLOAT)); - } - - CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed"); - - if(ncoeff > 0) { - F_FLOAT coeff1[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j]; - } - } - - if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*)); - cudaMemcpy((sdata->pair.coeff1_gm.dev_data), coeff1, n, cudaMemcpyHostToDevice); - - _coeff1_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff1_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff1_gm_texture_ptr = &MY_AP(coeff1_gm_tex); - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed"); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed"); - cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed"); -#else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b-d failed"); - cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed"); -#endif - - } else - cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n); - } - - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed"); - - if(ncoeff > 1) { - F_FLOAT coeff2[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j]; - } - } - - if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*)); - cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n, cudaMemcpyHostToDevice); - - _coeff2_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff2_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff2_gm_texture_ptr = &MY_AP(coeff2_gm_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); -#else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); -#endif - - } else - cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n); - } - - CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed"); - - if(ncoeff > 2) { - F_FLOAT coeff3[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j]; - } - } - - if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*)); - cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n, cudaMemcpyHostToDevice); - _coeff3_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff3_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff3_gm_texture_ptr = &MY_AP(coeff3_gm_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); -#else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); -#endif - } else - cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n); - } - - CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed"); - - if(ncoeff > 3) { - F_FLOAT coeff4[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j]; - } - } - - if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*)); - cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n, cudaMemcpyHostToDevice); - _coeff4_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff4_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff4_gm_texture_ptr = &MY_AP(coeff4_gm_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); -#else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); -#endif - } else - cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n); - } - - CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed"); - - if(ncoeff > 4) { - F_FLOAT coeff5[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j]; - } - } - - if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*)); - cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n, cudaMemcpyHostToDevice); - _coeff5_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff5_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff5_gm_texture_ptr = &MY_AP(coeff5_gm_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); -#else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); -#endif - } else - cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n); - } - - CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed"); - - if(ncoeff > 5) { - F_FLOAT coeff6[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j]; - } - } - - if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*)); - cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n, cudaMemcpyHostToDevice); - _coeff6_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff6_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff6_gm_texture_ptr = &MY_AP(coeff6_gm_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); -#else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); -#endif - } - } - - CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed"); - - if(ncoeff > 6) { - F_FLOAT coeff7[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j]; - } - } - - if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*)); - cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n, cudaMemcpyHostToDevice); - _coeff7_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff7_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff7_gm_texture_ptr = &MY_AP(coeff7_gm_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); -#else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); -#endif - } - } - - CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed"); - - if(ncoeff > 7) { - F_FLOAT coeff8[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j]; - } - } - - if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*)); - cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n, cudaMemcpyHostToDevice); - _coeff8_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff8_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff8_gm_texture_ptr = &MY_AP(coeff8_gm_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); -#else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); -#endif - } - } - - CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed"); - - if(ncoeff > 8) { - F_FLOAT coeff9[cuda_ntypes2]; - - for(int i = 1; i <= sdata->atom.ntypes; ++i) { - for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j]; - } - } - - if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*)); - cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n, cudaMemcpyHostToDevice); - _coeff9_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff9_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff9_gm_texture_ptr = &MY_AP(coeff9_gm_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); -#else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); -#endif - } - } - - CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed"); - - F_FLOAT special_lj[4]; - special_lj[0] = sdata->pair.special_lj[0]; - special_lj[1] = sdata->pair.special_lj[1]; - special_lj[2] = sdata->pair.special_lj[2]; - special_lj[3] = sdata->pair.special_lj[3]; - - - X_FLOAT box_size[3] = { - sdata->domain.subhi[0] - sdata->domain.sublo[0], - sdata->domain.subhi[1] - sdata->domain.sublo[1], - sdata->domain.subhi[2] - sdata->domain.sublo[2] - }; - - cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); - cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_FLOAT) * 4); - cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); - cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); - - if(need_q) { - F_FLOAT qqrd2e_tmp = sdata->pppm.qqrd2e; - F_FLOAT special_coul[4]; - special_coul[0] = sdata->pair.special_coul[0]; - special_coul[1] = sdata->pair.special_coul[1]; - special_coul[2] = sdata->pair.special_coul[2]; - special_coul[3] = sdata->pair.special_coul[3]; - - cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_FLOAT) * 4); - cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*)); - } - - CUT_CHECK_ERROR("Cuda_Pair: init failed"); -} -timespec startpairtime, endpairtime; -//Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary -void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, dim3 &grid, dim3 &threads, int &sharedperproc, bool need_q = false, int maxthreads = 256) -{ - if(sdata->atom.nlocal == 0) return; - - if(sdata->atom.update_neigh) - Cuda_Pair_UpdateNeighbor_AllStyles(sdata, sneighlist); - - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax_AllStyles(sdata, sneighlist); - - if(sdata->atom.update_nlocal) { - cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); - } - - - - BindXTypeTexture(sdata); - - if(need_q) BindQTexture(sdata); - - - sharedperproc = 0; - - if(sdata->pair.use_block_per_atom) sharedperproc += 3; - - if(eflag) sharedperproc += 1; - - if(need_q && eflag) sharedperproc += 1; - - if(vflag) sharedperproc += 6; - - int threadnum = sneighlist->inum; - - if(sdata->comm.comm_phase == 2)threadnum = sneighlist->inum_border2; - - if(sdata->pair.use_block_per_atom) { - threadnum *= 64; - maxthreads = 64; - } - - int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_FLOAT), maxthreads, true); //need to limit to 192 threads due to register limit - threads.x = layout.z; - threads.y = 1; - threads.z = 1; - grid.x = layout.x; - grid.y = layout.y; - grid.z = 1; - - int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_FLOAT); - - if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_FLOAT)); - - Cuda_UpdateBuffer(sdata, size); - - if(sdata->pair.use_block_per_atom) - cudaMemset(sdata->buffer, 0, size); - - sdata->pair.lastgridsize = grid.x * grid.y; - sdata->pair.n_energy_virial = sharedperproc; - - if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial -= 3; - - clock_gettime(CLOCK_REALTIME, &startpairtime); - - MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) -} - -//Function which is called after the kernel invocation, collects energy and virial -void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sharedperproc, int eflag, int vflag) -{ - if((not sdata->pair.collect_forces_later) && (eflag || vflag)) { //not sdata->comm.comm_phase==2)) - cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &endpairtime); - sdata->cuda_timings.pair_kernel += - endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000; - CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); - - if(eflag || vflag) { - int n = grid.x * grid.y; - - if(sdata->pair.use_block_per_atom) - grid.x = sharedperproc - 3; - else - grid.x = sharedperproc; - - grid.y = 1; - dim3 threads(128, 1, 1); - MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) - MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed"); - } - - MYDBG(printf("# CUDA: Cuda_Pair: kernel done\n");) - } -} - - -#include "pair_born_coul_long_cuda.cu" -#include "pair_buck_coul_cut_cuda.cu" -#include "pair_buck_coul_long_cuda.cu" -#include "pair_buck_cuda.cu" -#include "pair_lj_sdk_cuda.cu" -#include "pair_lj_sdk_coul_cut_cuda.cu" -#include "pair_lj_sdk_coul_debye_cuda.cu" -#include "pair_lj_sdk_coul_long_cuda.cu" -#include "pair_gran_hooke_cuda.cu" -#include "pair_lj_charmm_coul_charmm_implicit_cuda.cu" -#include "pair_lj_charmm_coul_charmm_cuda.cu" -#include "pair_lj_charmm_coul_long_cuda.cu" -#include "pair_lj_class2_coul_cut_cuda.cu" -#include "pair_lj_class2_coul_long_cuda.cu" -#include "pair_lj_class2_cuda.cu" -#include "pair_lj_cut_coul_cut_cuda.cu" -#include "pair_lj_cut_coul_debye_cuda.cu" -#include "pair_lj_cut_coul_long_cuda.cu" -#include "pair_lj_cut_cuda.cu" -#include "pair_lj_cut_experimental_cuda.cu" -#include "pair_lj_expand_cuda.cu" -#include "pair_lj_gromacs_cuda.cu" -#include "pair_lj_gromacs_coul_gromacs_cuda.cu" -#include "pair_lj_smooth_cuda.cu" -#include "pair_lj96_cut_cuda.cu" -#include "pair_morse_coul_long_cuda.cu" -#include "pair_morse_cuda.cu" -#include "pair_eam_cuda.cu" - -#include "cuda_pair_kernel.cu" - -#include "pair_manybody_const.h" -#include "pair_tersoff_cuda.cu" -#include "pair_sw_cuda.cu" - -void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata) -{ - CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed"); - cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*)); - CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed"); -} - - -void Cuda_Pair_GenerateXType(cuda_shared_data* sdata) -{ - MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);) - - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax(sdata); - - if(sdata->atom.update_nlocal) { - cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); - } - - MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout);) - - int3 layout = getgrid(sdata->atom.nall); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - MYDBG(printf(" # CUDA: GenerateXType ... kernel start test\n"); fflush(stdout);) - Pair_GenerateXType_Kernel <<< grid, threads, 0>>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); - MYDBG(printf(" # CUDA: GenerateXType ... end\n"); fflush(stdout);) -} - -void Cuda_Pair_RevertXType(cuda_shared_data* sdata) -{ - MYDBG(printf(" # CUDA: RevertXType ... start\n");) - - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax(sdata); - - cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); - - int3 layout = getgrid(sdata->atom.nall); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Pair_RevertXType_Kernel <<< grid, threads, 0>>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); - MYDBG(printf(" # CUDA: RevertXType ... end\n");) -} - -void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata) -{ - MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);) - - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax(sdata); - - cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); - MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout);) - - int3 layout = getgrid(sdata->atom.nall); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - MYDBG(printf(" # CUDA: GenerateVRadius ... kernel start test\n"); fflush(stdout);) - Pair_GenerateVRadius_Kernel <<< grid, threads, 0>>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair GenerateVRadius: Kernel failed"); - MYDBG(printf(" # CUDA: GenerateVRadius ... end\n"); fflush(stdout);) -} - -void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata) -{ - MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);) - - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax(sdata); - - cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); - MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout);) - - int3 layout = getgrid(sdata->atom.nall); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - MYDBG(printf(" # CUDA: GenerateOmegaRmass ... kernel start test\n"); fflush(stdout);) - Pair_GenerateOmegaRmass_Kernel <<< grid, threads, 0>>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair GenerateOmegaRmass: Kernel failed"); - MYDBG(printf(" # CUDA: GenerateOmegaRmass ... end\n"); fflush(stdout);) -} - -void Cuda_Pair_BuildXHold(cuda_shared_data* sdata) -{ - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax(sdata); - - cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); - - int3 layout = getgrid(sdata->atom.nall); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Pair_BuildXHold_Kernel <<< grid, threads, 0>>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); -} - -void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag) -{ - cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &endpairtime); - sdata->cuda_timings.pair_kernel += - endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000; - CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); - dim3 threads; - dim3 grid; - - if(eflag || vflag) { - int n = sdata->pair.lastgridsize; - grid.x = sdata->pair.n_energy_virial; - grid.y = 1; - threads.x = 128; - //printf("A grid.x: %i\n",grid.x); - MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed"); - } - - int3 layout = getgrid(sdata->atom.nlocal); - threads.x = layout.z; - grid.x = layout.x; - grid.y = layout.y; - Pair_CollectForces_Kernel <<< grid, threads, 0>>>(sdata->pair.n_energy_virial, sdata->pair.lastgridsize); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair_CollectForces: Force Summation Kernel execution failed"); - -} diff --git a/lib/cuda/cuda_pair_cu.h b/lib/cuda/cuda_pair_cu.h deleted file mode 100644 index 1844735a16..0000000000 --- a/lib/cuda/cuda_pair_cu.h +++ /dev/null @@ -1,30 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -#include "cuda_shared.h" - -extern "C" void Cuda_Pair_GenerateXType(cuda_shared_data* sdata); -extern "C" void Cuda_Pair_RevertXType(cuda_shared_data* sdata); -extern "C" void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata); -extern "C" void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata); -extern "C" void Cuda_Pair_BuildXHold(cuda_shared_data* sdata); -extern "C" void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag); diff --git a/lib/cuda/cuda_pair_kernel.cu b/lib/cuda/cuda_pair_kernel.cu deleted file mode 100644 index 72c0e0aa25..0000000000 --- a/lib/cuda/cuda_pair_kernel.cu +++ /dev/null @@ -1,1349 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -#define EWALD_F 1.12837917 -#define EWALD_P 0.3275911 -#define A1 0.254829592 -#define A2 -0.284496736 -#define A3 1.421413741 -#define A4 -1.453152027 -#define A5 1.061405429 - - -template -__global__ void Pair_Kernel_TpA(int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT ecoul = ENERGY_F(0.0); - - ENERGY_FLOAT* sharedE; - ENERGY_FLOAT* sharedECoul; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; - - if(eflag||eflag_atom) - { - sharedE = &sharedmem[threadIdx.x]; - sharedE[0] = ENERGY_F(0.0); - sharedV += blockDim.x; - if(coul_type!=COUL_NONE) - { - sharedECoul = sharedE + blockDim.x; - sharedECoul[0] = ENERGY_F(0.0); - sharedV += blockDim.x; - } - } - if(vflag||vflag_atom) - { - sharedV[0*blockDim.x] = ENERGY_F(0.0); - sharedV[1*blockDim.x] = ENERGY_F(0.0); - sharedV[2*blockDim.x] = ENERGY_F(0.0); - sharedV[3*blockDim.x] = ENERGY_F(0.0); - sharedV[4*blockDim.x] = ENERGY_F(0.0); - sharedV[5*blockDim.x] = ENERGY_F(0.0); - } - - int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - X_FLOAT xtmp,ytmp,ztmp; - X_FLOAT4 myxtype; - F_FLOAT fxtmp,fytmp,fztmp,fpair; - F_FLOAT delx,dely,delz; - F_FLOAT factor_lj,factor_coul; - F_FLOAT qtmp; - int itype,i,j; - int jnum=0; - int* jlist; - - if(ii < _inum) - { - i = _ilist[ii]; - - myxtype=fetchXType(i); - xtmp=myxtype.x; - ytmp=myxtype.y; - ztmp=myxtype.z; - itype=static_cast (myxtype.w); - - - fxtmp = F_F(0.0); - fytmp = F_F(0.0); - fztmp = F_F(0.0); - - if(coul_type!=COUL_NONE) - qtmp = fetchQ(i); - - jnum = _numneigh[i]; - jlist = &_neighbors[i]; - } - __syncthreads(); - - for (int jj = 0; jj < jnum; jj++) - { - if(ii < _inum) - if(jj (myxtype.w); - - - const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; - - bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); - if (in_cutoff) - { - switch(pair_type) - { - case PAIR_BORN: - fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_BUCK: - fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_CG_CMM: - fpair += PairLJSDKCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CHARMM: - fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CLASS2: - fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CUT: - fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_EXPAND: - fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_GROMACS: - fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_SMOOTH: - fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ96_CUT: - fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_MORSE_R6: - fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_MORSE: - fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - } - } - - if(coul_type!=COUL_NONE) - { - const F_FLOAT qiqj=qtmp*fetchQ(j); - if(qiqj*qiqj>1e-8) - { - const bool in_coul_cutoff = - rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); - if (in_coul_cutoff) - { - switch(coul_type) - { - case COUL_CHARMM: - fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_CHARMM_IMPLICIT: - fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_CUT: - { - const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq); - if(eflag) - { - ecoul += forcecoul; - } - fpair += forcecoul*(F_F(1.0)/rsq); - } - break; - - case COUL_DEBYE: - { - const F_FLOAT r2inv = F_F(1.0)/rsq; - const X_FLOAT r = _RSQRT_(r2inv); - const X_FLOAT rinv = F_F(1.0)/r; - const F_FLOAT screening = _EXP_(-_kappa*r); - F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; - if(eflag) - { - ecoul += forcecoul*rinv; - } - forcecoul *= (_kappa + rinv); - fpair += forcecoul*r2inv; - } - break; - - case COUL_GROMACS: - fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_LONG: - { - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT grij = _g_ewald * r; - const F_FLOAT expm2 = _EXP_(-grij*grij); - const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij); - const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r); - F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); - if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; - if(eflag) - { - ecoul += prefactor*erfc; - if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; - } - fpair += forcecoul*r2inv; - } - break; - } - } - in_cutoff=in_cutoff || in_coul_cutoff; - } - } - - - if (in_cutoff) - { - F_FLOAT dxfp,dyfp,dzfp; - fxtmp += dxfp = delx*fpair; - fytmp += dyfp = dely*fpair; - fztmp += dzfp = delz*fpair; - if(vflag) - { - sharedV[0 * blockDim.x]+= delx*dxfp; - sharedV[1 * blockDim.x]+= dely*dyfp; - sharedV[2 * blockDim.x]+= delz*dzfp; - sharedV[3 * blockDim.x]+= delx*dyfp; - sharedV[4 * blockDim.x]+= delx*dzfp; - sharedV[5 * blockDim.x]+= dely*dzfp; - } - } - } - } - __syncthreads(); - if(ii < _inum) - { - F_FLOAT* my_f; - if(_collect_forces_later) - { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; - if(eflag) - { - buffer=&buffer[1 * gridDim.x * gridDim.y]; - if(coul_type!=COUL_NONE) - buffer=&buffer[1 * gridDim.x * gridDim.y]; - } - if(vflag) - { - buffer=&buffer[6 * gridDim.x * gridDim.y]; - } - my_f = (F_FLOAT*) buffer; - my_f += i; - *my_f = fxtmp; my_f += _nmax; - *my_f = fytmp; my_f += _nmax; - *my_f = fztmp; - } - else - { - my_f = _f + i; - *my_f += fxtmp; my_f += _nmax; - *my_f += fytmp; my_f += _nmax; - *my_f += fztmp; - } - } - __syncthreads(); - - if(eflag) - { - sharedE[0] = evdwl; - if(coul_type!=COUL_NONE) - sharedECoul[0] = ecoul; - } - if(eflag_atom && i<_nlocal) - { - if(coul_type!=COUL_NONE) - _eatom[i] += evdwl + ecoul; - else - _eatom[i] += evdwl; - } - - if(vflag_atom && i<_nlocal) - { - _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; - _vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; - _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; - _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; - _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; - _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; - } - if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,coul_type!=COUL_NONE?1:0); - } - -template - __global__ void Pair_Kernel_BpA(int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - int ii = (blockIdx.x*gridDim.y+blockIdx.y); - if( ii >= _inum ) - return; - - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT ecoul = ENERGY_F(0.0); - F_FLOAT3* sharedVirial1; - F_FLOAT3* sharedVirial2; - F_FLOAT* sharedEnergy; - F_FLOAT* sharedEnergyCoul; - - F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0]; - if(vflag) - { - sharedVirial1 = &sharedForce[64]; - sharedVirial2 = &sharedVirial1[64]; - } - else - { - sharedVirial1 = &sharedForce[0]; - sharedVirial2 = &sharedVirial1[0]; - } - - if(eflag) - { - if(vflag||vflag_atom) - sharedEnergy = (F_FLOAT*) &sharedVirial2[64]; - else - sharedEnergy = (F_FLOAT*) &sharedForce[64]; - - if(coul_type!=COUL_NONE) - sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64]; - - } - - F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) }; - F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) }; - F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) }; - - X_FLOAT xtmp,ytmp,ztmp; - X_FLOAT4 myxtype; - F_FLOAT delx,dely,delz; - F_FLOAT factor_lj,factor_coul; - F_FLOAT fpair; - F_FLOAT qtmp; - int itype,jnum,i,j; - int* jlist; - - i = _ilist[ii]; - - myxtype = fetchXType(i); - - xtmp=myxtype.x; - ytmp=myxtype.y; - ztmp=myxtype.z; - itype=static_cast (myxtype.w); - - if(coul_type!=COUL_NONE) - qtmp = fetchQ(i); - - jnum = _numneigh[i]; - - jlist = &_neighbors[i*_maxneighbors]; - __syncthreads(); - for (int jj = threadIdx.x; jj < jnum+blockDim.x; jj+=blockDim.x) - { - if(jj (myxtype.w); - - const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; - - bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); - bool in_coul_cutoff; - if (in_cutoff) - { - switch(pair_type) - { - case PAIR_BORN: - fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_BUCK: - fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_CG_CMM: - fpair += PairLJSDKCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CHARMM: - fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CLASS2: - fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CUT: - fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_EXPAND: - fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_GROMACS: - fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_SMOOTH: - fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ96_CUT: - fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_MORSE_R6: - fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_MORSE: - fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - } - } - - if(coul_type!=COUL_NONE) - { - const F_FLOAT qiqj=qtmp*fetchQ(j); - if(qiqj*qiqj>(1e-8f)) - { - in_coul_cutoff = - rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); - if (in_coul_cutoff) - { - switch(coul_type) - { - case COUL_CHARMM: - fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_CHARMM_IMPLICIT: - fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_GROMACS: - fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_LONG: - { - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT grij = _g_ewald * r; - const F_FLOAT expm2 = _EXP_(-grij*grij); - const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij); - const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r); - F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); - if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; - if(eflag) - { - ecoul += prefactor*erfc; - if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; - } - fpair += forcecoul*r2inv; - } - break; - - case COUL_DEBYE: - { - const F_FLOAT r2inv = F_F(1.0)/rsq; - const X_FLOAT r = _RSQRT_(r2inv); - const X_FLOAT rinv = F_F(1.0)/r; - const F_FLOAT screening = _EXP_(-_kappa*r); - F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; - if(eflag) - { - ecoul += forcecoul*rinv; - } - forcecoul *= (_kappa + rinv); - fpair += forcecoul*r2inv; - } - break; - - case COUL_CUT: - { - const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq); - if(eflag) - { - ecoul += forcecoul; - } - fpair += forcecoul*(F_F(1.0)/rsq); - } - break; - - - } - } - } - } - - - - if (in_cutoff||in_coul_cutoff) - { - F_FLOAT dxfp,dyfp,dzfp; - partialForce.x += dxfp = delx*fpair; - partialForce.y += dyfp = dely*fpair; - partialForce.z += dzfp = delz*fpair; - if(vflag) - { - partialVirial1.x+= delx*dxfp; - partialVirial1.y+= dely*dyfp; - partialVirial1.z+= delz*dzfp; - partialVirial2.x+= delx*dyfp; - partialVirial2.y+= delx*dzfp; - partialVirial2.z+= dely*dzfp; - } - } - } - } - - if(eflag) - { - sharedEnergy[threadIdx.x]= evdwl; - if(coul_type!=COUL_NONE) - sharedEnergyCoul[threadIdx.x]= ecoul; - } - sharedForce[threadIdx.x]=partialForce; - if(vflag) - { - sharedVirial1[threadIdx.x]=partialVirial1; - sharedVirial2[threadIdx.x]=partialVirial2; - } - - __syncthreads(); - - - for( unsigned int s = blockDim.x >> 1; s > 0; s >>= 1 ) - { - - if( threadIdx.x < s ) - { - sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x; - sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y; - sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z; - - if(vflag) - { - sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x; - sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y; - sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z; - - sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x; - sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y; - sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z; - } - - if(eflag) - { - sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ]; - if(coul_type!=COUL_NONE) - sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ]; - } - } - __syncthreads(); - } - - if(threadIdx.x == 0) - { - - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; - if(eflag) - { - ENERGY_FLOAT tmp_evdwl; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergy[0]; - if(eflag_atom) - _eatom[i] = tmp_evdwl; - buffer=&buffer[gridDim.x * gridDim.y]; - if(coul_type!=COUL_NONE) - { - buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergyCoul[0]; - if(eflag_atom) - _eatom[i] += tmp_evdwl; - buffer=&buffer[gridDim.x * gridDim.y]; - } - } - if(vflag) - { - ENERGY_FLOAT tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].x; - if(vflag_atom) _vatom[i+0*_nmax] = tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].y; - if(vflag_atom) _vatom[i+1*_nmax] = tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].z; - if(vflag_atom) _vatom[i+2*_nmax] = tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].x; - if(vflag_atom) _vatom[i+3*_nmax] = tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].y; - if(vflag_atom) _vatom[i+4*_nmax] = tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].z; - if(vflag_atom) _vatom[i+5*_nmax] = tmp; - buffer=&buffer[6 * gridDim.x * gridDim.y]; - } - F_FLOAT* my_f; - if(_collect_forces_later) - { - my_f = (F_FLOAT*) buffer; - my_f += i; - *my_f = sharedForce[0].x; my_f += _nmax; - *my_f = sharedForce[0].y; my_f += _nmax; - *my_f = sharedForce[0].z; - } - else - { - my_f = _f + i; - *my_f += sharedForce[0].x; my_f += _nmax; - *my_f += sharedForce[0].y; my_f += _nmax; - *my_f += sharedForce[0].z; - } - } -} - - -template -__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase) -{ - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT ecoul = ENERGY_F(0.0); - - ENERGY_FLOAT* sharedE; - ENERGY_FLOAT* sharedECoul; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; - - if(eflag||eflag_atom) - { - sharedE = &sharedmem[threadIdx.x]; - sharedE[0] = ENERGY_F(0.0); - sharedV += blockDim.x; - if(coul_type!=COUL_NONE) - { - sharedECoul = sharedE + blockDim.x; - sharedECoul[0] = ENERGY_F(0.0); - sharedV += blockDim.x; - } - } - if(vflag||vflag_atom) - { - sharedV[0*blockDim.x] = ENERGY_F(0.0); - sharedV[1*blockDim.x] = ENERGY_F(0.0); - sharedV[2*blockDim.x] = ENERGY_F(0.0); - sharedV[3*blockDim.x] = ENERGY_F(0.0); - sharedV[4*blockDim.x] = ENERGY_F(0.0); - sharedV[5*blockDim.x] = ENERGY_F(0.0); - } - - int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - X_FLOAT xtmp,ytmp,ztmp; - X_FLOAT4 myxtype; - F_FLOAT fxtmp,fytmp,fztmp,fpair; - F_FLOAT delx,dely,delz; - F_FLOAT factor_lj,factor_coul; - F_FLOAT qtmp; - int itype,i,j; - int jnum=0; - int* jlist; - - if(ii < (comm_phase<2?_inum:_inum_border[0])) - { - i = comm_phase<2? _ilist[ii] : _ilist_border[ii] ; - - myxtype=fetchXType(i); - myxtype=_x_type[i]; - xtmp=myxtype.x; - ytmp=myxtype.y; - ztmp=myxtype.z; - itype=static_cast (myxtype.w); - - - fxtmp = F_F(0.0); - fytmp = F_F(0.0); - fztmp = F_F(0.0); - - if(coul_type!=COUL_NONE) - qtmp = fetchQ(i); - jnum = comm_phase==0? _numneigh[i]: (comm_phase==1?_numneigh_inner[i]:_numneigh_border[ii]); - - - jlist = comm_phase==0? &_neighbors[i]: (comm_phase==1?&_neighbors_inner[i]:&_neighbors_border[ii]); - } - __syncthreads(); - - for (int jj = 0; jj < jnum; jj++) - { - if(ii < (comm_phase<2?_inum:_inum_border[0])) - if(jj (myxtype.w); - - - const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; - - bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); - if (in_cutoff) - { - switch(pair_type) - { - case PAIR_BORN: - fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_BUCK: - fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_CG_CMM: - fpair += PairLJSDKCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CHARMM: - fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CLASS2: - fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CUT: - fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_EXPAND: - fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_GROMACS: - fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_SMOOTH: - fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ96_CUT: - fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_MORSE_R6: - fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_MORSE: - fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - } - } - - if(coul_type!=COUL_NONE) - { - const F_FLOAT qiqj=qtmp*fetchQ(j); - if(qiqj*qiqj>1e-8) - { - const bool in_coul_cutoff = - rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); - if (in_coul_cutoff) - { - switch(coul_type) - { - case COUL_CHARMM: - fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_CHARMM_IMPLICIT: - fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_CUT: - { - const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq); - if(eflag) - { - ecoul += forcecoul; - } - fpair += forcecoul*(F_F(1.0)/rsq); - } - break; - - case COUL_DEBYE: - { - const F_FLOAT r2inv = F_F(1.0)/rsq; - const X_FLOAT r = _RSQRT_(r2inv); - const X_FLOAT rinv = F_F(1.0)/r; - const F_FLOAT screening = _EXP_(-_kappa*r); - F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; - if(eflag) - { - ecoul += forcecoul*rinv; - } - forcecoul *= (_kappa + rinv); - fpair += forcecoul*r2inv; - } - break; - - case COUL_GROMACS: - fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_LONG: - { - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT grij = _g_ewald * r; - const F_FLOAT expm2 = _EXP_(-grij*grij); - const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij); - const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r); - F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); - if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; - if(eflag) - { - ecoul += prefactor*erfc; - if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; - } - fpair += forcecoul*r2inv; - } - break; - - } - } - in_cutoff=in_cutoff || in_coul_cutoff; - } - } - - - if (in_cutoff) - { - F_FLOAT dxfp,dyfp,dzfp; - fxtmp += dxfp = delx*fpair; - fytmp += dyfp = dely*fpair; - fztmp += dzfp = delz*fpair; - if(vflag) - { - sharedV[0 * blockDim.x]+= delx*dxfp; - sharedV[1 * blockDim.x]+= dely*dyfp; - sharedV[2 * blockDim.x]+= delz*dzfp; - sharedV[3 * blockDim.x]+= delx*dyfp; - sharedV[4 * blockDim.x]+= delx*dzfp; - sharedV[5 * blockDim.x]+= dely*dzfp; - } - } - } - } - __syncthreads(); - if(ii < (comm_phase<2?_inum:_inum_border[0])) - { - F_FLOAT* my_f; - if(_collect_forces_later) - { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; - if(eflag) - { - buffer=&buffer[1 * gridDim.x * gridDim.y]; - if(coul_type!=COUL_NONE) - buffer=&buffer[1 * gridDim.x * gridDim.y]; - } - if(vflag) - { - buffer=&buffer[6 * gridDim.x * gridDim.y]; - } - my_f = (F_FLOAT*) buffer; - my_f += i; - *my_f = fxtmp; my_f += _nmax; - *my_f = fytmp; my_f += _nmax; - *my_f = fztmp; - } - else - { - my_f = _f + i; - *my_f += fxtmp; my_f += _nmax; - *my_f += fytmp; my_f += _nmax; - *my_f += fztmp; - } - } - __syncthreads(); - - if(eflag) - { - sharedE[0] = evdwl; - if(coul_type!=COUL_NONE) - sharedECoul[0] = ecoul; - } - if(eflag_atom && i<_nlocal) - { - if(coul_type!=COUL_NONE) - _eatom[i] += evdwl + ecoul; - else - _eatom[i] += evdwl; - } - - if(vflag_atom && i<_nlocal) - { - _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; - _vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; - _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; - _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; - _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; - _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; - } - if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,coul_type!=COUL_NONE?1:0); - } - -template - __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase) -{ - int ii = (blockIdx.x*gridDim.y+blockIdx.y); - if( ii >= (comm_phase<2?_inum:_inum_border[0])) - return; - - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT ecoul = ENERGY_F(0.0); - F_FLOAT3* sharedVirial1; - F_FLOAT3* sharedVirial2; - F_FLOAT* sharedEnergy; - F_FLOAT* sharedEnergyCoul; - - F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0]; - if(vflag) - { - sharedVirial1 = &sharedForce[64]; - sharedVirial2 = &sharedVirial1[64]; - } - else - { - sharedVirial1 = &sharedForce[0]; - sharedVirial2 = &sharedVirial1[0]; - } - - if(eflag) - { - if(vflag||vflag_atom) - sharedEnergy = (F_FLOAT*) &sharedVirial2[64]; - else - sharedEnergy = (F_FLOAT*) &sharedForce[64]; - - if(coul_type!=COUL_NONE) - sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64]; - - } - - F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) }; - F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) }; - F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) }; - - X_FLOAT xtmp,ytmp,ztmp; - X_FLOAT4 myxtype; - F_FLOAT delx,dely,delz; - F_FLOAT factor_lj,factor_coul; - F_FLOAT fpair; - F_FLOAT qtmp; - int itype,jnum,i,j; - int* jlist; - - i = comm_phase<2? _ilist[ii] : _ilist_border[ii]; - - myxtype = fetchXType(i); - - xtmp=myxtype.x; - ytmp=myxtype.y; - ztmp=myxtype.z; - itype=static_cast (myxtype.w); - - if(coul_type!=COUL_NONE) - qtmp = fetchQ(i); - - jnum = comm_phase==0? _numneigh[i]: (comm_phase==1?_numneigh_inner[i]:_numneigh_border[ii]); - - jlist = comm_phase==0? &_neighbors[i*_maxneighbors]: (comm_phase==1?&_neighbors_inner[i*_maxneighbors]:&_neighbors_border[ii*_maxneighbors]); - __syncthreads(); - for (int jj = threadIdx.x; jj < jnum+blockDim.x; jj+=blockDim.x) - { - if(jj (myxtype.w); - - const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; - - bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); - bool in_coul_cutoff; - if (in_cutoff) - { - switch(pair_type) - { - case PAIR_BORN: - fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_BUCK: - fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_CG_CMM: - fpair += PairLJSDKCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CHARMM: - fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CLASS2: - fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_CUT: - fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_EXPAND: - fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_GROMACS: - fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ_SMOOTH: - fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_LJ96_CUT: - fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_MORSE_R6: - fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - case PAIR_MORSE: - fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl); - break; - } - } - - if(coul_type!=COUL_NONE) - { - const F_FLOAT qiqj=qtmp*fetchQ(j); - if(qiqj*qiqj>(1e-8f)) - { - in_coul_cutoff = - rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); - if (in_coul_cutoff) - { - switch(coul_type) - { - case COUL_CHARMM: - fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_CHARMM_IMPLICIT: - fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_GROMACS: - fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj); - break; - - case COUL_LONG: - { - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT grij = _g_ewald * r; - const F_FLOAT expm2 = _EXP_(-grij*grij); - const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij); - const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r); - F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); - if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; - if(eflag) - { - ecoul += prefactor*erfc; - if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; - } - fpair += forcecoul*r2inv; - } - break; - - case COUL_DEBYE: - { - const F_FLOAT r2inv = F_F(1.0)/rsq; - const X_FLOAT r = _RSQRT_(r2inv); - const X_FLOAT rinv = F_F(1.0)/r; - const F_FLOAT screening = _EXP_(-_kappa*r); - F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; - if(eflag) - { - ecoul += forcecoul*rinv; - } - forcecoul *= (_kappa + rinv); - fpair += forcecoul*r2inv; - } - break; - - case COUL_CUT: - { - const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq); - if(eflag) - { - ecoul += forcecoul; - } - fpair += forcecoul*(F_F(1.0)/rsq); - } - break; - - - } - } - } - } - - - - if (in_cutoff||in_coul_cutoff) - { - F_FLOAT dxfp,dyfp,dzfp; - partialForce.x += dxfp = delx*fpair; - partialForce.y += dyfp = dely*fpair; - partialForce.z += dzfp = delz*fpair; - if(vflag) - { - partialVirial1.x+= delx*dxfp; - partialVirial1.y+= dely*dyfp; - partialVirial1.z+= delz*dzfp; - partialVirial2.x+= delx*dyfp; - partialVirial2.y+= delx*dzfp; - partialVirial2.z+= dely*dzfp; - } - } - } - } - - if(eflag) - { - sharedEnergy[threadIdx.x]= evdwl; - if(coul_type!=COUL_NONE) - sharedEnergyCoul[threadIdx.x]= ecoul; - } - sharedForce[threadIdx.x]=partialForce; - if(vflag) - { - sharedVirial1[threadIdx.x]=partialVirial1; - sharedVirial2[threadIdx.x]=partialVirial2; - } - - __syncthreads(); - - - for( unsigned int s = blockDim.x >> 1; s > 0; s >>= 1 ) - { - - if( threadIdx.x < s ) - { - sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x; - sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y; - sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z; - - if(vflag) - { - sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x; - sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y; - sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z; - - sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x; - sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y; - sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z; - } - - if(eflag) - { - sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ]; - if(coul_type!=COUL_NONE) - sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ]; - } - } - __syncthreads(); - } - - if(threadIdx.x == 0) - { - - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; - if(eflag) - { - ENERGY_FLOAT tmp_evdwl; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergy[0]; - if(eflag_atom) - _eatom[i] = tmp_evdwl; - buffer=&buffer[gridDim.x * gridDim.y]; - if(coul_type!=COUL_NONE) - { - buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergyCoul[0]; - if(eflag_atom) - _eatom[i] += tmp_evdwl; - buffer=&buffer[gridDim.x * gridDim.y]; - } - } - if(vflag) - { - ENERGY_FLOAT tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].x; - if(vflag_atom) _vatom[i+0*_nmax] = tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].y; - if(vflag_atom) _vatom[i+1*_nmax] = tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].z; - if(vflag_atom) _vatom[i+2*_nmax] = tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].x; - if(vflag_atom) _vatom[i+3*_nmax] = tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].y; - if(vflag_atom) _vatom[i+4*_nmax] = tmp; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].z; - if(vflag_atom) _vatom[i+5*_nmax] = tmp; - buffer=&buffer[6 * gridDim.x * gridDim.y]; - } - F_FLOAT* my_f; - if(_collect_forces_later) - { - my_f = (F_FLOAT*) buffer; - my_f += i; - *my_f = sharedForce[0].x; my_f += _nmax; - *my_f = sharedForce[0].y; my_f += _nmax; - *my_f = sharedForce[0].z; - } - else - { - my_f = _f + i; - *my_f += sharedForce[0].x; my_f += _nmax; - *my_f += sharedForce[0].y; my_f += _nmax; - *my_f += sharedForce[0].z; - } - } -} - -__global__ void Pair_GenerateXType_Kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nall) - { - X_FLOAT4 xtype; - xtype.x=_x[i]; - xtype.y=_x[i+_nmax]; - xtype.z=_x[i+2*_nmax]; - xtype.w=_type[i]; - _x_type[i]=xtype; - } - -} - -__global__ void Pair_GenerateVRadius_Kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nall) - { - V_FLOAT4 vradius; - vradius.x=_v[i]; - vradius.y=_v[i+_nmax]; - vradius.z=_v[i+2*_nmax]; - vradius.w=_radius[i]; - _v_radius[i]=vradius; - } -} - -__global__ void Pair_GenerateOmegaRmass_Kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nall) - { - V_FLOAT4 omegarmass; - omegarmass.x=_omega[i]; - omegarmass.y=_omega[i+_nmax]; - omegarmass.z=_omega[i+2*_nmax]; - omegarmass.w=_rmass[i]; - _omega_rmass[i]=omegarmass; - } -} - -__global__ void Pair_RevertXType_Kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nall) - { - X_FLOAT4 xtype=_x_type[i]; - _x[i]=xtype.x; - _x[i+_nmax]=xtype.y; - _x[i+2*_nmax]=xtype.z; - _type[i]=static_cast (xtype.w); - } - -} - -__global__ void Pair_BuildXHold_Kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nall) - { - X_FLOAT4 xtype=_x_type[i]; - _xhold[i]=xtype.x; - _xhold[i+_nmax]=xtype.y; - _xhold[i+2*_nmax]=xtype.z; - } - -} - -__global__ void Pair_CollectForces_Kernel(int nperblock,int n) -{ - int i = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i>=_nlocal) return; - ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; - - F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n]; - F_FLOAT* my_f = _f + i; - buf_f += i; - *my_f += * buf_f; my_f+=_nmax; buf_f+=_nmax; - *my_f += * buf_f; my_f+=_nmax; buf_f+=_nmax; - *my_f += * buf_f; my_f+=_nmax; -} diff --git a/lib/cuda/cuda_pair_virial_kernel_nc.cu b/lib/cuda/cuda_pair_virial_kernel_nc.cu deleted file mode 100644 index b4bacf748a..0000000000 --- a/lib/cuda/cuda_pair_virial_kernel_nc.cu +++ /dev/null @@ -1,126 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -extern __shared__ ENERGY_FLOAT sharedmem[]; - -static inline __device__ void PairVirialCompute_A_Kernel(int eflag,int vflag,int coulflag=0) -{ - __syncthreads(); - ENERGY_FLOAT* shared=sharedmem; - - if(eflag) - { - reduceBlock(shared); - shared+=blockDim.x; - if(coulflag) - { - reduceBlock(shared); - shared+=blockDim.x; - } - } - if(vflag) - { - reduceBlock(shared + 0 * blockDim.x); - reduceBlock(shared + 1 * blockDim.x); - reduceBlock(shared + 2 * blockDim.x); - reduceBlock(shared + 3 * blockDim.x); - reduceBlock(shared + 4 * blockDim.x); - reduceBlock(shared + 5 * blockDim.x); - } - if(threadIdx.x == 0) - { - shared=sharedmem; - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; - if(eflag) - { - buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0]; - shared+=blockDim.x; buffer+=gridDim.x * gridDim.y; - if(coulflag) - { - buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0]; - shared+=blockDim.x; buffer+=gridDim.x * gridDim.y; - } - } - if(vflag) - { - buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[0 * blockDim.x]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[1 * blockDim.x]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[2 * blockDim.x]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[3 * blockDim.x]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[4 * blockDim.x]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[5 * blockDim.x]; - } - } - __syncthreads(); -} - -__global__ void MY_AP(PairVirialCompute_reduce)(int n) -{ - sharedmem[threadIdx.x] = ENERGY_F(0.0); - ENERGY_FLOAT sum = ENERGY_F(0.0); - ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; - buf = &buf[blockIdx.x * n]; - //if(blockIdx.x==2) buf=&buf[n]; - - for(int i = 0; i < n; i += blockDim.x) - { - sharedmem[threadIdx.x] = (i + threadIdx.x < n) ? buf[i + threadIdx.x] : ENERGY_F(0.0); - __syncthreads(); - reduceBlock(sharedmem); - if(threadIdx.x == 0) sum += sharedmem[0]; - } - if(threadIdx.x==0) - { - if(gridDim.x == 1) //evdwl - { - _eng_vdwl[0]+=sum; - } - if(gridDim.x == 2) //evdwl + ecoul only - { - if(blockIdx.x==0) - _eng_vdwl[0]+=sum; - else - _eng_coul[0]+=sum; - } - if(gridDim.x == 6) //virial - { - _virial[blockIdx.x] += sum; - } - if(gridDim.x == 7) //evdwl+virial - { - if(blockIdx.x==0) - _eng_vdwl[0]+=sum; - else _virial[blockIdx.x-1] += sum; - } - if(gridDim.x == 8) //evdwl+ecoul+virial - { - if(blockIdx.x==0) - _eng_vdwl[0]+=sum; - else - if(blockIdx.x==1) - _eng_coul[0]+=sum; - else - _virial[blockIdx.x-2] += sum; - } - } -} diff --git a/lib/cuda/cuda_precision.h b/lib/cuda/cuda_precision.h deleted file mode 100644 index 7300fc50d7..0000000000 --- a/lib/cuda/cuda_precision.h +++ /dev/null @@ -1,284 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#ifndef CUDA_PRECISION_H_ -#define CUDA_PRECISION_H_ -/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA. - * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation). - * ***_FLOAT: type definition of given property - * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F) - */ - -#ifdef CUDA_USE_BINNING -#define CUDA_IF_BINNING(a) a -#else -#define CUDA_IF_BINNING(a) -#endif - -//GLOBAL - -#ifdef CUDA_PRECISION - #if CUDA_PRECISION == 1 - #define CUDA_FLOAT float - #define CUDA_F(x) x##f - #endif - #if CUDA_PRECISION == 2 - #define CUDA_FLOAT double - #define CUDA_F(x) x - #endif -#endif - -#ifndef CUDA_PRECISION - #define CUDA_FLOAT double - #define CUDA_F(x) x - #define CUDA_PRECISION 2 -#endif -//-------------------------------- -//-----------FFT----------------- -//-------------------------------- - -#ifdef FFT_PRECISION_CU - #if FFT_PRECISION_CU == 1 - #define FFT_FLOAT float - #define FFT_F(x) x##f - #endif - #if FFT_PRECISION_CU == 2 - #define FFT_FLOAT double - #define FFT_F(x) x - #endif -#endif - -#ifndef FFT_PRECISION_CU - #define FFT_FLOAT CUDA_FLOAT - #define FFT_F(x) CUDA_F(x) - #define FFT_PRECISION_CU CUDA_PRECISION -#endif - -//-------------------------------- -//-----------PPPM----------------- -//-------------------------------- - -#ifndef PPPM_PRECISION - #define PPPM_PRECISION CUDA_PRECISION -#endif - -#ifdef PPPM_PRECISION - #if PPPM_PRECISION == 1 - #define PPPM_FLOAT float - #ifdef float3 - #define PPPM_FLOAT3 float3 - #else - struct PPPM_FLOAT3 - { - PPPM_FLOAT x; - PPPM_FLOAT y; - PPPM_FLOAT z; - }; - #endif - #define PPPM_F(x) x##f - #endif - #if PPPM_PRECISION == 2 - #define PPPM_FLOAT double - struct PPPM_FLOAT3 - { - PPPM_FLOAT x; - PPPM_FLOAT y; - PPPM_FLOAT z; - }; - #define PPPM_F(x) x - #endif -#endif - - -//-------------------------------- -//-----------FORCE----------------- -//-------------------------------- - - -#ifdef F_PRECISION - #if F_PRECISION == 1 - #define F_FLOAT float - #define F_F(x) x##f - #endif - #if F_PRECISION == 2 - #define F_FLOAT double - #define F_F(x) x - #endif -#endif - -#ifndef F_PRECISION - #define F_FLOAT CUDA_FLOAT - #define F_F(x) CUDA_F(x) - #define F_PRECISION CUDA_PRECISION -#endif - -#if F_PRECISION == 1 -#define _SQRT_ sqrtf -#define _RSQRT_ rsqrtf -#define _EXP_ expf -#else -#define _SQRT_ sqrt -#define _RSQRT_ rsqrt -#define _EXP_ exp -#endif - -#if F_PRECISION == 2 -struct F_FLOAT2 -{ - F_FLOAT x; - F_FLOAT y; -}; -struct F_FLOAT3 -{ - F_FLOAT x; - F_FLOAT y; - F_FLOAT z; -}; -struct F_FLOAT4 -{ - F_FLOAT x; - F_FLOAT y; - F_FLOAT z; - F_FLOAT w; -}; -#else -#define F_FLOAT2 float2 -#define F_FLOAT3 float3 -#define F_FLOAT4 float4 -#endif -//-------------------------------- -//-----------ENERGY----------------- -//-------------------------------- - -#ifndef ENERGY_PRECISION - #define ENERGY_FLOAT CUDA_FLOAT - #define ENERGY_F(x) CUDA_F(x) -#endif - -#ifdef ENERGY_PRECISION - #if ENERGY_PRECISION == 1 - #define ENERGY_FLOAT float - #define ENERGY_F(x) x##f - #endif - #if ENERGY_PRECISION == 2 - #define ENERGY_FLOAT double - #define ENERGY_F(x) x - #endif -#endif - -#ifndef ENERGY_PRECISION - #define ENERGY_FLOAT CUDA_FLOAT - #define ENERGY_F(x) CUDA_F(x) - #define ENERGY_PRECISION CUDA_PRECISION -#endif - -//-------------------------------- -//-----------POSITIONS------------ -//-------------------------------- - -#ifdef X_PRECISION - #if X_PRECISION == 1 - #define X_FLOAT float - #define X_F(x) x##f - #endif - #if X_PRECISION == 2 - #define X_FLOAT double - #define X_F(x) x - #endif -#endif - -#ifndef X_PRECISION - #define X_FLOAT CUDA_FLOAT - #define X_F(x) CUDA_F(x) - #define X_PRECISION CUDA_PRECISION -#endif - -#if X_PRECISION == 2 -struct X_FLOAT2 -{ - X_FLOAT x; - X_FLOAT y; -}; -struct X_FLOAT3 -{ - X_FLOAT x; - X_FLOAT y; - X_FLOAT z; -}; -struct X_FLOAT4 -{ - X_FLOAT x; - X_FLOAT y; - X_FLOAT z; - X_FLOAT w; -}; -#else -#define X_FLOAT2 float2 -#define X_FLOAT3 float3 -#define X_FLOAT4 float4 -#endif - -//-------------------------------- -//-----------velocities----------- -//-------------------------------- - -#ifdef V_PRECISION - #if V_PRECISION == 1 - #define V_FLOAT float - #define V_F(x) x##f - #endif - #if V_PRECISION == 2 - #define V_FLOAT double - #define V_F(x) x - #endif -#endif - -#ifndef V_PRECISION - #define V_FLOAT CUDA_FLOAT - #define V_F(x) CUDA_F(x) - #define V_PRECISION CUDA_PRECISION -#endif - -#if V_PRECISION == 2 -struct V_FLOAT4 -{ - V_FLOAT x; - V_FLOAT y; - V_FLOAT z; - V_FLOAT w; -}; -#else -#define V_FLOAT4 float4 -#endif - -#ifdef NO_PREC_TIMING -struct timespec_2 -{ - unsigned int tv_sec; - unsigned int tv_nsec; -}; - -#define timespec timespec_2 -#define clock_gettime(a,b) -#endif -#endif /*CUDA_PRECISION_H_*/ diff --git a/lib/cuda/cuda_shared.h b/lib/cuda/cuda_shared.h deleted file mode 100644 index a11c57dc22..0000000000 --- a/lib/cuda/cuda_shared.h +++ /dev/null @@ -1,380 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#ifndef _CUDA_SHARED_H_ -#define _CUDA_SHARED_H_ -#include "cuda_precision.h" - -#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int) - -struct dev_array -{ - void* dev_data; // pointer to memory address on cuda device - unsigned dim[3]; // array dimensions -}; - -struct cuda_shared_atom // relevent data from atom class -{ - dev_array dx; // cumulated distance for binning settings - dev_array x; // position - dev_array v; // velocity - dev_array f; // force - dev_array tag; - dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1) - dev_array mask; - dev_array image; - dev_array q; // charges - dev_array mass; // per-type masses - dev_array rmass; // per-atom masses - dev_array radius; // per-atom radius - dev_array density; - dev_array omega; - dev_array torque; - dev_array molecule; - - dev_array special; - int maxspecial; - dev_array nspecial; - int* special_flag; - int molecular; - - dev_array eatom; // per-atom energy - dev_array vatom; // per-atom virial - int need_eatom; - int need_vatom; - - dev_array x_type; // position + type in X_FLOAT4 struct - dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style - dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style - - double* mass_host; // remember per-type host pointer to masses - //int natoms; // total # of atoms in system, could be 0 - int nghost; // and ghost atoms on this proc - int nlocal; // # of owned - int nall; // total # of atoms in this proc - int nmax; // max # of owned+ghost in arrays on this proc - int ntypes; - int q_flag; // do we have charges? - int rmass_flag; // do we have per-atom masses? - int firstgroup; - int nfirst; - - int update_nlocal; - int update_nmax; - int update_neigh; - - dev_array xhold; // position at last neighboring - X_FLOAT triggerneighsq; // maximum square movement before reneighboring - int reneigh_flag; // is reneighboring necessary - int maxhold; // size of xhold - int dist_check; //perform distance check for reneighboring - dev_array binned_id; //id of each binned atom (not tag!!) - dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]] - float bin_extraspace; - int bin_dim[3]; - int bin_nmax; - dev_array map_array; -}; - -struct cuda_shared_pair // relevent data from pair class -{ - char cudable_force; // check for (cudable_force!=0) - X_FLOAT cut_global; - X_FLOAT cut_inner_global; - X_FLOAT cut_coul_global; - double** cut; // type-type cutoff - double** cutsq; // type-type cutoff - double** cut_inner; // type-type cutoff for coul - double** cut_coul; // type-type cutoff for coul - double** coeff1; // tpye-type pair parameters - double** coeff2; - double** coeff3; - double** coeff4; - double** coeff5; - double** coeff6; - double** coeff7; - double** coeff8; - double** coeff9; - double** coeff10; - double** offset; - double* special_lj; - double* special_coul; - dev_array virial; // ENERGY_FLOAT - dev_array eng_vdwl; // ENERGY_FLOAT - dev_array eng_coul; // ENERGY_FLOAT - X_FLOAT cut_coulsq_global; - F_FLOAT g_ewald,kappa; - int freeze_group_bit; - - dev_array coeff1_gm; - dev_array coeff2_gm; - dev_array coeff3_gm; - dev_array coeff4_gm; - dev_array coeff5_gm; - dev_array coeff6_gm; - dev_array coeff7_gm; - dev_array coeff8_gm; - dev_array coeff9_gm; - dev_array coeff10_gm; - - int lastgridsize; - int n_energy_virial; - int collect_forces_later; - int use_block_per_atom; - int override_block_per_atom; - bool neighall; - -}; - -struct cuda_shared_domain // relevent data from domain class -{ - X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc - X_FLOAT subhi[3]; - X_FLOAT boxlo[3]; - X_FLOAT boxhi[3]; - X_FLOAT prd[3]; - int periodicity[3]; // xyz periodicity as array - - int triclinic; - X_FLOAT xy; - X_FLOAT xz; - X_FLOAT yz; - X_FLOAT boxlo_lamda[3]; - X_FLOAT boxhi_lamda[3]; - X_FLOAT prd_lamda[3]; - X_FLOAT h[6]; - X_FLOAT h_inv[6]; - V_FLOAT h_rate[6]; - int update; -}; - -struct cuda_shared_pppm -{ - char cudable_force; -#ifdef FFT_CUFFT - FFT_FLOAT* work1; - FFT_FLOAT* work2; - FFT_FLOAT* work3; - PPPM_FLOAT* greensfn; - PPPM_FLOAT* fkx; - PPPM_FLOAT* fky; - PPPM_FLOAT* fkz; - PPPM_FLOAT* vg; -#endif - int* part2grid; - PPPM_FLOAT* density_brick; - int* density_brick_int; - PPPM_FLOAT density_intScale; - PPPM_FLOAT* vdx_brick; - PPPM_FLOAT* vdy_brick; - PPPM_FLOAT* vdz_brick; - PPPM_FLOAT* density_fft; - ENERGY_FLOAT* energy; - ENERGY_FLOAT* virial; - int nxlo_in; - int nxhi_in; - int nxlo_out; - int nxhi_out; - int nylo_in; - int nyhi_in; - int nylo_out; - int nyhi_out; - int nzlo_in; - int nzhi_in; - int nzlo_out; - int nzhi_out; - int nx_pppm; - int ny_pppm; - int nz_pppm; - PPPM_FLOAT qqrd2e; - int order; - // float3 sublo; - PPPM_FLOAT* rho_coeff; - int nmax; - int nlocal; - PPPM_FLOAT* debugdata; - PPPM_FLOAT delxinv; - PPPM_FLOAT delyinv; - PPPM_FLOAT delzinv; - int nlower; - int nupper; - PPPM_FLOAT shiftone; - PPPM_FLOAT3* fH; -}; - -struct cuda_shared_comm -{ - int maxswap; - int maxlistlength; - dev_array pbc; - dev_array slablo; - dev_array slabhi; - dev_array multilo; - dev_array multihi; - dev_array sendlist; - int grow_flag; - int comm_phase; - - int nsend; - int* nsend_swap; - int* send_size; - int* recv_size; - double** buf_send; - void** buf_send_dev; - double** buf_recv; - void** buf_recv_dev; - void* buffer; - int buffer_size; - double overlap_split_ratio; -}; - -struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data -{ - int maxlocal; - int inum; // # of I atoms neighbors are stored for local indices of I atoms - int inum_border2; - dev_array inum_border; // # of atoms which interact with border atoms - dev_array ilist; - dev_array ilist_border; - dev_array numneigh; - dev_array numneigh_inner; - dev_array numneigh_border; - dev_array firstneigh; - dev_array neighbors; - dev_array neighbors_border; - dev_array neighbors_inner; - int maxpage; - dev_array page_pointers; - dev_array* pages; - int maxneighbors; - int neigh_lists_per_page; - double** cutneighsq; - CUDA_FLOAT* cu_cutneighsq; - int* binned_id; - int* bin_dim; - int bin_nmax; - float bin_extraspace; - double maxcut; - dev_array ex_type; - int nex_type; - dev_array ex1_bit; - dev_array ex2_bit; - int nex_group; - dev_array ex_mol_bit; - int nex_mol; - -}; - -struct cuda_compile_settings // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files -{ - int prec_glob; - int prec_x; - int prec_v; - int prec_f; - int prec_pppm; - int prec_fft; - int cufft; - int arch; -}; - -struct cuda_timings_struct -{ - //Debug: - double test1; - double test2; - //transfers - double transfer_upload_tmp_constr; - double transfer_download_tmp_deconstr; - - //communication - double comm_forward_total; - double comm_forward_mpi_upper; - double comm_forward_mpi_lower; - double comm_forward_kernel_pack; - double comm_forward_kernel_unpack; - double comm_forward_kernel_self; - double comm_forward_upload; - double comm_forward_download; - - double comm_exchange_total; - double comm_exchange_mpi; - double comm_exchange_kernel_pack; - double comm_exchange_kernel_unpack; - double comm_exchange_kernel_fill; - double comm_exchange_cpu_pack; - double comm_exchange_upload; - double comm_exchange_download; - - double comm_border_total; - double comm_border_mpi; - double comm_border_kernel_pack; - double comm_border_kernel_unpack; - double comm_border_kernel_self; - double comm_border_kernel_buildlist; - double comm_border_upload; - double comm_border_download; - - //pair forces - double pair_xtype_conversion; - double pair_kernel; - double pair_virial; - double pair_force_collection; - - //neighbor - double neigh_bin; - double neigh_build; - double neigh_special; - - //PPPM - double pppm_particle_map; - double pppm_make_rho; - double pppm_brick2fft; - double pppm_poisson; - double pppm_fillbrick; - double pppm_fieldforce; - double pppm_compute; - -}; - -struct cuda_shared_data // holds space for all relevent data from the different classes -{ - void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine] - int buffersize; //maxsize of buffer - int buffer_new; //should be 1 if the pointer to buffer has changed - void* flag; - void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array - cuda_shared_atom atom; - cuda_shared_pair pair; - cuda_shared_domain domain; - cuda_shared_pppm pppm; - cuda_shared_comm comm; - cuda_compile_settings compile_settings; - cuda_timings_struct cuda_timings; - int exchange_dim; - int me; //mpi rank - unsigned int datamask; - int overlap_comm; -}; - - -#endif // #ifndef _CUDA_SHARED_H_ diff --git a/lib/cuda/cuda_wrapper.cu b/lib/cuda/cuda_wrapper.cu deleted file mode 100644 index 7168df138b..0000000000 --- a/lib/cuda/cuda_wrapper.cu +++ /dev/null @@ -1,317 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#include "cuda_shared.h" -#include "cuda_common.h" -#include "cuda_wrapper_cu.h" -#include "cuda_wrapper_kernel.cu" - -static int CudaWrapper_total_gpu_mem=0; -static double CudaWrapper_total_upload_time=0; -static double CudaWrapper_total_download_time=0; -static double CudaWrapper_cpubuffer_upload_time=0; -static double CudaWrapper_cpubuffer_download_time=0; -static cudaStream_t* streams; -static int nstreams=0; - -void CudaWrapper_Init(int argc, char** argv,int me,int ppn,int* devicelist) -{ - MYDBG( printf("# CUDA: debug mode on\n"); ) - - #if __DEVICE_EMULATION__ - - printf("# CUDA: emulation mode on\n"); - - #else - - // modified from cutil.h - static int deviceCount=0; - static bool sharedmode=false; - if(deviceCount && !sharedmode) return; - if(deviceCount && sharedmode) cudaThreadExit(); - - CUDA_SAFE_CALL_NO_SYNC( cudaGetDeviceCount(&deviceCount) ); - if (deviceCount == 0) - { - fprintf(stderr, "cutil error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - MYDBG( printf("# CUDA There are %i devices supporting CUDA in this system.\n",deviceCount);) - - cudaDeviceProp deviceProp[deviceCount]; - for(int i=0;ideviceCount) {printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n"); exit(0);} - int devicea=me%ppn; - if(devicelist) devicea=devicelist[devicea]; - else - devicea=dev_list[devicea]; - if(devicea>=deviceCount) {printf("Asking for non existent GPU %i. Found only %i GPUs.\n",devicea,deviceCount); exit(0);} - MYDBG( - printf(" # CUDA myid: %i take device: %i\n",me,devicea); - ) - CUDA_SAFE_CALL( cudaSetDevice(devicea) ); - } - else - { - CUDA_SAFE_CALL( cudaSetValidDevices(dev_list,deviceCount) ); - } - cudaThreadSynchronize(); - - int dev; - CUDA_SAFE_CALL( cudaGetDevice(&dev)); - - if (deviceProp[dev].major < 1) - { - fprintf(stderr, "CUDA error: device does not support CUDA.\n"); - exit(EXIT_FAILURE); - } - else - if ((deviceProp[dev].major == 1)&&(deviceProp[dev].minor != 3)) - { - fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n",dev,deviceProp[dev].name,deviceProp[dev].major,deviceProp[dev].minor); - exit(EXIT_FAILURE); - } - if ((deviceProp[dev].major == 2)&&(CUDA_ARCH<20)) - { - fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n",deviceProp[dev].major,deviceProp[dev].minor); - } - if ((deviceProp[dev].major == 1)&&(CUDA_ARCH>=20)) - { - fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n",CUDA_ARCH); - exit(EXIT_FAILURE); - } - - -fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name); - MYDBG( fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);) - - MYDBG - ( - printf("name = %s\n", deviceProp[dev].name); - printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem); - printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock); - printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock); - printf("warpSize = %i\n", deviceProp[dev].warpSize); - printf("memPitch = %i\n", deviceProp[dev].memPitch); - printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock); - printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]); - printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]); - printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem); - printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor); - printf("clockRate = %i\n", deviceProp[dev].clockRate); - printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment); - printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap); - printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount); - printf("computeMode = %i\n", deviceProp[dev].computeMode); - ) - - #endif - } - -void* CudaWrapper_AllocCudaData(unsigned nbytes) -{ - void* dev_data; - CUDA_SAFE_CALL( cudaMalloc((void**)&dev_data, nbytes) ); - MYDBG( printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data); ) - CudaWrapper_total_gpu_mem+=nbytes; - return dev_data; -} - -void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes) -{ - MYDBG( printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data,host_data); ) - cudaThreadSynchronize(); - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); - CUDA_SAFE_CALL( cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice) ); - clock_gettime(CLOCK_REALTIME,&time2); - CudaWrapper_total_upload_time+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; -} - -void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream) -{ - MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); ) - cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice,streams[stream]); -} - -void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes) -{ - MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); ) - cudaThreadSynchronize(); - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); - CUDA_SAFE_CALL( cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost) ); - clock_gettime(CLOCK_REALTIME,&time2); - CudaWrapper_total_download_time+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; -} - -void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream) -{ - MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); ) - cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost,streams[stream]); -} - -void CudaWrapper_FreeCudaData(void* dev_data,unsigned nbytes) -{ - MYDBG( printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data,nbytes,(char*)dev_data+nbytes); ) - CUDA_SAFE_CALL( cudaFree(dev_data) ); - CudaWrapper_total_gpu_mem-=nbytes; -} - -void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes) -{ - MYDBG( printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data); ) - CUDA_SAFE_CALL( cudaMemset(dev_data, value, nbytes) ); -} - -void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes) -{ - MYDBG( printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source,dev_dest); ) - CUDA_SAFE_CALL( cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice) ); -} - -void* CudaWrapper_AllocPinnedHostData(unsigned nbytes,bool mapped,bool writeCombined) -{ - void* host_data; - int flags=0; - if(mapped) flags=flags | cudaHostAllocMapped; - if(writeCombined) flags=flags | cudaHostAllocWriteCombined; - - CUDA_SAFE_CALL( cudaHostAlloc((void**)&host_data, nbytes,flags) ); -// CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) ); - MYDBG( printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data); ) - return host_data; -} - -void CudaWrapper_FreePinnedHostData(void* host_data) -{ - MYDBG( printf("# CUDA: freeing pinned host memory at %p \n",host_data); ) - if(host_data) - CUDA_SAFE_CALL( cudaFreeHost(host_data) ); -} - -void cuda_check_error(char* comment) -{ - printf("ERROR-CUDA %s %s\n",comment,cudaGetErrorString(cudaGetLastError())); -} - -int CudaWrapper_CheckMemUseage() -{ - size_t free,total; - cudaMemGetInfo(&free,&total); - return total-free; //possible with cuda 3.0 ??? - //return CudaWrapper_total_gpu_mem; -} - -double CudaWrapper_CheckUploadTime(bool reset) -{ - if(reset) CudaWrapper_total_upload_time=0.0; - return CudaWrapper_total_upload_time; -} - -double CudaWrapper_CheckDownloadTime(bool reset) -{ - if(reset) CudaWrapper_total_download_time=0.0; - return CudaWrapper_total_download_time; -} - -double CudaWrapper_CheckCPUBufUploadTime(bool reset) -{ - if(reset) CudaWrapper_cpubuffer_upload_time=0.0; - return CudaWrapper_cpubuffer_upload_time; -} - -double CudaWrapper_CheckCPUBufDownloadTime(bool reset) -{ - if(reset) CudaWrapper_cpubuffer_download_time=0.0; - return CudaWrapper_cpubuffer_download_time; -} - -void CudaWrapper_AddCPUBufUploadTime(double dt) -{ - CudaWrapper_cpubuffer_upload_time+=dt; -} - -void CudaWrapper_AddCPUBufDownloadTime(double dt) -{ - CudaWrapper_cpubuffer_download_time+=dt; -} - -void CudaWrapper_Sync() -{ - cudaThreadSynchronize(); -} - -void CudaWrapper_SyncStream(int stream) -{ - cudaStreamSynchronize(streams[stream]); -} - -void CudaWrapper_AddStreams(int n) -{ - cudaStream_t* new_streams=new cudaStream_t[nstreams+n]; - for(int i=0;i0) - delete [] streams; - streams=new_streams; - nstreams+=n; -} - -void* CudaWrapper_returnStreams() -{ - return (void*) streams; -} - -int CudaWrapper_returnNStreams() -{ - return nstreams; -} - diff --git a/lib/cuda/cuda_wrapper_cu.h b/lib/cuda/cuda_wrapper_cu.h deleted file mode 100644 index 85d51a8586..0000000000 --- a/lib/cuda/cuda_wrapper_cu.h +++ /dev/null @@ -1,52 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#ifndef _CUDA_DATA_WRAPPER_H_ -#define _CUDA_DATA_WRAPPER_H_ - -extern "C" void CudaWrapper_Init(int argc, char** argv,int me=0,int ppn=2,int* devicelist=NULL); -extern "C" void* CudaWrapper_AllocCudaData(unsigned nbytes); -extern "C" void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes); -extern "C" void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id); -extern "C" void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes); -extern "C" void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id); -extern "C" void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes=0); -extern "C" void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes); -extern "C" void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes); -extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped=false, bool writeCombind=false); -extern "C" void CudaWrapper_FreePinnedHostData(void* dev_data); -extern "C" void cuda_check_error(char* comment); -extern "C" int CudaWrapper_CheckMemUseage(); -extern "C" double CudaWrapper_CheckUploadTime(bool reset=false); -extern "C" double CudaWrapper_CheckDownloadTime(bool reset=false); -extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset=false); -extern "C" double CudaWrapper_CheckCPUBufDownloadTime(bool reset=false); -extern "C" void CudaWrapper_AddCPUBufUploadTime(double dt); -extern "C" void CudaWrapper_AddCPUBufDownloadTime(double dt); -extern "C" void CudaWrapper_Sync(); -extern "C" void CudaWrapper_SyncStream(int n); -extern "C" void CudaWrapper_AddStreams(int n); -extern "C" void* CudaWrapper_returnStreams(); -extern "C" int CudaWrapper_returnNStreams(); - -#endif // _CUDA_DATA_WRAPPER_H_ diff --git a/lib/cuda/cuda_wrapper_kernel.cu b/lib/cuda/cuda_wrapper_kernel.cu deleted file mode 100644 index 951563b67b..0000000000 --- a/lib/cuda/cuda_wrapper_kernel.cu +++ /dev/null @@ -1,24 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -// empty file to obay common make rule diff --git a/lib/cuda/domain.cu b/lib/cuda/domain.cu deleted file mode 100644 index 0f1583dda1..0000000000 --- a/lib/cuda/domain.cu +++ /dev/null @@ -1,194 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX domain -#include "cuda_shared.h" -#include "cuda_common.h" - -#include "crm_cuda_utils.cu" - -#include "domain_cu.h" -#include "domain_kernel.cu" - -void Cuda_Domain_UpdateBuffer(cuda_shared_data* sdata,int size) -{ - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); -} - -void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(image) , & sdata->atom.image.dev_data, sizeof(int*) ); -} - -void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(boxlo) , sdata->domain.boxlo , 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(boxhi) , sdata->domain.boxhi , 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo , 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi , 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd , 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , 3*sizeof(int)); - cudaMemcpyToSymbol(MY_CONST(triclinic) , & sdata->domain.triclinic , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(boxlo_lamda) , sdata->domain.boxlo_lamda , 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(boxhi_lamda) , sdata->domain.boxhi_lamda , 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(prd_lamda) , sdata->domain.prd_lamda , 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(h) , sdata->domain.h , 6*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(h_inv) , sdata->domain.h_inv , 6*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(h_rate) , sdata->domain.h_rate , 6*sizeof(V_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag , sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(debugdata) , &sdata->debugdata , sizeof(int*)); -} - -void Cuda_Domain_Init(cuda_shared_data* sdata) -{ - Cuda_Domain_UpdateNmax(sdata); - Cuda_Domain_UpdateDomain(sdata); -} - -void Cuda_Domain_PBC(cuda_shared_data* sdata,int deform_remap,int deform_groupbit,double* extent) -{ - Cuda_Domain_UpdateNmax(sdata); - //if(sdata->domain.update) - Cuda_Domain_UpdateDomain(sdata); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - - int box_change=0; - if(extent) box_change=1; - - int sharedmem=0; - if(box_change) sharedmem=6*sizeof(X_FLOAT); - - int3 layout=getgrid(sdata->atom.nlocal,sharedmem); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - sharedmem*=threads.x; - - if((box_change)&&(sdata->buffer_new or (6*sizeof(X_FLOAT)*grid.x*grid.y>sdata->buffersize))) - Cuda_Domain_UpdateBuffer(sdata,layout.x*layout.y*6*sizeof(X_FLOAT)); - - - Domain_PBC_Kernel<<>>(deform_remap,deform_groupbit,box_change); - cudaThreadSynchronize(); - - CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed"); - if(box_change) - { - X_FLOAT buf2[6*layout.x*layout.y]; - X_FLOAT* buf=buf2; - int flag; - cudaMemcpy(buf, sdata->buffer, 6*layout.x*layout.y*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); - //printf("Flag: %i\n",flag); - X_FLOAT min,max; - min=1.0*BIG; - max=-1.0*BIG; - for(int i=0;imax) max=buf[i+layout.x*layout.y]; - } - extent[0]=min; - extent[1]=max; - - buf+=2*layout.x*layout.y; - min=1.0*BIG; - max=-1.0*BIG; - for(int i=0;imax) max=buf[i+layout.x*layout.y]; - } - extent[2]=min; - extent[3]=max; - - buf+=2*layout.x*layout.y; - min=1.0*BIG; - max=-1.0*BIG; - for(int i=0;imax) max=buf[i+layout.x*layout.y]; - } - extent[4]=min; - extent[5]=max; - //printf("Extent: %lf %lf %lf %lf %lf %lf\n",extent[0],extent[1],extent[2],extent[3],extent[4],extent[5]); -/* int n=grid.x*grid.y; - if(n<128) threads.x=32; - else if(n<256) threads.x=64; - else threads.x=128; - sharedmem=n*sizeof(X_FLOAT); - grid.x=6; - grid.y=1; - Domain_reduceBoxExtent<<>>(extent,n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Domain_reduceBoxExtent: Kernel execution failed");*/ - } -} - -void Cuda_Domain_lamda2x(cuda_shared_data* sdata,int n) -{ - Cuda_Domain_UpdateNmax(sdata); - //if(sdata->domain.update) - Cuda_Domain_UpdateDomain(sdata); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Domain_lamda2x_Kernel<<>>(n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Domain_lamda2x: Kernel execution failed"); -} - -void Cuda_Domain_x2lamda(cuda_shared_data* sdata,int n) -{ - Cuda_Domain_UpdateNmax(sdata); - //if(sdata->domain.update) - Cuda_Domain_UpdateDomain(sdata); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Domain_x2lamda_Kernel<<>>(n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Domain_x2lamda: Kernel execution failed"); -} diff --git a/lib/cuda/domain_cu.h b/lib/cuda/domain_cu.h deleted file mode 100644 index f04e5610c2..0000000000 --- a/lib/cuda/domain_cu.h +++ /dev/null @@ -1,29 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_Domain_Init(cuda_shared_data* sdata); -extern "C" void Cuda_Domain_PBC(cuda_shared_data* sdata,int deform_remap,int deform_groupbit,double* extent=NULL); -extern "C" void Cuda_Domain_lamda2x(cuda_shared_data* sdata,int n); -extern "C" void Cuda_Domain_x2lamda(cuda_shared_data* sdata,int n); diff --git a/lib/cuda/domain_kernel.cu b/lib/cuda/domain_kernel.cu deleted file mode 100644 index fa76974076..0000000000 --- a/lib/cuda/domain_kernel.cu +++ /dev/null @@ -1,269 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -extern __shared__ X_FLOAT sharedmem[]; - -#define BIG 1e10 -__global__ void Domain_PBC_Kernel(int deform_remap,int deform_groupbit,int box_change) -{ - int idim,otherdims; - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - X_FLOAT lo[3]; - X_FLOAT hi[3]; - X_FLOAT* period; - - if (_triclinic == 0) { - lo[0] = _boxlo[0]; - lo[1] = _boxlo[1]; - lo[2] = _boxlo[2]; - - hi[0] = _boxhi[0]; - hi[1] = _boxhi[1]; - hi[2] = _boxhi[2]; - period = _prd; - } else { - lo[0] = _boxlo_lamda[0]; - lo[1] = _boxlo_lamda[1]; - lo[2] = _boxlo_lamda[2]; - - hi[0] = _boxhi_lamda[0]; - hi[1] = _boxhi_lamda[1]; - hi[2] = _boxhi_lamda[2]; - period = _prd_lamda; - } - - - X_FLOAT tmpx=X_F(0.5)*(hi[0]+lo[0]); - X_FLOAT tmpy=X_F(0.5)*(hi[1]+lo[1]); - X_FLOAT tmpz=X_F(0.5)*(hi[2]+lo[2]); - - X_FLOAT* buf=(X_FLOAT*) _buffer; - buf+=blockIdx.x*gridDim.y+blockIdx.y; - buf[0]=tmpx; - buf+=gridDim.x*gridDim.y; - buf[0]=tmpx; - buf+=gridDim.x*gridDim.y; - buf[0]=tmpy; - buf+=gridDim.x*gridDim.y; - buf[0]=tmpy; - buf+=gridDim.x*gridDim.y; - buf[0]=tmpz; - buf+=gridDim.x*gridDim.y; - buf[0]=tmpz; - - if(i<_nlocal) - { - - if (_periodicity[0]) { - if (_x[i] < lo[0]) { - _x[i] += period[0]; - if (deform_remap && _mask[i] & deform_groupbit) _v[i] += _h_rate[0]; - idim = _image[i] & 1023; - otherdims = _image[i] ^ idim; - idim--; - idim &= 1023; - _image[i] = otherdims | idim; - } - if (_x[i] >= hi[0]) { - _x[i] -= period[0]; - _x[i] = MAX(_x[i],lo[0]); - if (deform_remap && _mask[i] & deform_groupbit) _v[i] -= _h_rate[0]; - idim = _image[i] & 1023; - otherdims = _image[i] ^ idim; - idim++; - idim &= 1023; - _image[i] = otherdims | idim; - } - } - - if (_periodicity[1]) { - if (_x[i+_nmax] < lo[1]) { - _x[i+_nmax] += period[1]; - if (deform_remap && _mask[i] & deform_groupbit) { - _v[i] += _h_rate[5]; - _v[i+_nmax] += _h_rate[1]; - } - idim = (_image[i] >> 10) & 1023; - otherdims = _image[i] ^ (idim << 10); - idim--; - idim &= 1023; - _image[i] = otherdims | (idim << 10); - } - if (_x[i+_nmax] >= hi[1]) { - _x[i+_nmax] -= period[1]; - _x[i+_nmax] = MAX(_x[i+_nmax],lo[1]); - if (deform_remap && _mask[i] & deform_groupbit) { - _v[i] -= _h_rate[5]; - _v[i+_nmax] -= _h_rate[1]; - } - idim = (_image[i] >> 10) & 1023; - otherdims = _image[i] ^ (idim << 10); - idim++; - idim &= 1023; - _image[i] = otherdims | (idim << 10); - } - } - - if (_periodicity[2]) { - if (_x[i+2*_nmax] < lo[2]) { - _x[i+2*_nmax] += period[2]; - if (deform_remap && _mask[i] & deform_groupbit) { - _v[i] += _h_rate[4]; - _v[i+_nmax] += _h_rate[3]; - _v[i+2*_nmax] += _h_rate[2]; - } - idim = _image[i] >> 20; - otherdims = _image[i] ^ (idim << 20); - idim--; - idim &= 1023; - _image[i] = otherdims | (idim << 20); - } - if (_x[i+2*_nmax] >= hi[2]) { - _x[i+2*_nmax] -= period[2]; - _x[i+2*_nmax] = MAX(_x[i+2*_nmax],lo[2]); - if (deform_remap && _mask[i] & deform_groupbit) { - _v[i] -= _h_rate[4]; - _v[i+_nmax] -= _h_rate[3]; - _v[i+2*_nmax] -= _h_rate[2]; - } - idim = _image[i] >> 20; - otherdims = _image[i] ^ (idim << 20); - idim++; - idim &= 1023; - _image[i] = otherdims | (idim << 20); - } - } - if(box_change) - { - tmpx=_x[i]; - tmpy=_x[i+_nmax]; - tmpz=_x[i+2*_nmax]; - - - } - } - __syncthreads(); - if(box_change) - { - X_FLOAT minx=BIG; - X_FLOAT maxx=-BIG; - X_FLOAT miny=BIG; - X_FLOAT maxy=-BIG; - X_FLOAT minz=BIG; - X_FLOAT maxz=-BIG; - - if (not _periodicity[0]) { - sharedmem[threadIdx.x]=tmpx; - minOfBlock(sharedmem); - minx=sharedmem[0]; - __syncthreads(); - sharedmem[threadIdx.x]=tmpx; - maxOfBlock(sharedmem); - maxx=sharedmem[0]; - __syncthreads(); - } - else {minx=lo[0];maxx=hi[0];} - if (not _periodicity[1]) { - sharedmem[threadIdx.x]=tmpy; - minOfBlock(sharedmem); - miny=sharedmem[0]; - __syncthreads(); - sharedmem[threadIdx.x]=tmpy; - maxOfBlock(sharedmem); - maxy=sharedmem[0]; - __syncthreads(); - } - else {minx=lo[1];maxx=hi[1];} - if (not _periodicity[2]) { - sharedmem[threadIdx.x]=tmpz; - minOfBlock(sharedmem); - minz=sharedmem[0]; - __syncthreads(); - sharedmem[threadIdx.x]=tmpz; - maxOfBlock(sharedmem); - maxz=sharedmem[0]; - __syncthreads(); - } - else {minz=lo[2];maxz=hi[2];} - if(threadIdx.x==0) - { - buf=(X_FLOAT*) _buffer; - buf+=blockIdx.x*gridDim.y+blockIdx.y; - buf[0]=minx; - buf+=gridDim.x*gridDim.y; - buf[0]=maxx; - buf+=gridDim.x*gridDim.y; - buf[0]=miny; - buf+=gridDim.x*gridDim.y; - buf[0]=maxy; - buf+=gridDim.x*gridDim.y; - buf[0]=minz; - buf+=gridDim.x*gridDim.y; - buf[0]=maxz; - } - } -} - -__global__ void Domain_reduceBoxExtent(double* extent,int n) -{ - X_FLOAT* buf=(X_FLOAT*) _buffer; - buf+=blockIdx.x*n; - copyGlobToShared(buf,sharedmem,n); - if(blockIdx.x%2==0) - minOfData(sharedmem,n); - else - maxOfData(sharedmem,n); - extent[blockIdx.x]=sharedmem[0]; -} - -__global__ void Domain_lamda2x_Kernel(int n) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i - -void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow) -{ - - dim3 grid; - grid.x=nslow; - grid.y=nmid; - grid.z=1; - dim3 threads; - threads.x=nfast; - threads.y=1; - threads.z=1; - cudaThreadSynchronize(); - initfftdata_kernel<<>>(in,out); - cudaThreadSynchronize(); - MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n",cudaGetErrorString(cudaGetLastError()))); -} - - -void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow) -{ - - dim3 grid; - grid.x=nslow; - grid.y=nmid; - grid.z=1; - dim3 threads; - threads.x=nfast*2; - threads.y=1; - threads.z=1; - permute_kernel<<>>((FFT_FLOAT*)in,(FFT_FLOAT*)out); - cudaThreadSynchronize(); - MYDBG(printf("ERROR-CUDA permute_kernel: %s\n",cudaGetErrorString(cudaGetLastError()))); -} - -void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow) -{ - - dim3 grid; - grid.x=nslow; - grid.y=nmid; - grid.z=1; - dim3 threads; - threads.x=nfast*2; - threads.y=1; - threads.z=1; - permute_kernel<<>>((FFT_FLOAT*)in,(FFT_FLOAT*)out); - cudaThreadSynchronize(); -} -void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo) -{ - - dim3 grid; - grid.x=(ihi-ilo+1); - grid.y=(jhi-jlo+1); - grid.z=1; - dim3 threads; - threads.x=(khi-klo+1)*2; - threads.y=1; - threads.z=1; - permute_part_kernel<<>>((FFT_FLOAT*)in,(FFT_FLOAT*)out,nfast,nmid,nslow,ihi,ilo,jhi,jlo,khi,klo); - cudaThreadSynchronize(); - } - - void FFTsyncthreads() - { - cudaThreadSynchronize(); - } - diff --git a/lib/cuda/fft3d_cuda_cu.h b/lib/cuda/fft3d_cuda_cu.h deleted file mode 100644 index 426b61d40c..0000000000 --- a/lib/cuda/fft3d_cuda_cu.h +++ /dev/null @@ -1,30 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow); -extern "C" void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow); -extern "C" void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow); -extern "C" void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo); -extern "C" void FFTsyncthreads(); diff --git a/lib/cuda/fft3d_cuda_kernel.cu b/lib/cuda/fft3d_cuda_kernel.cu deleted file mode 100644 index 0ee414998f..0000000000 --- a/lib/cuda/fft3d_cuda_kernel.cu +++ /dev/null @@ -1,44 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__global__ void initfftdata_kernel(double* in,FFT_FLOAT* out) -{ - out[2*(((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x]; - out[2*(((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x)+1]=0; -} - - -__global__ void permute_kernel(FFT_FLOAT* in,FFT_FLOAT* out) -{ - out[2*(((threadIdx.x/2)*gridDim.x+blockIdx.x)*gridDim.y+blockIdx.y)+threadIdx.x-2*(threadIdx.x/2)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x]; -} - -__global__ void permute_scale_kernel(FFT_FLOAT* in,FFT_FLOAT* out) -{ - out[2*(((threadIdx.x/2)*gridDim.x+blockIdx.x)*gridDim.y+blockIdx.y)+threadIdx.x-2*(threadIdx.x/2)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x]*gridDim.x*gridDim.y*blockDim.x*0.5; -} - -__global__ void permute_part_kernel(FFT_FLOAT* in,FFT_FLOAT* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo) -{ - {out[2*((threadIdx.x/2)*(ihi-ilo+1)*(jhi-jlo+1)+(blockIdx.x)*(jhi-jlo+1)+blockIdx.y-jlo)+threadIdx.x-2*(threadIdx.x/2)]=in[2*(blockIdx.x+ilo)*nmid*nslow+2*(blockIdx.y+jlo)*nmid+threadIdx.x+2*klo]; } -} diff --git a/lib/cuda/fix_addforce_cuda.cu b/lib/cuda/fix_addforce_cuda.cu deleted file mode 100644 index 33700b44b6..0000000000 --- a/lib/cuda/fix_addforce_cuda.cu +++ /dev/null @@ -1,89 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX fix_add_force_cuda -#include "cuda_shared.h" -#include "cuda_common.h" - -#include "crm_cuda_utils.cu" - -#include "fix_addforce_cuda_cu.h" -#include "fix_addforce_cuda_kernel.cu" - -void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata) -{ - int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - int size=(unsigned)(layout.z*layout.y*layout.x)*4*sizeof(F_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); -} - -void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); -} - -void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata) -{ - Cuda_FixAddForceCuda_UpdateNmax(sdata); -} - -void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue,F_FLOAT* aforiginal) -{ - if(sdata->atom.update_nmax) - Cuda_FixAddForceCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - if(sdata->buffer_new) - Cuda_FixAddForceCuda_UpdateBuffer(sdata); - int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Cuda_FixAddForceCuda_PostForce_Kernel<<>> (groupbit,axvalue,ayvalue,azvalue); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed"); - - int oldgrid=grid.x; - grid.x=4; - threads.x=512; - reduce_foriginal<<>> (oldgrid,aforiginal); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed"); - -} diff --git a/lib/cuda/fix_addforce_cuda_cu.h b/lib/cuda/fix_addforce_cuda_cu.h deleted file mode 100644 index 8aff462666..0000000000 --- a/lib/cuda/fix_addforce_cuda_cu.h +++ /dev/null @@ -1,27 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue,F_FLOAT* aforiginal); diff --git a/lib/cuda/fix_addforce_cuda_kernel.cu b/lib/cuda/fix_addforce_cuda_kernel.cu deleted file mode 100644 index bbfbdbe35a..0000000000 --- a/lib/cuda/fix_addforce_cuda_kernel.cu +++ /dev/null @@ -1,86 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -extern __shared__ F_FLOAT sharedmem[]; - - -__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - sharedmem[threadIdx.x]=0; - sharedmem[threadIdx.x+blockDim.x]=0; - sharedmem[threadIdx.x+2*blockDim.x]=0; - sharedmem[threadIdx.x+3*blockDim.x]=0; - - if(i < _nlocal) - if (_mask[i] & groupbit) - //if (iregion >= 0 && - //match(x[i][0],x[i][1],x[i][2],iregion)) //currently not supported - { - sharedmem[threadIdx.x]=-xvalue*_x[i] - yvalue*_x[i+1*_nmax] - zvalue*_x[i+2*_nmax]; - sharedmem[threadIdx.x+blockDim.x]=_f[i]; - sharedmem[threadIdx.x+2*blockDim.x]=_f[i+1*_nmax]; - sharedmem[threadIdx.x+3*blockDim.x]=_f[i+2*_nmax]; - _f[i] += xvalue; - _f[i+1*_nmax] += yvalue; - _f[i+2*_nmax] += zvalue; - } - - reduceBlock(sharedmem); - reduceBlock(&sharedmem[blockDim.x]); - reduceBlock(&sharedmem[2*blockDim.x]); - reduceBlock(&sharedmem[3*blockDim.x]); - F_FLOAT* buffer=(F_FLOAT*) _buffer; - if(threadIdx.x==0) - { - buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x]; - } - -} - - -__global__ void reduce_foriginal(int n,F_FLOAT* foriginal) -{ - int i=0; - sharedmem[threadIdx.x]=0; - F_FLOAT myforig=0.0; - F_FLOAT* buf=(F_FLOAT*) _buffer; - buf=&buf[blockIdx.x*n]; - while(i -#define MY_PREFIX fix_ave_force_cuda -#include "cuda_shared.h" -#include "cuda_common.h" - -#include "crm_cuda_utils.cu" - -#include "fix_aveforce_cuda_cu.h" -#include "fix_aveforce_cuda_kernel.cu" - -void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata) -{ - int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - int size=(unsigned)(layout.z*layout.y*layout.x)*4*sizeof(F_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); -} - -void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); -} - -void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata) -{ - Cuda_FixAveForceCuda_UpdateNmax(sdata); -} - -void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,F_FLOAT* aforiginal) -{ - if(sdata->atom.update_nmax) - Cuda_FixAveForceCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - if(sdata->buffer_new) - Cuda_FixAveForceCuda_UpdateBuffer(sdata); - - int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - - Cuda_FixAveForceCuda_PostForce_FOrg_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed"); - - int oldgrid=grid.x; - grid.x=4; - threads.x=512; - Cuda_FixAveForceCuda_reduce_foriginal<<>> (oldgrid,aforiginal); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed"); - -} - -void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue) -{ - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - - Cuda_FixAveForceCuda_PostForce_Set_Kernel<<>> (groupbit,xflag,yflag,zflag,axvalue,ayvalue,azvalue); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce_Set: fix ave_force post_force Compute Kernel execution failed"); - -} diff --git a/lib/cuda/fix_aveforce_cuda_cu.h b/lib/cuda/fix_aveforce_cuda_cu.h deleted file mode 100644 index dd9992d866..0000000000 --- a/lib/cuda/fix_aveforce_cuda_cu.h +++ /dev/null @@ -1,28 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,F_FLOAT* aforiginal); -extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue); diff --git a/lib/cuda/fix_aveforce_cuda_kernel.cu b/lib/cuda/fix_aveforce_cuda_kernel.cu deleted file mode 100644 index edccee8c4d..0000000000 --- a/lib/cuda/fix_aveforce_cuda_kernel.cu +++ /dev/null @@ -1,87 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -extern __shared__ F_FLOAT sharedmem[]; - - -__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - sharedmem[threadIdx.x]=0; - sharedmem[threadIdx.x+blockDim.x]=0; - sharedmem[threadIdx.x+2*blockDim.x]=0; - sharedmem[threadIdx.x+3*blockDim.x]=0; - if(i < _nlocal) - if (_mask[i] & groupbit) { - sharedmem[threadIdx.x]=_f[i]; - sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax]; - sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax]; - sharedmem[threadIdx.x+3*blockDim.x]=1; - } - reduceBlock(sharedmem); - reduceBlock(&sharedmem[blockDim.x]); - reduceBlock(&sharedmem[2*blockDim.x]); - reduceBlock(&sharedmem[3*blockDim.x]); - F_FLOAT* buffer=(F_FLOAT*) _buffer; - if(threadIdx.x==0) - { - buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x]; - } -} - - -__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n,F_FLOAT* foriginal) -{ - int i=0; - sharedmem[threadIdx.x]=0; - F_FLOAT myforig=0.0; - F_FLOAT* buf=(F_FLOAT*) _buffer; - buf=&buf[blockIdx.x*n]; - while(i -#define MY_PREFIX fix_enforce2d_cuda -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" -#include "fix_enforce2d_cuda_cu.h" -#include "fix_enforce2d_cuda_kernel.cu" - -void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); -} - -void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit) -{ - if(sdata->atom.update_nmax) - Cuda_FixEnforce2dCuda_Init(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - Cuda_FixEnforce2dCuda_PostForce_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Cuda_FixEnforce2dCuda_PostForce: fix enforce2d post_force Kernel execution failed"); -} diff --git a/lib/cuda/fix_enforce2d_cuda_cu.h b/lib/cuda/fix_enforce2d_cuda_cu.h deleted file mode 100644 index a35fadf806..0000000000 --- a/lib/cuda/fix_enforce2d_cuda_cu.h +++ /dev/null @@ -1,27 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit); diff --git a/lib/cuda/fix_enforce2d_cuda_kernel.cu b/lib/cuda/fix_enforce2d_cuda_kernel.cu deleted file mode 100644 index c07f944901..0000000000 --- a/lib/cuda/fix_enforce2d_cuda_kernel.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - - -__global__ void Cuda_FixEnforce2dCuda_PostForce_Kernel(int groupbit) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal) - if (_mask[i] & groupbit) { - _v[i+2*_nmax] = V_F(0.0); - _f[i+2*_nmax] = F_F(0.0); - } -} diff --git a/lib/cuda/fix_freeze_cuda.cu b/lib/cuda/fix_freeze_cuda.cu deleted file mode 100644 index ba6fe117ce..0000000000 --- a/lib/cuda/fix_freeze_cuda.cu +++ /dev/null @@ -1,95 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX fix_freeze_cuda -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" - -#include "fix_freeze_cuda_cu.h" -#include "fix_freeze_cuda_kernel.cu" - -void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata) -{ - int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - - } - cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); -} - -void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*) ); -} - - -void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata) -{ - Cuda_FixFreezeCuda_UpdateNmax(sdata); - -} - - -void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT* foriginal) -{ - if(sdata->atom.update_nmax) - Cuda_FixFreezeCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - if(sdata->buffer_new) - Cuda_FixFreezeCuda_UpdateBuffer(sdata); - - - int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Cuda_FixFreezeCuda_PostForce_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed"); - - int oldgrid=grid.x; - grid.x=3; - threads.x=512; - Cuda_FixFreezeCuda_Reduce_FOriginal<<>> (oldgrid,foriginal); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed"); - -} diff --git a/lib/cuda/fix_freeze_cuda_cu.h b/lib/cuda/fix_freeze_cuda_cu.h deleted file mode 100644 index 2df8743a6a..0000000000 --- a/lib/cuda/fix_freeze_cuda_cu.h +++ /dev/null @@ -1,27 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT* foriginal); diff --git a/lib/cuda/fix_freeze_cuda_kernel.cu b/lib/cuda/fix_freeze_cuda_kernel.cu deleted file mode 100644 index d6721311b6..0000000000 --- a/lib/cuda/fix_freeze_cuda_kernel.cu +++ /dev/null @@ -1,82 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -extern __shared__ F_FLOAT sharedmem[]; - - -__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - sharedmem[threadIdx.x]=0; - sharedmem[threadIdx.x+blockDim.x]=0; - sharedmem[threadIdx.x+2*blockDim.x]=0; - if(i < _nlocal) - if (_mask[i] & groupbit) { - sharedmem[threadIdx.x]=_f[i]; - sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax]; - sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax]; - - _f[i] = F_F(0.0); - _f[i+1*_nmax] = F_F(0.0); - _f[i+2*_nmax] = F_F(0.0); - _torque[i] = F_F(0.0); - _torque[i+1*_nmax] = F_F(0.0); - _torque[i+2*_nmax] = F_F(0.0); - } - - - reduceBlock(sharedmem); - reduceBlock(&sharedmem[blockDim.x]); - reduceBlock(&sharedmem[2*blockDim.x]); - F_FLOAT* buffer=(F_FLOAT*)_buffer; - if(threadIdx.x==0) - { - buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; - } -} - - -__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n,F_FLOAT* foriginal) -{ - int i=0; - sharedmem[threadIdx.x]=0; - F_FLOAT myforig=0.0; - F_FLOAT* buf=(F_FLOAT*)_buffer; - buf=&buf[blockIdx.x*n]; - while(i -#define MY_PREFIX fix_gravity_cuda -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" - -#include "fix_gravity_cuda_cu.h" -#include "fix_gravity_cuda_kernel.cu" - -void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata) -{ - int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - - } - cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); -} - -void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) ); -} - -void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata) -{ - Cuda_FixGravityCuda_UpdateNmax(sdata); - -} - - -void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc) -{ - if(sdata->atom.update_nmax) - Cuda_FixGravityCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - if(sdata->buffer_new) - Cuda_FixGravityCuda_UpdateBuffer(sdata); - - - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Cuda_FixGravityCuda_PostForce_Kernel<<>> (groupbit,xacc,yacc,zacc); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Cuda_FixGravityCuda_PostForce: fix add_force post_force compute Kernel execution failed"); -} diff --git a/lib/cuda/fix_gravity_cuda_cu.h b/lib/cuda/fix_gravity_cuda_cu.h deleted file mode 100644 index d69816bb67..0000000000 --- a/lib/cuda/fix_gravity_cuda_cu.h +++ /dev/null @@ -1,27 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc); diff --git a/lib/cuda/fix_gravity_cuda_kernel.cu b/lib/cuda/fix_gravity_cuda_kernel.cu deleted file mode 100644 index 6a77933acb..0000000000 --- a/lib/cuda/fix_gravity_cuda_kernel.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - if(i < _nlocal) - if (_mask[i] & groupbit) { - F_FLOAT mass = _rmass_flag?_rmass[i]:_mass[_type[i]]; - _f[i] += mass*xacc; - _f[i+1*_nmax] += mass*yacc; - _f[i+2*_nmax] += mass*zacc; - } -} - diff --git a/lib/cuda/fix_nh_cuda.cu b/lib/cuda/fix_nh_cuda.cu deleted file mode 100644 index 39abcea94e..0000000000 --- a/lib/cuda/fix_nh_cuda.cu +++ /dev/null @@ -1,220 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX fix_nh_cuda -#define IncludeCommonNeigh -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" -#include "fix_nh_cuda_cu.h" -#include "fix_nh_cuda_kernel.cu" - -void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass.dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*) ); //might be moved to a neighbor record in sdata - cudaMemcpyToSymbol(MY_CONST(maxhold) , & sdata->atom.maxhold, sizeof(int) ); //might be moved to a neighbor record in sdata - cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata - cudaMemcpyToSymbol(MY_CONST(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata -} - -void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata) -{ - int size=(unsigned)10*sizeof(int); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - - } - cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata -} - -void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf) -{ - cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(dtf) , & dtf , sizeof(V_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(dtv) , & dtv , sizeof(X_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(dist_check), & sdata->atom.dist_check , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag , sizeof(int) ); // - Cuda_FixNHCuda_UpdateNmax(sdata); -} - - -void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup see cpp -{ - timespec atime1,atime2; - clock_gettime(CLOCK_REALTIME,&atime1); - if(sdata->atom.update_nmax) - Cuda_FixNHCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); - clock_gettime(CLOCK_REALTIME,&atime2); - sdata->cuda_timings.test1+= - atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; - if(sdata->buffer_new) - Cuda_FixNHCuda_UpdateBuffer(sdata); - F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]}; - F_FLOAT3 factor2; - if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];} - - int3 layout=getgrid(mynlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - FixNHCuda_nh_v_press_Kernel<<>> (groupbit,factor,p_triclinic,factor2); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed"); - -} - -void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup see cpp -{ - if(sdata->atom.update_nmax) - Cuda_FixNHCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); - if(sdata->buffer_new) - Cuda_FixNHCuda_UpdateBuffer(sdata); - F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]}; - F_FLOAT3 factor2; - if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];} - - int3 layout=getgrid(mynlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - CUT_CHECK_ERROR("FixNHCuda: fix nh v_press pre Kernel execution failed"); - FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel<<>> (groupbit,factor,p_triclinic,factor2); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed"); - -} - -void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp -{ - timespec atime1,atime2; - clock_gettime(CLOCK_REALTIME,&atime1); - if(sdata->atom.update_nmax) - Cuda_FixNHCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); - clock_gettime(CLOCK_REALTIME,&atime2); - sdata->cuda_timings.test1+= - atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; - if(sdata->buffer_new) - Cuda_FixNHCuda_UpdateBuffer(sdata); - - int3 layout=getgrid(mynlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - FixNHCuda_nh_v_temp_Kernel<<>> (groupbit,factor_eta); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("FixNHCuda: fix nh v_temp Kernel execution failed"); - -} -void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp -{ - timespec atime1,atime2; - clock_gettime(CLOCK_REALTIME,&atime1); - if(sdata->atom.update_nmax) - Cuda_FixNHCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); - clock_gettime(CLOCK_REALTIME,&atime2); - sdata->cuda_timings.test1+= - atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; - if(sdata->buffer_new) - Cuda_FixNHCuda_UpdateBuffer(sdata); - - int3 layout=getgrid(mynlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - FixNHCuda_nve_v_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("FixNHCuda: nve_v Kernel execution failed"); -} - - -void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp -{ - timespec atime1,atime2; - clock_gettime(CLOCK_REALTIME,&atime1); - if(sdata->atom.update_nmax) - Cuda_FixNHCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); - clock_gettime(CLOCK_REALTIME,&atime2); - sdata->cuda_timings.test1+= - atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; - if(sdata->buffer_new) - Cuda_FixNHCuda_UpdateBuffer(sdata); - - int3 layout=getgrid(mynlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - cudaMemset(sdata->buffer,0,sizeof(int)); - FixNHCuda_nve_x_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - int reneigh_flag; - cudaMemcpy((void*) (&reneigh_flag), sdata->buffer, sizeof(int),cudaMemcpyDeviceToHost); - sdata->atom.reneigh_flag+=reneigh_flag; - CUT_CHECK_ERROR("FixNHCuda: nve_x Kernel execution failed"); -} - -void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata,int groupbit, double* factor_h, int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup see cpp -{ - if(sdata->atom.update_nmax) - Cuda_FixNHCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); - if(sdata->buffer_new) - Cuda_FixNHCuda_UpdateBuffer(sdata); - - F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]}; - F_FLOAT3 factor2; - if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];} - - int3 layout=getgrid(mynlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel<<>> (groupbit,factor,p_triclinic,factor2); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("FixNHCuda__nve_v_and_nh_v_press_NoBias: Kernel execution failed"); -} - diff --git a/lib/cuda/fix_nh_cuda_cu.h b/lib/cuda/fix_nh_cuda_cu.h deleted file mode 100644 index e6ba4e08bd..0000000000 --- a/lib/cuda/fix_nh_cuda_cu.h +++ /dev/null @@ -1,32 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf); -extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup see cpp -extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup see cpp -extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup see cpp -extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup see cpp -extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup see cpp -extern "C" void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata,int groupbit, double* factor_h, int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup see cpp diff --git a/lib/cuda/fix_nh_cuda_kernel.cu b/lib/cuda/fix_nh_cuda_kernel.cu deleted file mode 100644 index a6a3a52a87..0000000000 --- a/lib/cuda/fix_nh_cuda_kernel.cu +++ /dev/null @@ -1,187 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -static inline __device__ void check_distance(X_FLOAT &xtmp,X_FLOAT &ytmp,X_FLOAT &ztmp,int &i,int groupbit) -{ - if(_dist_check) - { - - X_FLOAT d=X_F(0.0); - if(i<_nlocal) - { - X_FLOAT tmp=xtmp-_xhold[i]; - d=tmp*tmp; - tmp=ytmp-_xhold[i+_maxhold]; - d+=tmp*tmp; - tmp=ztmp-_xhold[i+2*_maxhold]; - d+=tmp*tmp; - - d=((_mask[i] & groupbit))?d:X_F(0.0); - } - if(not __all(d<=_triggerneighsq)) - _reneigh_flag[0]=1; - } -} - -__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal && _mask[i] & groupbit) - { - V_FLOAT* my_v = _v + i; - V_FLOAT vx=my_v[0]; - V_FLOAT vy=my_v[_nmax]; - V_FLOAT vz=my_v[2*_nmax]; - vx*=factor.x; - vy*=factor.y; - vz*=factor.z; - if(p_triclinic) { - vx += vy*factor2.z + vz*factor2.y; - vy += vz*factor2.x; - } - vx*=factor.x; - vy*=factor.y; - vz*=factor.z; - my_v[0] = vx; - my_v[_nmax] = vy; - my_v[2*_nmax] = vz; - } - -} - -__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal && _mask[i] & groupbit) - { - V_FLOAT* my_v = _v + i; - my_v[0]*=factor_eta; - my_v[_nmax]*=factor_eta; - my_v[2*_nmax]*=factor_eta; - } - -} - -__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal && _mask[i] & groupbit) - { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; - - V_FLOAT dtfm = _dtf; - if(_rmass_flag) dtfm*= V_F(1.0) / _rmass[i]; - else dtfm*= V_F(1.0) / _mass[_type[i]]; - - V_FLOAT vx=my_v[0]; - V_FLOAT vy=my_v[_nmax]; - V_FLOAT vz=my_v[2*_nmax]; - vx*=factor.x; - vy*=factor.y; - vz*=factor.z; - if(p_triclinic) { - vx += vy*factor2.z + vz*factor2.y; - vy += vz*factor2.x; - } - vx*=factor.x; - vy*=factor.y; - vz*=factor.z; - my_v[0] = vx + dtfm * my_f[0]; - my_v[_nmax] = vy + dtfm * my_f[_nmax]; - my_v[2*_nmax] = vz + dtfm * my_f[_nmax*2]; - } - -} - -__global__ void FixNHCuda_nve_v_Kernel(int groupbit) -{ - - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal && _mask[i] & groupbit) - { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; - - V_FLOAT dtfm = _dtf; - if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i]; - else dtfm*=V_F(1.0) / _mass[_type[i]]; - - *my_v = (*my_v + dtfm*(*my_f)); my_f += _nmax; my_v += _nmax; - *my_v = (*my_v + dtfm*(*my_f)); my_f += _nmax; my_v += _nmax; - *my_v = (*my_v + dtfm*(*my_f)); - } -} - -__global__ void FixNHCuda_nve_x_Kernel(int groupbit) -{ - X_FLOAT xtmp,ytmp,ztmp; - - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal && _mask[i] & groupbit) - { - V_FLOAT* my_v = _v + i; - X_FLOAT* my_x = _x + i; - - xtmp = *my_x += _dtv * *my_v; my_v += _nmax; my_x += _nmax; - ytmp = *my_x += _dtv * *my_v; my_v += _nmax; my_x += _nmax; - ztmp = *my_x += _dtv * *my_v; - } - check_distance(xtmp,ytmp,ztmp,i,groupbit); -} - - -__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2) -{ - - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal && _mask[i] & groupbit) - { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; - - V_FLOAT dtfm = _dtf; - if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i]; - else dtfm*=V_F(1.0) / _mass[_type[i]]; - - V_FLOAT vx = my_v[0] + dtfm*my_f[0]; - V_FLOAT vy = my_v[_nmax] + dtfm*my_f[_nmax]; - V_FLOAT vz = my_v[2*_nmax] + dtfm*my_f[2*_nmax]; - - vx*=factor.x; - vy*=factor.y; - vz*=factor.z; - if(p_triclinic) { - vx += vy*factor2.z + vz*factor2.y; - vy += vz*factor2.x; - } - vx*=factor.x; - vy*=factor.y; - vz*=factor.z; - my_v[0] = vx; - my_v[_nmax] = vy; - my_v[2*_nmax] = vz; - - } -} - diff --git a/lib/cuda/fix_nve_cuda.cu b/lib/cuda/fix_nve_cuda.cu deleted file mode 100644 index 8154359ad4..0000000000 --- a/lib/cuda/fix_nve_cuda.cu +++ /dev/null @@ -1,161 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX fix_nve_cuda -#define IncludeCommonNeigh -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" -#include "fix_nve_cuda_cu.h" -#include "fix_nve_cuda_kernel.cu" - -void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata) -{ - #ifdef CUDA_USE_BINNING - - - cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*)); - cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*)); - cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int)*3 ); - cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(binned_x) , & sdata->atom.binned_x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(binned_rmass) , & sdata->atom.binned_rmass .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) ); - - } - - #else - - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*) ); //might be moved to a neighbor record in sdata - cudaMemcpyToSymbol(MY_CONST(maxhold) , & sdata->atom.maxhold, sizeof(int) ); //might be moved to a neighbor record in sdata - cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata - cudaMemcpyToSymbol(MY_CONST(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata - - #endif -} - -void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata) -{ - int size=(unsigned)10*sizeof(int); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - - } - cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata -} - -void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf) -{ - cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(dtf) , & dtf , sizeof(V_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(dtv) , & dtv , sizeof(X_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(dist_check), & sdata->atom.dist_check , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag , sizeof(int) ); // - Cuda_FixNVECuda_UpdateNmax(sdata); -} - - -void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp -{ - if(sdata->atom.update_nmax) - Cuda_FixNVECuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); - if(sdata->buffer_new) - Cuda_FixNVECuda_UpdateBuffer(sdata); - - #ifdef CUDA_USE_BINNING - - dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1); - dim3 threads(sdata->domain.bin_nmax, 1, 1); - FixNVECuda_InitialIntegrate_N_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate (binning) Kernel execution failed"); - - #else - - int3 layout=getgrid(mynlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - cudaMemset(sdata->buffer,0,sizeof(int)); - FixNVECuda_InitialIntegrate_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - int reneigh_flag; - cudaMemcpy((void*) (&reneigh_flag), sdata->buffer, sizeof(int),cudaMemcpyDeviceToHost); - sdata->atom.reneigh_flag+=reneigh_flag; - CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate Kernel execution failed"); - - #endif - -} - -void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp -{ - if(sdata->atom.update_nmax) - Cuda_FixNVECuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); - if(sdata->buffer_new) - Cuda_FixNVECuda_UpdateBuffer(sdata); - - #ifdef CUDA_USE_BINNING - - dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1); - dim3 threads(sdata->domain.bin_nmax, 1, 1); - FixNVECuda_FinalIntegrate_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate (binning) Kernel execution failed"); - - #else - - int3 layout=getgrid(mynlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - FixNVECuda_FinalIntegrate_Kernel<<>> (groupbit); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate Kernel execution failed"); - - #endif -} - diff --git a/lib/cuda/fix_nve_cuda_cu.h b/lib/cuda/fix_nve_cuda_cu.h deleted file mode 100644 index 93cabe8d8b..0000000000 --- a/lib/cuda/fix_nve_cuda_cu.h +++ /dev/null @@ -1,28 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf); -extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal); -extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal); diff --git a/lib/cuda/fix_nve_cuda_kernel.cu b/lib/cuda/fix_nve_cuda_kernel.cu deleted file mode 100644 index 84f59fb307..0000000000 --- a/lib/cuda/fix_nve_cuda_kernel.cu +++ /dev/null @@ -1,137 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -static inline __device__ void check_distance(X_FLOAT &xtmp,X_FLOAT &ytmp,X_FLOAT &ztmp,int &i,int groupbit) -{ - if(_dist_check) - { - X_FLOAT tmp=xtmp-_xhold[i]; - X_FLOAT d=tmp*tmp; - tmp=ytmp-_xhold[i+_maxhold]; - d+=tmp*tmp; - tmp=ztmp-_xhold[i+2*_maxhold]; - d+=tmp*tmp; - - d=((i < _nlocal) && (_mask[i] & groupbit))?d:X_F(0.0); - - if(not __all(d<=_triggerneighsq)) - _reneigh_flag[0]=1; - } -} - - -__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit) -{ - X_FLOAT xtmp,ytmp,ztmp; - #ifdef CUDA_USE_BINNING - - const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; - if(threadIdx.x < _bin_count_local[bin]) - { - const int i = 3*blockDim.x * bin + threadIdx.x; - if(_mask[i] & groupbit) - { - F_FLOAT* my_f = _binned_f + i; - V_FLOAT* my_v = _binned_v + i; - X_FLOAT* my_x = _binned_x + i; - - V_FLOAT dtfm = _dtf - if(_rmass_flag) dtfm*= V_F(1.0) / _binned_rmass[i]; - else dtfm*= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]]; - - V_FLOAT v_mem; - v_mem = *my_v += dtfm * (*my_f); xtmp = *my_x += _dtv * v_mem; my_f += blockDim.x; my_v += blockDim.x; my_x += blockDim.x; - v_mem = *my_v += dtfm * (*my_f); ytmp = *my_x += _dtv * v_mem; my_f += blockDim.x; my_v += blockDim.x; my_x += blockDim.x; - v_mem = *my_v += dtfm * (*my_f); ztmp = *my_x += _dtv * v_mem; - } - } - - #else - - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal && _mask[i] & groupbit) - { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; - X_FLOAT* my_x = _x + i; - - V_FLOAT dtfm = _dtf; - if(_rmass_flag) dtfm*= V_F(1.0) / _rmass[i]; - else dtfm*= V_F(1.0) / _mass[_type[i]]; - - V_FLOAT v_mem; - v_mem = *my_v += dtfm * (*my_f); xtmp=*my_x += _dtv * v_mem; my_f += _nmax; my_v += _nmax; my_x += _nmax; - v_mem = *my_v += dtfm * (*my_f); ytmp=*my_x += _dtv * v_mem; my_f += _nmax; my_v += _nmax; my_x += _nmax; - v_mem = *my_v += dtfm * (*my_f); ztmp=*my_x += _dtv * v_mem; - } - - #endif - - check_distance(xtmp,ytmp,ztmp,i,groupbit); -} - -__global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit) -{ - #ifdef CUDA_USE_BINNING - - const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; - if(threadIdx.x < _bin_count_local[bin]) - { - const int i = 3*blockDim.x * bin + threadIdx.x; - if(_mask[i] & groupbit) - { - F_FLOAT* my_f = _binned_f + i; - V_FLOAT* my_v = _binned_v + i; - - V_FLOAT dtfm = _dtf - if(_rmass_flag) dtfm*= V_F(1.0) / _binned_rmass[i]; - else dtfm*= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]]; - - *my_v += dtfm * (*my_f); my_f += blockDim.x; my_v += blockDim.x; - *my_v += dtfm * (*my_f); my_f += blockDim.x; my_v += blockDim.x; - *my_v += dtfm * (*my_f); - } - } - - #else - - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal && _mask[i] & groupbit) - { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; - - V_FLOAT dtfm = _dtf; - if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i]; - else dtfm*=V_F(1.0) / _mass[_type[i]]; - - *my_v += dtfm * (*my_f); my_f += _nmax; my_v += _nmax; - *my_v += dtfm * (*my_f); my_f += _nmax; my_v += _nmax; - *my_v += dtfm * (*my_f); - } - - #endif -} - - - diff --git a/lib/cuda/fix_set_force_cuda.cu b/lib/cuda/fix_set_force_cuda.cu deleted file mode 100644 index 6d0f2fde66..0000000000 --- a/lib/cuda/fix_set_force_cuda.cu +++ /dev/null @@ -1,93 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX fix_set_force_cuda -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" - -#include "fix_set_force_cuda_cu.h" -#include "fix_set_force_cuda_kernel.cu" - -void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata) -{ - int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - - } - cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); -} - -void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); -} - -void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata) -{ - Cuda_FixSetForceCuda_UpdateNmax(sdata); - -} - - -void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,F_FLOAT* foriginal,int flagx,int flagy,int flagz) -{ - if(sdata->atom.update_nmax) - Cuda_FixSetForceCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - if(sdata->buffer_new) - Cuda_FixSetForceCuda_UpdateBuffer(sdata); - - - int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Cuda_FixSetForceCuda_PostForce_Kernel<<>> (groupbit,xvalue,yvalue,zvalue,flagx,flagy,flagz); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed"); - - int oldgrid=grid.x; - grid.x=3; - threads.x=512; - Cuda_FixSetForceCuda_Reduce_FOriginal<<>> (oldgrid,foriginal); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed"); - -} diff --git a/lib/cuda/fix_set_force_cuda_cu.h b/lib/cuda/fix_set_force_cuda_cu.h deleted file mode 100644 index 3121a684ad..0000000000 --- a/lib/cuda/fix_set_force_cuda_cu.h +++ /dev/null @@ -1,27 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,F_FLOAT* foriginal,int flagx,int flagy,int flagz); diff --git a/lib/cuda/fix_set_force_cuda_kernel.cu b/lib/cuda/fix_set_force_cuda_kernel.cu deleted file mode 100644 index f5836dee5f..0000000000 --- a/lib/cuda/fix_set_force_cuda_kernel.cu +++ /dev/null @@ -1,79 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -extern __shared__ F_FLOAT sharedmem[]; - - -__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,int flagx,int flagy,int flagz) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - sharedmem[threadIdx.x]=0; - sharedmem[threadIdx.x+blockDim.x]=0; - sharedmem[threadIdx.x+2*blockDim.x]=0; - if(i < _nlocal) - if (_mask[i] & groupbit) { - sharedmem[threadIdx.x]=_f[i]; - sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax]; - sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax]; - - if(flagx) _f[i] = xvalue; - if(flagy) _f[i+1*_nmax] = yvalue; - if(flagz) _f[i+2*_nmax] = zvalue; - } - - - reduceBlock(sharedmem); - reduceBlock(&sharedmem[blockDim.x]); - reduceBlock(&sharedmem[2*blockDim.x]); - F_FLOAT* buffer=(F_FLOAT*)_buffer; - if(threadIdx.x==0) - { - buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; - buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; - } -} - - -__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n,F_FLOAT* foriginal) -{ - int i=0; - sharedmem[threadIdx.x]=0; - F_FLOAT myforig=0.0; - F_FLOAT* buf=(F_FLOAT*)_buffer; - buf=&buf[blockIdx.x*n]; - while(i -#define MY_PREFIX fix_shake_cuda -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" -#include "fix_shake_cuda_cu.h" -#include "cuda_pair_virial_kernel_nc.cu" - -#define _shake_atom MY_AP(shake_atom) -#define _shake_type MY_AP(shake_type) -#define _shake_flag MY_AP(shake_flag) -#define _xshake MY_AP(xshake) -#define _dtfsq MY_AP(dtfsq) -#define _bond_distance MY_AP(bond_distance) -#define _angle_distance MY_AP(angle_distance) -#define _max_iter MY_AP(max_iter) -#define _tolerance MY_AP(tolerance) -__device__ __constant__ int* _shake_atom; -__device__ __constant__ int* _shake_type; -__device__ __constant__ int* _shake_flag; -__device__ __constant__ X_FLOAT3* _xshake; -__device__ __constant__ F_FLOAT _dtfsq; -__device__ __constant__ X_FLOAT* _bond_distance; -__device__ __constant__ X_FLOAT* _angle_distance; -__device__ __constant__ int _max_iter; -__device__ __constant__ X_FLOAT _tolerance; - -#include "fix_shake_cuda_kernel.cu" - -void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(map_array), & sdata->atom.map_array .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(debugdata), & sdata->debugdata , sizeof(int*) ); -} - -void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity , sizeof(int)*3 ); - cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd , sizeof(X_FLOAT)*3 ); - cudaMemcpyToSymbol(MY_CONST(triclinic) , &sdata->domain.triclinic , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(h) , sdata->domain.h , sizeof(X_FLOAT)*6 ); -} - -void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata,int size) -{ - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - - } - cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) ); -} - -void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata,X_FLOAT dtv, F_FLOAT dtfsq, - void* shake_flag,void* shake_atom,void* shake_type, void* xshake, - void* bond_distance,void* angle_distance,void* virial, - int max_iter,X_FLOAT tolerance) -{ - Cuda_FixShakeCuda_UpdateNmax(sdata); - Cuda_FixShakeCuda_UpdateDomain(sdata); - cudaMemcpyToSymbol(MY_CONST(shake_atom) , & shake_atom , sizeof(void*) ); - cudaMemcpyToSymbol(MY_CONST(shake_type) , & shake_type , sizeof(void*) ); - cudaMemcpyToSymbol(MY_CONST(shake_flag) , & shake_flag , sizeof(void*) ); - cudaMemcpyToSymbol(MY_CONST(xshake) , & xshake , sizeof(void*) ); - cudaMemcpyToSymbol(MY_CONST(dtv) , & dtv , sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(dtfsq) , & dtfsq , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(bond_distance) , & bond_distance , sizeof(void*) ); - cudaMemcpyToSymbol(MY_CONST(angle_distance) , & angle_distance , sizeof(void*) ); - cudaMemcpyToSymbol(MY_CONST(virial) , & virial , sizeof(void*) ); - cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag , sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(max_iter) , &max_iter , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(tolerance) , &tolerance , sizeof(X_FLOAT)); - - if(sdata->atom.mass_host) - cudaMemcpyToSymbol(MY_CONST(mass),& sdata->atom.mass.dev_data , sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag , sizeof(int) ); // - - cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag, sizeof(int*)); - -} - -void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata) -{ - if(sdata->atom.update_nmax) - Cuda_FixShakeCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); - if(sdata->buffer_new) - Cuda_FixShakeCuda_UpdateBuffer(sdata,10*sizeof(double)); - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - FixShakeCuda_UnconstrainedUpdate_Kernel<<>> (); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("FixShakeCuda_UnconstrainedUpdate: Kernel execution failed"); -} - -void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata,int vflag,int vflag_atom,int* list,int nlist) -{ - if(sdata->atom.update_nmax) - Cuda_FixShakeCuda_UpdateNmax(sdata); - if(sdata->domain.update) - Cuda_FixShakeCuda_UpdateDomain(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int3 layout=getgrid(sdata->atom.nlocal,6*sizeof(ENERGY_FLOAT),64); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->buffer_new) - Cuda_FixShakeCuda_UpdateBuffer(sdata,grid.x*grid.y*6*sizeof(ENERGY_FLOAT)); - - BindXTypeTexture(sdata); - - FixShakeCuda_Shake_Kernel<<>> (vflag,vflag_atom,list,nlist); - cudaThreadSynchronize(); - - CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed"); - - if(vflag) - { - int n=grid.x*grid.y; - grid.x=6; - grid.y=1; - threads.x=256; - MY_AP(PairVirialCompute_reduce)<<>>(n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed"); - } - -} - -int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - if(sdata->atom.update_nmax) - Cuda_FixShakeCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_FixShakeCuda_UpdateBuffer(sdata,size); - - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - if(sdata->atom.nlocal>0) - { - cudaMemset( sdata->flag,0,sizeof(int)); - FixShakeCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz); - cudaThreadSynchronize(); - cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - int aflag; - cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); - if(aflag!=0) printf("aflag PackComm: %i\n",aflag); - CUT_CHECK_ERROR("Cuda_FixShakeCuda_PackComm: Kernel execution failed"); - - } - return 3*n; -} - -int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - if(sdata->atom.update_nmax) - Cuda_FixShakeCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_FixShakeCuda_UpdateBuffer(sdata,size); - static int count=-1; - count++; - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - FixShakeCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); - } - - return 3*n; -} - -void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv) -{ - if(sdata->atom.update_nmax) - Cuda_FixShakeCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_FixShakeCuda_UpdateBuffer(sdata,size); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice); - FixShakeCuda_UnpackComm_Kernel<<>>(n,first); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed"); - - } -} diff --git a/lib/cuda/fix_shake_cuda_cu.h b/lib/cuda/fix_shake_cuda_cu.h deleted file mode 100644 index b4276b741a..0000000000 --- a/lib/cuda/fix_shake_cuda_cu.h +++ /dev/null @@ -1,34 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -#include "cuda_shared.h" - -extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata,X_FLOAT dtv, F_FLOAT dtfsq, - void* shake_flag,void* shake_atom,void* shake_type, void* xshake, - void* bond_distance,void* angle_distance,void* virial, - int max_iter,X_FLOAT tolerance); -extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata); -extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata,int vflag,int vflag_atom,int* list,int nlist); -extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag); -extern "C" int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag); -extern "C" void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv); - diff --git a/lib/cuda/fix_shake_cuda_kernel.cu b/lib/cuda/fix_shake_cuda_kernel.cu deleted file mode 100644 index e4ca822a77..0000000000 --- a/lib/cuda/fix_shake_cuda_kernel.cu +++ /dev/null @@ -1,971 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__device__ void v_tally(int& vflag_global,int& vflag_atom,int& n, int *list, ENERGY_FLOAT total, ENERGY_FLOAT *v) -{ - /*if(vflag_global) - { - ENERGY_FLOAT fraction = n/total; - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; - *shared += fraction*v[0]; shared+=blockDim.x; - *shared += fraction*v[1]; shared+=blockDim.x; - *shared += fraction*v[2]; shared+=blockDim.x; - *shared += fraction*v[3]; shared+=blockDim.x; - *shared += fraction*v[4]; shared+=blockDim.x; - *shared += fraction*v[5]; - }*/ - if (vflag_atom) { - ENERGY_FLOAT fraction = ENERGY_F(1.0)/total; - for (int i = 0; i < n; i++) { - int m = list[i]; - ENERGY_FLOAT* myvatom=&_vatom[m]; - - *myvatom += fraction*v[0]; myvatom+=_nmax; - *myvatom += fraction*v[1]; myvatom+=_nmax; - *myvatom += fraction*v[2]; myvatom+=_nmax; - *myvatom += fraction*v[3]; myvatom+=_nmax; - *myvatom += fraction*v[4]; myvatom+=_nmax; - *myvatom += fraction*v[5]; - } - } -} - -inline __device__ void minimum_image(X_FLOAT3& delta) -{ - if (_triclinic == 0) { - if (_periodicity[0]) { - delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] : - (delta.x > X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0)); - } - if (_periodicity[1]) { - delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] : - (delta.y > X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0)); - } - if (_periodicity[2]) { - delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] : - (delta.z > X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0)); - } - - } else { - if (_periodicity[1]) { - delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] : - (delta.z > X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0)); - delta.y += delta.z < -X_F(0.5)*_prd[2] ? _h[3] : - (delta.z > X_F(0.5)*_prd[2] ?-_h[3] : X_F(0.0)); - delta.x += delta.z < -X_F(0.5)*_prd[2] ? _h[4] : - (delta.z > X_F(0.5)*_prd[2] ?-_h[4] : X_F(0.0)); - - } - if (_periodicity[1]) { - delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] : - (delta.y > X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0)); - delta.x += delta.y < -X_F(0.5)*_prd[1] ? _h[5] : - (delta.y > X_F(0.5)*_prd[1] ?-_h[5] : X_F(0.0)); - - } - if (_periodicity[0]) { - delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] : - (delta.x > X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0)); - } - } -} - -__global__ void FixShakeCuda_UnconstrainedUpdate_Kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i>=_nlocal) return; - - X_FLOAT3 my_xshake = {X_F(0.0),X_F(0.0),X_F(0.0)}; - if(_shake_flag[i]) - { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; - X_FLOAT* my_x = _x + i; - - V_FLOAT dtfmsq = _dtfsq; - if(_rmass_flag) dtfmsq*= V_F(1.0) / _rmass[i]; - else dtfmsq*= V_F(1.0) / _mass[_type[i]]; - - my_xshake.x = *my_x + _dtv* *my_v + dtfmsq* *my_f; my_f += _nmax; my_v += _nmax; my_x += _nmax; - my_xshake.y = *my_x + _dtv* *my_v + dtfmsq* *my_f; my_f += _nmax; my_v += _nmax; my_x += _nmax; - my_xshake.z = *my_x + _dtv* *my_v + dtfmsq* *my_f; - } - _xshake[i]=my_xshake; -} - - - - -__device__ void FixShakeCuda_Shake2(int& vflag,int& vflag_atom,int& m) -{ - int nlist,list[2]; - ENERGY_FLOAT v[6]; - X_FLOAT invmass0,invmass1; - - // local atom IDs and constraint distances - - int i0 = _map_array[_shake_atom[m]]; - int i1 = _map_array[_shake_atom[m+_nmax]]; - X_FLOAT bond1 = _bond_distance[_shake_type[m]]; - - // r01 = distance vec between atoms, with PBC - - X_FLOAT3 r01; - - X_FLOAT4 x_i0,x_i1; - x_i0=fetchXType(i0); - x_i1=fetchXType(i1); - - r01.x = x_i0.x - x_i1.x; - r01.y = x_i0.y - x_i1.y; - r01.z = x_i0.z - x_i1.z; - minimum_image(r01); - - // s01 = distance vec after unconstrained update, with PBC - - X_FLOAT3 s01; - X_FLOAT3 xs_i0=_xshake[i0]; - X_FLOAT3 xs_i1=_xshake[i1]; - - s01.x = xs_i0.x - xs_i1.x; - s01.y = xs_i0.y - xs_i1.y; - s01.z = xs_i0.z - xs_i1.z; - minimum_image(s01); - - // scalar distances between atoms - - X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z; - X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z; - - // a,b,c = coeffs in quadratic equation for lamda - - if (_rmass_flag) { - invmass0 = X_F(1.0)/_rmass[i0]; - invmass1 = X_F(1.0)/_rmass[i1]; - } else { - invmass0 = X_F(1.0)/_mass[static_cast (x_i0.w)]; - invmass1 = X_F(1.0)/_mass[static_cast (x_i1.w)]; - } - - X_FLOAT a = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; - X_FLOAT b = X_F(2.0) * (invmass0+invmass1) * - (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z); - X_FLOAT c = s01sq - bond1*bond1; - - // error check - - X_FLOAT determ = b*b - X_F(4.0)*a*c; - if (determ < X_F(0.0)) { - _flag[0]++; - determ = X_F(0.0); - } - - // exact quadratic solution for lamda - - X_FLOAT lamda,lamda1,lamda2; - lamda1 = -b+_SQRT_(determ); - lamda2 = -lamda1 - X_F(2.0)*b; - lamda1 *= X_F(1.0) / (X_F(2.0)*a); - lamda2 *= X_F(1.0) / (X_F(2.0)*a); - - lamda = (fabs(lamda1) <= fabs(lamda2))? lamda1 : lamda2; - - // update forces if atom is owned by this processor - - lamda*= X_F(1.0)/_dtfsq; - - - //attenion: are shake clusters <-> atom unique? - nlist = 0; - if (i0 < _nlocal) { - _f[i0] += lamda*r01.x; - _f[i0+_nmax] += lamda*r01.y; - _f[i0+2*_nmax] += lamda*r01.z; - list[nlist++] = i0; - } - - if (i1 < _nlocal) { - _f[i1] -= lamda*r01.x; - _f[i1+_nmax] -= lamda*r01.y; - _f[i1+2*_nmax] -= lamda*r01.z; - list[nlist++] = i1; - } - - if (vflag||vflag_atom) { - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; - X_FLOAT factor=nlist; - v[0] = lamda*r01.x*r01.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 - v[1] = lamda*r01.y*r01.y; *shared = factor*v[1]; shared+=blockDim.x; - v[2] = lamda*r01.z*r01.z; *shared = factor*v[2]; shared+=blockDim.x; - v[3] = lamda*r01.x*r01.y; *shared = factor*v[3]; shared+=blockDim.x; - v[4] = lamda*r01.x*r01.z; *shared = factor*v[4]; shared+=blockDim.x; - v[5] = lamda*r01.y*r01.z; *shared = factor*v[5]; shared+=blockDim.x; - - v_tally(vflag,vflag_atom,nlist,list,2.0,v); - } -} - - -__device__ void FixShakeCuda_Shake3(int& vflag,int& vflag_atom,int& m) -{ - int nlist,list[3]; - ENERGY_FLOAT v[6]; - X_FLOAT invmass0,invmass1,invmass2; - - // local atom IDs and constraint distances - - int i0 = _map_array[_shake_atom[m]]; - int i1 = _map_array[_shake_atom[m+_nmax]]; - int i2 = _map_array[_shake_atom[m+2*_nmax]]; - X_FLOAT bond1 = _bond_distance[_shake_type[m]]; - X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]]; - - // r01 = distance vec between atoms, with PBC - - X_FLOAT3 r01,r02; - - X_FLOAT4 x_i0,x_i1,x_i2; - x_i0=fetchXType(i0); - x_i1=fetchXType(i1); - x_i2=fetchXType(i2); - - r01.x = x_i0.x - x_i1.x; - r01.y = x_i0.y - x_i1.y; - r01.z = x_i0.z - x_i1.z; - minimum_image(r01); - - r02.x = x_i0.x - x_i2.x; - r02.y = x_i0.y - x_i2.y; - r02.z = x_i0.z - x_i2.z; - minimum_image(r02); - - // s01 = distance vec after unconstrained update, with PBC - - X_FLOAT3 s01,s02; - X_FLOAT3 xs_i0=_xshake[i0]; - X_FLOAT3 xs_i1=_xshake[i1]; - X_FLOAT3 xs_i2=_xshake[i2]; - - s01.x = xs_i0.x - xs_i1.x; - s01.y = xs_i0.y - xs_i1.y; - s01.z = xs_i0.z - xs_i1.z; - minimum_image(s01); - - s02.x = xs_i0.x - xs_i2.x; - s02.y = xs_i0.y - xs_i2.y; - s02.z = xs_i0.z - xs_i2.z; - minimum_image(s02); - - // scalar distances between atoms - - X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z; - X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z; - X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z; - X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z; - - // a,b,c = coeffs in quadratic equation for lamda - - if (_rmass_flag) { - invmass0 = X_F(1.0)/_rmass[i0]; - invmass1 = X_F(1.0)/_rmass[i1]; - invmass2 = X_F(1.0)/_rmass[i2]; - } else { - invmass0 = X_F(1.0)/_mass[static_cast (x_i0.w)]; - invmass1 = X_F(1.0)/_mass[static_cast (x_i1.w)]; - invmass2 = X_F(1.0)/_mass[static_cast (x_i2.w)]; - } - - X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) * - (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z); - X_FLOAT a12 = X_F(2.0) * invmass0 * - (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z); - X_FLOAT a21 = X_F(2.0) * invmass0 * - (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z); - X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) * - (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z); - - // error check - - X_FLOAT determ = a11*a22 - a12*a21; - if (determ == X_F(0.0)) _flag[0]++; - X_FLOAT determinv = X_F(1.0)/determ; - - X_FLOAT a11inv = a22*determinv; - X_FLOAT a12inv = -a12*determinv; - X_FLOAT a21inv = -a21*determinv; - X_FLOAT a22inv = a11*determinv; - - // quadratic correction coeffs - - X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z); - - X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; - X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq; - X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102; - - X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; - X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq; - X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102; - - // iterate until converged - - X_FLOAT lamda01 = X_F(0.0); - X_FLOAT lamda02 = X_F(0.0); - int niter = 0; - int done = 0; - - X_FLOAT quad1,quad2,b1,b2,lamda01_new,lamda02_new; - -//maybe all running full loop? - while (__any(!done) && niter < _max_iter) { - quad1 = quad1_0101 * lamda01*lamda01 + quad1_0202 * lamda02*lamda02 + - quad1_0102 * lamda01*lamda02; - quad2 = quad2_0101 * lamda01*lamda01 + quad2_0202 * lamda02*lamda02 + - quad2_0102 * lamda01*lamda02; - - b1 = bond1*bond1 - s01sq - quad1; - b2 = bond2*bond2 - s02sq - quad2; - - lamda01_new = a11inv*b1 + a12inv*b2; - lamda02_new = a21inv*b1 + a22inv*b2; - - done++; - done = (fabs(lamda01_new-lamda01) > _tolerance)?0: done; - done = (fabs(lamda02_new-lamda02) > _tolerance)?0: done; - - - lamda01 = done<2?lamda01_new:lamda01; - lamda02 = done<2?lamda02_new:lamda02; - niter++; - } - // update forces if atom is owned by this processor - - lamda01 *= X_F(1.0)/_dtfsq; - lamda02 *= X_F(1.0)/_dtfsq; - - - //attenion: are shake clusters <-> atom unique? - nlist = 0; - if (i0 < _nlocal) { - _f[i0] += lamda01*r01.x + lamda02*r02.x; - _f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y; - _f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z; - list[nlist++] = i0; - } - - if (i1 < _nlocal) { - _f[i1] -= lamda01*r01.x; - _f[i1+_nmax] -= lamda01*r01.y; - _f[i1+2*_nmax] -= lamda01*r01.z; - list[nlist++] = i1; - } - - if (i2 < _nlocal) { - _f[i2] -= lamda02*r02.x; - _f[i2+_nmax] -= lamda02*r02.y; - _f[i2+2*_nmax] -= lamda02*r02.z; - list[nlist++] = i2; - } - - if (vflag||vflag_atom) { - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; - X_FLOAT factor=X_F(2.0)/X_F(3.0)*nlist; - v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 - v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y; *shared = factor*v[1]; shared+=blockDim.x; - v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z; *shared = factor*v[2]; shared+=blockDim.x; - v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y; *shared = factor*v[3]; shared+=blockDim.x; - v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z; *shared = factor*v[4]; shared+=blockDim.x; - v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z; *shared = factor*v[5]; shared+=blockDim.x; - - v_tally(vflag,vflag_atom,nlist,list,3.0,v); - } -} - -__device__ void FixShakeCuda_Shake4(int& vflag,int& vflag_atom,int& m) -{ - int nlist,list[4]; - ENERGY_FLOAT v[6]; - X_FLOAT invmass0,invmass1,invmass2,invmass3; - - // local atom IDs and constraint distances - - int i0 = _map_array[_shake_atom[m]]; - int i1 = _map_array[_shake_atom[m+_nmax]]; - int i2 = _map_array[_shake_atom[m+2*_nmax]]; - int i3 = _map_array[_shake_atom[m+3*_nmax]]; - X_FLOAT bond1 = _bond_distance[_shake_type[m]]; - X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]]; - X_FLOAT bond3 = _bond_distance[_shake_type[m+2*_nmax]]; - - // r01 = distance vec between atoms, with PBC - - X_FLOAT3 r01,r02,r03; - - X_FLOAT4 x_i0,x_i1,x_i2,x_i3; - x_i0=fetchXType(i0); - x_i1=fetchXType(i1); - x_i2=fetchXType(i2); - x_i3=fetchXType(i3); - - r01.x = x_i0.x - x_i1.x; - r01.y = x_i0.y - x_i1.y; - r01.z = x_i0.z - x_i1.z; - minimum_image(r01); - - r02.x = x_i0.x - x_i2.x; - r02.y = x_i0.y - x_i2.y; - r02.z = x_i0.z - x_i2.z; - minimum_image(r02); - - r03.x = x_i0.x - x_i3.x; - r03.y = x_i0.y - x_i3.y; - r03.z = x_i0.z - x_i3.z; - minimum_image(r03); - - // s01 = distance vec after unconstrained update, with PBC - - X_FLOAT3 s01,s02,s03; - X_FLOAT3 xs_i0=_xshake[i0]; - X_FLOAT3 xs_i1=_xshake[i1]; - X_FLOAT3 xs_i2=_xshake[i2]; - X_FLOAT3 xs_i3=_xshake[i3]; - - s01.x = xs_i0.x - xs_i1.x; - s01.y = xs_i0.y - xs_i1.y; - s01.z = xs_i0.z - xs_i1.z; - minimum_image(s01); - - s02.x = xs_i0.x - xs_i2.x; - s02.y = xs_i0.y - xs_i2.y; - s02.z = xs_i0.z - xs_i2.z; - minimum_image(s02); - - s03.x = xs_i0.x - xs_i3.x; - s03.y = xs_i0.y - xs_i3.y; - s03.z = xs_i0.z - xs_i3.z; - minimum_image(s03); - - // scalar distances between atoms - - X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z; - X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z; - X_FLOAT r03sq = r03.x*r03.x + r03.y*r03.y + r03.z*r03.z; - X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z; - X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z; - X_FLOAT s03sq = s03.x*s03.x + s03.y*s03.y + s03.z*s03.z; - - // a,b,c = coeffs in quadratic equation for lamda - - if (_rmass_flag) { - invmass0 = X_F(1.0)/_rmass[i0]; - invmass1 = X_F(1.0)/_rmass[i1]; - invmass2 = X_F(1.0)/_rmass[i2]; - invmass3 = X_F(1.0)/_rmass[i3]; - } else { - invmass0 = X_F(1.0)/_mass[static_cast (x_i0.w)]; - invmass1 = X_F(1.0)/_mass[static_cast (x_i1.w)]; - invmass2 = X_F(1.0)/_mass[static_cast (x_i2.w)]; - invmass3 = X_F(1.0)/_mass[static_cast (x_i3.w)]; - } - - X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) * - (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z); - X_FLOAT a12 = X_F(2.0) * invmass0 * - (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z); - X_FLOAT a13 = X_F(2.0) * invmass0 * - (s01.x*r03.x + s01.y*r03.y + s01.z*r03.z); - X_FLOAT a21 = X_F(2.0) * invmass0 * - (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z); - X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) * - (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z); - X_FLOAT a23 = X_F(2.0) * (invmass0) * - (s02.x*r03.x + s02.y*r03.y + s02.z*r03.z); - X_FLOAT a31 = X_F(2.0) * (invmass0) * - (s03.x*r01.x + s03.y*r01.y + s03.z*r01.z); - X_FLOAT a32 = X_F(2.0) * (invmass0) * - (s03.x*r02.x + s03.y*r02.y + s03.z*r02.z); - X_FLOAT a33 = X_F(2.0) * (invmass0+invmass3) * - (s03.x*r03.x + s03.y*r03.y + s03.z*r03.z); - - // error check - - X_FLOAT determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 - - a11*a23*a32 - a12*a21*a33 - a13*a22*a31; - if (determ == X_F(0.0)) _flag[0]++; - X_FLOAT determinv = X_F(1.0)/determ; - - X_FLOAT a11inv = determinv * (a22*a33 - a23*a32); - X_FLOAT a12inv = -determinv * (a12*a33 - a13*a32); - X_FLOAT a13inv = determinv * (a12*a23 - a13*a22); - X_FLOAT a21inv = -determinv * (a21*a33 - a23*a31); - X_FLOAT a22inv = determinv * (a11*a33 - a13*a31); - X_FLOAT a23inv = -determinv * (a11*a23 - a13*a21); - X_FLOAT a31inv = determinv * (a21*a32 - a22*a31); - X_FLOAT a32inv = -determinv * (a11*a32 - a12*a31); - X_FLOAT a33inv = determinv * (a11*a22 - a12*a21); - - // quadratic correction coeffs - - X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z); - X_FLOAT r0103 = (r01.x*r03.x + r01.y*r03.y + r01.z*r03.z); - X_FLOAT r0203 = (r02.x*r03.x + r02.y*r03.y + r02.z*r03.z); - - X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; - X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq; - X_FLOAT quad1_0303 = invmass0*invmass0 * r03sq; - X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102; - X_FLOAT quad1_0103 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0103; - X_FLOAT quad1_0203 = X_F(2.0) * invmass0*invmass0 * r0203; - - X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq; - X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; - X_FLOAT quad2_0303 = invmass0*invmass0 * r03sq; - X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102; - X_FLOAT quad2_0103 = X_F(2.0) * invmass0*invmass0 * r0103; - X_FLOAT quad2_0203 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0203; - - X_FLOAT quad3_0101 = invmass0*invmass0 * r01sq; - X_FLOAT quad3_0202 = invmass0*invmass0 * r02sq; - X_FLOAT quad3_0303 = (invmass0+invmass3)*(invmass0+invmass3) * r03sq; - X_FLOAT quad3_0102 = X_F(2.0) * invmass0*invmass0 * r0102; - X_FLOAT quad3_0103 = X_F(2.0) * (invmass0+invmass3)*invmass0 * r0103; - X_FLOAT quad3_0203 = X_F(2.0) * (invmass0+invmass3)*invmass0 * r0203; - // iterate until converged - - X_FLOAT lamda01 = X_F(0.0); - X_FLOAT lamda02 = X_F(0.0); - X_FLOAT lamda03 = X_F(0.0); - int niter = 0; - int done = 0; - - X_FLOAT quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda03_new; - -//maybe all running full loop? - while (__any(!done) && niter < _max_iter) { - quad1 = quad1_0101 * lamda01*lamda01 + - quad1_0202 * lamda02*lamda02 + - quad1_0303 * lamda03*lamda03 + - quad1_0102 * lamda01*lamda02 + - quad1_0103 * lamda01*lamda03 + - quad1_0203 * lamda02*lamda03; - - quad2 = quad2_0101 * lamda01*lamda01 + - quad2_0202 * lamda02*lamda02 + - quad2_0303 * lamda03*lamda03 + - quad2_0102 * lamda01*lamda02 + - quad2_0103 * lamda01*lamda03 + - quad2_0203 * lamda02*lamda03; - - quad3 = quad3_0101 * lamda01*lamda01 + - quad3_0202 * lamda02*lamda02 + - quad3_0303 * lamda03*lamda03 + - quad3_0102 * lamda01*lamda02 + - quad3_0103 * lamda01*lamda03 + - quad3_0203 * lamda02*lamda03; - - b1 = bond1*bond1 - s01sq - quad1; - b2 = bond2*bond2 - s02sq - quad2; - b3 = bond3*bond3 - s03sq - quad3; - - lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3; - lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3; - lamda03_new = a31inv*b1 + a32inv*b2 + a33inv*b3; - - done++; - done = (fabs(lamda01_new-lamda01) > _tolerance)? 0:done; - done = (fabs(lamda02_new-lamda02) > _tolerance)? 0:done; - done = (fabs(lamda03_new-lamda03) > _tolerance)? 0:done; - - lamda01 = done<2?lamda01_new:lamda01; - lamda02 = done<2?lamda02_new:lamda02; - lamda03 = done<2?lamda03_new:lamda03; - niter++; - } - // update forces if atom is owned by this processor - - lamda01 *= X_F(1.0)/_dtfsq; - lamda02 *= X_F(1.0)/_dtfsq; - lamda03 *= X_F(1.0)/_dtfsq; - - - //attenion: are shake clusters <-> atom unique? - nlist = 0; - if (i0 < _nlocal) { - _f[i0] += lamda01*r01.x + lamda02*r02.x + lamda03*r03.x; - _f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y + lamda03*r03.y; - _f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z + lamda03*r03.z; - list[nlist++] = i0; - } - - if (i1 < _nlocal) { - _f[i1] -= lamda01*r01.x; - _f[i1+_nmax] -= lamda01*r01.y; - _f[i1+2*_nmax] -= lamda01*r01.z; - list[nlist++] = i1; - } - - if (i2 < _nlocal) { - _f[i2] -= lamda02*r02.x; - _f[i2+_nmax] -= lamda02*r02.y; - _f[i2+2*_nmax] -= lamda02*r02.z; - list[nlist++] = i2; - } - - if (i3 < _nlocal) { - _f[i3] -= lamda03*r03.x; - _f[i3+_nmax] -= lamda03*r03.y; - _f[i3+2*_nmax] -= lamda03*r03.z; - list[nlist++] = i3; - } - - if (vflag||vflag_atom) { - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; - X_FLOAT factor=X_F(2.0)/X_F(4.0)*nlist; - v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x + lamda03*r03.x*r03.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 - v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y + lamda03*r03.y*r03.y; *shared = factor*v[1]; shared+=blockDim.x; - v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z + lamda03*r03.z*r03.z; *shared = factor*v[2]; shared+=blockDim.x; - v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y + lamda03*r03.x*r03.y; *shared = factor*v[3]; shared+=blockDim.x; - v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z + lamda03*r03.x*r03.z; *shared = factor*v[4]; shared+=blockDim.x; - v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z + lamda03*r03.y*r03.z; *shared = factor*v[5]; shared+=blockDim.x; - - v_tally(vflag,vflag_atom,nlist,list,4.0,v); - } -} - -__device__ void FixShakeCuda_Shake3Angle(int& vflag,int& vflag_atom,int& m) -{ - int nlist,list[3]; - ENERGY_FLOAT v[6]; - X_FLOAT invmass0,invmass1,invmass2; - - // local atom IDs and constraint distances - - int i0 = _map_array[_shake_atom[m]]; - int i1 = _map_array[_shake_atom[m+_nmax]]; - int i2 = _map_array[_shake_atom[m+2*_nmax]]; - X_FLOAT bond1 = _bond_distance[_shake_type[m]]; - X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]]; - X_FLOAT bond12 = _angle_distance[_shake_type[m+2*_nmax]]; - - // r01 = distance vec between atoms, with PBC - - X_FLOAT3 r01,r02,r12; - - X_FLOAT4 x_i0,x_i1,x_i2; - x_i0=fetchXType(i0); - x_i1=fetchXType(i1); - x_i2=fetchXType(i2); - - r01.x = x_i0.x - x_i1.x; - r01.y = x_i0.y - x_i1.y; - r01.z = x_i0.z - x_i1.z; - minimum_image(r01); - - r02.x = x_i0.x - x_i2.x; - r02.y = x_i0.y - x_i2.y; - r02.z = x_i0.z - x_i2.z; - minimum_image(r02); - - r12.x = x_i1.x - x_i2.x; - r12.y = x_i1.y - x_i2.y; - r12.z = x_i1.z - x_i2.z; - minimum_image(r12); - - // s01 = distance vec after unconstrained update, with PBC - - X_FLOAT3 s01,s02,s12; - X_FLOAT3 xs_i0=_xshake[i0]; - X_FLOAT3 xs_i1=_xshake[i1]; - X_FLOAT3 xs_i2=_xshake[i2]; - - s01.x = xs_i0.x - xs_i1.x; - s01.y = xs_i0.y - xs_i1.y; - s01.z = xs_i0.z - xs_i1.z; - minimum_image(s01); - - s02.x = xs_i0.x - xs_i2.x; - s02.y = xs_i0.y - xs_i2.y; - s02.z = xs_i0.z - xs_i2.z; - minimum_image(s02); - - s12.x = xs_i1.x - xs_i2.x; - s12.y = xs_i1.y - xs_i2.y; - s12.z = xs_i1.z - xs_i2.z; - minimum_image(s12); - - // scalar distances between atoms - - X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z; - X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z; - X_FLOAT r12sq = r12.x*r12.x + r12.y*r12.y + r12.z*r12.z; - X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z; - X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z; - X_FLOAT s12sq = s12.x*s12.x + s12.y*s12.y + s12.z*s12.z; - - // a,b,c = coeffs in quadratic equation for lamda - - if (_rmass_flag) { - invmass0 = X_F(1.0)/_rmass[i0]; - invmass1 = X_F(1.0)/_rmass[i1]; - invmass2 = X_F(1.0)/_rmass[i2]; - } else { - invmass0 = X_F(1.0)/_mass[static_cast (x_i0.w)]; - invmass1 = X_F(1.0)/_mass[static_cast (x_i1.w)]; - invmass2 = X_F(1.0)/_mass[static_cast (x_i2.w)]; - } - - X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) * - (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z); - X_FLOAT a12 = X_F(2.0) * invmass0 * - (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z); - X_FLOAT a13 = - X_F(2.0) * invmass1 * - (s01.x*r12.x + s01.y*r12.y + s01.z*r12.z); - X_FLOAT a21 = X_F(2.0) * invmass0 * - (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z); - X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) * - (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z); - X_FLOAT a23 = X_F(2.0) * invmass2 * - (s02.x*r12.x + s02.y*r12.y + s02.z*r12.z); - X_FLOAT a31 = - X_F(2.0) * invmass1 * - (s12.x*r01.x + s12.y*r01.y + s12.z*r01.z); - X_FLOAT a32 = X_F(2.0) * invmass2 * - (s12.x*r02.x + s12.y*r02.y + s12.z*r02.z); - X_FLOAT a33 = X_F(2.0) * (invmass1+invmass2) * - (s12.x*r12.x + s12.y*r12.y + s12.z*r12.z); - - // inverse of matrix - - X_FLOAT determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 - - a11*a23*a32 - a12*a21*a33 - a13*a22*a31; - if (determ == X_F(0.0)) _flag[0]++; - X_FLOAT determinv = X_F(1.0)/determ; - - X_FLOAT a11inv = determinv * (a22*a33 - a23*a32); - X_FLOAT a12inv = -determinv * (a12*a33 - a13*a32); - X_FLOAT a13inv = determinv * (a12*a23 - a13*a22); - X_FLOAT a21inv = -determinv * (a21*a33 - a23*a31); - X_FLOAT a22inv = determinv * (a11*a33 - a13*a31); - X_FLOAT a23inv = -determinv * (a11*a23 - a13*a21); - X_FLOAT a31inv = determinv * (a21*a32 - a22*a31); - X_FLOAT a32inv = -determinv * (a11*a32 - a12*a31); - X_FLOAT a33inv = determinv * (a11*a22 - a12*a21); - - // quadratic correction coeffs - - X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z); - X_FLOAT r0112 = (r01.x*r12.x + r01.y*r12.y + r01.z*r12.z); - X_FLOAT r0212 = (r02.x*r12.x + r02.y*r12.y + r02.z*r12.z); - - X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; - X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq; - X_FLOAT quad1_1212 = invmass1*invmass1 * r12sq; - X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102; - X_FLOAT quad1_0112 = - X_F(2.0) * (invmass0+invmass1)*invmass1 * r0112; - X_FLOAT quad1_0212 = - X_F(2.0) * invmass0*invmass1 * r0212; - - X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq; - X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; - X_FLOAT quad2_1212 = invmass2*invmass2 * r12sq; - X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102; - X_FLOAT quad2_0112 = X_F(2.0) * invmass0*invmass2 * r0112; - X_FLOAT quad2_0212 = X_F(2.0) * (invmass0+invmass2)*invmass2 * r0212; - - X_FLOAT quad3_0101 = invmass1*invmass1 * r01sq; - X_FLOAT quad3_0202 = invmass2*invmass2 * r02sq; - X_FLOAT quad3_1212 = (invmass1+invmass2)*(invmass1+invmass2) * r12sq; - X_FLOAT quad3_0102 = - X_F(2.0) * invmass1*invmass2 * r0102; - X_FLOAT quad3_0112 = - X_F(2.0) * (invmass1+invmass2)*invmass1 * r0112; - X_FLOAT quad3_0212 = X_F(2.0) * (invmass1+invmass2)*invmass2 * r0212; - // iterate until converged - - X_FLOAT lamda01 = X_F(0.0); - X_FLOAT lamda02 = X_F(0.0); - X_FLOAT lamda12 = X_F(0.0); - int niter = 0; - int done = 0; - - X_FLOAT quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda12_new; - -//maybe all running full loop? - while (__any(!done) && niter < _max_iter) { - quad1 = quad1_0101 * lamda01*lamda01 + - quad1_0202 * lamda02*lamda02 + - quad1_1212 * lamda12*lamda12 + - quad1_0102 * lamda01*lamda02 + - quad1_0112 * lamda01*lamda12 + - quad1_0212 * lamda02*lamda12; - - quad2 = quad2_0101 * lamda01*lamda01 + - quad2_0202 * lamda02*lamda02 + - quad2_1212 * lamda12*lamda12 + - quad2_0102 * lamda01*lamda02 + - quad2_0112 * lamda01*lamda12 + - quad2_0212 * lamda02*lamda12; - - quad3 = quad3_0101 * lamda01*lamda01 + - quad3_0202 * lamda02*lamda02 + - quad3_1212 * lamda12*lamda12 + - quad3_0102 * lamda01*lamda02 + - quad3_0112 * lamda01*lamda12 + - quad3_0212 * lamda02*lamda12; - - b1 = bond1*bond1 - s01sq - quad1; - b2 = bond2*bond2 - s02sq - quad2; - b3 = bond12*bond12 - s12sq - quad3; - - lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3; - lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3; - lamda12_new = a31inv*b1 + a32inv*b2 + a33inv*b3; - - done++; - done = (fabs(lamda01_new-lamda01) > _tolerance)?0: done; - done = (fabs(lamda02_new-lamda02) > _tolerance)?0: done; - done = (fabs(lamda12_new-lamda12) > _tolerance)?0: done; - - lamda01 = done<2?lamda01_new:lamda01; - lamda02 = done<2?lamda02_new:lamda02; - lamda12 = done<2?lamda12_new:lamda12; - niter++; - } - // update forces if atom is owned by this processor - - lamda01 *= X_F(1.0)/_dtfsq; - lamda02 *= X_F(1.0)/_dtfsq; - lamda12 *= X_F(1.0)/_dtfsq; - - - //attenion: are shake clusters <-> atom unique? - nlist = 0; - if (i0 < _nlocal) { - _f[i0] += lamda01*r01.x + lamda02*r02.x; - _f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y; - _f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z; - list[nlist++] = i0; - } - - if (i1 < _nlocal) { - _f[i1] -= lamda01*r01.x - lamda12*r12.x; - _f[i1+_nmax] -= lamda01*r01.y - lamda12*r12.y; - _f[i1+2*_nmax] -= lamda01*r01.z - lamda12*r12.z; - list[nlist++] = i1; - } - - if (i2 < _nlocal) { - _f[i2] -= lamda02*r02.x + lamda12*r12.x; - _f[i2+_nmax] -= lamda02*r02.y + lamda12*r12.y; - _f[i2+2*_nmax] -= lamda02*r02.z + lamda12*r12.z; - list[nlist++] = i2; - } - - if (vflag||vflag_atom) { - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; - X_FLOAT factor=X_F(2.0)/X_F(3.0)*nlist; - v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x + lamda12*r12.x*r12.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 - v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y + lamda12*r12.y*r12.y; *shared = factor*v[1]; shared+=blockDim.x; - v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z + lamda12*r12.z*r12.z; *shared = factor*v[2]; shared+=blockDim.x; - v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y + lamda12*r12.x*r12.y; *shared = factor*v[3]; shared+=blockDim.x; - v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z + lamda12*r12.x*r12.z; *shared = factor*v[4]; shared+=blockDim.x; - v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z + lamda12*r12.y*r12.z; *shared = factor*v[5]; shared+=blockDim.x; - - v_tally(vflag,vflag_atom,nlist,list,3.0,v); - } -} - -__global__ void FixShakeCuda_Shake_Kernel(int vflag,int vflag_atom,int* list,int nlist) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i_nmax) _flag[0]=1; - X_FLOAT3 xs=_xshake[j]; - ((X_FLOAT*) _buffer)[i]=xs.x + dx; - ((X_FLOAT*) _buffer)[i+1*n] = xs.y + dy; - ((X_FLOAT*) _buffer)[i+2*n] = xs.z + dz; - } - -} - -__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - int* list=sendlist+iswap*maxlistlength; - if(i_nmax) _flag[0]=1; - X_FLOAT3 xs=_xshake[j]; - xs.x += dx; - xs.y += dy; - xs.z += dz; - _xshake[i+first]=xs; - } - -} - -__global__ void FixShakeCuda_UnpackComm_Kernel(int n,int first) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i -#define MY_PREFIX fix_temp_berendsen_cuda -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" - -#include "fix_temp_berendsen_cuda_cu.h" -#include "fix_temp_berendsen_cuda_kernel.cu" - - -void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); -} - -void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata) -{ - Cuda_FixTempBerendsenCuda_UpdateNmax(sdata); - -} - - -void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor) -{ - V_FLOAT factor=afactor; - if(sdata->atom.update_nmax) - Cuda_FixTempBerendsenCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Cuda_FixTempBerendsenCuda_EndOfStep_Kernel<<>> (groupbit,factor); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Cuda_FixTempBerendsenCuda_PostForce: fix add_force post_force compute Kernel execution failed"); -} diff --git a/lib/cuda/fix_temp_berendsen_cuda_cu.h b/lib/cuda/fix_temp_berendsen_cuda_cu.h deleted file mode 100644 index fd64f98e42..0000000000 --- a/lib/cuda/fix_temp_berendsen_cuda_cu.h +++ /dev/null @@ -1,27 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor); diff --git a/lib/cuda/fix_temp_berendsen_cuda_kernel.cu b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu deleted file mode 100644 index 716cbeac1e..0000000000 --- a/lib/cuda/fix_temp_berendsen_cuda_kernel.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - - - -__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal) - if (_mask[i] & groupbit) { - _v[i]*=factor; - _v[i+_nmax]*=factor; - _v[i+2*_nmax]*=factor; - } -} - diff --git a/lib/cuda/fix_temp_rescale_cuda.cu b/lib/cuda/fix_temp_rescale_cuda.cu deleted file mode 100644 index 6ca0942970..0000000000 --- a/lib/cuda/fix_temp_rescale_cuda.cu +++ /dev/null @@ -1,64 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX fix_temp_rescale_cuda -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" - -#include "fix_temp_rescale_cuda_cu.h" -#include "fix_temp_rescale_cuda_kernel.cu" - - -void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); -} - -void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata) -{ - Cuda_FixTempRescaleCuda_UpdateNmax(sdata); - -} - - -void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor) -{ - V_FLOAT factor=afactor; - //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step - Cuda_FixTempRescaleCuda_UpdateNmax(sdata); - //if(sdata->atom.update_nlocal) - //cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Cuda_FixTempRescaleCuda_EndOfStep_Kernel<<>> (groupbit,factor); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleCuda_PostForce: fix add_force post_force compute Kernel execution failed"); -} diff --git a/lib/cuda/fix_temp_rescale_cuda_cu.h b/lib/cuda/fix_temp_rescale_cuda_cu.h deleted file mode 100644 index 689b51a603..0000000000 --- a/lib/cuda/fix_temp_rescale_cuda_cu.h +++ /dev/null @@ -1,27 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor); diff --git a/lib/cuda/fix_temp_rescale_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_cuda_kernel.cu deleted file mode 100644 index 19d04a5156..0000000000 --- a/lib/cuda/fix_temp_rescale_cuda_kernel.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - - - -__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal) - if (_mask[i] & groupbit) { - _v[i]*=factor; - _v[i+_nmax]*=factor; - _v[i+2*_nmax]*=factor; - } -} - diff --git a/lib/cuda/fix_temp_rescale_limit_cuda.cu b/lib/cuda/fix_temp_rescale_limit_cuda.cu deleted file mode 100644 index 5e2c43e932..0000000000 --- a/lib/cuda/fix_temp_rescale_limit_cuda.cu +++ /dev/null @@ -1,64 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX fix_temp_rescale_limit_cuda -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" - -#include "fix_temp_rescale_limit_cuda_cu.h" -#include "fix_temp_rescale_limit_cuda_kernel.cu" - - -void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); -} - -void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata) -{ - Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata); - -} - - -void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor,double limit) -{ - V_FLOAT factor=afactor; - //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step - Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata); - //if(sdata->atom.update_nlocal) - //cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - - int3 layout=getgrid(sdata->atom.nlocal); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel<<>> (groupbit,factor,limit); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleLimitCuda_PostForce: fix add_force post_force compute Kernel execution failed"); -} diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_cu.h b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h deleted file mode 100644 index 117bca28d8..0000000000 --- a/lib/cuda/fix_temp_rescale_limit_cuda_cu.h +++ /dev/null @@ -1,27 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor,double limit); diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu deleted file mode 100644 index a6cf446677..0000000000 --- a/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu +++ /dev/null @@ -1,43 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - - - -__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor,V_FLOAT limit) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if(i < _nlocal) - if (_mask[i] & groupbit) { - V_FLOAT vx = _v[i]; - V_FLOAT vy = _v[i+_nmax]; - V_FLOAT vz = _v[i+2*_nmax]; - vx*=factor; - vy*=factor; - vz*=factor; - - _v[i]=vx>0?min(vx,limit):max(vx,-limit); - _v[i+_nmax]=vy>0?min(vy,limit):max(vy,-limit); - _v[i+2*_nmax]=vz>0?min(vz,limit):max(vz,-limit); - } -} - diff --git a/lib/cuda/fix_viscous_cuda.cu b/lib/cuda/fix_viscous_cuda.cu deleted file mode 100644 index 3406115e58..0000000000 --- a/lib/cuda/fix_viscous_cuda.cu +++ /dev/null @@ -1,66 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX fix_viscous_cuda -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" - -#include "fix_viscous_cuda_cu.h" -#include "fix_viscous_cuda_kernel.cu" - -void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); -} - -void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata) -{ - Cuda_FixViscousCuda_UpdateNmax(sdata); - -} - - -void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit,void* gamma) -{ - if(sdata->atom.update_nmax) - Cuda_FixViscousCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - - - int3 layout=getgrid(sdata->atom.nlocal,0); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Cuda_FixViscousCuda_PostForce_Kernel<<>> (groupbit,(F_FLOAT*) gamma); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed"); - -} diff --git a/lib/cuda/fix_viscous_cuda_cu.h b/lib/cuda/fix_viscous_cuda_cu.h deleted file mode 100644 index b785a598a8..0000000000 --- a/lib/cuda/fix_viscous_cuda_cu.h +++ /dev/null @@ -1,27 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit,void* gamma); diff --git a/lib/cuda/fix_viscous_cuda_kernel.cu b/lib/cuda/fix_viscous_cuda_kernel.cu deleted file mode 100644 index 2cd225bbd1..0000000000 --- a/lib/cuda/fix_viscous_cuda_kernel.cu +++ /dev/null @@ -1,35 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit,F_FLOAT* gamma) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - if(i < _nlocal) - if (_mask[i] & groupbit) { - F_FLOAT drag = gamma[_type[i]]; - _f[i] -= drag*_v[i]; - _f[i+1*_nmax] -= drag*_v[i+1*_nmax]; - _f[i+2*_nmax] -= drag*_v[i+2*_nmax]; - } -} diff --git a/lib/cuda/neighbor.cu b/lib/cuda/neighbor.cu deleted file mode 100644 index b1732bf9b6..0000000000 --- a/lib/cuda/neighbor.cu +++ /dev/null @@ -1,367 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#include -#define MY_PREFIX neighbor -#define IncludeCommonNeigh -#include "cuda_shared.h" -#include "cuda_common.h" -#include "crm_cuda_utils.cu" -#include "cuda_wrapper_cu.h" - -#define _cutneighsq MY_AP(cutneighsq) -#define _ex_type MY_AP(ex_type) -#define _nex_type MY_AP(nex_type) -#define _ex1_bit MY_AP(ex1_bit) -#define _ex2_bit MY_AP(ex2_bit) -#define _nex_group MY_AP(nex_group) -#define _ex_mol_bit MY_AP(ex_mol_bit) -#define _nex_mol MY_AP(nex_mol) -__device__ __constant__ CUDA_FLOAT* _cutneighsq; -__device__ __constant__ int* _ex_type; -__device__ __constant__ int _nex_type; -__device__ __constant__ int* _ex1_bit; -__device__ __constant__ int* _ex2_bit; -__device__ __constant__ int _nex_group; -__device__ __constant__ int* _ex_mol_bit; -__device__ __constant__ int _nex_mol; - -#include "neighbor_cu.h" -#include "neighbor_kernel.cu" - -void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed"); - - int size=(unsigned)(sizeof(int)*20+sneighlist->bin_dim[0]*sneighlist->bin_dim[1]*sneighlist->bin_dim[2]*(sizeof(int)+sneighlist->bin_nmax*3*sizeof(CUDA_FLOAT))); - if(sdata->buffersizebuffer,sdata->buffersize);) - if(sdata->buffer!=NULL) CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer=CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); - CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed"); -} - -int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - if(sdata->buffer_new) - Cuda_Neighbor_UpdateBuffer(sdata,sneighlist); - - // initialize only on first call - CUDA_FLOAT rez_bin_size[3] = - { - (1.0 * sneighlist->bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]), - (1.0 * sneighlist->bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]), - (1.0 * sneighlist->bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2]) - }; - - short init = 0; - if(! init) - { - init = 0; - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo , sizeof(X_FLOAT)*3); - } - - - int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - timespec starttime,endtime; - clock_gettime(CLOCK_REALTIME,&starttime); - - cudaMemset((int*) (sdata->buffer),0,sizeof(int)*(20+(sneighlist->bin_dim[0])*(sneighlist->bin_dim[1])*(sneighlist->bin_dim[2]))+3*sizeof(CUDA_FLOAT)*(sneighlist->bin_dim[0])*(sneighlist->bin_dim[1])*(sneighlist->bin_dim[2])*(sneighlist->bin_nmax)); - - Binning_Kernel<<>> (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],sneighlist->bin_dim[2],rez_bin_size[0],rez_bin_size[1],rez_bin_size[2]); - cudaThreadSynchronize(); - - clock_gettime(CLOCK_REALTIME,&endtime); - sdata->cuda_timings.neigh_bin+= - endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; - - - int binning_error; - cudaMemcpy((void*) &binning_error,(void*) sdata->buffer,1*sizeof(int),cudaMemcpyDeviceToHost); - if(binning_error) - { - sneighlist->bin_extraspace+=0.05; - } - else - { - MYDBG(printf("CUDA: binning successful\n");) - } - CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed"); - return binning_error; -} - -int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - //Cuda_Neighbor_UpdateBuffer(sdata,sneighlist); - CUDA_FLOAT globcutoff=-1.0; - - short init=0; - if(! init) - { - init = 1; - - // !! LAMMPS indexes atom types starting with 1 !! - - unsigned cuda_ntypes = sdata->atom.ntypes + 1; - - unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes; - - CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx); - //printf("Allocate: %i\n",nx); - sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx); - - if(sneighlist->cutneighsq) - { - int cutoffsdiffer=0; - double cutoff0 = sneighlist->cutneighsq[1][1]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT) (sneighlist->cutneighsq[i][j]); - if((sneighlist->cutneighsq[i][j]-cutoff0)*(sneighlist->cutneighsq[i][j]-cutoff0)>1e-6) cutoffsdiffer++; - } - } - if(not cutoffsdiffer) globcutoff=(CUDA_FLOAT) cutoff0; - } - else - { - MYEMUDBG( printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n"); ) - return 0; - } - - int size = 100; - if(sdata->buffersize < size) - { - MYDBG( printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize); ) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize = size; - sdata->buffer_new++; - MYDBG( printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize); ) - } - - CudaWrapper_UploadCudaData(acutneighsq,sneighlist->cu_cutneighsq,nx); - cudaMemcpyToSymbol(MY_CONST(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_FLOAT*) ); - - cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(special_flag) , sdata->atom.special_flag , 4*sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(molecular) , & sdata->atom.molecular , sizeof(int) ); - } - - cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); - //cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(special) , & sdata->atom.special .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(maxspecial) , & sdata->atom.maxspecial , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nspecial) , & sdata->atom.nspecial .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata , sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(overlap_comm) , & sdata->overlap_comm, sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(ex_type) , & sneighlist->ex_type.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(ex1_bit) , & sneighlist->ex1_bit.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(ex2_bit) , & sneighlist->ex2_bit.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(ex_mol_bit) , & sneighlist->ex_mol_bit.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(nex_type) , & sneighlist->nex_type, sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nex_group) , & sneighlist->nex_group, sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nex_mol) , & sneighlist->nex_mol, sizeof(int) ); - - if(sdata->overlap_comm) - { - cudaMemcpyToSymbol(MY_CONST(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*) ); - } - - //dim3 threads(sneighlist->bin_nmax,1,1); - dim3 threads(MIN(128,sneighlist->bin_nmax),1,1); - dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1],sneighlist->bin_dim[2],1); - - //printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax); - int buffer[20]; - buffer[0]=1; - buffer[1]=0; - CudaWrapper_UploadCudaData( buffer, sdata->buffer, 2*sizeof(int)); - CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error"); - //cudaMemset(sdata->debugdata,0,100*sizeof(int)); - unsigned int shared_size=(sizeof(int)+3*sizeof(CUDA_FLOAT))*threads.x; - MYDBG(printf("Configuration: %i %i %i %u %i\n",grid.x,grid.y,threads.x,shared_size,sneighlist->bin_nmax);) - //shared_size=2056; - timespec starttime,endtime; - clock_gettime(CLOCK_REALTIME,&starttime); - //for(int i=0;i<100;i++) - { - if(sdata->overlap_comm) - NeighborBuildFullBin_OverlapComm_Kernel<<>> - (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom); - else - { - int exclude=sneighlist->nex_mol|sneighlist->nex_group|sneighlist->nex_type; - if(exclude) - NeighborBuildFullBin_Kernel<1><<>> - (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom,sdata->pair.neighall); - else - NeighborBuildFullBin_Kernel<0><<>> - (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom,sdata->pair.neighall); - } - //NeighborBuildFullBin_Kernel_Restrict<<>> - // (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff); - - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed"); - clock_gettime(CLOCK_REALTIME,&endtime); - sdata->cuda_timings.neigh_build+= - endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; - //dim3 threads,grid; - CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int)); - if(buffer[0]>=0&&true&&sdata->atom.molecular) - { - //printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall); - clock_gettime(CLOCK_REALTIME,&starttime); - int3 layout=getgrid(sdata->atom.nlocal,0,512); - threads.x = layout.z; threads.y = 1; threads.z = 1; - grid.x = layout.x; grid.y = layout.y; grid.z = 1; - FindSpecial<<>>(sdata->pair.use_block_per_atom); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed"); - clock_gettime(CLOCK_REALTIME,&endtime); - sdata->cuda_timings.neigh_special+= - endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; - } - } - //printf("Neightime: %lf\n",sdata->cuda_timings.test1); - CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed"); - - //CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int)); - - MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");) - return buffer[0]; -} - -int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");) - // initialize only on first call - /*static*/ short init=0; - if(! init) - { - init = 1; - - // !! LAMMPS indexes atom types starting with 1 !! - - unsigned cuda_ntypes = sdata->atom.ntypes + 1; - if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2) - printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u " - "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " - "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); - - unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes; - CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx); - - if(sneighlist->cutneighsq) - { - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT) (sneighlist->cutneighsq[i][j]); - //printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]); - } - } - } - else - { - MYEMUDBG( printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n"); ) - return 0; - } - - int size = 100; - if(sdata->buffersize < size) - { - MYDBG( printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize); ) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize = size; - sdata->buffer_new++; - MYDBG( printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize); ) - } - - cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer , sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(cutneighsq) , acutneighsq , nx ); - cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); - - free(acutneighsq); - } - - int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - int return_value = 1; - CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int)); - - CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed"); - NeighborBuildFullNsq_Kernel<<>> (); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed"); - - int buffer[20]; - CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int)*20); - MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");) - return return_value=buffer[0]; -} diff --git a/lib/cuda/neighbor_cu.h b/lib/cuda/neighbor_cu.h deleted file mode 100644 index 6ca1440de0..0000000000 --- a/lib/cuda/neighbor_cu.h +++ /dev/null @@ -1,32 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#ifndef NEIGHBOR_CU_H_ -#define NEIGHBOR_CU_H_ -#include "cuda_shared.h" - -extern "C" int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist); -extern "C" int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist); -extern "C" int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist); - -#endif /*NEIGHBOR_CU_H_*/ diff --git a/lib/cuda/neighbor_kernel.cu b/lib/cuda/neighbor_kernel.cu deleted file mode 100644 index 965aa2b1cf..0000000000 --- a/lib/cuda/neighbor_kernel.cu +++ /dev/null @@ -1,626 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#define SBBITS 30 - -__global__ void Binning_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,int bin_dim_z, - CUDA_FLOAT rez_bin_size_x,CUDA_FLOAT rez_bin_size_y,CUDA_FLOAT rez_bin_size_z) -{ - int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - - /*int* bin_count=(int*) _buffer; - bin_count=bin_count+20; - CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/ - CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer; - binned_x = &binned_x[2]; - int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax]; - if(i < _nall) - { - // copy atom position from global device memory to local register - // in this 3 steps to get as much coalesced access as possible - X_FLOAT* my_x = _x + i; - CUDA_FLOAT x_i = *my_x; my_x += _nmax; - CUDA_FLOAT y_i = *my_x; my_x += _nmax; - CUDA_FLOAT z_i = *my_x; - - - // calculate flat bin index - int bx=__float2int_rd(rez_bin_size_x * (x_i - _sublo[0]))+2; - int by=__float2int_rd(rez_bin_size_y * (y_i - _sublo[1]))+2; - int bz=__float2int_rd(rez_bin_size_z * (z_i - _sublo[2]))+2; - - bx-=bx*negativCUDA(1.0f*bx); - bx-=(bx-bin_dim_x+1)*negativCUDA(1.0f*bin_dim_x-1.0f-1.0f*bx); - by-=by*negativCUDA(1.0f*by); - by-=(by-bin_dim_y+1)*negativCUDA(1.0f*bin_dim_y-1.0f-1.0f*by); - bz-=bz*negativCUDA(1.0f*bz); - bz-=(bz-bin_dim_z+1)*negativCUDA(1.0f*bin_dim_z-1.0f-1.0f*bz); - - - const unsigned j = bin_dim_z * ( bin_dim_y *bx+by)+bz; - - // add new atom to bin, get bin-array position - const unsigned k = atomicAdd(& bin_count[j], 1); - if(k < bin_nmax) - { - binned_id [bin_nmax * j + k] = i; - binned_x [3 * bin_nmax * j + k] = x_i; - binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i; - binned_x [3 * bin_nmax * j + k + 2*bin_nmax] = z_i; - } - else - { // normally, this should not happen: - int errorn=atomicAdd((int*) _buffer, 1); - MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); ) - } - } -} - - -__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype) -{ - int m; - - if (_nex_type) - if( _ex_type[itype * _cuda_ntypes + jtype]) return 1; - - if (_nex_group) { - for (m = 0; m < _nex_group; m++) { - if (_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1; - if (_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1; - } - } - - if (_nex_mol) { - if(_molecule[i] == _molecule[j]) - for (m = 0; m < _nex_mol; m++) - if (_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m] ) return 1; - } - - return 0; -} - -extern __shared__ CUDA_FLOAT shared[]; - -__device__ inline int find_special(int3 &n, int* list,int & tag,int3 flag) -{ - int k=n.z; - for (int l = 0; l < n.z; l++) k = ((list[l] == tag)?l:k); - - return k -__global__ void NeighborBuildFullBin_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style, bool neighall) -{ - int natoms = neighall?_nall:_nlocal; - //const bool domol=false; - int bin_dim_z=gridDim.y; - CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer; - binned_x = &binned_x[2]; - int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax]; - int bin = __mul24(gridDim.y,blockIdx.x)+blockIdx.y; - int bin_x = blockIdx.x/bin_dim_y; - int bin_y = blockIdx.x-bin_x*bin_dim_y; - int bin_z = blockIdx.y; - int bin_c = bin_count[bin]; - - - CUDA_FLOAT cut; - if(globcutoff>0) - cut = globcutoff; - - int i=_nall; - CUDA_FLOAT* my_x; - CUDA_FLOAT x_i,y_i,z_i; - - for(int actOffset=0; actOffset=bin_dim_x||obin_y>=bin_dim_y||obin_z>=bin_dim_z) continue; - int other_bin=bin_dim_z * ( bin_dim_y * obin_x + obin_y) + obin_z; - if(other_bin==bin) continue; - - int obin_c=bin_count[other_bin]; - - for(int otherActOffset=0; otherActOffset _maxneighbors) ((int*)_buffer)[0] = -jnum; - - if(i=_nlocal) return; - int special_id[CUDA_MAX_NSPECIAL]; - - int i = _ilist[ii]; - if(i>=_nlocal) return; - int jnum = _numneigh[i]; - if (_special_flag[1] == 0) spec_flag.x = -1; - else if (_special_flag[1] == 1) spec_flag.x = 0; - else spec_flag.x = 1; - - if (_special_flag[2] == 0) spec_flag.y = -1; - else if (_special_flag[2] == 1) spec_flag.y = 0; - else spec_flag.y = 2; - - if (_special_flag[3] == 0) spec_flag.z = -1; - else if (_special_flag[3] == 1) spec_flag.z = 0; - else spec_flag.z = 3; - - mynspecial.x=_nspecial[i]; - mynspecial.y=_nspecial[i+_nmax]; - mynspecial.z=_nspecial[i+2*_nmax]; - - if(i<_nlocal) - { - int* list = &_special[i]; - for(int k=0;k0) - { - if(block_style) - _neighbors[i*_maxneighbors+k]=j ^ (which << SBBITS); - else - _neighbors[i+k*_nlocal]=j ^ (which << SBBITS); - } - else if(which<0) - { - if(block_style) - _neighbors[i*_maxneighbors+k]=_neighbors[i*_maxneighbors+jnum-1]; - else - _neighbors[i+k*_nlocal]=_neighbors[i+(jnum-1)*_nlocal]; - jnum--; - k--; - } - } - } - _numneigh[i]=jnum; -} - -__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style) -{ - int bin_dim_z=gridDim.y; - CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer; - binned_x = &binned_x[2]; - int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax]; - int bin = __mul24(gridDim.y,blockIdx.x)+blockIdx.y; - int bin_x = blockIdx.x/bin_dim_y; - int bin_y = blockIdx.x-bin_x*bin_dim_y; - int bin_z = blockIdx.y; - int bin_c = bin_count[bin]; - - - CUDA_FLOAT cut; - if(globcutoff>0) - cut = globcutoff; - - int i=_nall; - CUDA_FLOAT* my_x; - CUDA_FLOAT x_i,y_i,z_i; - - for(int actOffset=0; actOffset=_nlocal)&&(i_border<0)) - i_border=atomicAdd(_inum_border,1); - - if(jnum<_maxneighbors) - { - if(block_style) - { - _neighbors[i*_maxneighbors+jnum]= j; - if(j>=_nlocal) - {_neighbors_border[i_border*_maxneighbors+jnum_border]=j;} - else - {_neighbors_inner[i*_maxneighbors+jnum_inner]=j;} - } - else - { - _neighbors[i+jnum*_nlocal]=j; - if(j>=_nlocal) - {_neighbors_border[i_border+jnum_border*_nlocal]=j;} - else - {_neighbors_inner[i+jnum_inner*_nlocal]=j;} - } - } - ++jnum; - if(j>=_nlocal) - jnum_border++; - else - jnum_inner++; - } - } - } - __syncthreads(); - } - for(int obin_x=bin_x-1;obin_x=bin_dim_x||obin_y>=bin_dim_y||obin_z>=bin_dim_z) continue; - int other_bin=bin_dim_z * ( bin_dim_y * obin_x + obin_y) + obin_z; - if(other_bin==bin) continue; - - int obin_c=bin_count[other_bin]; - - for(int otherActOffset=0; otherActOffset=_nlocal)&&(i_border<0)) - i_border=atomicAdd(_inum_border,1); - if(jnum<_maxneighbors) - { - if(block_style) - { - _neighbors[i*_maxneighbors+jnum]= j; - if(j>=_nlocal) - {_neighbors_border[i_border*_maxneighbors+jnum_border]=j;} - else - {_neighbors_inner[i*_maxneighbors+jnum_inner]=j;} - } - else - { - _neighbors[i+jnum*_nlocal]=j; - if(j>=_nlocal) - {_neighbors_border[i_border+jnum_border*_nlocal]=j;} - else - {_neighbors_inner[i+jnum_inner*_nlocal]=j;} - } - } - ++jnum; - if(j>=_nlocal) - jnum_border++; - else - jnum_inner++; - } - } - } - __syncthreads(); - } - } - - if(jnum > _maxneighbors) ((int*)_buffer)[0] = -jnum; - - if(i<_nlocal) - { - _numneigh[i] = jnum; - _numneigh_inner[i] = jnum_inner; - if(i_border>=0) _numneigh_border[i_border] = jnum_border; - if(i_border>=0) _ilist_border[i_border] = i; - - } - } -} - -__global__ void NeighborBuildFullNsq_Kernel() -{ - int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - int* buffer = (int*) _buffer; - - if(i < _nlocal) - { - X_FLOAT* my_x = _x + i; - CUDA_FLOAT x_i = *my_x; my_x += _nmax; - CUDA_FLOAT y_i = *my_x; my_x += _nmax; - CUDA_FLOAT z_i = *my_x; - int jnum = 0; - int* jlist = _firstneigh[i]; - _ilist[i]=i; - - int itype = _type[i]; - __syncthreads(); - for(int j = 0; j < _nall; ++j) - { - my_x = _x + j; - CUDA_FLOAT x_j = *my_x; my_x += _nmax; - CUDA_FLOAT y_j = *my_x; my_x += _nmax; - CUDA_FLOAT z_j = *my_x; - CUDA_FLOAT delx = x_i - x_j; - CUDA_FLOAT dely = y_i - y_j; - CUDA_FLOAT delz = z_i - z_j; - CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz; - int jtype = _type[j]; - if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) - { - if(jnum<_maxneighbors) - jlist[jnum] = j; - if(i==151) ((int*)_buffer)[jnum+2]=j; - ++jnum; - } - __syncthreads(); - } - if(jnum > _maxneighbors) buffer[0] = 0; - _numneigh[i] = jnum; - if(i==151) ((int*)_buffer)[1]=jnum; - } -} - diff --git a/lib/cuda/pair_born_coul_long_cuda.cu b/lib/cuda/pair_born_coul_long_cuda.cu deleted file mode 100644 index 913d5eb2c5..0000000000 --- a/lib/cuda/pair_born_coul_long_cuda.cu +++ /dev/null @@ -1,78 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _rhoinv MY_AP(coeff1) -#define _sigma MY_AP(coeff2) -#define _a MY_AP(coeff3) -#define _c MY_AP(coeff4) -#define _d MY_AP(coeff5) - -#include "pair_born_coul_long_cuda_cu.h" -#include "pair_born_coul_long_cuda_kernel_nc.cu" - -#include - -void Cuda_PairBornCoulLongCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5,true); -} - -void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairBornCoulLongCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - - -#undef _rhoinv -#undef _sigma -#undef _a -#undef _c -#undef _d - diff --git a/lib/cuda/pair_born_coul_long_cuda_cu.h b/lib/cuda/pair_born_coul_long_cuda_cu.h deleted file mode 100644 index e47968d0f9..0000000000 --- a/lib/cuda/pair_born_coul_long_cuda_cu.h +++ /dev/null @@ -1,30 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -#ifdef CUDA_USE_BINNING -extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); -#else -extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); -#endif diff --git a/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu deleted file mode 100644 index 651326cb60..0000000000 --- a/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu +++ /dev/null @@ -1,34 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairBornCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT r6inv = r2inv*r2inv*r2inv; - const F_FLOAT rexp = _EXP_((_sigma[ij_type]-r)*_rhoinv[ij_type]); - const F_FLOAT forceborn = _a[ij_type]*_rhoinv[ij_type]*r*rexp - - F_F(6.0)*_c[ij_type]*r6inv + F_F(8.0)*_d[ij_type]*r2inv*r6inv; - if(eflag) evdwl += factor_lj*(_a[ij_type]*rexp - _c[ij_type]*r6inv - +_d[ij_type]*r2inv*r6inv-_offset[ij_type]); - return factor_lj*forceborn*r2inv; -} diff --git a/lib/cuda/pair_buck_coul_cut_cuda.cu b/lib/cuda/pair_buck_coul_cut_cuda.cu deleted file mode 100644 index b20de75efb..0000000000 --- a/lib/cuda/pair_buck_coul_cut_cuda.cu +++ /dev/null @@ -1,74 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _rhoinv MY_AP(coeff1) -#define _buck1 MY_AP(coeff2) -#define _buck2 MY_AP(coeff3) -#define _a MY_AP(coeff4) -#define _c MY_AP(coeff5) - -#include "pair_buck_coul_cut_cuda_cu.h" - -#include -void Cuda_PairBuckCoulCutCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5,true); -} - -void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairBuckCoulCutCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - -#undef _rhoinv -#undef _buck1 -#undef _buck2 -#undef _a -#undef _c - diff --git a/lib/cuda/pair_buck_coul_cut_cuda_cu.h b/lib/cuda/pair_buck_coul_cut_cuda_cu.h deleted file mode 100644 index 1a2576ccae..0000000000 --- a/lib/cuda/pair_buck_coul_cut_cuda_cu.h +++ /dev/null @@ -1,30 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -#ifdef CUDA_USE_BINNING -extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, int eflag, int vflag); -#else -extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); -#endif diff --git a/lib/cuda/pair_buck_coul_long_cuda.cu b/lib/cuda/pair_buck_coul_long_cuda.cu deleted file mode 100644 index 70e53edf08..0000000000 --- a/lib/cuda/pair_buck_coul_long_cuda.cu +++ /dev/null @@ -1,77 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _rhoinv MY_AP(coeff1) -#define _buck1 MY_AP(coeff2) -#define _buck2 MY_AP(coeff3) -#define _a MY_AP(coeff4) -#define _c MY_AP(coeff5) - -#include "pair_buck_coul_long_cuda_cu.h" - -#include - -void Cuda_PairBuckCoulLongCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5,true); -} - -void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairBuckCoulLongCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - - - -#undef _rhoinv -#undef _buck1 -#undef _buck2 -#undef _a -#undef _c - diff --git a/lib/cuda/pair_buck_coul_long_cuda_cu.h b/lib/cuda/pair_buck_coul_long_cuda_cu.h deleted file mode 100644 index 77cbb4c07f..0000000000 --- a/lib/cuda/pair_buck_coul_long_cuda_cu.h +++ /dev/null @@ -1,30 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -#ifdef CUDA_USE_BINNING -extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); -#else -extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); -#endif diff --git a/lib/cuda/pair_buck_cuda.cu b/lib/cuda/pair_buck_cuda.cu deleted file mode 100644 index c14abc0067..0000000000 --- a/lib/cuda/pair_buck_cuda.cu +++ /dev/null @@ -1,76 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _rhoinv MY_AP(coeff1) -#define _buck1 MY_AP(coeff2) -#define _buck2 MY_AP(coeff3) -#define _a MY_AP(coeff4) -#define _c MY_AP(coeff5) - -#include "pair_buck_cuda_cu.h" -#include "pair_buck_cuda_kernel_nc.cu" - -#include - -void Cuda_PairBuckCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5); -} - -void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairBuckCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - -#undef _rhoinv -#undef _buck1 -#undef _buck2 -#undef _a -#undef _c - diff --git a/lib/cuda/pair_buck_cuda_cu.h b/lib/cuda/pair_buck_cuda_cu.h deleted file mode 100644 index 92b6350d9f..0000000000 --- a/lib/cuda/pair_buck_cuda_cu.h +++ /dev/null @@ -1,30 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -#ifdef CUDA_USE_BINNING -extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, int eflag, int vflag); -#else -extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); -#endif diff --git a/lib/cuda/pair_buck_cuda_kernel_nc.cu b/lib/cuda/pair_buck_cuda_kernel_nc.cu deleted file mode 100644 index 3ec40a26f8..0000000000 --- a/lib/cuda/pair_buck_cuda_kernel_nc.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairBuckCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r6inv = r2inv*r2inv*r2inv; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT rexp = _EXP_(-r*_rhoinv[ij_type]); - const F_FLOAT forcebuck = _buck1[ij_type]*r*rexp - _buck2[ij_type]*r6inv; - if(eflag) evdwl += factor_lj*(_a[ij_type]*rexp - _c[ij_type]*r6inv - - _offset[ij_type]); - return (factor_lj*forcebuck) * r2inv; -} diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu deleted file mode 100644 index 1f780674c1..0000000000 --- a/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu +++ /dev/null @@ -1,80 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) -#define _cg_type MY_AP(coeff5) - - -#include "pair_cg_cmm_coul_cut_cuda_cu.h" -#include - - - - -void Cuda_PairCGCMMCoulCutCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5, true, false ); - -} - - - - -void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairCGCMMCoulCutCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _cg_type - diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h deleted file mode 100644 index 00eb4c983c..0000000000 --- a/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu deleted file mode 100644 index ead0fc9832..0000000000 --- a/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu +++ /dev/null @@ -1,80 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) -#define _cg_type MY_AP(coeff5) - - -#include "pair_cg_cmm_coul_debye_cuda_cu.h" -#include - - - - -void Cuda_PairCGCMMCoulDebyeCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5, true, false ); - -} - - - - -void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairCGCMMCoulDebyeCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _cg_type - diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h deleted file mode 100644 index 5b8bab44c5..0000000000 --- a/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda.cu b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu deleted file mode 100644 index dbdc2d2a12..0000000000 --- a/lib/cuda/pair_cg_cmm_coul_long_cuda.cu +++ /dev/null @@ -1,80 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) -#define _cg_type MY_AP(coeff5) - - -#include "pair_cg_cmm_coul_long_cuda_cu.h" -#include - - - - -void Cuda_PairCGCMMCoulLongCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5, true, false ); - -} - - - - -void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairCGCMMCoulLongCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _cg_type - diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h deleted file mode 100644 index bed897d5d3..0000000000 --- a/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_cuda.cu b/lib/cuda/pair_cg_cmm_cuda.cu deleted file mode 100644 index b4bb31e094..0000000000 --- a/lib/cuda/pair_cg_cmm_cuda.cu +++ /dev/null @@ -1,85 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) -#define _cg_type MY_AP(coeff5) - - enum {CG_NOT_SET=0, CG_LJ9_6, CG_LJ12_4, CG_LJ12_6, NUM_CG_TYPES, - CG_COUL_NONE, CG_COUL_CUT, CG_COUL_DEBYE, CG_COUL_LONG}; - -#include "pair_cg_cmm_cuda_cu.h" -#include "pair_cg_cmm_cuda_kernel_nc.cu" -#include - - - - -void Cuda_PairCGCMMCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5, false, false ); - -} - - - - -void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairCGCMMCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - int maxthreads=128; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,maxthreads); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _cg_type - diff --git a/lib/cuda/pair_cg_cmm_cuda_cu.h b/lib/cuda/pair_cg_cmm_cuda_cu.h deleted file mode 100644 index da6d6075f0..0000000000 --- a/lib/cuda/pair_cg_cmm_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu deleted file mode 100644 index dcaaab7955..0000000000 --- a/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu +++ /dev/null @@ -1,48 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) //0.11 of 0.4 -{ - const F_FLOAT r2inv = F_F(1.0)/rsq; - const int cg_type = _cg_type[ij_type]; - const F_FLOAT r4inv = r2inv*r2inv; - const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); - const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?-r2inv:-F_F(1.0); - const F_FLOAT forcelj = r4inv * (_lj1[ij_type]*r4inv*rNinv_first + _lj2[ij_type]*rNinv_second); - - if(eflag) evdwl += factor_lj*(r4inv*(_lj3[ij_type]*r4inv*rNinv_first+_lj4[ij_type]*rNinv_second) - _offset[ij_type]); - return factor_lj*forcelj*r2inv; -} - -/*__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const int cg_type = tex1Dfetch(_coeff5_gm_tex,ij_type); - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r4inv = r2inv*r2inv; - const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); - const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0); - const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second); - - if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second)); - return factor_lj*forcelj*r2inv; -}*/ diff --git a/lib/cuda/pair_eam_cuda.cu b/lib/cuda/pair_eam_cuda.cu deleted file mode 100644 index cb20343770..0000000000 --- a/lib/cuda/pair_eam_cuda.cu +++ /dev/null @@ -1,351 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _type2frho MY_AP(coeff1) -#define _type2rhor MY_AP(coeff2) -#define _type2z2r MY_AP(coeff3) -#define _rdr MY_AP(rdr) -#define _rdrho MY_AP(rdrho) -#define _nr MY_AP(nr) -#define _nrho MY_AP(nrho) -#define _nfrho MY_AP(nfrho) -#define _nrhor MY_AP(nrhor) -#define _nz2r MY_AP(nz2r) -#define _frho_spline MY_AP(frho_spline) -#define _rhor_spline MY_AP(rhor_spline) -#define _z2r_spline MY_AP(z2r_spline) -#define _rho MY_AP(rho) -#define _fp MY_AP(fp) - -__device__ __constant__ F_FLOAT MY_AP(rdr); -__device__ __constant__ F_FLOAT MY_AP(rdrho); -__device__ __constant__ int MY_AP(nr); -__device__ __constant__ int MY_AP(nrho); -__device__ __constant__ int MY_AP(nfrho); -__device__ __constant__ int MY_AP(nrhor); -__device__ __constant__ int MY_AP(nz2r); -__device__ __constant__ F_FLOAT* MY_AP(frho_spline); -__device__ __constant__ F_FLOAT* MY_AP(rhor_spline); -__device__ __constant__ F_FLOAT* MY_AP(z2r_spline); -__device__ __constant__ F_FLOAT* MY_AP(rho); -__device__ __constant__ F_FLOAT* MY_AP(fp); - -#define _rhor_spline_tex MY_AP(rhor_spline_tex) -#if F_PRECISION == 1 -texture _rhor_spline_tex; -#else -texture _rhor_spline_tex; -#endif - - -#define _z2r_spline_tex MY_AP(z2r_spline_tex) -#if F_PRECISION == 1 -texture _z2r_spline_tex; -#else -texture _z2r_spline_tex; -#endif - - - -#include "pair_eam_cuda_cu.h" -#include "pair_eam_cuda_kernel_nc.cu" -#include - -int eam_buff_offset; -int rhor_spline_size; -void* rhor_spline_pointer; -int z2r_spline_size; -void* z2r_spline_pointer; - - -inline void BindEAMTextures(cuda_shared_data* sdata) -{ - _rhor_spline_tex.normalized = false; // access with normalized texture coordinates - _rhor_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _rhor_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - - const textureReference* rhor_spline_texture_ptr = &MY_AP(rhor_spline_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc(); - cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); -#else - cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc(); - cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); -#endif - - _z2r_spline_tex.normalized = false; // access with normalized texture coordinates - _z2r_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _z2r_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - - const textureReference* z2r_spline_texture_ptr = &MY_AP(z2r_spline_tex); - -#if F_PRECISION == 1 - cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc(); - cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); -#else - cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc(); - cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); -#endif - -} - -void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed"); - int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT); - - if(sdata->buffersize < size) { - MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) - - if(sdata->buffer != NULL) cudaFree(sdata->buffer); - - cudaMalloc((void**)&sdata->buffer, size); - sdata->buffersize = size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) - } - - cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed"); -} - -void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int)); -} - -void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed"); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed"); -} - - -void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, int nfrho, int nrhor, int nr, int nrho, int nz2r, - void* frho_spline, void* rhor_spline, void* z2r_spline, void* rho, void* fp, - int* type2frho, int** type2z2r, int** type2rhor) -{ - // !! LAMMPS indexes atom types starting with 1 !! - - unsigned cuda_ntypes = sdata->atom.ntypes + 1; - - if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2) - printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u " - "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 " - "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); - - unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes; - - X_FLOAT cutsq_global; - cutsq_global = (X_FLOAT)(sdata->pair.cut_global); - cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT)); - - - F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes]; - - for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i]; - - cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_FLOAT)); - - for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i]; - - cudaMemcpyToSymbol(MY_AP(coeff2) , coeff_buf , nI); - - for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2z2r[0][0])[i]; - - cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI); - - delete [] coeff_buf; - X_FLOAT box_size[3] = { - sdata->domain.subhi[0] - sdata->domain.sublo[0], - sdata->domain.subhi[1] - sdata->domain.sublo[1], - sdata->domain.subhi[2] - sdata->domain.sublo[2] - }; - F_FLOAT rdr_F = rdr; - F_FLOAT rdrho_F = rdrho; - cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); - cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3); - cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int)); - cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int)); - cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int)); - - rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT); - z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT); - rhor_spline_pointer = rhor_spline; - z2r_spline_pointer = z2r_spline; - - CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed"); - -} - - - -void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) -{ - - if(sdata->atom.update_nmax) - Cuda_PairEAMCuda_UpdateNmax(sdata, sneighlist); - - if(sdata->atom.update_neigh) - Cuda_PairEAMCuda_UpdateNeighbor(sdata, sneighlist); - - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - - if(sdata->buffer_new) - Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist); - - cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); - - int sharedperproc = 0; - - if(eflag || eflag_atom) sharedperproc = 1; - - if(vflag || vflag_atom) sharedperproc = 7; - - int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - eam_buff_offset = grid.x * grid.y; - - BindXTypeTexture(sdata); - BindEAMTextures(sdata); // initialize only on first call - - - MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);) - CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation"); - PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed"); - - - - MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");) - -} - -void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) -{ - int sharedperproc = 0; - - if(eflag || eflag_atom) sharedperproc = 1; - - if(vflag || vflag_atom) sharedperproc = 7; - - int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - BindXTypeTexture(sdata); - BindEAMTextures(sdata); // initialize only on first call - // initialize only on first call - sdata->pair.lastgridsize = grid.x * grid.y; - sdata->pair.n_energy_virial = sharedperproc; - - MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);) - CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation"); - PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed"); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed"); - - if(eflag || vflag) { - int n = grid.x * grid.y; - grid.x = sharedperproc; - grid.y = 1; - threads.x = 256; - MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)*sharedperproc>>>(n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed"); - } - - MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");) - -} - -void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send) -{ - int3 layout = getgrid(n, 0); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - F_FLOAT* buf = (F_FLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]); - - PairEAMCuda_PackComm_Kernel <<< grid, threads, 0>>> ((int*) sdata->comm.sendlist.dev_data, n - , sdata->comm.maxlistlength, iswap, buf); - cudaThreadSynchronize(); - cudaMemcpy(buf_send, buf, n* sizeof(F_FLOAT), cudaMemcpyDeviceToHost); - cudaThreadSynchronize(); -} - -void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp) -{ - F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]); - cudaMemcpy(fp_first, buf_recv, n * sizeof(F_FLOAT), cudaMemcpyHostToDevice); -} - -#undef _type2frho -#undef _type2rhor -#undef _type2z2r - - -/* ---------------------------------------------------------------------- - tally eng_vdwl and virial into global and per-atom accumulators - need i < nlocal test since called by bond_quartic and dihedral_charmm -------------------------------------------------------------------------- */ - diff --git a/lib/cuda/pair_eam_cuda_cu.h b/lib/cuda/pair_eam_cuda_cu.h deleted file mode 100644 index dee4a036e2..0000000000 --- a/lib/cuda/pair_eam_cuda_cu.h +++ /dev/null @@ -1,33 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" -extern "C" void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata,double rdr,double rdrho,int nfrho, int nrhor,int nr, int nrho,int nz2r, -void* frho_spline,void* rhor_spline,void* z2r_spline,void* rho,void* fp, -int* type2frho,int** type2z2r,int** type2rhor); -extern "C" void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); -extern "C" void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); -extern "C" void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send); -extern "C" void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,void* fp); - -#define EAM_COEFF_LENGTH 8 diff --git a/lib/cuda/pair_eam_cuda_kernel_nc.cu b/lib/cuda/pair_eam_cuda_kernel_nc.cu deleted file mode 100644 index a3dc30f397..0000000000 --- a/lib/cuda/pair_eam_cuda_kernel_nc.cu +++ /dev/null @@ -1,340 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - - - - -static __device__ inline F_FLOAT4 fetchRhor(int i) -{ - #ifdef CUDA_USE_TEXTURE - #if F_PRECISION == 1 - return tex1Dfetch(_rhor_spline_tex,i); - #else - return tex1Dfetch_double_f(_rhor_spline_tex,i); - #endif - #else - return _rhor_spline[i]; - #endif -} - -static __device__ inline F_FLOAT4 fetchZ2r(int i) -{ - #ifdef CUDA_USE_TEXTURE - #if F_PRECISION == 1 - return tex1Dfetch(_z2r_spline_tex,i); - #else - return tex1Dfetch_double_f(_z2r_spline_tex,i); - #endif - #else - return _z2r_spline[i]; - #endif -} - -__global__ void PairEAMCuda_Kernel1(int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - ENERGY_FLOAT* sharedE; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; - - - if(eflag||eflag_atom) - { - sharedE = &sharedmem[threadIdx.x]; - sharedE[0] = ENERGY_F(0.0); - sharedV += blockDim.x; - } - if(vflag||vflag_atom) - { - sharedV[0*blockDim.x] = ENERGY_F(0.0); - sharedV[1*blockDim.x] = ENERGY_F(0.0); - sharedV[2*blockDim.x] = ENERGY_F(0.0); - sharedV[3*blockDim.x] = ENERGY_F(0.0); - sharedV[4*blockDim.x] = ENERGY_F(0.0); - sharedV[5*blockDim.x] = ENERGY_F(0.0); - } - - int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - X_FLOAT xtmp,ytmp,ztmp; - X_FLOAT4 myxtype; - F_FLOAT delx,dely,delz; - int itype; - int i=_nlocal; - int jnum=0; - int* jlist; - - if(ii < _inum) - { - i = _ilist[ii]; - - myxtype=fetchXType(i); - xtmp=myxtype.x; - ytmp=myxtype.y; - ztmp=myxtype.z; - itype=static_cast (myxtype.w); - - jnum = _numneigh[i]; - - jlist = &_neighbors[i]; - if(i<_nlocal) - _rho[i]=F_F(0.0); - } - __syncthreads(); - - for (int jj = 0; jj < jnum; jj++) - { - if(ii < _inum) - if(jj (myxtype.w); - const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; - if (rsq < _cutsq_global) - { - F_FLOAT p = sqrt(rsq)*_rdr + F_F(1.0); - int m = static_cast (p); - m = MIN(m,_nr-1); - p -= m; - p = MIN(p,F_F(1.0)); - - int k=(static_cast (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2; - F_FLOAT4 c=fetchRhor(k+1); - _rho[i] += ((c.w*p+c.x)*p+c.y)*p+c.z; - } - } - } - - if(ii < _inum) - { - - F_FLOAT p = _rho[i]*_rdrho + F_F(1.0); - int m = static_cast (p); - m = MAX(1,MIN(m,_nrho-1)); - p -= m; - p = MIN(p,F_F(1.0)); - F_FLOAT* coeff = &_frho_spline[(static_cast (_type2frho[itype])*(_nrho+1)+m)*EAM_COEFF_LENGTH]; - _fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2]; - if (eflag||eflag_atom) { - sharedmem[threadIdx.x] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; - } - - } - __syncthreads(); - if(eflag||eflag_atom) - { - if(i<_nlocal&&eflag_atom) - _eatom[i]+=sharedmem[threadIdx.x]; - reduceBlock(sharedmem); - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; - buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0)*sharedmem[0]; - } -} - -__global__ void PairEAMCuda_Kernel2(int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - - ENERGY_FLOAT* sharedE; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; - - - if(eflag||eflag_atom) - { - sharedE = &sharedmem[threadIdx.x]; - sharedE[0] = ENERGY_F(0.0); - sharedV += blockDim.x; - } - if(vflag||vflag_atom) - { - sharedV[0*blockDim.x] = ENERGY_F(0.0); - sharedV[1*blockDim.x] = ENERGY_F(0.0); - sharedV[2*blockDim.x] = ENERGY_F(0.0); - sharedV[3*blockDim.x] = ENERGY_F(0.0); - sharedV[4*blockDim.x] = ENERGY_F(0.0); - sharedV[5*blockDim.x] = ENERGY_F(0.0); - } - - int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - X_FLOAT xtmp,ytmp,ztmp; - X_FLOAT4 myxtype; - F_FLOAT fxtmp,fytmp,fztmp,fpair; - F_FLOAT delx,dely,delz; - int itype,i; - int jnum=0; - int* jlist; - - if(ii < _inum) - { - i = _ilist[ii]; - - myxtype=fetchXType(i); - xtmp=myxtype.x; - ytmp=myxtype.y; - ztmp=myxtype.z; - itype=static_cast (myxtype.w); - fxtmp = F_F(0.0); - fytmp = F_F(0.0); - fztmp = F_F(0.0); - - jnum = _numneigh[i]; - - jlist = &_neighbors[i]; - if(i<_nlocal) - _rho[i]=F_F(0.0); - } - if(ii (myxtype.w); - const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; - - if (rsq < _cutsq_global) - { - F_FLOAT r = _SQRT_(rsq); - F_FLOAT p = r*_rdr + F_F(1.0); - int m = static_cast (p); - m = MIN(m,_nr-1); - p -= m; - p = MIN(p,F_F(1.0)); - - int k=(static_cast (_type2rhor[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2; - F_FLOAT4 c=fetchRhor(k); - F_FLOAT rhoip = (c.x*p + c.y)*p + c.z; - k=(static_cast (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2; - c=fetchRhor(k); - F_FLOAT rhojp = (c.x*p + c.y)*p + c.z; - k=(static_cast (_type2z2r[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2; - c=fetchZ2r(k); - F_FLOAT z2p = (c.x*p + c.y)*p + c.z; - c=fetchZ2r(k+1); - F_FLOAT z2 = ((c.w*p + c.x)*p + c.y)*p+c.z; - - F_FLOAT recip = F_F(1.0)/r; - F_FLOAT phi = z2*recip; - F_FLOAT phip = z2p*recip - phi*recip; - F_FLOAT psip = _fp[i]*rhojp + _fp[j]*rhoip + phip; - fpair = -psip*recip; - - F_FLOAT dxfp,dyfp,dzfp; - fxtmp += dxfp = delx*fpair; - fytmp += dyfp = dely*fpair; - fztmp += dzfp = delz*fpair; - evdwl+=phi; - if(vflag||vflag_atom) - { - sharedV[0 * blockDim.x]+= delx*dxfp; - sharedV[1 * blockDim.x]+= dely*dyfp; - sharedV[2 * blockDim.x]+= delz*dzfp; - sharedV[3 * blockDim.x]+= delx*dyfp; - sharedV[4 * blockDim.x]+= delx*dzfp; - sharedV[5 * blockDim.x]+= dely*dzfp; - } - } - } - } - - __syncthreads(); - if(ii < _inum) - { - F_FLOAT* my_f; - if(_collect_forces_later) - { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; - if(eflag) - { - buffer=&buffer[1 * gridDim.x * gridDim.y]; - } - if(vflag) - { - buffer=&buffer[6 * gridDim.x * gridDim.y]; - } - my_f = (F_FLOAT*) buffer; - my_f += i; - *my_f = fxtmp; my_f += _nmax; - *my_f = fytmp; my_f += _nmax; - *my_f = fztmp; - } - else - { - my_f = _f + i; - *my_f += fxtmp; my_f += _nmax; - *my_f += fytmp; my_f += _nmax; - *my_f += fztmp; - } - } - __syncthreads(); - - if(eflag) - { - sharedE[0] = evdwl; - } - if(eflag_atom && i<_nlocal) - { - _eatom[i] += evdwl; - } - - if(vflag_atom && i<_nlocal) - { - _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; - _vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; - _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; - _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; - _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; - _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; - } - if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,0); -} - -__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,F_FLOAT* buffer) -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - int* list=sendlist+iswap*maxlistlength; - if(i - -#define _kn MY_AP(coeff1) //[0] -#define _kt MY_AP(coeff1) //[1] -#define _gamman MY_AP(coeff1) //[2] -#define _gammat MY_AP(coeff3) //[0] -#define _xmu MY_AP(coeff2) //[0] -#define _dampflag MY_AP(coeff2) //[1] - -#include "pair_gran_hooke_cuda_cu.h" -#include "pair_gran_hooke_cuda_kernel_nc.cu" -#include - -void Cuda_PairGranHookeCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateBuffer failed"); - int3 layout=getgrid(sneighlist->inum,7*sizeof(ENERGY_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - int size=(unsigned)(layout.y*layout.x)*7*sizeof(ENERGY_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - if(sdata->buffer!=NULL) cudaFree(sdata->buffer); - cudaMalloc((void**)&sdata->buffer,size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); - CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: updateBuffer failed"); -} - -void Cuda_PairGranHookeCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) -{ - CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateNmax failed"); - cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); - //cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) ); - cudaMemcpyToSymbol(MY_CONST(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*) ); - cudaMemcpyToSymbol(MY_CONST(omega_rmass),& sdata->atom.omega_rmass.dev_data,sizeof(V_FLOAT4*) ); - cudaMemcpyToSymbol(MY_CONST(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(maxneighbors),&sneighlist->maxneighbors , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata , sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(freeze_group_bit) , & sdata->pair.freeze_group_bit, sizeof(int) ); - - - CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: updateNmax failed"); -} - - -void Cuda_PairGranHookeCuda_Init(cuda_shared_data* sdata) -{ - // !! LAMMPS indexes atom types starting with 1 !! - - unsigned cuda_ntypes = sdata->atom.ntypes + 2; - if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2) - printf("# CUDA: Cuda_PairGranHookeCuda_Init: you need %u types. this is more than %u " - "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " - "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE-1); - unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes; - unsigned n = sizeof(F_FLOAT) * cuda_ntypes2; - - F_FLOAT coeffs1[cuda_ntypes2]; - coeffs1[0]= (F_FLOAT) sdata->pair.coeff1[0][0]; - coeffs1[1]= (F_FLOAT) sdata->pair.coeff1[0][1]; - coeffs1[2]= (F_FLOAT) sdata->pair.coeff1[1][0]; - F_FLOAT coeffs3[cuda_ntypes2]; - coeffs3[0]= (F_FLOAT) sdata->pair.coeff1[1][1]; - F_FLOAT coeffs2[cuda_ntypes2]; - coeffs2[0]= (F_FLOAT) sdata->pair.coeff2[0][0]; - coeffs2[1]= (F_FLOAT) sdata->pair.coeff2[0][1]; - - - X_FLOAT box_size[3] = - { - sdata->domain.subhi[0] - sdata->domain.sublo[0], - sdata->domain.subhi[1] - sdata->domain.sublo[1], - sdata->domain.subhi[2] - sdata->domain.sublo[2] - }; - //printf("n: %i %i\n",n,CUDA_MAX_TYPES2); - cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); - cudaMemcpyToSymbol(MY_CONST(cuda_ntypes), & cuda_ntypes , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(coeff1) , coeffs1 , n ); - cudaMemcpyToSymbol(MY_CONST(coeff2) , coeffs2 , n ); - cudaMemcpyToSymbol(MY_CONST(coeff3) , coeffs3 , n ); - cudaMemcpyToSymbol(MY_CONST(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity, sizeof(int)*3 ); - CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: init failed"); -} - - - -void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - //if(sdata->atom.update_nmax) - Cuda_PairGranHookeCuda_UpdateNmax(sdata,sneighlist); - //if(sdata->atom.update_nlocal) - { - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); - } - //if(sdata->buffer_new) - Cuda_PairGranHookeCuda_UpdateBuffer(sdata,sneighlist); - - BindXTypeTexture(sdata); - BindVRadiusTexture(sdata); - BindOmegaRmassTexture(sdata); - - int sharedperproc=0; - if(eflag) sharedperproc+=1; - if(vflag) sharedperproc+=6; - - int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT),128); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairGranHookeCuda_Init(sdata); - } - - MYDBG( printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); ) - - CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pre pair lj cut Kernel problems before kernel invocation"); - PairGranHookeCuda_Kernel<<>> (eflag, vflag,eflag_atom,vflag_atom,(int**)sneighlist->firstneigh.dev_data,sneighlist->binned_id - ,(F_FLOAT) sdata->pair.coeff1[0][0],(F_FLOAT) sdata->pair.coeff1[1][0],(F_FLOAT) sdata->pair.coeff1[1][1],(F_FLOAT) sdata->pair.coeff2[0][0]); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pair lj cut Kernel execution failed"); - - if(eflag||vflag) - { - int n=grid.x*grid.y; - grid.x=sharedperproc; - grid.y=1; - threads.x=256; - MY_AP(PairVirialCompute_reduce)<<>>(n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) virial compute Kernel execution failed"); - } - - MYDBG( printf("# CUDA: Cuda_PairGranHookeCoulLongCuda: kernel done\n"); ) - -} - - -#undef _kn -#undef _kt -#undef _gamman -#undef _gammat -#undef _xmu -#undef _dampflag - - diff --git a/lib/cuda/pair_gran_hooke_cuda_cu.h b/lib/cuda/pair_gran_hooke_cuda_cu.h deleted file mode 100644 index 03cbd36519..0000000000 --- a/lib/cuda/pair_gran_hooke_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu deleted file mode 100644 index f063def443..0000000000 --- a/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu +++ /dev/null @@ -1,219 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - - -__global__ void PairGranHookeCuda_Kernel(int eflag, int vflag,int eflag_atom,int vflag_atom,int** firstneight,int* binned_id -,F_FLOAT kn,F_FLOAT gamman,F_FLOAT gammat, F_FLOAT xmu) -{ - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - - ENERGY_FLOAT* sharedE; - ENERGY_FLOAT* sharedV; - - if(eflag||eflag_atom) - { - sharedE = &sharedmem[threadIdx.x]; - sharedV = &sharedmem[0]; - sharedE[0] = ENERGY_F(0.0); sharedV+=blockDim.x; - } - if(vflag||vflag_atom) - { - sharedV += threadIdx.x; - sharedV[0*blockDim.x] = ENERGY_F(0.0); - sharedV[1*blockDim.x] = ENERGY_F(0.0); - sharedV[2*blockDim.x] = ENERGY_F(0.0); - sharedV[3*blockDim.x] = ENERGY_F(0.0); - sharedV[4*blockDim.x] = ENERGY_F(0.0); - sharedV[5*blockDim.x] = ENERGY_F(0.0); - } - - int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - MYEMUDBG( if(ii==0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n"); ) - - X_FLOAT xtmp,ytmp,ztmp; - X_FLOAT4 myxtype; - V_FLOAT4 myvradius, ovradius; - F_FLOAT fxtmp,fytmp,fztmp,torquextmp,torqueytmp,torqueztmp; - F_FLOAT delx,dely,delz; - F_FLOAT radi,radj,radsum,r,rsqinv; - F_FLOAT vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3; - F_FLOAT wr1,wr2,wr3; - F_FLOAT vtr1,vtr2,vtr3,vrel; - F_FLOAT meff,damp,ccel,tor1,tor2,tor3; - F_FLOAT fn,fs,ft,fs1,fs2,fs3; - - int jnum =0; - int i,j; - int* jlist; - - if(ii < _inum) - { - i = _ilist[ii]; - - myxtype = fetchXType(i); - myvradius = fetchVRadius(i); - - xtmp=myxtype.x; - ytmp=myxtype.y; - ztmp=myxtype.z; - radi = myvradius.w; - - fxtmp = F_F(0.0); - fytmp = F_F(0.0); - fztmp = F_F(0.0); - torquextmp = F_F(0.0); - torqueytmp = F_F(0.0); - torqueztmp = F_F(0.0); - - jnum = _numneigh[i]; - - jlist = &_neighbors[i]; - } - __syncthreads(); - - for (int jj = 0; jj < jnum; jj++) - { - if(ii < _inum) - if(jj - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) - -#include "pair_lj96_cut_cuda_cu.h" -#include "pair_lj96_cut_cuda_kernel_nc.cu" -#include - - - - -void Cuda_PairLJ96CutCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 4, false, false ); -} - - - - -void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJ96CutCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,256); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 - - diff --git a/lib/cuda/pair_lj96_cut_cuda_cu.h b/lib/cuda/pair_lj96_cut_cuda_cu.h deleted file mode 100644 index 24763103a7..0000000000 --- a/lib/cuda/pair_lj96_cut_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu deleted file mode 100644 index 28ccb839ba..0000000000 --- a/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT PairLJ96CutCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r6inv = r2inv*r2inv*r2inv; - const F_FLOAT r3inv = _SQRT_(r6inv); - const F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r3inv - _lj2[ij_type]); - if(eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r3inv-_lj4[ij_type]) - _offset[ij_type]); - return factor_lj*forcelj*r2inv; -} - diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu deleted file mode 100644 index b5a12755da..0000000000 --- a/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu +++ /dev/null @@ -1,78 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1_gm) -#define _lj2 MY_AP(coeff2_gm) -#define _lj3 MY_AP(coeff3_gm) -#define _lj4 MY_AP(coeff4_gm) - -#include "pair_lj_charmm_coul_charmm_cuda_cu.h" -#include "pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu" - -#include - -void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata,F_FLOAT cut_coul_innersq,F_FLOAT denom_lj_inv,F_FLOAT denom_coul_inv) -{ - Cuda_Pair_Init_AllStyles(sdata, 4,true,true,true); - cudaMemcpyToSymbol(MY_CONST(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT) ); - - return; -} - - - -void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, - int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul) -{ - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJCharmmCoulCharmmCuda_Init(sdata,cut_coul_innersq,1.0/denom_lj,1.0/denom_coul); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h deleted file mode 100644 index 3b96ab4481..0000000000 --- a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul); diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu deleted file mode 100644 index baaea5d4e5..0000000000 --- a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu +++ /dev/null @@ -1,72 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r6inv = r2inv*r2inv*r2inv; - F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]); - F_FLOAT philj,switch1; - if(rsq > _cut_innersq_global) - { - switch1 = (_cutsq_global-rsq) * (_cutsq_global-rsq) * - (_cutsq_global + F_F(2.0)*rsq - F_F(3.0)*_cut_innersq_global) * _denom_lj_inv; - const F_FLOAT switch2 = F_F(12.0)*rsq * (_cutsq_global-rsq) * - (rsq-_cut_innersq_global) * _denom_lj_inv; - philj = r6inv * (_lj3[ij_type]*r6inv - _lj4[ij_type]); - forcelj = forcelj*switch1 + philj*switch2; - } - - if (eflag) - { - ENERGY_FLOAT evdwl_tmp = factor_lj; - if (rsq > _cut_innersq_global) - { - evdwl_tmp*=philj*switch1; - } - else - evdwl_tmp*= r6inv * (_lj3[ij_type]*r6inv - _lj4[ij_type]); - evdwl+=evdwl_tmp; - } - - return factor_lj*forcelj*r2inv; -} - -__device__ inline F_FLOAT CoulCharmmCuda_Eval(const F_FLOAT& rsq,F_FLOAT& factor_coul,int& eflag, ENERGY_FLOAT& ecoul, F_FLOAT qij) -{ - F_FLOAT forcecoul; - ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij *_RSQRT_(rsq)*factor_coul; - if (rsq > _cut_coul_innersq_global) { - const F_FLOAT switch1 = (_cut_coulsq_global-rsq) * (_cut_coulsq_global-rsq) * - (_cut_coulsq_global + F_F(2.0)*rsq - F_F(3.0)*_cut_coul_innersq_global) * _denom_coul_inv; - ecoul_tmp *= switch1; - const F_FLOAT switch2 = F_F(12.0)*rsq * (_cut_coulsq_global-rsq) * - (rsq-_cut_coul_innersq_global) * _denom_coul_inv; - forcecoul *= switch1 + switch2; - } - if(eflag) - { - ecoul += ecoul_tmp*factor_coul; - } - return forcecoul*(F_F(1.0)/rsq); -} - diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu deleted file mode 100644 index 9bfb0bcc0e..0000000000 --- a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu +++ /dev/null @@ -1,85 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1_gm) -#define _lj2 MY_AP(coeff2_gm) -#define _lj3 MY_AP(coeff3_gm) -#define _lj4 MY_AP(coeff4_gm) -#define _cut_coul_innersq_global MY_AP(cut_coul_innersq_global) -#define _denom_lj_inv MY_AP(denom_lj_inv) -#define _denom_coul_inv MY_AP(denom_coul_inv) -__device__ __constant__ F_FLOAT _cut_coul_innersq_global; -__device__ __constant__ F_FLOAT _denom_lj_inv; -__device__ __constant__ F_FLOAT _denom_coul_inv; - - -#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h" -#include "pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu" - -#include - -void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata,F_FLOAT cut_coul_innersq,F_FLOAT denom_lj_inv,F_FLOAT denom_coul_inv) -{ - Cuda_Pair_Init_AllStyles(sdata, 4,true,true,true); - cudaMemcpyToSymbol(MY_CONST(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT) ); - - return; -} - - - -void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, - int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul) -{ - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(sdata,cut_coul_innersq,1.0/denom_lj,1.0/denom_coul); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h deleted file mode 100644 index 119163b291..0000000000 --- a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT denom_lj,F_FLOAT cut_coul_innersq,F_FLOAT denom_coul); diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu deleted file mode 100644 index c67037b7ce..0000000000 --- a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu +++ /dev/null @@ -1,42 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT CoulCharmmImplicitCuda_Eval(const F_FLOAT& rsq,F_FLOAT& factor_coul,int& eflag, ENERGY_FLOAT& ecoul, F_FLOAT qij) -{ - F_FLOAT forcecoul; - ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij *(F_F(1.0)/rsq)*factor_coul; - if (rsq > _cut_coul_innersq_global) { - const F_FLOAT switch1 = (_cut_coulsq_global-rsq) * (_cut_coulsq_global-rsq) * - (_cut_coulsq_global + F_F(2.0)*rsq - F_F(3.0)*_cut_coul_innersq_global) * _denom_coul_inv; - ecoul_tmp *= switch1; - const F_FLOAT switch2 = F_F(12.0)*rsq * (_cut_coulsq_global-rsq) * - (rsq-_cut_coul_innersq_global) * _denom_coul_inv; - forcecoul *= (switch1 + switch2); - } - if(eflag) - { - ecoul += ecoul_tmp*factor_coul; - } - return F_F(2.0)*forcecoul*(F_F(1.0)/rsq); -} - diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda.cu b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu deleted file mode 100644 index 7c1a5ac46c..0000000000 --- a/lib/cuda/pair_lj_charmm_coul_long_cuda.cu +++ /dev/null @@ -1,75 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1_gm) -#define _lj2 MY_AP(coeff2_gm) -#define _lj3 MY_AP(coeff3_gm) -#define _lj4 MY_AP(coeff4_gm) - -#include "pair_lj_charmm_coul_long_cuda_cu.h" - -#include - -void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata,F_FLOAT denom_lj_inv) -{ - Cuda_Pair_Init_AllStyles(sdata, 4,true,true,true); - cudaMemcpyToSymbol(MY_CONST(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT) ); - - return; -} - - - -void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, - int eflag_atom,int vflag_atom,F_FLOAT denom_lj) -{ - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJCharmmCoulLongCuda_Init(sdata,1.0/denom_lj); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h deleted file mode 100644 index 0f29e8f97b..0000000000 --- a/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT denom_lj); diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda.cu b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu deleted file mode 100644 index 7cd53d31ff..0000000000 --- a/lib/cuda/pair_lj_class2_coul_cut_cuda.cu +++ /dev/null @@ -1,72 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) - -#include "pair_lj_class2_coul_cut_cuda_cu.h" - -#include - -void Cuda_PairLJClass2CoulCutCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 4,true); -} - -void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJClass2CoulCutCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h deleted file mode 100644 index a656ebbd89..0000000000 --- a/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda.cu b/lib/cuda/pair_lj_class2_coul_long_cuda.cu deleted file mode 100644 index 4f15d42936..0000000000 --- a/lib/cuda/pair_lj_class2_coul_long_cuda.cu +++ /dev/null @@ -1,72 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) - -#include "pair_lj_class2_coul_long_cuda_cu.h" - -#include - -void Cuda_PairLJClass2CoulLongCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 4,true); -} - -void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJClass2CoulLongCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h b/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h deleted file mode 100644 index dea620defe..0000000000 --- a/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_class2_cuda.cu b/lib/cuda/pair_lj_class2_cuda.cu deleted file mode 100644 index 1064d12cf6..0000000000 --- a/lib/cuda/pair_lj_class2_cuda.cu +++ /dev/null @@ -1,74 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) - -#include "pair_lj_class2_cuda_cu.h" -#include "pair_lj_class2_cuda_kernel_nc.cu" - -#include - -void Cuda_PairLJClass2Cuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 4); -} - -void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJClass2Cuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); - //if(CUDA_ARCH==20) maxthreads*=2; - //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt,cudaFuncCachePreferL1); - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_class2_cuda_cu.h b/lib/cuda/pair_lj_class2_cuda_cu.h deleted file mode 100644 index cc14d9eda4..0000000000 --- a/lib/cuda/pair_lj_class2_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu deleted file mode 100644 index e5674d8b74..0000000000 --- a/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT PairLJClass2Cuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r6inv = r2inv*r2inv*r2inv; - const F_FLOAT r3inv = _SQRT_(r6inv); - if (eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r3inv- - _lj4[ij_type]) - _offset[ij_type]); - return factor_lj*r6inv * (_lj1[ij_type]*r3inv - _lj2[ij_type])*r2inv; -} - diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda.cu b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu deleted file mode 100644 index c3b4a40749..0000000000 --- a/lib/cuda/pair_lj_cut_coul_cut_cuda.cu +++ /dev/null @@ -1,72 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) - -#include "pair_lj_cut_coul_cut_cuda_cu.h" - -#include - -void Cuda_PairLJCutCoulCutCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 4,true); -} - -void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJCutCoulCutCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h deleted file mode 100644 index 95fadcd39b..0000000000 --- a/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda.cu b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu deleted file mode 100644 index f5e074ba82..0000000000 --- a/lib/cuda/pair_lj_cut_coul_debye_cuda.cu +++ /dev/null @@ -1,71 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) - -#include "pair_lj_cut_coul_debye_cuda_cu.h" - -#include - -void Cuda_PairLJCutCoulDebyeCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 4,true); -} - -void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJCutCoulDebyeCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h deleted file mode 100644 index b6df066ac1..0000000000 --- a/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda.cu b/lib/cuda/pair_lj_cut_coul_long_cuda.cu deleted file mode 100644 index dd3e1df978..0000000000 --- a/lib/cuda/pair_lj_cut_coul_long_cuda.cu +++ /dev/null @@ -1,72 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) - -#include "pair_lj_cut_coul_long_cuda_cu.h" - -#include - -void Cuda_PairLJCutCoulLongCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 4,true); -} - -void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJCutCoulLongCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h deleted file mode 100644 index 9cac5457bd..0000000000 --- a/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h +++ /dev/null @@ -1,30 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -#ifdef CUDA_USE_BINNING -extern "C" void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); -#else -extern "C" void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); -#endif diff --git a/lib/cuda/pair_lj_cut_cuda.cu b/lib/cuda/pair_lj_cut_cuda.cu deleted file mode 100644 index 8f0c862004..0000000000 --- a/lib/cuda/pair_lj_cut_cuda.cu +++ /dev/null @@ -1,74 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) - -#include "pair_lj_cut_cuda_cu.h" -#include "pair_lj_cut_cuda_kernel_nc.cu" - -#include - -void Cuda_PairLJCutCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 4); -} - -void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJCutCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); - //if(CUDA_ARCH==20) maxthreads*=2; - //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt,cudaFuncCachePreferL1); - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_cuda_cu.h b/lib/cuda/pair_lj_cut_cuda_cu.h deleted file mode 100644 index 9d9722501f..0000000000 --- a/lib/cuda/pair_lj_cut_cuda_cu.h +++ /dev/null @@ -1,30 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -#ifdef CUDA_USE_BINNING -extern "C" void Cuda_PairLJCutCuda(cuda_shared_data* sdata, int eflag, int vflag); -#else -extern "C" void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); -#endif diff --git a/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu deleted file mode 100644 index d263e4a5cf..0000000000 --- a/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT PairLJCutCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r6inv = r2inv*r2inv*r2inv; - if (eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r6inv- - _lj4[ij_type]) - _offset[ij_type]); - return factor_lj*r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type])*r2inv; -} - diff --git a/lib/cuda/pair_lj_cut_experimental_cuda.cu b/lib/cuda/pair_lj_cut_experimental_cuda.cu deleted file mode 100644 index 6996c02236..0000000000 --- a/lib/cuda/pair_lj_cut_experimental_cuda.cu +++ /dev/null @@ -1,75 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) - -#include "pair_lj_cut_experimental_cuda_cu.h" - -#include - -void Cuda_PairLJCutExperimentalCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 4); -} - -void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJCutExperimentalCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); - //if(CUDA_ARCH==20) maxthreads*=2; - //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt,cudaFuncCachePreferL1); - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); - if (sharedperproc==0) sharedperproc++; - //printf("comm_phase: %i\n",sdata->comm.comm_phase); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA_opt - <<>> (eflag, vflag,eflag_atom,vflag_atom,sdata->comm.comm_phase); - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_experimental_cuda_cu.h b/lib/cuda/pair_lj_cut_experimental_cuda_cu.h deleted file mode 100644 index 4cc1f6de36..0000000000 --- a/lib/cuda/pair_lj_cut_experimental_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_expand_cuda.cu b/lib/cuda/pair_lj_expand_cuda.cu deleted file mode 100644 index e1fa43d050..0000000000 --- a/lib/cuda/pair_lj_expand_cuda.cu +++ /dev/null @@ -1,77 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) -#define _shift MY_AP(coeff5) - -#include "pair_lj_expand_cuda_cu.h" -#include "pair_lj_expand_cuda_kernel_nc.cu" -#include - - -void Cuda_PairLJExpandCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5); -} - - - - -void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJExpandCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,256); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 - - diff --git a/lib/cuda/pair_lj_expand_cuda_cu.h b/lib/cuda/pair_lj_expand_cuda_cu.h deleted file mode 100644 index 24164b6fa7..0000000000 --- a/lib/cuda/pair_lj_expand_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu deleted file mode 100644 index 533bd761fc..0000000000 --- a/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairLJExpandCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const F_FLOAT r = _SQRT_(rsq); - const F_FLOAT rshift = r - _shift[ij_type]; - const F_FLOAT rshiftsq = rshift*rshift; - const F_FLOAT r2inv = F_F(1.0)/rshiftsq; - const F_FLOAT r6inv = r2inv*r2inv*r2inv; - const F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]); - if(eflag) evdwl += factor_lj*(r6inv*(_lj3[ij_type]*r6inv-_lj4[ij_type]) - _offset[ij_type]); - return factor_lj*forcelj*(F_F(1.0)/rshift)*(F_F(1.0)/r); -} diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu deleted file mode 100644 index 7532e4b643..0000000000 --- a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu +++ /dev/null @@ -1,102 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1_gm) -#define _lj2 MY_AP(coeff2_gm) -#define _lj3 MY_AP(coeff3_gm) -#define _lj4 MY_AP(coeff4_gm) -#define _ljsw1 MY_AP(coeff5_gm) -#define _ljsw2 MY_AP(coeff6_gm) -#define _ljsw3 MY_AP(coeff7_gm) -#define _ljsw4 MY_AP(coeff8_gm) -#define _ljsw5 MY_AP(coeff9_gm) - -#define _cut_coul_inner_global MY_AP(cut_coul_inner_global) -#define _coulsw1 MY_AP(coulsw1) -#define _coulsw2 MY_AP(coulsw2) -#define _coulsw5 MY_AP(coulsw5) -__device__ __constant__ F_FLOAT _cut_coul_inner_global; -__device__ __constant__ F_FLOAT _coulsw1; -__device__ __constant__ F_FLOAT _coulsw2; -__device__ __constant__ F_FLOAT _coulsw5; - - -#include "pair_lj_gromacs_coul_gromacs_cuda_cu.h" -#include "pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu" - -#include - -void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata,F_FLOAT cut_coul_inner,F_FLOAT coulsw1,F_FLOAT coulsw2,F_FLOAT coulsw5) -{ - Cuda_Pair_Init_AllStyles(sdata, 9,true,true,true); - cudaMemcpyToSymbol(MY_CONST(cut_coul_inner_global) , &cut_coul_inner , sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(coulsw1) , &coulsw1 , sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(coulsw2) , &coulsw2 , sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(coulsw5) , &coulsw5 , sizeof(F_FLOAT) ); - - return; -} - - - -void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, - int eflag_atom,int vflag_atom,F_FLOAT cut_coul_inner,F_FLOAT coulsw1,F_FLOAT coulsw2,F_FLOAT coulsw5) -{ - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJGromacsCoulGromacsCuda_Init(sdata,cut_coul_inner,coulsw1,coulsw2,coulsw5); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _ljsw1 -#undef _ljsw2 -#undef _ljsw3 -#undef _ljsw4 -#undef _ljsw5 -#undef _cut_coul_inner_global -#undef _coulsw1 -#undef _coulsw2 -#undef _coulsw5 diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h deleted file mode 100644 index 8dc5f8fcde..0000000000 --- a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom,F_FLOAT cut_coul_inner,F_FLOAT coulsw1,F_FLOAT coulsw2,F_FLOAT coulsw5); diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu deleted file mode 100644 index 29e0a63c90..0000000000 --- a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu +++ /dev/null @@ -1,46 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT CoulGromacsCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_coul,int& eflag, ENERGY_FLOAT& ecoul, F_FLOAT qij) -{ - if (qij != F_F(0.0)) - { - F_FLOAT ecoul_tmp; - F_FLOAT forcecoul = _RSQRT_(rsq); - if(eflag) ecoul_tmp=forcecoul - _coulsw5; - if (rsq > _cut_coul_inner_global*_cut_coul_inner_global) { - const F_FLOAT r = F_F(1.0)/forcecoul; - const F_FLOAT tc = r - _cut_coul_inner_global; - forcecoul += r*tc*tc*(_coulsw1 + _coulsw2*tc); - if(eflag) ecoul_tmp-=tc*tc*tc*(_coulsw1*(F_F(1.0)/F_F(3.0)) + _coulsw2*tc*(F_F(1.0)/F_F(4.0))); - } - F_FLOAT qprod=_qqrd2e * qij*factor_coul; - forcecoul*=qprod; - if(eflag) - { - ecoul += ecoul_tmp*qprod; - } - return forcecoul*(F_F(1.0)/rsq); - } - return F_F(0.0); -} diff --git a/lib/cuda/pair_lj_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_cuda.cu deleted file mode 100644 index ce0c08f6f0..0000000000 --- a/lib/cuda/pair_lj_gromacs_cuda.cu +++ /dev/null @@ -1,83 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1_gm) -#define _lj2 MY_AP(coeff2_gm) -#define _lj3 MY_AP(coeff3_gm) -#define _lj4 MY_AP(coeff4_gm) -#define _ljsw1 MY_AP(coeff5_gm) -#define _ljsw2 MY_AP(coeff6_gm) -#define _ljsw3 MY_AP(coeff7_gm) -#define _ljsw4 MY_AP(coeff8_gm) -#define _ljsw5 MY_AP(coeff9_gm) - -#include "pair_lj_gromacs_cuda_cu.h" -#include "pair_lj_gromacs_cuda_kernel_nc.cu" - -#include - -void Cuda_PairLJGromacsCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 9,false,true,true); -} - - - -void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, - int eflag_atom,int vflag_atom) -{ - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJGromacsCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); - -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _ljsw1 -#undef _ljsw2 -#undef _ljsw3 -#undef _ljsw4 -#undef _ljsw5 diff --git a/lib/cuda/pair_lj_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_cuda_cu.h deleted file mode 100644 index 970eb1f832..0000000000 --- a/lib/cuda/pair_lj_gromacs_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu deleted file mode 100644 index 818c9f55fc..0000000000 --- a/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu +++ /dev/null @@ -1,51 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT PairLJGromacsCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - F_FLOAT tlj; - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT r6inv = r2inv*r2inv*r2inv; - F_FLOAT forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]); - const X_FLOAT cut_lj_innersq=(_cut_innersq_global > X_F(0.0)? _cut_innersq_global : _cut_innersq[ij_type]); - if (rsq > cut_lj_innersq) - { - tlj = r - _SQRT_(cut_lj_innersq); - forcelj += r*tlj*tlj*(_ljsw1[ij_type] + _ljsw2[ij_type]*tlj); - } - - if (eflag) - { - ENERGY_FLOAT evdwl_tmp = r6inv*(_lj3[ij_type]*r6inv-_lj4[ij_type]); - - if (rsq > cut_lj_innersq) - { - evdwl_tmp += tlj*tlj*tlj* - (_ljsw3[ij_type] + _ljsw4[ij_type]*tlj) + _ljsw5[ij_type];; - } - - evdwl+=evdwl_tmp*factor_lj; - } - return factor_lj*forcelj*r2inv; -} diff --git a/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu b/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu deleted file mode 100644 index 813d031476..0000000000 --- a/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu +++ /dev/null @@ -1,80 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) -#define _lj_type MY_AP(coeff5) - - -#include "pair_lj_sdk_coul_cut_cuda_cu.h" -#include - - - - -void Cuda_PairLJSDKCoulCutCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5, true, false ); - -} - - - - -void Cuda_PairLJSDKCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJSDKCoulCutCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _lj_type - diff --git a/lib/cuda/pair_lj_sdk_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_sdk_coul_cut_cuda_cu.h deleted file mode 100644 index a8da1256da..0000000000 --- a/lib/cuda/pair_lj_sdk_coul_cut_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJSDKCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu b/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu deleted file mode 100644 index 0f1fe1dbe0..0000000000 --- a/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu +++ /dev/null @@ -1,80 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) -#define _lj_type MY_AP(coeff5) - - -#include "pair_lj_sdk_coul_debye_cuda_cu.h" -#include - - - - -void Cuda_PairLJSDKCoulDebyeCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5, true, false ); - -} - - - - -void Cuda_PairLJSDKCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJSDKCoulDebyeCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _lj_type - diff --git a/lib/cuda/pair_lj_sdk_coul_debye_cuda_cu.h b/lib/cuda/pair_lj_sdk_coul_debye_cuda_cu.h deleted file mode 100644 index bee1825a17..0000000000 --- a/lib/cuda/pair_lj_sdk_coul_debye_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJSDKCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_sdk_coul_long_cuda.cu b/lib/cuda/pair_lj_sdk_coul_long_cuda.cu deleted file mode 100644 index a71538f10c..0000000000 --- a/lib/cuda/pair_lj_sdk_coul_long_cuda.cu +++ /dev/null @@ -1,80 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) -#define _lj_type MY_AP(coeff5) - - -#include "pair_lj_sdk_coul_long_cuda_cu.h" -#include - - - - -void Cuda_PairLJSDKCoulLongCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5, true, false ); - -} - - - - -void Cuda_PairLJSDKCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJSDKCoulLongCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,128); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _lj_type - diff --git a/lib/cuda/pair_lj_sdk_coul_long_cuda_cu.h b/lib/cuda/pair_lj_sdk_coul_long_cuda_cu.h deleted file mode 100644 index cc7979d186..0000000000 --- a/lib/cuda/pair_lj_sdk_coul_long_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJSDKCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_sdk_cuda.cu b/lib/cuda/pair_lj_sdk_cuda.cu deleted file mode 100644 index aae7c76734..0000000000 --- a/lib/cuda/pair_lj_sdk_cuda.cu +++ /dev/null @@ -1,85 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1) -#define _lj2 MY_AP(coeff2) -#define _lj3 MY_AP(coeff3) -#define _lj4 MY_AP(coeff4) -#define _lj_type MY_AP(coeff5) - - enum {CG_NOT_SET=0, CG_LJ9_6, CG_LJ12_4, CG_LJ12_6, NUM_CG_TYPES, - CG_COUL_NONE, CG_COUL_CUT, CG_COUL_DEBYE, CG_COUL_LONG}; - -#include "pair_lj_sdk_cuda_cu.h" -#include "pair_lj_sdk_cuda_kernel_nc.cu" -#include - - - - -void Cuda_PairLJSDKCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5, false, false ); - -} - - - - -void Cuda_PairLJSDKCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJSDKCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - int maxthreads=128; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,maxthreads); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _lj_type - diff --git a/lib/cuda/pair_lj_sdk_cuda_cu.h b/lib/cuda/pair_lj_sdk_cuda_cu.h deleted file mode 100644 index 5cb7ea2153..0000000000 --- a/lib/cuda/pair_lj_sdk_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJSDKCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu b/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu deleted file mode 100644 index 1338d775e6..0000000000 --- a/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu +++ /dev/null @@ -1,48 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT PairLJSDKCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) //0.11 of 0.4 -{ - const F_FLOAT r2inv = F_F(1.0)/rsq; - const int lj_type = _lj_type[ij_type]; - const F_FLOAT r4inv = r2inv*r2inv; - const F_FLOAT rNinv_first = lj_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); - const F_FLOAT rNinv_second = lj_type!=CG_LJ12_4?-r2inv:-F_F(1.0); - const F_FLOAT forcelj = r4inv * (_lj1[ij_type]*r4inv*rNinv_first + _lj2[ij_type]*rNinv_second); - - if(eflag) evdwl += factor_lj*(r4inv*(_lj3[ij_type]*r4inv*rNinv_first+_lj4[ij_type]*rNinv_second) - _offset[ij_type]); - return factor_lj*forcelj*r2inv; -} - -/*__device__ inline F_FLOAT PairLJSDKCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const int lj_type = tex1Dfetch(_coeff5_gm_tex,ij_type); - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r4inv = r2inv*r2inv; - const F_FLOAT rNinv_first = lj_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); - const F_FLOAT rNinv_second = lj_type!=CG_LJ12_4?r2inv:F_F(1.0); - const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second); - - if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second)); - return factor_lj*forcelj*r2inv; -}*/ diff --git a/lib/cuda/pair_lj_smooth_cuda.cu b/lib/cuda/pair_lj_smooth_cuda.cu deleted file mode 100644 index 5723ffc94c..0000000000 --- a/lib/cuda/pair_lj_smooth_cuda.cu +++ /dev/null @@ -1,83 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _lj1 MY_AP(coeff1_gm) -#define _lj2 MY_AP(coeff2_gm) -#define _lj3 MY_AP(coeff3_gm) -#define _lj4 MY_AP(coeff4_gm) -#define _ljsw1 MY_AP(coeff5_gm) -#define _ljsw2 MY_AP(coeff6_gm) -#define _ljsw3 MY_AP(coeff7_gm) -#define _ljsw4 MY_AP(coeff8_gm) -#define _ljsw0 MY_AP(coeff9_gm) - -#include "pair_lj_smooth_cuda_cu.h" -#include "pair_lj_smooth_cuda_kernel_nc.cu" - -#include - -void Cuda_PairLJSmoothCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 9,false,true,true); -} - - - -void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, - int eflag_atom,int vflag_atom) -{ - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairLJSmoothCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _lj1 -#undef _lj2 -#undef _lj3 -#undef _lj4 -#undef _ljsw1 -#undef _ljsw2 -#undef _ljsw3 -#undef _ljsw4 -#undef _ljsw0 diff --git a/lib/cuda/pair_lj_smooth_cuda_cu.h b/lib/cuda/pair_lj_smooth_cuda_cu.h deleted file mode 100644 index 504cf19f98..0000000000 --- a/lib/cuda/pair_lj_smooth_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu deleted file mode 100644 index bcac8bf88a..0000000000 --- a/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu +++ /dev/null @@ -1,66 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT PairLJSmoothCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - F_FLOAT fskin,t,tsq,forcelj; - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT r6inv = r2inv*r2inv*r2inv; - - - X_FLOAT cut_lj_innersq=(_cut_innersq_global > X_F(0.0)? _cut_innersq_global : _cut_innersq[ij_type]); - if (rsq < cut_lj_innersq) - { - forcelj = r6inv * (_lj1[ij_type]*r6inv - _lj2[ij_type]); - } - else - { - t = r - _SQRT_(cut_lj_innersq); - tsq = t*t; - fskin = _ljsw1[ij_type] + _ljsw2[ij_type]*t + - _ljsw3[ij_type]*tsq + _ljsw4[ij_type]*tsq*t; - forcelj = fskin*r; - - } - - if (eflag) - { - ENERGY_FLOAT evdwl_tmp; - - if (rsq < cut_lj_innersq) - { - evdwl_tmp = r6inv*(_lj3[ij_type]*r6inv-_lj4[ij_type]) - - _offset[ij_type]; - } - else - { - evdwl_tmp = _ljsw0[ij_type] - _ljsw1[ij_type]*t - - _ljsw2[ij_type]*tsq*F_F(0.5) - _ljsw3[ij_type]*tsq*t*(F_F(1.0)/F_F(3.0)) - - _ljsw4[ij_type]*tsq*tsq*(F_F(1.0)/F_F(4.0)) - _offset[ij_type]; - } - - evdwl+=evdwl_tmp*factor_lj; - } - return factor_lj*forcelj * r2inv; -} diff --git a/lib/cuda/pair_manybody_const.h b/lib/cuda/pair_manybody_const.h deleted file mode 100644 index 69bf32aead..0000000000 --- a/lib/cuda/pair_manybody_const.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * pair_manybody_const.h - * - * Created on: Oct 11, 2011 - * Author: chmu-tph - */ - -#define MANYBODY_NPAIR 3 - -__device__ __constant__ int elem2param[(MANYBODY_NPAIR+1)*(MANYBODY_NPAIR+1)*(MANYBODY_NPAIR+1)]; -__device__ __constant__ int nelements; -__device__ __constant__ int map[MANYBODY_NPAIR+2]; -__device__ __constant__ int* _glob_numneigh_red; //number of neighbors within force cutoff (as opposed to neighbor cutoff) -__device__ __constant__ int* _glob_neighbors_red; //indices of neighbors within force cutoff -__device__ __constant__ int* _glob_neightype_red; //type of neighbors within force cutoff - diff --git a/lib/cuda/pair_morse_coul_long_cuda.cu b/lib/cuda/pair_morse_coul_long_cuda.cu deleted file mode 100644 index cb226b58f4..0000000000 --- a/lib/cuda/pair_morse_coul_long_cuda.cu +++ /dev/null @@ -1,79 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _r0 MY_AP(coeff1) -#define _alpha MY_AP(coeff2) -#define _morse1 MY_AP(coeff3) -#define _d0 MY_AP(coeff4) -#define _c0 MY_AP(coeff5) - -#include "pair_morse_coul_long_cuda_cu.h" -#include "pair_morse_coul_long_cuda_kernel_nc.cu" - -#include - -void Cuda_PairMorseCoulLongCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 5,true); -} - -void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - - static short init=0; - if(! init) - { - init = 1; - Cuda_PairMorseCoulLongCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - - - -#undef _rhoinv -#undef _sigma -#undef _a -#undef _c -#undef _d -#undef _c0 - diff --git a/lib/cuda/pair_morse_coul_long_cuda_cu.h b/lib/cuda/pair_morse_coul_long_cuda_cu.h deleted file mode 100644 index 63055289f4..0000000000 --- a/lib/cuda/pair_morse_coul_long_cuda_cu.h +++ /dev/null @@ -1,30 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -#ifdef CUDA_USE_BINNING -extern "C" void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); -#else -extern "C" void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); -#endif diff --git a/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu deleted file mode 100644 index b367914a78..0000000000 --- a/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairMorseR6Cuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r = _SQRT_(rsq); - const F_FLOAT r4inv = r2inv*r2inv; - const F_FLOAT dr = r-_r0[ij_type]; - const F_FLOAT dexp = _EXP_(-_alpha[ij_type]*dr); - if(eflag) evdwl += factor_lj*(_d0[ij_type]*(dexp*dexp-F_F(2.0)*dexp) + _c0[ij_type]*r4inv*r4inv*r4inv - - _offset[ij_type]); - return factor_lj*(_morse1[ij_type]*(dexp*dexp-dexp)*(F_F(1.0)/r)- F_F(12.0)*_c0[ij_type]*r4inv*r4inv*r4inv*r2inv); -} diff --git a/lib/cuda/pair_morse_cuda.cu b/lib/cuda/pair_morse_cuda.cu deleted file mode 100644 index d33ac842d3..0000000000 --- a/lib/cuda/pair_morse_cuda.cu +++ /dev/null @@ -1,77 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#define _r0 MY_AP(coeff1) -#define _alpha MY_AP(coeff2) -#define _morse1 MY_AP(coeff3) -#define _d0 MY_AP(coeff4) - -#include "pair_morse_cuda_cu.h" -#include "pair_morse_cuda_kernel_nc.cu" -#include - - - -void Cuda_PairMorseCuda_Init(cuda_shared_data* sdata) -{ - Cuda_Pair_Init_AllStyles(sdata, 4); -} - - - - -void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) -{ - - // initialize only on first call - static short init=0; - if(! init) - { - init = 1; - Cuda_PairMorseCuda_Init(sdata); - } - - dim3 grid,threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,256); - - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - if(sdata->pair.use_block_per_atom) - Pair_Kernel_BpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - else - Pair_Kernel_TpA - <<>> (eflag, vflag,eflag_atom,vflag_atom); - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - -#undef _r0 -#undef _alpha -#undef _morse1 -#undef _d0 - - diff --git a/lib/cuda/pair_morse_cuda_cu.h b/lib/cuda/pair_morse_cuda_cu.h deleted file mode 100644 index 2cfe350458..0000000000 --- a/lib/cuda/pair_morse_cuda_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_morse_cuda_kernel_nc.cu b/lib/cuda/pair_morse_cuda_kernel_nc.cu deleted file mode 100644 index ead1c54fb2..0000000000 --- a/lib/cuda/pair_morse_cuda_kernel_nc.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairMorseCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) -{ - const F_FLOAT r = _SQRT_(rsq); - const F_FLOAT dr = r-_r0[ij_type]; - const F_FLOAT dexp = _EXP_(-_alpha[ij_type]*dr); - if(eflag) evdwl += factor_lj*(_d0[ij_type]*(dexp*dexp-F_F(2.0)*dexp) - - _offset[ij_type]); - return factor_lj*_morse1[ij_type]*(dexp*dexp-dexp)*(F_F(1.0)/r); -} - diff --git a/lib/cuda/pair_sw_cuda.cu b/lib/cuda/pair_sw_cuda.cu deleted file mode 100644 index 491d4d666f..0000000000 --- a/lib/cuda/pair_sw_cuda.cu +++ /dev/null @@ -1,139 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - -#include "pair_sw_cuda_cu.h" -__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR]; - -#include "pair_sw_cuda_kernel_nc.cu" - -#include - - -void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h) -{ - unsigned cuda_ntypes = sdata->atom.ntypes + 1; - X_FLOAT box_size[3] = { - sdata->domain.subhi[0] - sdata->domain.sublo[0], - sdata->domain.subhi[1] - sdata->domain.sublo[1], - sdata->domain.subhi[2] - sdata->domain.sublo[2] - }; - - cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); - cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); - cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); - cudaMemcpyToSymbol(params_sw, params_host , sizeof(ParamSW_Float)*nelements_h * nelements_h * nelements_h); - cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h); - cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes); - cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int)); -} - -void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) -{ - static int glob_ij_size = 0; - static F_FLOAT4* glob_r_ij = NULL; - static int* glob_numneigh_red = NULL; - static int* glob_neighbors_red = NULL; - static int* glob_neightype_red = NULL; - - if(glob_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) { - glob_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT); - cudaFree(glob_r_ij); - cudaFree(glob_numneigh_red); - cudaFree(glob_neighbors_red); - cudaFree(glob_neightype_red); - cudaMalloc(&glob_r_ij, glob_ij_size * 4); - cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int)); - cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); - cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); - cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*)); - cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*)); - cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*)); - cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*)); - } - - dim3 grid, threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64); - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - - - - dim3 grid2; - - if(sdata->atom.nall <= 256 * 64000) { - grid2.x = (sdata->atom.nall + 255) / 256; - grid2.y = 1; - } else { - grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128); - grid2.y = 128; - } - - grid2.z = 1; - dim3 threads2; - threads2.x = 256; - threads2.y = 1; - threads2.z = 1; - - timespec time1, time2; - - //pre-calculate all neighbordistances and zeta_ij - clock_gettime(CLOCK_REALTIME, &time1); - Pair_SW_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>(); - cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); - sdata->cuda_timings.test1 += - time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; - clock_gettime(CLOCK_REALTIME, &time1); - - //actual force calculation - unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure - - if(eflag) { - if(vflag) - Pair_SW_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>> - (eflag_atom, vflag_atom); - else - Pair_SW_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>> - (eflag_atom, vflag_atom); - } else { - if(vflag) - Pair_SW_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>> - (eflag_atom, vflag_atom); - else - Pair_SW_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>> - (eflag_atom, vflag_atom); - } - cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); - sdata->cuda_timings.test2 += - time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - diff --git a/lib/cuda/pair_sw_cuda_cu.h b/lib/cuda/pair_sw_cuda_cu.h deleted file mode 100644 index 24e92689ff..0000000000 --- a/lib/cuda/pair_sw_cuda_cu.h +++ /dev/null @@ -1,39 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - - struct ParamSW_Float { - F_FLOAT epsilon,sigma; - F_FLOAT littlea,lambda,gamma,costheta; - F_FLOAT biga,bigb; - F_FLOAT powerp,powerq; - F_FLOAT tol; - F_FLOAT cut,cutsq; - F_FLOAT sigma_gamma,lambda_epsilon,lambda_epsilon2; - F_FLOAT c1,c2,c3,c4,c5,c6; - int ielement,jelement,kelement; - }; - -extern "C" void Cuda_PairSWCuda_Init(cuda_shared_data* sdata,ParamSW_Float* params_host,void* map_host, void* elem2param_host,int nelements_h); -extern "C" void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_sw_cuda_kernel_nc.cu b/lib/cuda/pair_sw_cuda_kernel_nc.cu deleted file mode 100644 index e84dcd53b6..0000000000 --- a/lib/cuda/pair_sw_cuda_kernel_nc.cu +++ /dev/null @@ -1,450 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -#define Pi F_F(3.1415926535897932384626433832795) -#define PI Pi -#define PI2 F_F(0.5)*Pi -#define PI4 F_F(0.25)*Pi - - - -__device__ void twobody(int iparam, F_FLOAT rsq, F_FLOAT &fforce, - int eflag, ENERGY_FLOAT &eng) -{ - F_FLOAT r,rp,rq,rainv,expsrainv; - - r = sqrt(rsq); - rp = pow(r,-params_sw[iparam].powerp); - rq = pow(r,-params_sw[iparam].powerq); - rainv = 1.0 / (r - params_sw[iparam].cut); - expsrainv = exp(params_sw[iparam].sigma * rainv); - fforce = (params_sw[iparam].c1*rp - params_sw[iparam].c2*rq + - (params_sw[iparam].c3*rp -params_sw[iparam].c4*rq) * rainv*rainv*r) * expsrainv / rsq; - if (eflag) eng += (params_sw[iparam].c5*rp - params_sw[iparam].c6*rq) * expsrainv; -} - -__device__ void threebody(int paramij, int paramik, int paramijk, - F_FLOAT4& delr1, - F_FLOAT4& delr2, - F_FLOAT3& fj, F_FLOAT3& fk, int eflag,ENERGY_FLOAT &eng) -{ - F_FLOAT r1,rinvsq1,rainv1,gsrainv1,gsrainvsq1,expgsrainv1; - F_FLOAT r2,rinvsq2,rainv2,gsrainv2,gsrainvsq2,expgsrainv2; - F_FLOAT rinv12,cs,delcs,delcssq,facexp,facrad,frad1,frad2; - F_FLOAT facang,facang12,csfacang,csfac1,csfac2; - - r1 = sqrt(delr1.w); - rinvsq1 = F_F(1.0)/delr1.w; - rainv1 = F_F(1.0)/(r1 - params_sw[paramij].cut); - gsrainv1 = params_sw[paramij].sigma_gamma * rainv1; - gsrainvsq1 = gsrainv1*rainv1/r1; - expgsrainv1 = exp(gsrainv1); - - r2 = sqrt(delr2.w); - rinvsq2 = F_F(1.0)/delr2.w; - rainv2 = F_F(1.0)/(r2 - params_sw[paramik].cut); - gsrainv2 = params_sw[paramik].sigma_gamma * rainv2; - gsrainvsq2 = gsrainv2*rainv2/r2; - expgsrainv2 = exp(gsrainv2); - - rinv12 = F_F(1.0)/(r1*r2); - cs = (delr1.x*delr2.x + delr1.y*delr2.y + delr1.z*delr2.z) * rinv12; - delcs = cs - params_sw[paramijk].costheta; - delcssq = delcs*delcs; - - facexp = expgsrainv1*expgsrainv2; - - // facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) * - // facexp*delcssq; - - facrad = params_sw[paramijk].lambda_epsilon * facexp*delcssq; - frad1 = facrad*gsrainvsq1; - frad2 = facrad*gsrainvsq2; - facang = params_sw[paramijk].lambda_epsilon2 * facexp*delcs; - facang12 = rinv12*facang; - csfacang = cs*facang; - csfac1 = rinvsq1*csfacang; - - fj.x = delr1.x*(frad1+csfac1)-delr2.x*facang12; - fj.y = delr1.y*(frad1+csfac1)-delr2.y*facang12; - fj.z = delr1.z*(frad1+csfac1)-delr2.z*facang12; - - csfac2 = rinvsq2*csfacang; - - fk.x = delr2.x*(frad2+csfac2)-delr1.x*facang12; - fk.y = delr2.y*(frad2+csfac2)-delr1.y*facang12; - fk.z = delr2.z*(frad2+csfac2)-delr1.z*facang12; - - if (eflag) eng+= F_F(2.0)*facrad; -} - -__device__ void threebody_fj(int paramij, int paramik, int paramijk, - F_FLOAT4& delr1, - F_FLOAT4& delr2, - F_FLOAT3& fj) -{ - F_FLOAT r1,rinvsq1,rainv1,gsrainv1,gsrainvsq1,expgsrainv1; - F_FLOAT r2,rainv2,gsrainv2,expgsrainv2; - F_FLOAT rinv12,cs,delcs,delcssq,facexp,facrad,frad1; - F_FLOAT facang,facang12,csfacang,csfac1; - - r1 = sqrt(delr1.w); - rinvsq1 = F_F(1.0)/delr1.w; - rainv1 = F_F(1.0)/(r1 - params_sw[paramij].cut); - gsrainv1 = params_sw[paramij].sigma_gamma * rainv1; - gsrainvsq1 = gsrainv1*rainv1/r1; - expgsrainv1 = exp(gsrainv1); - - r2 = sqrt(delr2.w); - rainv2 = F_F(1.0)/(r2 - params_sw[paramik].cut); - gsrainv2 = params_sw[paramik].sigma_gamma * rainv2; - expgsrainv2 = exp(gsrainv2); - - rinv12 = F_F(1.0)/(r1*r2); - cs = (delr1.x*delr2.x + delr1.y*delr2.y + delr1.z*delr2.z) * rinv12; - delcs = cs - params_sw[paramijk].costheta; - delcssq = delcs*delcs; - - facexp = expgsrainv1*expgsrainv2; - - // facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) * - // facexp*delcssq; - - facrad = params_sw[paramijk].lambda_epsilon * facexp*delcssq; - frad1 = facrad*gsrainvsq1; - facang = params_sw[paramijk].lambda_epsilon2 * facexp*delcs; - facang12 = rinv12*facang; - csfacang = cs*facang; - csfac1 = rinvsq1*csfacang; - - fj.x = delr1.x*(frad1+csfac1)-delr2.x*facang12; - fj.y = delr1.y*(frad1+csfac1)-delr2.y*facang12; - fj.z = delr1.z*(frad1+csfac1)-delr2.z*facang12; -} - - -__global__ void Pair_SW_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) -{ - int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if( ii >= _nall ) return; - - X_FLOAT4 myxtype; - F_FLOAT4 delij; - F_FLOAT xtmp,ytmp,ztmp; - int itype,jnum,i,j; - int* jlist; - int neigh_red = 0; - i = ii;//_ilist[ii]; - myxtype = fetchXType(i); - - xtmp=myxtype.x; - ytmp=myxtype.y; - ztmp=myxtype.z; - itype=map[(static_cast (myxtype.w))]; - - jnum = _numneigh[i]; - jlist = &_neighbors[i]; - - __syncthreads(); - for (int jj = 0; jj < jnum; jj++) - { - if(jj (myxtype.w))]; - int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype]; - delij.w = vec3_dot(delij,delij); - if (delij.w < params_sw[iparam_ij].cutsq) - { - _glob_neighbors_red[i+neigh_red*_nall]=j; - _glob_neightype_red[i+neigh_red*_nall]=jtype; - _glob_r_ij[i+neigh_red*_nall]=delij; - neigh_red++; - } - } - } - _glob_numneigh_red[i]=neigh_red; -} - - - template - __global__ void Pair_SW_Kernel_TpA(int eflag_atom,int vflag_atom)//,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) -{ - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - - ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x]; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; - - F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem; - if((eflag||eflag_atom)&&(vflagm||vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7*blockDim.x]; - else - if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x]; - else - if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6*blockDim.x]; - shared_F_F+=threadIdx.x; - - if(eflag_atom||eflag) - { - sharedE[0] = ENERGY_F(0.0); - sharedV += blockDim.x; - } - - if(vflagm||vflag_atom) - { - sharedV[0*blockDim.x] = ENERGY_F(0.0); - sharedV[1*blockDim.x] = ENERGY_F(0.0); - sharedV[2*blockDim.x] = ENERGY_F(0.0); - sharedV[3*blockDim.x] = ENERGY_F(0.0); - sharedV[4*blockDim.x] = ENERGY_F(0.0); - sharedV[5*blockDim.x] = ENERGY_F(0.0); - } - - int jnum_red=0; -#define fxtmp shared_F_F[0] -#define fytmp shared_F_F[blockDim.x] -#define fztmp shared_F_F[2*blockDim.x] -//#define jnum_red (static_cast (shared_F_F[3*blockDim.x])) - - int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - X_FLOAT4 myxtype_i,myxtype_j,myxtype_k; - F_FLOAT4 delij,delik,deljk; - F_FLOAT fpair; - - int itype,i,j; - int* jlist_red; - - if(ii < _inum) - { - i = _ilist[ii]; - - if(vflagm) - myxtype_i=fetchXType(i); - //itype=map[(static_cast (myxtype_i.w))]; - itype=map[_type[i]]; - - - fxtmp = F_F(0.0); - fytmp = F_F(0.0); - fztmp = F_F(0.0); - - - //shared_F_F[3*blockDim.x] = _glob_numneigh_red[i]; - jnum_red = _glob_numneigh_red[i]; - jlist_red = &_glob_neighbors_red[i]; - } - __syncthreads(); -#pragma unroll 1 - for (int jj = 0; jj < jnum_red; jj++) - { - if(i < _nlocal) - { - fpair=F_F(0.0); - j = jlist_red[jj*_nall]; - j &= NEIGHMASK; - - if(vflagm) - myxtype_j = fetchXType(j); - - int jtype = _glob_neightype_red[i+jj*_nall]; - delij = _glob_r_ij[i+jj*_nall]; - - volatile int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype]; - volatile int iparam_ji = elem2param[(jtype*nelements+itype)*nelements+itype]; - - if (delij.w(); - else if(eflag) PairVirialCompute_A_Kernel_Template<1,0>(); - else if(vflagm) PairVirialCompute_A_Kernel_Template<0,1>(); -#undef fxtmp -#undef fytmp -#undef fztmp -//#undef jnum_red -} diff --git a/lib/cuda/pair_tersoff_cuda.cu b/lib/cuda/pair_tersoff_cuda.cu deleted file mode 100644 index 0ae5e846a0..0000000000 --- a/lib/cuda/pair_tersoff_cuda.cu +++ /dev/null @@ -1,154 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include - - -#include "pair_tersoff_cuda_cu.h" -__device__ __constant__ Param_Float params[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR]; -__device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij -__device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff -__device__ __constant__ bool _zbl; //is tersoff zbl? - - -#include "pair_tersoff_cuda_kernel_nc.cu" - -#include - - -void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl) -{ - unsigned cuda_ntypes = sdata->atom.ntypes + 1; - X_FLOAT box_size[3] = { - sdata->domain.subhi[0] - sdata->domain.sublo[0], - sdata->domain.subhi[1] - sdata->domain.sublo[1], - sdata->domain.subhi[2] - sdata->domain.sublo[2] - }; - - cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); - cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); - cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); - cudaMemcpyToSymbol(params, params_host , sizeof(Param_Float)*nelements_h * nelements_h * nelements_h); - cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h); - cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes); - cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int)); - cudaMemcpyToSymbol(_zbl, &zbl, sizeof(bool)); - -} - -void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) -{ - static F_FLOAT* glob_zeta_ij = NULL; - static int glob_zeta_ij_size = 0; - static F_FLOAT4* glob_r_ij = NULL; - static int* glob_numneigh_red = NULL; - static int* glob_neighbors_red = NULL; - static int* glob_neightype_red = NULL; - - if(glob_zeta_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) { - glob_zeta_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT); - cudaFree(glob_zeta_ij); - cudaFree(glob_r_ij); - cudaFree(glob_numneigh_red); - cudaFree(glob_neighbors_red); - cudaFree(glob_neightype_red); - cudaMalloc(&glob_zeta_ij, glob_zeta_ij_size); - cudaMalloc(&glob_r_ij, glob_zeta_ij_size * 4); - cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int)); - cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); - cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); - cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*)); - cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*)); - cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*)); - cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*)); - cudaMemcpyToSymbol(_glob_zeta_ij, &glob_zeta_ij , sizeof(F_FLOAT*)); - } - - dim3 grid, threads; - int sharedperproc; - - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64); - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - - - - dim3 grid2; - - if(sdata->atom.nall <= 256 * 64000) { - grid2.x = (sdata->atom.nall + 255) / 256; - grid2.y = 1; - } else { - grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128); - grid2.y = 128; - } - - grid2.z = 1; - dim3 threads2; - threads2.x = 256; - threads2.y = 1; - threads2.z = 1; - - timespec time1, time2; - - //pre-calculate all neighbordistances and zeta_ij - clock_gettime(CLOCK_REALTIME, &time1); - Pair_Tersoff_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>> - (); - cudaThreadSynchronize(); - Pair_Tersoff_Kernel_TpA_ZetaIJ <<< grid2, threads2, 0, streams[1]>>> - (); - cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); - sdata->cuda_timings.test1 += - time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; - clock_gettime(CLOCK_REALTIME, &time1); - - //actual force calculation - unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure - - if(eflag) { - if(vflag) - Pair_Tersoff_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>> - (eflag_atom, vflag_atom); - else - Pair_Tersoff_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>> - (eflag_atom, vflag_atom); - } else { - if(vflag) - Pair_Tersoff_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>> - (eflag_atom, vflag_atom); - else - Pair_Tersoff_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>> - (eflag_atom, vflag_atom); - } - cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); - sdata->cuda_timings.test2 += - time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; - - Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); -} - diff --git a/lib/cuda/pair_tersoff_cuda_cu.h b/lib/cuda/pair_tersoff_cuda_cu.h deleted file mode 100644 index 5276cd1c35..0000000000 --- a/lib/cuda/pair_tersoff_cuda_cu.h +++ /dev/null @@ -1,42 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - - struct Param_Float { - F_FLOAT lam1,lam2,lam3; - F_FLOAT c,d,h; - F_FLOAT gamma,powerm; - F_FLOAT powern,beta; - F_FLOAT biga,bigb,bigd,bigr; - F_FLOAT cut,cutsq; - F_FLOAT c1,c2,c3,c4; - int ielement,jelement,kelement; - int powermint; - //F_FLOAT Z_i,Z_j; - F_FLOAT ZBLcut,ZBLexpscale; - F_FLOAT a_ij,premult; - }; - -extern "C" void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata,Param_Float* params_host,void* map_host, void* elem2param_host,int nelements_h,bool zbl); -extern "C" void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_tersoff_cuda_kernel_nc.cu b/lib/cuda/pair_tersoff_cuda_kernel_nc.cu deleted file mode 100644 index 125c73855a..0000000000 --- a/lib/cuda/pair_tersoff_cuda_kernel_nc.cu +++ /dev/null @@ -1,1055 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ -#define Pi F_F(3.1415926535897932384626433832795) -#define PI Pi -#define PI2 F_F(0.5)*Pi -#define PI4 F_F(0.25)*Pi -template -static inline __device__ void PairVirialCompute_A_Kernel_Template() -{ - __syncthreads(); - ENERGY_FLOAT* shared=sharedmem; - - if(eflag) - { - reduceBlock(shared); - shared+=blockDim.x; - } - if(vflag) - { - reduceBlock(shared + 0 * blockDim.x); - reduceBlock(shared + 1 * blockDim.x); - reduceBlock(shared + 2 * blockDim.x); - reduceBlock(shared + 3 * blockDim.x); - reduceBlock(shared + 4 * blockDim.x); - reduceBlock(shared + 5 * blockDim.x); - } - if(threadIdx.x == 0) - { - shared=sharedmem; - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; - if(eflag) - { - buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0]; - shared+=blockDim.x; buffer+=gridDim.x * gridDim.y; - } - if(vflag) - { - buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[0 * blockDim.x]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[1 * blockDim.x]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[2 * blockDim.x]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[3 * blockDim.x]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[4 * blockDim.x]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[5 * blockDim.x]; - } - } - __syncthreads(); -} - -__global__ void virial_fdotr_compute_kernel(int eflag) -{ - int i = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - ENERGY_FLOAT* sharedE = (ENERGY_FLOAT*) &sharedmem[0]; - ENERGY_FLOAT* sharedVirial = (ENERGY_FLOAT*) &sharedE[blockDim.x]; - sharedE+=threadIdx.x; - sharedVirial+=threadIdx.x; - if(i<_nlocal) - { - - F_FLOAT x = _x[i]; - F_FLOAT y = _x[i+_nmax]; - F_FLOAT z = _x[i+2*_nmax]; - F_FLOAT fx = _f[i]; - F_FLOAT fy = _f[i+_nmax]; - F_FLOAT fz = _f[i+2*_nmax]; - //if(fz*z*fz*z>1e-5) printf("V %i %i %e %e %e %e %e %e\n",i,_tag[i],x,y,z,fx,fy,fz); - sharedVirial[0] = fx*x; - sharedVirial[1*blockDim.x] = fy*y; - sharedVirial[2*blockDim.x] = fz*z; - sharedVirial[3*blockDim.x] = fy*x; - sharedVirial[4*blockDim.x] = fz*x; - sharedVirial[5*blockDim.x] = fz*y; - } else { - sharedVirial[0] = 0; - sharedVirial[1*blockDim.x] = 0; - sharedVirial[2*blockDim.x] = 0; - sharedVirial[3*blockDim.x] = 0; - sharedVirial[4*blockDim.x] = 0; - sharedVirial[5*blockDim.x] = 0; - } - sharedVirial = (ENERGY_FLOAT*) &sharedmem[0]; - sharedVirial += blockDim.x; - reduceBlockP2(sharedVirial); - reduceBlockP2(&sharedVirial[1*blockDim.x]); - reduceBlockP2(&sharedVirial[2*blockDim.x]); - reduceBlockP2(&sharedVirial[3*blockDim.x]); - reduceBlockP2(&sharedVirial[4*blockDim.x]); - reduceBlockP2(&sharedVirial[5*blockDim.x]); - if(threadIdx.x<6) - { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; - if(eflag) buffer = &buffer[gridDim.x*gridDim.y]; - buffer[blockIdx.x * gridDim.y + blockIdx.y + threadIdx.x * gridDim.x * gridDim.y]= sharedVirial[threadIdx.x*blockDim.x]; - } -} - -/*#define vec3_scale(K,X,Y) Y.x = K*X.x; Y.y = K*X.y; Y.z = K*X.z; -#define vec3_scaleadd(K,X,Y,Z) Z.x = K*X.x+Y.x; Z.y = K*X.y+Y.y; Z.z = K*X.z+Y.z; -#define vec3_add(X,Y,Z) Z.x = X.x+Y.x; Z.y = X.y+Y.y; Z.z = X.z+Y.z; -#define vec3_dot(X,Y) (X.x*Y.x + X.y*Y.y + X.z*Y.z)*/ - -__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT3& x, F_FLOAT3& y) { - y.x = k*x.x; y.y = k*x.y; y.z = k*x.z; -} - -__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4& x, F_FLOAT3& y) { - y.x = k*x.x; y.y = k*x.y; y.z = k*x.z; -} - -__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4& x, F_FLOAT4& y) { - y.x = k*x.x; y.y = k*x.y; y.z = k*x.z; -} - -__device__ inline void vec3_scaleadd(F_FLOAT k, F_FLOAT3& x, F_FLOAT3& y, F_FLOAT3& z) { - z.x = k*x.x+y.x; z.y = k*x.y+y.y; z.z = k*x.z+y.z; -} - -__device__ inline void vec3_add(F_FLOAT3& x, F_FLOAT3& y, F_FLOAT3& z) { - z.x = x.x+y.x; z.y = x.y+y.y; z.z = x.z+y.z; -} - -__device__ inline F_FLOAT vec3_dot(F_FLOAT3 x, F_FLOAT3 y) { - return x.x*y.x + x.y*y.y + x.z*y.z; -} - -__device__ inline F_FLOAT vec3_dot(F_FLOAT4 x, F_FLOAT4 y) { - return x.x*y.x + x.y*y.y + x.z*y.z; -} - -/* ---------------------------------------------------------------------- - Fermi-like smoothing function -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT F_fermi(F_FLOAT &r, int &iparam) -{ - return F_F(1.0) / (F_F(1.0) + exp(-params[iparam].ZBLexpscale*(r-params[iparam].ZBLcut))); -} - -/* ---------------------------------------------------------------------- - Fermi-like smoothing function derivative with respect to r -------------------------------------------------------------------------- */ - -__device__ inline F_FLOAT F_fermi_d(F_FLOAT &r, int &iparam) -{ - volatile const F_FLOAT tmp = exp(-params[iparam].ZBLexpscale*(r-params[iparam].ZBLcut)); - return params[iparam].ZBLexpscale*tmp / - ((F_F(1.0) +tmp)*(F_F(1.0) +tmp)); -} - -__device__ inline F_FLOAT ters_fc(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D) -{ - return (r < ters_R-ters_D)?F_F(1.0):((r > ters_R+ters_D)? - F_F(0.0):F_F(0.5)*(F_F(1.0) - sin(PI2*(r - ters_R)/ters_D))); -} - -__device__ inline F_FLOAT ters_fc_d(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D) -{ - return ((r < ters_R-ters_D)||(r > ters_R+ters_D))? - F_F(0.0):-(PI4/ters_D) * cos(PI2*(r - ters_R)/ters_D); -} - - -__device__ inline F_FLOAT ters_gijk(F_FLOAT& cos_theta, int iparam) -{ - F_FLOAT ters_c = params[iparam].c; - F_FLOAT ters_d = params[iparam].d; - - return params[iparam].gamma*(F_F(1.0) + pow(params[iparam].c/params[iparam].d,F_F(2.0)) - - pow(ters_c,F_F(2.0)) / (pow(ters_d,F_F(2.0)) + pow(params[iparam].h - cos_theta,F_F(2.0)))); -} - -__device__ F_FLOAT ters_gijk2(F_FLOAT& cos_theta, int iparam) -{ - F_FLOAT ters_c = params[iparam].c; - F_FLOAT ters_d = params[iparam].d; - - return params[iparam].gamma*(F_F(1.0) + pow(ters_c/ters_d,F_F(2.0)) - - pow(ters_c,F_F(2.0)) / (pow(ters_d,F_F(2.0)) + pow(params[iparam].h - cos_theta,F_F(2.0)))); -} - -__device__ inline F_FLOAT ters_gijk_d(F_FLOAT costheta, int iparam) -{ - F_FLOAT numerator = -F_F(2.0) * pow(params[iparam].c,F_F(2.0)) * (params[iparam].h - costheta); - F_FLOAT denominator = pow(pow(params[iparam].d,F_F(2.0)) + - pow(params[iparam].h - costheta,F_F(2.0)),F_F(2.0)); - return params[iparam].gamma*numerator/denominator; -} - -__device__ inline F_FLOAT zeta(int iparam, const F_FLOAT rsqij, const F_FLOAT rsqik, - F_FLOAT3& delij, F_FLOAT3& delik) -{ - F_FLOAT rij,rik,costheta,arg,ex_delr; - - rij = sqrt(rsqij); - rik = sqrt(rsqik); - costheta = vec3_dot(delij,delik) / (rij*rik); - - arg = (params[iparam].powermint == 3)? (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)) : params[iparam].lam3 * (rij-rik); - - if (arg > F_F(69.0776)) ex_delr = F_F(1.e30); - else if (arg < -F_F(69.0776)) ex_delr = F_F(0.0); - else ex_delr = exp(arg); - - return ters_fc(rik,params[iparam].bigr,params[iparam].bigd) * ex_delr * params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c/(params[iparam].d*params[iparam].d)) - - (params[iparam].c*params[iparam].c) / ((params[iparam].d*params[iparam].d) + (params[iparam].h - costheta)*(params[iparam].h - costheta))); -} - -__device__ void repulsive(int iparam, F_FLOAT rsq, F_FLOAT &fforce, - int eflag, ENERGY_FLOAT &eng) -{ - F_FLOAT r,tmp_fc,tmp_fc_d,tmp_exp; - - F_FLOAT ters_R = params[iparam].bigr; - F_FLOAT ters_D = params[iparam].bigd; - r = sqrt(rsq); - tmp_fc = ters_fc(r,ters_R,ters_D); - tmp_fc_d = ters_fc_d(r,ters_R,ters_D); - tmp_exp = exp(-params[iparam].lam1 * r); - if(!_zbl) - { - fforce = -params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc*params[iparam].lam1) / r; - if (eflag) eng += tmp_fc * params[iparam].biga * tmp_exp; - } - else - { - F_FLOAT const fforce_ters = params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc*params[iparam].lam1); - ENERGY_FLOAT eng_ters = tmp_fc * params[iparam].biga * tmp_exp; - - F_FLOAT r_ov_a = r/params[iparam].a_ij; - F_FLOAT phi = F_F(0.1818)*exp(-F_F(3.2)*r_ov_a) + F_F(0.5099)*exp(-F_F(0.9423)*r_ov_a) + - F_F(0.2802)*exp(-F_F(0.4029)*r_ov_a) + F_F(0.02817)*exp(-F_F(0.2016)*r_ov_a); - F_FLOAT dphi = (F_F(1.0)/params[iparam].a_ij) * (-F_F(3.2)*F_F(0.1818)*exp(-F_F(3.2)*r_ov_a) - - F_F(0.9423)*F_F(0.5099)*exp(-F_F(0.9423)*r_ov_a) - - F_F(0.4029)*F_F(0.2802)*exp(-F_F(0.4029)*r_ov_a) - - F_F(0.2016)*F_F(0.02817)*exp(-F_F(0.2016)*r_ov_a)); - F_FLOAT fforce_ZBL = params[iparam].premult/(-r*r)* phi + params[iparam].premult/r*dphi; - ENERGY_FLOAT eng_ZBL = params[iparam].premult*(F_F(1.0)/r)*phi; - - fforce = -(-F_fermi_d(r,iparam) * (eng_ZBL - eng_ters) + fforce_ZBL + F_fermi(r,iparam)*(fforce_ters-fforce_ZBL))/r; - if(eflag) - eng += eng_ZBL + F_fermi(r,iparam)*(eng_ters-eng_ZBL); - } - - -} - -/* ---------------------------------------------------------------------- */ - -__device__ inline F_FLOAT ters_fa(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D) -{ - if (r > ters_R + ters_D) return F_F(0.0); - if(_zbl) - return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r,ters_R,ters_D) * F_fermi(r,iparam); - else - return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r,ters_R,ters_D); -} - -/* ---------------------------------------------------------------------- */ - -__device__ inline F_FLOAT ters_fa_d(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D) -{ - if (r > ters_R + ters_D) return F_F(0.0); - if(_zbl) - return params[iparam].bigb * exp(-params[iparam].lam2 * r) * - ((params[iparam].lam2 * ters_fc(r,ters_R,ters_D) - ters_fc_d(r,ters_R,ters_D))*F_fermi(r,iparam) - -ters_fc(r,ters_R,ters_D)*F_fermi_d(r,iparam)); - else - return params[iparam].bigb * exp(-params[iparam].lam2 * r) * - (params[iparam].lam2 * ters_fc(r,ters_R,ters_D) - ters_fc_d(r,ters_R,ters_D)); -} - -/* ---------------------------------------------------------------------- */ - -__device__ inline F_FLOAT ters_bij(F_FLOAT zeta, int iparam) -{ - F_FLOAT tmp = params[iparam].beta * zeta; - if (tmp > params[iparam].c1) return F_F(1.0)/sqrt(tmp); - if (tmp > params[iparam].c2) - return (F_F(1.0) - pow(tmp,-params[iparam].powern) / (F_F(2.0)*params[iparam].powern))/sqrt(tmp); - if (tmp < params[iparam].c4) return F_F(1.0); - if (tmp < params[iparam].c3) - return F_F(1.0) - pow(tmp,params[iparam].powern)/(F_F(2.0)*params[iparam].powern); - return pow(F_F(1.0) + pow(tmp,params[iparam].powern), -F_F(1.0)/(F_F(2.0)*params[iparam].powern)); -} - -/* ---------------------------------------------------------------------- */ - -__device__ inline F_FLOAT ters_bij_d(F_FLOAT zeta, int iparam) -{ - F_FLOAT tmp = params[iparam].beta * zeta; - if (tmp > params[iparam].c1) return params[iparam].beta * -F_F(0.5)*pow(tmp,-F_F(1.5)); - if (tmp > params[iparam].c2) - return params[iparam].beta * (-F_F(0.5)*pow(tmp,-F_F(1.5)) * - (F_F(1.0) - F_F(0.5)*(F_F(1.0) + F_F(1.0)/(F_F(2.0)*params[iparam].powern)) * - pow(tmp,-params[iparam].powern))); - if (tmp < params[iparam].c4) return F_F(0.0); - if (tmp < params[iparam].c3) - return -F_F(0.5)*params[iparam].beta * pow(tmp,params[iparam].powern-F_F(1.0)); - - F_FLOAT tmp_n = pow(tmp,params[iparam].powern); - return -F_F(0.5) * pow(F_F(1.0)+tmp_n, -F_F(1.0)-(F_F(1.0)/(F_F(2.0)*params[iparam].powern)))*tmp_n / zeta; -} - -__device__ void force_zeta(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, - F_FLOAT &fforce, F_FLOAT &prefactor, - int eflag, F_FLOAT &eng) -{ - F_FLOAT r,fa,fa_d,bij; - F_FLOAT ters_R = params[iparam].bigr; - F_FLOAT ters_D = params[iparam].bigd; - r = sqrt(rsq); - fa = ters_fa(r,iparam,ters_R,ters_D); - fa_d = ters_fa_d(r,iparam,ters_R,ters_D); - bij = ters_bij(zeta_ij,iparam); - fforce = F_F(0.5)*bij*fa_d / r; - prefactor = -F_F(0.5)*fa * ters_bij_d(zeta_ij,iparam); - if (eflag) eng += bij*fa; -} - -__device__ void force_zeta_prefactor_force(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, - F_FLOAT &fforce, F_FLOAT &prefactor) -{ - F_FLOAT r,fa,fa_d,bij; - F_FLOAT ters_R = params[iparam].bigr; - F_FLOAT ters_D = params[iparam].bigd; - r = sqrt(rsq); - fa = ters_fa(r,iparam,ters_R,ters_D); - fa_d = ters_fa_d(r,iparam,ters_R,ters_D); - bij = ters_bij(zeta_ij,iparam); - fforce = F_F(0.5)*bij*fa_d / r; - prefactor = -F_F(0.5)*fa * ters_bij_d(zeta_ij,iparam); -} - -__device__ void force_zeta_prefactor(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, - F_FLOAT &prefactor) -{ - F_FLOAT r,fa; - r = sqrt(rsq); - fa = ters_fa(r,iparam,params[iparam].bigr,params[iparam].bigd); - prefactor = -F_F(0.5)*fa*ters_bij_d(zeta_ij,iparam); -} - - -__device__ void costheta_d(F_FLOAT3& rij_hat, F_FLOAT& rij, - F_FLOAT3& rik_hat, F_FLOAT& rik, - F_FLOAT3& dri, F_FLOAT3& drj, F_FLOAT3& drk) -{ - // first element is derivative wrt Ri, second wrt Rj, third wrt Rk - - F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); - - vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj); - vec3_scale(F_F(1.0)/rij,drj,drj); - vec3_scaleadd(-cos_theta,rik_hat,rij_hat,drk); - vec3_scale(F_F(1.0)/rik,drk,drk); - vec3_add(drj,drk,dri); - vec3_scale(-F_F(1.0),dri,dri); -} - -__device__ void ters_zetaterm_d(F_FLOAT prefactor, - F_FLOAT3& rij_hat, F_FLOAT rij, - F_FLOAT3& rik_hat, F_FLOAT rik, - F_FLOAT3& dri, F_FLOAT3& drj, F_FLOAT3& drk, - int iparam) -{ - F_FLOAT ex_delr,ex_delr_d,tmp; - F_FLOAT3 dcosdri,dcosdrj,dcosdrk; - - if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); - else tmp = params[iparam].lam3 * (rij-rik); - - if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); - else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); - else ex_delr = exp(tmp); - - if (params[iparam].powermint == 3) - ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; - else ex_delr_d = params[iparam].lam3 * ex_delr; - - - const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); - costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); - - const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - - (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); - const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); - const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + - (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); - const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri - // dri = -dfc*gijk*ex_delr*rik_hat; - // dri += fc*gijk_d*ex_delr*dcosdri; - // dri += fc*gijk*ex_delr_d*(rik_hat - rij_hat); - const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); - const F_FLOAT dfc = ters_fc_d(rik,params[iparam].bigr,params[iparam].bigd); - - - vec3_scale(-dfc*gijk*ex_delr,rik_hat,dri); - vec3_scaleadd(fc*gijk_d*ex_delr,dcosdri,dri,dri); - vec3_scaleadd(fc*gijk*ex_delr_d,rik_hat,dri,dri); - vec3_scaleadd(-fc*gijk*ex_delr_d,rij_hat,dri,dri); - vec3_scale(prefactor,dri,dri); - // compute the derivative wrt Rj - // drj = fc*gijk_d*ex_delr*dcosdrj; - // drj += fc*gijk*ex_delr_d*rij_hat; - - vec3_scale(fc*gijk_d*ex_delr,dcosdrj,drj); - vec3_scaleadd(fc*gijk*ex_delr_d,rij_hat,drj,drj); - vec3_scale(prefactor,drj,drj); - - // compute the derivative wrt Rk - // drk = dfc*gijk*ex_delr*rik_hat; - // drk += fc*gijk_d*ex_delr*dcosdrk; - // drk += -fc*gijk*ex_delr_d*rik_hat; - - vec3_scale(dfc*gijk*ex_delr,rik_hat,drk); - vec3_scaleadd(fc*gijk_d*ex_delr,dcosdrk,drk,drk); - vec3_scaleadd(-fc*gijk*ex_delr_d,rik_hat,drk,drk); - vec3_scale(prefactor,drk,drk); -} - -__device__ void ters_zetaterm_d_fi(F_FLOAT &prefactor, - F_FLOAT3& rij_hat, F_FLOAT &rij, - F_FLOAT3& rik_hat, F_FLOAT &rik, - F_FLOAT3& dri, int &iparam) -{ - F_FLOAT ex_delr,ex_delr_d,tmp; - - if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); - else tmp = params[iparam].lam3 * (rij-rik); - - if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); - else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); - else ex_delr = exp(tmp); - - if (params[iparam].powermint == 3) - ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; - else ex_delr_d = params[iparam].lam3 * ex_delr; - - const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); - //costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); - - - F_FLOAT3 dcosdri; - vec3_scaleadd(-cos_theta,rij_hat,rik_hat,dri); - vec3_scale(F_F(1.0)/rij,dri,dri); - vec3_scaleadd(-cos_theta,rik_hat,rij_hat,dcosdri); - vec3_scale(F_F(1.0)/rik,dcosdri,dcosdri); - vec3_add(dri,dcosdri,dcosdri); - vec3_scale(-F_F(1.0),dcosdri,dcosdri); - - const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - - (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); - const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); - const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + - (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); - const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri -// - const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); - const F_FLOAT dfc = ters_fc_d(rik,params[iparam].bigr,params[iparam].bigd); - - vec3_scale(-dfc*gijk*ex_delr,rik_hat,dri); - vec3_scaleadd(fc*gijk_d*ex_delr,dcosdri,dri,dri); - vec3_scaleadd(fc*gijk*ex_delr_d,rik_hat,dri,dri); - vec3_scaleadd(-fc*gijk*ex_delr_d,rij_hat,dri,dri); - vec3_scale(prefactor,dri,dri); - -} - -__device__ void ters_zetaterm_d_fj(F_FLOAT &prefactor, - F_FLOAT3& rij_hat, F_FLOAT &rij, - F_FLOAT3& rik_hat, F_FLOAT &rik, - F_FLOAT3& drj, int &iparam) -{ - F_FLOAT ex_delr,ex_delr_d,tmp; - - if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); - else tmp = params[iparam].lam3 * (rij-rik); - - if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); - else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); - else ex_delr = exp(tmp); - - if (params[iparam].powermint == 3) - ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; - else ex_delr_d = params[iparam].lam3 * ex_delr; - - const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); - vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj); - vec3_scale(F_F(1.0)/rij,drj,drj); - - const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - - (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); - const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); - const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + - (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); - const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri - - const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); - - vec3_scale(fc*gijk_d*ex_delr,drj,drj); - vec3_scaleadd(fc*gijk*ex_delr_d,rij_hat,drj,drj); - vec3_scale(prefactor,drj,drj); -} - -__device__ void ters_zetaterm_d_fk(F_FLOAT &prefactor, - F_FLOAT3& rij_hat, F_FLOAT &rij, - F_FLOAT3& rik_hat, F_FLOAT &rik, - F_FLOAT3& drk, int &iparam) -{ - F_FLOAT ex_delr,ex_delr_d,tmp; - - if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); - else tmp = params[iparam].lam3 * (rij-rik); - - if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); - else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); - else ex_delr = exp(tmp); - - if (params[iparam].powermint == 3) - ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; - else ex_delr_d = params[iparam].lam3 * ex_delr; - - const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); - vec3_scaleadd(-cos_theta,rik_hat,rij_hat,drk); - vec3_scale(F_F(1.0)/rik,drk,drk); - - const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - - (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); - const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); - const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + - (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); - const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri - - const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); - const F_FLOAT dfc = ters_fc_d(rik,params[iparam].bigr,params[iparam].bigd); - - vec3_scale(fc*gijk_d*ex_delr,drk,drk); - vec3_scaleadd(dfc*gijk*ex_delr,rik_hat,drk,drk); - vec3_scaleadd(-fc*gijk*ex_delr_d,rik_hat,drk,drk); - vec3_scale(prefactor,drk,drk); -} - -__device__ void attractive(int iparam, F_FLOAT prefactor, - F_FLOAT4& delij, - F_FLOAT4& delik, - F_FLOAT3& fi, F_FLOAT3& fj, F_FLOAT3& fk) -{ - F_FLOAT3 rij_hat,rik_hat; - F_FLOAT rij,rijinv,rik,rikinv; - - rij = sqrt(delij.w); - rijinv = F_F(1.0)/rij; - vec3_scale(rijinv,delij,rij_hat); - - rik = sqrt(delik.w); - rikinv = F_F(1.0)/rik; - vec3_scale(rikinv,delik,rik_hat); - - ters_zetaterm_d(prefactor,rij_hat,rij,rik_hat,rik,fi,fj,fk,iparam); -} - -__device__ void attractive_fi(int& iparam, F_FLOAT& prefactor, - F_FLOAT4& delij, - F_FLOAT4& delik, - F_FLOAT3& f) -{ - F_FLOAT3 rij_hat,rik_hat; - F_FLOAT rij,rijinv,rik,rikinv; - - rij = sqrt(delij.w); - rijinv = F_F(1.0)/rij; - vec3_scale(rijinv,delij,rij_hat); - - rik = sqrt(delik.w); - rikinv = F_F(1.0)/rik; - vec3_scale(rikinv,delik,rik_hat); - - ters_zetaterm_d_fi(prefactor,rij_hat,rij,rik_hat,rik,f,iparam); -} - -__device__ void attractive_fj(int iparam, F_FLOAT prefactor, - F_FLOAT4& delij, - F_FLOAT4& delik, - F_FLOAT3& f) -{ - F_FLOAT3 rij_hat,rik_hat; - F_FLOAT rij,rijinv,rik,rikinv; - - rij = sqrt(delij.w); - rijinv = F_F(1.0)/rij; - vec3_scale(rijinv,delij,rij_hat); - - rik = sqrt(delik.w); - rikinv = F_F(1.0)/rik; - vec3_scale(rikinv,delik,rik_hat); - - ters_zetaterm_d_fj(prefactor,rij_hat,rij,rik_hat,rik,f,iparam); -} - -__device__ void attractive_fk(int iparam, F_FLOAT prefactor, - F_FLOAT4& delij, - F_FLOAT4& delik, - F_FLOAT3& f) -{ - F_FLOAT3 rij_hat,rik_hat; - F_FLOAT rij,rijinv,rik,rikinv; - - rij = sqrt(delij.w); - rijinv = F_F(1.0)/rij; - vec3_scale(rijinv,delij,rij_hat); - - rik = sqrt(delik.w); - rikinv = F_F(1.0)/rik; - vec3_scale(rikinv,delik,rik_hat); - - ters_zetaterm_d_fk(prefactor,rij_hat,rij,rik_hat,rik,f,iparam); -} - -__global__ void Pair_Tersoff_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) -{ - int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if( ii >= _nall ) return; - - X_FLOAT4 myxtype; - F_FLOAT4 delij; - F_FLOAT xtmp,ytmp,ztmp; - int itype,jnum,i,j; - int* jlist; - int neigh_red = 0; - i = ii;//_ilist[ii]; - myxtype = fetchXType(i); - - xtmp=myxtype.x; - ytmp=myxtype.y; - ztmp=myxtype.z; - itype=map[(static_cast (myxtype.w))]; - - jnum = _numneigh[i]; - jlist = &_neighbors[i]; - - __syncthreads(); - for (int jj = 0; jj < jnum; jj++) - { - if(jj (myxtype.w))]; - int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype]; - delij.w = vec3_dot(delij,delij); - if (delij.w < params[iparam_ij].cutsq) - { - _glob_neighbors_red[i+neigh_red*_nall]=j; - _glob_neightype_red[i+neigh_red*_nall]=jtype; - _glob_r_ij[i+neigh_red*_nall]=delij; - neigh_red++; - } - } - } - _glob_numneigh_red[i]=neigh_red; -} - - - __global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) - { - - int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - if( ii >= _nall ) return; - - - F_FLOAT4 delij; - F_FLOAT4 delik; - - int itype,jnum,i,j; - int* jlist; - i = ii; - itype=map[(static_cast (_type[i]))]; - - jnum = _glob_numneigh_red[i]; - jlist = &_glob_neighbors_red[i]; - - __syncthreads(); - for (int jj = 0; jj < jnum; jj++) - { - if(jj - __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom,int vflag_atom)//,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) -{ - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - - ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x]; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; - - F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem; - if((eflag||eflag_atom)&&(vflagm||vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7*blockDim.x]; - else - if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x]; - else - if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6*blockDim.x]; - shared_F_F+=threadIdx.x; - - if(eflag_atom||eflag) - { - sharedE[0] = ENERGY_F(0.0); - sharedV += blockDim.x; - } - - if(vflagm||vflag_atom) - { - sharedV[0*blockDim.x] = ENERGY_F(0.0); - sharedV[1*blockDim.x] = ENERGY_F(0.0); - sharedV[2*blockDim.x] = ENERGY_F(0.0); - sharedV[3*blockDim.x] = ENERGY_F(0.0); - sharedV[4*blockDim.x] = ENERGY_F(0.0); - sharedV[5*blockDim.x] = ENERGY_F(0.0); - } - - int jnum_red=0; -#define fxtmp shared_F_F[0] -#define fytmp shared_F_F[blockDim.x] -#define fztmp shared_F_F[2*blockDim.x] -//#define jnum_red (static_cast (shared_F_F[3*blockDim.x])) - - int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - - X_FLOAT4 myxtype_i,myxtype_j,myxtype_k; - F_FLOAT4 delij,delik,deljk; - F_FLOAT fpair; - F_FLOAT prefactor_ij,prefactor_ji; - - int itype,i,j; - int* jlist_red; - - if(ii < _inum) - { - i = _ilist[ii]; - - if(vflagm) - myxtype_i=fetchXType(i); - //itype=map[(static_cast (myxtype_i.w))]; - itype=map[_type[i]]; - - - fxtmp = F_F(0.0); - fytmp = F_F(0.0); - fztmp = F_F(0.0); - - - //shared_F_F[3*blockDim.x] = _glob_numneigh_red[i]; - jnum_red = _glob_numneigh_red[i]; - jlist_red = &_glob_neighbors_red[i]; - } - __syncthreads(); - -#pragma unroll 1 - for (int jj = 0; jj < jnum_red; jj++) - { - if(i < _nlocal) - { - fpair=F_F(0.0); - j = jlist_red[jj*_nall]; - j &= NEIGHMASK; - - if(vflagm) - myxtype_j = fetchXType(j); - - int jtype = _glob_neightype_red[i+jj*_nall]; - delij = _glob_r_ij[i+jj*_nall]; - - volatile int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype]; - volatile int iparam_ji = elem2param[(jtype*nelements+itype)*nelements+itype]; - - if (delij.w(); - else if(eflag) PairVirialCompute_A_Kernel_Template<1,0>(); - else if(vflagm) PairVirialCompute_A_Kernel_Template<0,1>(); -#undef fxtmp -#undef fytmp -#undef fztmp -//#undef jnum_red -} diff --git a/lib/cuda/pair_virial_compute_cu.h b/lib/cuda/pair_virial_compute_cu.h deleted file mode 100644 index fdd2cecb8c..0000000000 --- a/lib/cuda/pair_virial_compute_cu.h +++ /dev/null @@ -1,26 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_shared.h" - -extern "C" void Cuda_PairVirialCompute(cuda_shared_data* sdata, int offset, int end); diff --git a/lib/cuda/pppm_cuda.cu b/lib/cuda/pppm_cuda.cu deleted file mode 100644 index dd434b9bbf..0000000000 --- a/lib/cuda/pppm_cuda.cu +++ /dev/null @@ -1,588 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_precision.h" -//#define FFT_CUFFT -#define MY_PREFIX pppm -#include "cuda_shared.h" -#include "cuda_common.h" -#include "pppm_cuda_cu.h" -#include "cuda_runtime.h" -#include - -//#include "crm_cuda_utils.cu" -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) - -__device__ __constant__ FFT_FLOAT* work1; -__device__ __constant__ FFT_FLOAT* work2; -__device__ __constant__ FFT_FLOAT* work3; -__device__ __constant__ PPPM_FLOAT* greensfn; -__device__ __constant__ PPPM_FLOAT* gf_b; -__device__ __constant__ PPPM_FLOAT* fkx; -__device__ __constant__ PPPM_FLOAT* fky; -__device__ __constant__ PPPM_FLOAT* fkz; -__device__ __constant__ PPPM_FLOAT* vg; -__device__ __constant__ int* part2grid; -__device__ __constant__ PPPM_FLOAT* density_brick; -__device__ __constant__ int* density_brick_int; -__device__ __constant__ PPPM_FLOAT density_intScale; -__device__ __constant__ PPPM_FLOAT* vdx_brick; -__device__ __constant__ PPPM_FLOAT* vdy_brick; -__device__ __constant__ PPPM_FLOAT* vdz_brick; -__device__ __constant__ PPPM_FLOAT* density_fft; -__device__ __constant__ ENERGY_FLOAT* energy; -__device__ __constant__ ENERGY_FLOAT* virial; -__device__ __constant__ int nxlo_in; -__device__ __constant__ int nxhi_in; -__device__ __constant__ int nxlo_out; -__device__ __constant__ int nxhi_out; -__device__ __constant__ int nylo_in; -__device__ __constant__ int nyhi_in; -__device__ __constant__ int nylo_out; -__device__ __constant__ int nyhi_out; -__device__ __constant__ int nzlo_in; -__device__ __constant__ int nzhi_in; -__device__ __constant__ int nzlo_out; -__device__ __constant__ int nzhi_out; -__device__ __constant__ int nxlo_fft; -__device__ __constant__ int nxhi_fft; -__device__ __constant__ int nylo_fft; -__device__ __constant__ int nyhi_fft; -__device__ __constant__ int nzlo_fft; -__device__ __constant__ int nzhi_fft; -__device__ __constant__ int nx_pppm; -__device__ __constant__ int ny_pppm; -__device__ __constant__ int nz_pppm; -__device__ __constant__ int slabflag; -__device__ __constant__ PPPM_FLOAT qqrd2e; -__device__ __constant__ int order; -//__device__ __constant__ float3 sublo; -__device__ __constant__ PPPM_FLOAT* rho_coeff; -__device__ __constant__ int nmax; -__device__ __constant__ int nlocal; -__device__ __constant__ PPPM_FLOAT* debugdata; -__device__ __constant__ PPPM_FLOAT delxinv; -__device__ __constant__ PPPM_FLOAT delyinv; -__device__ __constant__ PPPM_FLOAT delzinv; -__device__ __constant__ int nlower; -__device__ __constant__ int nupper; -__device__ __constant__ PPPM_FLOAT shiftone; - - -#include "pppm_cuda_kernel.cu" -#include "stdio.h" -void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial - , void* cu_work1, void* cu_work2, void* cu_work3, void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg - , int cu_nxlo_in, int cu_nxhi_in, int cu_nylo_in, int cu_nyhi_in, int cu_nzlo_in, int cu_nzhi_in, int cu_nxlo_out, int cu_nxhi_out, int cu_nylo_out, int cu_nyhi_out, int cu_nzlo_out, int cu_nzhi_out, int cu_nx_pppm, int cu_ny_pppm, int cu_nz_pppm - , int cu_nxlo_fft, int cu_nxhi_fft, int cu_nylo_fft, int cu_nyhi_fft, int cu_nzlo_fft, int cu_nzhi_fft, void* cu_gf_b - , double cu_qqrd2e, int cu_order, void* cu_rho_coeff, void* cu_debugdata, void* cu_density_brick_int, int cu_slabflag - ) -{ - CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start"); - cudaMemcpyToSymbol(density_brick, &cu_density_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(density_brick_int, &cu_density_brick_int, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(vdx_brick, &cu_vdx_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(vdy_brick, &cu_vdy_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(vdz_brick, &cu_vdz_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(density_fft, &cu_density_fft, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(energy, &cu_energy, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(virial, &cu_virial, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(nxlo_in, &cu_nxlo_in, sizeof(int)); - cudaMemcpyToSymbol(nxhi_in, &cu_nxhi_in, sizeof(int)); - cudaMemcpyToSymbol(nxlo_out, &cu_nxlo_out, sizeof(int)); - cudaMemcpyToSymbol(nxhi_out, &cu_nxhi_out, sizeof(int)); - cudaMemcpyToSymbol(nylo_in, &cu_nylo_in, sizeof(int)); - cudaMemcpyToSymbol(nyhi_in, &cu_nyhi_in, sizeof(int)); - cudaMemcpyToSymbol(nylo_out, &cu_nylo_out, sizeof(int)); - cudaMemcpyToSymbol(nyhi_out, &cu_nyhi_out, sizeof(int)); - cudaMemcpyToSymbol(nzlo_in, &cu_nzlo_in, sizeof(int)); - cudaMemcpyToSymbol(nzhi_in, &cu_nzhi_in, sizeof(int)); - cudaMemcpyToSymbol(nzlo_out, &cu_nzlo_out, sizeof(int)); - cudaMemcpyToSymbol(nzhi_out, &cu_nzhi_out, sizeof(int)); - cudaMemcpyToSymbol(nxlo_fft, &cu_nxlo_fft, sizeof(int)); - cudaMemcpyToSymbol(nxhi_fft, &cu_nxhi_fft, sizeof(int)); - cudaMemcpyToSymbol(nylo_fft, &cu_nylo_fft, sizeof(int)); - cudaMemcpyToSymbol(nyhi_fft, &cu_nyhi_fft, sizeof(int)); - cudaMemcpyToSymbol(nzlo_fft, &cu_nzlo_fft, sizeof(int)); - cudaMemcpyToSymbol(nzhi_fft, &cu_nzhi_fft, sizeof(int)); - cudaMemcpyToSymbol(slabflag, &cu_slabflag, sizeof(int)); - cudaMemcpyToSymbol(nx_pppm, &cu_nx_pppm, sizeof(int)); - cudaMemcpyToSymbol(ny_pppm, &cu_ny_pppm, sizeof(int)); - cudaMemcpyToSymbol(nz_pppm, &cu_nz_pppm, sizeof(int)); - cudaMemcpyToSymbol(work1, &cu_work1, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol(work2, &cu_work2, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol(work3, &cu_work3, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol(greensfn, &cu_greensfn, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(gf_b, &cu_gf_b, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(fkx, &cu_fkx, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(fky, &cu_fky, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(fkz, &cu_fkz, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(vg, &cu_vg, sizeof(PPPM_FLOAT*)); - - PPPM_FLOAT cu_qqrd2e_a = cu_qqrd2e; - cudaMemcpyToSymbol(qqrd2e, &cu_qqrd2e_a, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol(order, &cu_order, sizeof(int)); - cudaMemcpyToSymbol(rho_coeff, &cu_rho_coeff, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(debugdata, &cu_debugdata, sizeof(PPPM_FLOAT*)); - - CUT_CHECK_ERROR("ERROR-CUDA poisson_init"); - - /*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n"); - - #ifdef PPPM_PRECISION - if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n"); - if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n"); - #endif - #ifdef ENERGY_PRECISION - if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n"); - if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n"); - #endif - #ifdef ENERGY_PRECISION - if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n"); - if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n"); - #endif - #ifdef X_PRECISION - if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n"); - if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n"); - #endif - #ifdef F_PRECISION - if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n"); - if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n"); - #endif*/ -} - -void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT cu_shiftone, PPPM_FLOAT cu_delxinv, PPPM_FLOAT cu_delyinv, PPPM_FLOAT cu_delzinv, int cu_nlower, int cu_nupper) -{ - cudaMemcpyToSymbol(delxinv, &cu_delxinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol(delyinv, &cu_delyinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol(delzinv, &cu_delzinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol(shiftone, &cu_shiftone, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol(nlower, &cu_nlower, sizeof(int)); - cudaMemcpyToSymbol(nupper, &cu_nupper, sizeof(int)); - cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo, 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi, 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo, 3 * sizeof(X_FLOAT)); - CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup"); -} - -void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa) -{ - cudaMemcpyToSymbol("part2grid", &cu_part2grid, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); - //cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int)); - cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int)); - cudaMemcpyToSymbol(nmax , &nmaxa, sizeof(int)); - CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update"); - -} - -void pppm_update_nlocal(int nlocala) -{ - cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int)); - CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b"); -} - - -void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald) -{ - dim3 grid; - dim3 threads; - grid.x = nz_pppma; - grid.y = ny_pppma; - grid.z = 1; - threads.x = nx_pppma; - threads.y = 1; - threads.z = 1; - setup_fkxyz_vg <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald); - cudaThreadSynchronize(); - - CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg "); -} - -void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald, - int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab) -{ - dim3 grid; - dim3 threads; - grid.x = nz_pppma; - grid.y = ny_pppma; - grid.z = 1; - threads.x = nx_pppma; - threads.y = 1; - threads.z = 1; - setup_greensfn <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald, nbx, nby, nbz, xprd, yprd, zprd_slab); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn "); -} - -void poisson_scale(int nx_pppma, int ny_pppma, int nz_pppma) -{ - dim3 grid; - dim3 threads; - grid.x = nz_pppma; - grid.y = ny_pppma; - grid.z = 1; - threads.x = nx_pppma; - threads.y = 1; - threads.z = 1; - poisson_scale_kernel <<< grid, threads, 0>>>(); - CUT_CHECK_ERROR("ERROR-CUDA poisson_scale "); - -} - -void poisson_xgrad(int nx_pppma, int ny_pppma, int nz_pppma) -{ - dim3 grid; - dim3 threads; - grid.x = nz_pppma; - grid.y = ny_pppma; - grid.z = 1; - threads.x = nx_pppma; - threads.y = 1; - threads.z = 1; - poisson_xgrad_kernel <<< grid, threads, 0>>>(); - CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad "); -} - -void poisson_ygrad(int nx_pppma, int ny_pppma, int nz_pppma) -{ - dim3 grid; - dim3 threads; - grid.x = nz_pppma; - grid.y = ny_pppma; - grid.z = 1; - threads.x = nx_pppma; - threads.y = 1; - threads.z = 1; - poisson_ygrad_kernel <<< grid, threads, 0>>>(); - CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad "); -} - -void poisson_zgrad(int nx_pppma, int ny_pppma, int nz_pppma) -{ - dim3 grid; - dim3 threads; - grid.x = nz_pppma; - grid.y = ny_pppma; - grid.z = 1; - threads.x = nx_pppma; - threads.y = 1; - threads.z = 1; - poisson_zgrad_kernel <<< grid, threads, 0>>>(); - CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad "); -} - -void poisson_vdx_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppma, int ny_pppma, int nz_pppma) -{ - - dim3 grid; - dim3 threads; - grid.x = khi - klo + 1; - grid.y = jhi - jlo + 1; - grid.z = 1; - threads.x = ihi - ilo + 1; - threads.y = 1; - threads.z = 1; - //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x); - poisson_vdx_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo); - CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick "); - cudaThreadSynchronize(); -} - -void poisson_vdy_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm) -{ - dim3 grid; - dim3 threads; - grid.x = khi - klo + 1; - grid.y = jhi - jlo + 1; - grid.z = 1; - threads.x = ihi - ilo + 1; - threads.y = 1; - threads.z = 1; - poisson_vdy_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo); - CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick "); - cudaThreadSynchronize(); -} - -void poisson_vdz_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm) -{ - dim3 grid; - dim3 threads; - grid.x = khi - klo + 1; - grid.y = jhi - jlo + 1; - grid.z = 1; - threads.x = ihi - ilo + 1; - threads.y = 1; - threads.z = 1; - poisson_vdz_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo); - CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick "); - cudaThreadSynchronize(); -} - - -void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int nzlo_fft, int nzhi_fft, int vflag) -{ - //printf("VFLAG_GPU: %i\n",vflag); - CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start "); - dim3 grid; - dim3 threads; - grid.x = nzhi_fft - nzlo_fft + 1; - grid.y = nyhi_fft - nylo_fft + 1; - grid.z = 1; - threads.x = nxhi_fft - nxlo_fft + 1; - threads.y = 1; - threads.z = 1; - poisson_energy_kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(nxlo_fft, nylo_fft, nzlo_fft, vflag); - - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end "); -} - -ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial) -{ - ENERGY_FLOAT host_energy = 0; - dim3 grid; - dim3 threads; - - grid.x = nz_pppma; - grid.y = 1; - grid.z = 1; - threads.x = ny_pppma; - threads.y = 1; - threads.z = 1; - sum_energy_kernel1 <<< grid, threads, ny_pppma* sizeof(ENERGY_FLOAT)>>>(vflag); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 "); - - grid.x = 1; - grid.y = 1; - grid.z = 1; - threads.x = nz_pppma; - threads.y = 1; - threads.z = 1; - sum_energy_kernel2 <<< grid, threads, nz_pppma* sizeof(ENERGY_FLOAT)>>>(vflag); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 "); - - cudaMemcpy((void*)(&host_energy), cu_energy, sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); - - if(vflag) - cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6 * sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); - CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy"); - - return host_energy; -} - -void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int) -{ - CUT_CHECK_ERROR("cuda_make_rho begin"); - dim3 grid, threads; - int cpu_flag[3]; - grid.x = (sdata->atom.nlocal + 31) / 32; - grid.y = 1; - grid.z = 1; - threads.x = 32; - threads.y = 1; - threads.z = 1; - int sharedmemsize = (32 + 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT); - - do { - cpu_flag[0] = 0; - cpu_flag[1] = 0; - cpu_flag[2] = 0; - cudaMemcpyToSymbol("density_intScale", cu_density_intScale, sizeof(PPPM_FLOAT*)); - CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z"); - cudaMemset(flag, 0, 3 * sizeof(int)); - CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A"); - cudaMemset(cu_density_brick, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(PPPM_FLOAT)); - CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B"); - cudaMemset(cu_density_brick_int, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(int)); - CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C"); - make_rho_kernel <<< grid, threads, sharedmemsize>>>((int*) flag, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1)); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA make_rho A"); - cudaMemcpy((void*) &cpu_flag, flag, 3 * sizeof(int), cudaMemcpyDeviceToHost); - - if(cpu_flag[0] != 0) { - (*cu_density_intScale) /= 2; - MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n", *cu_density_intScale);) - } - if((cpu_flag[0] == 0) && (cpu_flag[1] == 0)) { - (*cu_density_intScale) *= 2; - MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n", *cu_density_intScale);) - } - /* if((*cu_density_intScale)>0xe0000000) - { - printf("Error Scaling\n"); - cpu_flag[0]=0; - cpu_flag[1]=1; - }*/ - CUT_CHECK_ERROR("ERROR-CUDA make_rho B"); - } while((cpu_flag[0] != 0) || (cpu_flag[1] == 0)); - - - grid.x = khi - klo + 1; - grid.y = jhi - jlo + 1; - threads.x = ihi - ilo + 1; - scale_rho_kernel <<< grid, threads, 0>>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale"); -} - - -int cuda_particle_map(cuda_shared_data* sdata, void* flag) -{ - dim3 grid, threads; - int cpu_flag; - grid.x = (sdata->atom.nlocal + 31) / 32; - grid.y = 1; - grid.z = 1; - threads.x = 32; - threads.y = 1; - threads.z = 1; - CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre"); - particle_map_kernel <<< grid, threads, 0>>>((int*) flag); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA particla_map a"); - cudaMemcpy((void*) &cpu_flag, flag, sizeof(int), cudaMemcpyDeviceToHost); - CUT_CHECK_ERROR("ERROR-CUDA particla_map b"); - return cpu_flag; -} - - -void cuda_fieldforce(cuda_shared_data* sdata, void* flag) -{ - dim3 grid, threads; - grid.x = (sdata->atom.nlocal + 31) / 32; - grid.y = 1; - grid.z = 1; - threads.x = 32; - threads.y = 1; - threads.z = 1; - int sharedmemsize = (32 + 3 * 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT); - fieldforce_kernel <<< grid, threads, sharedmemsize>>> - (sdata->pppm.nupper - sdata->pppm.nlower + 1, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1), (int*) flag); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA fieldforce"); -} - -double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf) -{ - dim3 grid, threads; - grid.x = (sdata->atom.nlocal + 31) / 32; - grid.y = 1; - grid.z = 1; - threads.x = 32; - threads.y = 1; - threads.z = 1; - slabcorr_energy_kernel <<< grid, threads, 32* sizeof(ENERGY_FLOAT)>>>(dev_buf); - cudaThreadSynchronize(); - cudaMemcpy((void*) buf, dev_buf, grid.x* sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); - - double dipole_all = 0.0; - - for(int i = 0; i < grid.x; i++) - dipole_all += buf[i]; - - return dipole_all; -} - -void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact) -{ - dim3 grid, threads; - grid.x = (sdata->atom.nlocal + 31) / 32; - grid.y = 1; - grid.z = 1; - threads.x = 32; - threads.y = 1; - threads.z = 1; - slabcorr_force_kernel <<< grid, threads>>>(ffact); - cudaThreadSynchronize(); -} - -void sum_virial(double* host_virial) -{ -} - -void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out) -{ - int nslow = sdata->pppm.nzhi_in - sdata->pppm.nzlo_in; - int nmid = sdata->pppm.nyhi_in - sdata->pppm.nylo_in; - int nfast = sdata->pppm.nxhi_in - sdata->pppm.nxlo_in; - int nrimz = MAX(sdata->pppm.nzlo_in - sdata->pppm.nzlo_out, sdata->pppm.nzhi_out - sdata->pppm.nzhi_in); - int nrimy = MAX(sdata->pppm.nylo_in - sdata->pppm.nylo_out, sdata->pppm.nyhi_out - sdata->pppm.nyhi_in); - int nrimx = MAX(sdata->pppm.nxlo_in - sdata->pppm.nxlo_out, sdata->pppm.nxhi_out - sdata->pppm.nxhi_in); - dim3 grid; - grid.x = nslow + 1; - grid.y = nmid + 1; - grid.z = 1; - dim3 threads; - threads.x = nfast + 1; - threads.y = 1; - threads.z = 1; - cudaThreadSynchronize(); - initfftdata_core_kernel <<< grid, threads, 0>>>(in, out); - cudaThreadSynchronize(); - grid.x = nrimz; - grid.y = nmid + 1; - threads.x = nfast + 1; - initfftdata_z_kernel <<< grid, threads, 0>>>(in, out); - cudaThreadSynchronize(); - grid.x = nslow + 1; - grid.y = nrimy; - threads.x = nfast + 1; - initfftdata_y_kernel <<< grid, threads, 0>>>(in, out); - cudaThreadSynchronize(); - grid.x = nslow + 1; - grid.y = nmid + 1; - threads.x = nrimx; - initfftdata_x_kernel <<< grid, threads, 0>>>(in, out); - cudaThreadSynchronize(); - grid.x = nrimz; - grid.y = nrimy; - threads.x = nfast + 1; - initfftdata_yz_kernel <<< grid, threads, 0>>>(in, out); - cudaThreadSynchronize(); - grid.x = nrimz; - grid.y = nmid + 1; - threads.x = nrimx; - initfftdata_xz_kernel <<< grid, threads, 0>>>(in, out); - cudaThreadSynchronize(); - grid.x = nslow + 1; - grid.y = nrimy; - threads.x = nrimx; - initfftdata_xy_kernel <<< grid, threads, 0>>>(in, out); - cudaThreadSynchronize(); - grid.x = nrimz; - grid.y = nrimy; - threads.x = nrimx; - initfftdata_xyz_kernel <<< grid, threads, 0>>>(in, out); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel"); -} - - diff --git a/lib/cuda/pppm_cuda_cu.h b/lib/cuda/pppm_cuda_cu.h deleted file mode 100644 index b594715b7c..0000000000 --- a/lib/cuda/pppm_cuda_cu.h +++ /dev/null @@ -1,55 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#ifndef PPPM_CUDA_CU_H_ -#define PPPM_CUDA_CU_H_ - -extern "C" void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial - ,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg - ,int nxlo_in,int nxhi_in,int nylo_in,int nyhi_in,int nzlo_in,int nzhi_in,int nxlo_out,int nxhi_out,int nylo_out,int nyhi_out,int nzlo_out,int nzhi_out, int nx_pppm,int ny_pppm,int nz_pppm - ,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b - ,double cu_qqrd2e, int cu_order,void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_lock,int slabflag - ); -extern "C" void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT shiftone,PPPM_FLOAT delxinv,PPPM_FLOAT delyinv,PPPM_FLOAT delzinv,int nlower,int nupper); -extern "C" void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald); -extern "C" void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald, - int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab); - -extern "C" void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa); -extern "C" void pppm_update_nlocal(int nlocala); -extern "C" void poisson_scale(int nx_pppm,int ny_pppm,int nz_pppm); -extern "C" void poisson_xgrad(int nx_pppm,int ny_pppm,int nz_pppm); -extern "C" void poisson_ygrad(int nx_pppm,int ny_pppm,int nz_pppm); -extern "C" void poisson_zgrad(int nx_pppm,int ny_pppm,int nz_pppm); -extern "C" void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm); -extern "C" void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm); -extern "C" void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm); -extern "C" void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag); -extern "C" ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial); -extern "C" int cuda_particle_map(cuda_shared_data* sdata,void* flag); -extern "C" void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int); -extern "C" void cuda_fieldforce(cuda_shared_data* sdata,void* flag); -extern "C" double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf); -extern "C" void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact); -extern "C" void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out); -#endif /*PPPM_CUDA_CU_H_*/ diff --git a/lib/cuda/pppm_cuda_kernel.cu b/lib/cuda/pppm_cuda_kernel.cu deleted file mode 100644 index 808c98fe39..0000000000 --- a/lib/cuda/pppm_cuda_kernel.cu +++ /dev/null @@ -1,816 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#define OFFSET 4096 -__device__ int negativCUDA(float f) -{ - return ((unsigned int)1<<31&(__float_as_int(f)))>>31; -} - -__device__ void reduceBlock(float* data) -{ - int p2=1; - while(p2*2= nzlo_fft)&&(blockIdx.x <=nzhi_fft)&& - (blockIdx.y >= nylo_fft)&&(blockIdx.y <=nyhi_fft)&& - (threadIdx.x>= nxlo_fft)&&(threadIdx.x<=nxhi_fft)) - { - int n=((int(blockIdx.x)-nzlo_fft)*(nyhi_fft-nylo_fft+1)+int(blockIdx.y)-nylo_fft)*(nxhi_fft-nxlo_fft+1)+int(threadIdx.x)-nxlo_fft; - PPPM_FLOAT sqk = my_fkx*my_fkx + my_fky*my_fky + my_fkz*my_fkz; - PPPM_FLOAT vterm = (sqk==PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0)/sqk + PPPM_F(0.25)/(g_ewald*g_ewald)); - vg[6*n+0] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkx*my_fkx; - vg[6*n+1] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fky*my_fky; - vg[6*n+2] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkz*my_fkz; - vg[6*n+3] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx*my_fky; - vg[6*n+4] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx*my_fkz; - vg[6*n+5] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fky*my_fkz; - - } -} - -__device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z) -{ - PPPM_FLOAT sx,sy,sz; - sz = sy = sx = PPPM_F(0.0); - for (int l = order-1; l >= 0; l--) { - sx = gf_b[l] + sx*x; - sy = gf_b[l] + sy*y; - sz = gf_b[l] + sz*z; - } - PPPM_FLOAT s = sx*sy*sz; - return s*s; -} - -__global__ void setup_greensfn(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald, -int nbx,int nby,int nbz, -PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab) -{ - PPPM_FLOAT sqk; - int nx,ny,nz,kper,lper,mper,k,l,m; - PPPM_FLOAT snx,sny,snz,snx2,sny2,snz2; - PPPM_FLOAT argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; - PPPM_FLOAT sum1,dot1,dot2; - PPPM_FLOAT numerator,denominator; - - PPPM_FLOAT form=PPPM_F(1.0); - int n=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - m=blockIdx.x; - l=blockIdx.y; - k=threadIdx.x; - - mper = m - nz_pppm*(2*m/nz_pppm); - snz = sin(PPPM_F(0.5)*unitkz*mper*zprd_slab/nz_pppm); - snz2 = snz*snz; - - - lper = l - ny_pppm*(2*l/ny_pppm); - sny = sin(PPPM_F(0.5)*unitky*lper*yprd/ny_pppm); - sny2 = sny*sny; - - kper = k - nx_pppm*(2*k/nx_pppm); - snx = sin(PPPM_F(0.5)*unitkx*kper*xprd/nx_pppm); - snx2 = snx*snx; - - sqk = pow(unitkx*kper,PPPM_F(2.0)) + pow(unitky*lper,PPPM_F(2.0)) + - pow(unitkz*mper,PPPM_F(2.0)); - - if (sqk != PPPM_F(0.0)) { - numerator = form*PPPM_F(12.5663706)/sqk; - denominator = gf_denom(snx2,sny2,snz2); - sum1 = PPPM_F(0.0); - for (nx = -nbx; nx <= nbx; nx++) { - qx = unitkx*(kper+nx_pppm*nx); - sx = exp(PPPM_F(-.25)*pow(qx/g_ewald,PPPM_F(2.0))); - wx = PPPM_F(1.0); - argx = PPPM_F(0.5)*qx*xprd/nx_pppm; - if (argx != PPPM_F(0.0)) wx = pow(sin(argx)/argx,order); - for (ny = -nby; ny <= nby; ny++) { - qy = unitky*(lper+ny_pppm*ny); - sy = exp(PPPM_F(-.25)*pow(qy/g_ewald,PPPM_F(2.0))); - wy = PPPM_F(1.0); - argy = PPPM_F(0.5)*qy*yprd/ny_pppm; - if (argy != PPPM_F(0.0)) wy = pow(sin(argy)/argy,order); - for (nz = -nbz; nz <= nbz; nz++) { - qz = unitkz*(mper+nz_pppm*nz); - sz = exp(PPPM_F(-.25)*pow(qz/g_ewald,PPPM_F(2.0))); - wz = PPPM_F(1.0); - argz = PPPM_F(0.5)*qz*zprd_slab/nz_pppm; - if (argz != PPPM_F(0.0)) wz = pow(sin(argz)/argz,order); - - dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; - dot2 = qx*qx+qy*qy+qz*qz; - sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,PPPM_F(2.0)); - } - } - } - greensfn[n] = numerator*sum1/denominator; - } else greensfn[n] = PPPM_F(0.0); -} - -__global__ void poisson_scale_kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - FFT_FLOAT scaleinv=FFT_F(1.0)/(gridDim.x*gridDim.y*blockDim.x); - work1[2*i] *= scaleinv * greensfn[i]; - work1[2*i+1] *= scaleinv * greensfn[i]; -} - -__global__ void poisson_xgrad_kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - work2[2*i] = fkx[threadIdx.x] * work1[2*i+1]; - work2[2*i+1] = -fkx[threadIdx.x] * work1[2*i]; -} - -__global__ void poisson_ygrad_kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - work2[2*i] = fky[blockIdx.y] * work1[2*i+1]; - work2[2*i+1] = -fky[blockIdx.y] * work1[2*i]; -} - -__global__ void poisson_zgrad_kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - work2[2*i] = fkz[blockIdx.x] * work1[2*i+1]; - work2[2*i+1] = -fkz[blockIdx.x] * work1[2*i]; -} - -__global__ void poisson_vdx_brick_kernel(int ilo,int jlo,int klo) -{ - int k=blockIdx.x+klo; - k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1)); - int j=blockIdx.y+jlo; - j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1)); - int i=threadIdx.x+ilo; - i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1)); - vdx_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)]; -} - -__global__ void poisson_vdy_brick_kernel(int ilo,int jlo,int klo) -{ - int k=blockIdx.x+klo; - k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1)); - int j=blockIdx.y+jlo; - j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1)); - int i=threadIdx.x+ilo; - i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1)); - vdy_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)]; -} - -__global__ void poisson_vdz_brick_kernel(int ilo,int jlo,int klo) -{ - int k=blockIdx.x+klo; - k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1)); - int j=blockIdx.y+jlo; - j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1)); - int i=threadIdx.x+ilo; - i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1)); - vdz_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)]; -} - -__global__ void poisson_energy_kernel(int nxlo_fft,int nylo_fft,int nzlo_fft,int vflag) -{ - ENERGY_FLOAT scaleinv=FFT_F(1.0)/(nx_pppm*ny_pppm*nz_pppm); - int i=(blockIdx.x+nzlo_fft)*ny_pppm*nx_pppm+(blockIdx.y+nylo_fft)*nx_pppm+threadIdx.x+nxlo_fft; - ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem; - ENERGY_FLOAT myenergy= scaleinv*scaleinv * greensfn[i] * (work1[2*i]*work1[2*i] + work1[2*i+1]*work1[2*i+1]); - s_energy[threadIdx.x]=myenergy; - - __syncthreads(); - reduceBlock(s_energy); - if(threadIdx.x==0) - energy[blockIdx.x*ny_pppm+blockIdx.y]=s_energy[0]; - if(vflag) - { - __syncthreads(); - for (int j = 0; j < 6; j++) - { - s_energy[threadIdx.x]= myenergy*vg[((blockIdx.x*gridDim.y+blockIdx.y)*(blockDim.x)+threadIdx.x)*6+j]; - __syncthreads(); - reduceBlock(s_energy); - if(threadIdx.x==0) - virial[blockIdx.x*ny_pppm+blockIdx.y+j*nz_pppm*ny_pppm]=s_energy[0]; - } - } -} - - -__global__ void sum_energy_kernel1(int vflag) -{ - ENERGY_FLOAT myenergy=energy[(blockIdx.x*ny_pppm+threadIdx.x)]; - ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem; - s_energy[threadIdx.x]=myenergy; - __syncthreads(); - reduceBlock(s_energy); - if(threadIdx.x==0) - energy[blockIdx.x*ny_pppm]=s_energy[0]; - if(vflag) - { - __syncthreads(); - for (int j = 0; j < 6; j++) - { - myenergy=virial[blockIdx.x*ny_pppm+threadIdx.x+j*ny_pppm*nz_pppm]; - s_energy[threadIdx.x]=myenergy; - __syncthreads(); - reduceBlock(s_energy); - if(threadIdx.x==0) - virial[blockIdx.x*ny_pppm+j*ny_pppm*nz_pppm]=s_energy[0]; - } - } - -} - -__global__ void sum_energy_kernel2(int vflag) -{ - ENERGY_FLOAT myenergy=energy[threadIdx.x*ny_pppm]; - ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem; - s_energy[threadIdx.x]=myenergy; - __syncthreads(); - reduceBlock(s_energy); - if(threadIdx.x==0) - energy[0]=s_energy[0]; - if(vflag) - { - __syncthreads(); - for (int j = 0; j < 6; j++) - { - myenergy=virial[threadIdx.x*ny_pppm+j*ny_pppm*nz_pppm]; - s_energy[threadIdx.x]=myenergy; - __syncthreads(); - reduceBlock(s_energy); - if(threadIdx.x==0) - virial[j]=s_energy[0]; - } - } -} - -__device__ PPPM_FLOAT rho1d(int k,PPPM_FLOAT d,PPPM_FLOAT* srho_coeff) -{ - PPPM_FLOAT rho1d_tmp=PPPM_F(0.0); - for (int l = order-1; l >= 0; l--) - rho1d_tmp = srho_coeff[l*order+k-(1-order)/2] + rho1d_tmp*d; - return rho1d_tmp; -} - -__global__ void particle_map_kernel(int* flag) -{ - int i=blockIdx.x*blockDim.x+threadIdx.x; - if(i nxhi_out || - ny+nlower < nylo_out || ny+nupper > nyhi_out || - nz+nlower < nzlo_out || nz+nupper > nzhi_out) - {flag[0]++; - debugdata[0]=i; - debugdata[1]=_boxlo[0]; - debugdata[2]=_boxlo[1]; - debugdata[3]=_boxlo[2]; - debugdata[4]=nx; - debugdata[5]=ny; - debugdata[6]=nz; - debugdata[7]=_x[i]; - debugdata[8]=_x[i+_nmax]; - debugdata[9]=_x[i+2*_nmax]; - debugdata[10]=nlocal; - - } - } -} - -__global__ void make_rho_kernelA() -{ - int i,l,m,n,nx,ny,nz,mx,my,mz; - - // clear 3d density array - - - // loop over my charges, add their contribution to nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - - i=blockIdx.x*blockDim.x+threadIdx.x; - - if(i < nlocal) { - - PPPM_FLOAT dx,dy,dz,x0,y0,z0; - nx = part2grid[i]; - ny = part2grid[i+nmax]; - nz = part2grid[i+2*nmax]; - dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv; - dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv; - dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv; - - z0 = delxinv*delyinv*delzinv * _q[i]; - for (n = nlower; n <= nupper; n++) - { - mz = n+nz; - y0 = z0*rho1d(n,dz,rho_coeff); - for (m = nlower; m <= nupper; m++) - { - my = m+ny; - x0 = y0*rho1d(m,dy,rho_coeff); - for (l = nlower; l <= nupper; l++) - { - mx = l+nx; - int mzyx=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+mx-nxlo_out; - while(atomicAdd(&density_brick_int[mzyx],1)!=0) atomicAdd(&density_brick_int[mzyx],-1); - density_brick[mzyx]+=x0*rho1d(l,dx,rho_coeff); - __threadfence(); - atomicAdd(&density_brick_int[mzyx],-1); - __syncthreads(); - - } - } - } - } -} - -__global__ void make_rho_kernel(int* flag,int read_threads_at_same_time) -{ - int i,l,m,n,nx,ny,nz,mx,my,mz,a,b; - - // clear 3d density array - - - // loop over my charges, add their contribution to nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // int nzxy=blockIdx.x*gridDim.y+blockIdx.y; - - int nelements=nupper-nlower+1; - int* idx=(int*) sharedmem; - int* sdensity_brick_int=&idx[blockDim.x]; - PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &sdensity_brick_int[nelements*blockDim.x]; - if(threadIdx.x-1)) - { - a=sdensity_brick_int[ii*nelements+threadIdx.x]; - //if(a*a>1e-100) - b=(atomicAdd(&density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements],a)|a); - //else - //b=(density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements]|a); - if(((b)&(0x7c000000))&&(not((b)&(0x80000000)))) - { - flag[1]++; - if((b)&(0x60000000)) flag[0]++; - } - } - } - __syncthreads(); //*/ - } - } - - } -} - -__global__ void scale_rho_kernel() -{ - int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; - density_brick[i]=(1.0/density_intScale)*density_brick_int[i]; -} - -__global__ void fieldforce_kernel(int elements_per_thread,int read_threads_at_same_time,int* flag) //20*x64 0.36 -{ - int i; - - // loop over my charges, interpolate electric field from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // ek = 3 components of E-field on particle - i=blockIdx.x*blockDim.x+threadIdx.x; - int* idx=(int*) sharedmem; - PPPM_FLOAT* tmp_brick=(PPPM_FLOAT*) &idx[blockDim.x]; - PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &tmp_brick[3*blockDim.x*elements_per_thread]; - if(threadIdx.x-1)) - { - tmp_brick[ii*elements_per_thread+threadIdx.x]=vdx_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread]; - tmp_brick[(ii+blockDim.x)*elements_per_thread+threadIdx.x]=vdy_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread]; - tmp_brick[(ii+2*blockDim.x)*elements_per_thread+threadIdx.x]=vdz_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread]; - } - } - __syncthreads(); - - if(i