From 27b1805916abf6d8697afe56d278d8dfbc54decd Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 10 Aug 2009 20:28:57 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3042 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Makefile | 75 --- lib/gpu/README | 94 ---- lib/gpu/gb_gpu.cu | 518 -------------------- lib/gpu/gb_gpu_extra.h | 337 ------------- lib/gpu/gb_gpu_kernel.h | 861 ---------------------------------- lib/gpu/gb_gpu_memory.cu | 146 ------ lib/gpu/gb_gpu_memory.h | 74 --- lib/gpu/lj_gpu.cu | 234 --------- lib/gpu/lj_gpu_kernel.h | 248 ---------- lib/gpu/lj_gpu_memory.cu | 167 ------- lib/gpu/lj_gpu_memory.h | 88 ---- lib/gpu/nvc_device.cu | 96 ---- lib/gpu/nvc_device.h | 92 ---- lib/gpu/nvc_get_devices.cu | 31 -- lib/gpu/nvc_macros.h | 121 ----- lib/gpu/nvc_memory.h | 447 ------------------ lib/gpu/nvc_timer.h | 83 ---- lib/gpu/pair_gpu_atom.cu | 173 ------- lib/gpu/pair_gpu_atom.h | 151 ------ lib/gpu/pair_gpu_nbor.cu | 80 ---- lib/gpu/pair_gpu_nbor.h | 91 ---- lib/gpu/pair_gpu_texture.h | 292 ------------ src/GPU/Install.csh | 34 -- src/GPU/pair_gayberne_gpu.cpp | 369 --------------- src/GPU/pair_gayberne_gpu.h | 43 -- src/GPU/pair_lj_cut_gpu.cpp | 270 ----------- src/GPU/pair_lj_cut_gpu.h | 40 -- src/GPU/style_gpu.h | 41 -- 28 files changed, 5296 deletions(-) delete mode 100644 lib/gpu/Makefile delete mode 100644 lib/gpu/README delete mode 100644 lib/gpu/gb_gpu.cu delete mode 100644 lib/gpu/gb_gpu_extra.h delete mode 100644 lib/gpu/gb_gpu_kernel.h delete mode 100644 lib/gpu/gb_gpu_memory.cu delete mode 100644 lib/gpu/gb_gpu_memory.h delete mode 100644 lib/gpu/lj_gpu.cu delete mode 100644 lib/gpu/lj_gpu_kernel.h delete mode 100644 lib/gpu/lj_gpu_memory.cu delete mode 100644 lib/gpu/lj_gpu_memory.h delete mode 100644 lib/gpu/nvc_device.cu delete mode 100644 lib/gpu/nvc_device.h delete mode 100644 lib/gpu/nvc_get_devices.cu delete mode 100644 lib/gpu/nvc_macros.h delete mode 100644 lib/gpu/nvc_memory.h delete mode 100644 lib/gpu/nvc_timer.h delete mode 100644 lib/gpu/pair_gpu_atom.cu delete mode 100644 lib/gpu/pair_gpu_atom.h delete mode 100644 lib/gpu/pair_gpu_nbor.cu delete mode 100644 lib/gpu/pair_gpu_nbor.h delete mode 100644 lib/gpu/pair_gpu_texture.h delete mode 100644 src/GPU/Install.csh delete mode 100644 src/GPU/pair_gayberne_gpu.cpp delete mode 100644 src/GPU/pair_gayberne_gpu.h delete mode 100644 src/GPU/pair_lj_cut_gpu.cpp delete mode 100644 src/GPU/pair_lj_cut_gpu.h delete mode 100644 src/GPU/style_gpu.h diff --git a/lib/gpu/Makefile b/lib/gpu/Makefile deleted file mode 100644 index 08e4982a24..0000000000 --- a/lib/gpu/Makefile +++ /dev/null @@ -1,75 +0,0 @@ -#*************************************************************************** -# Makefile -# ------------------- -# W. Michael Brown -# -# _________________________________________________________________________ -# Build for the LAMMPS GPU Force Library -# -# _________________________________________________________________________ -# -# begin : Tue June 23 2009 -# copyright : (C) 2009 by W. Michael Brown -# email : wmbrown@sandia.gov -# ***************************************************************************/ - -BIN_DIR = . -OBJ_DIR = . -AR = ar -CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math -CUDA_ARCH = -maxrregcount 128 #-arch=sm_13 -CUDA_PREC = -D_SINGLE_SINGLE -CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB) - -CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC) - -CUDA_LIB = $(OBJ_DIR)/pair_gpu_lib.a - -# Headers for CUDA Stuff -NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h -# Headers for Pair Stuff -PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h - -ALL_H = $(NVC_H) $(PAIR_H) - -EXECS = $(BIN_DIR)/nvc_get_devices -OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/gb_gpu.cu_o \ - $(OBJ_DIR)/gb_gpu_memory.cu_o $(OBJ_DIR)/lj_gpu.cu_o \ - $(OBJ_DIR)/lj_gpu_memory.cu_o $(OBJ_DIR)/pair_gpu_atom.cu_o \ - $(OBJ_DIR)/pair_gpu_nbor.cu_o - -all: $(CUDA_LIB) $(EXECS) - -$(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H) - $(CUDA) -o $@ -c nvc_device.cu - -$(OBJ_DIR)/pair_gpu_atom.cu_o: pair_gpu_atom.cu pair_gpu_texture.h pair_gpu_atom.h $(NVC_H) - $(CUDA) -o $@ -c pair_gpu_atom.cu - -$(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H) - $(CUDA) -o $@ -c pair_gpu_nbor.cu - -$(OBJ_DIR)/lj_gpu_memory.cu_o: lj_gpu_memory.cu lj_gpu_memory.h $(ALL_H) - $(CUDA) -o $@ -c lj_gpu_memory.cu - -$(OBJ_DIR)/lj_gpu.cu_o: lj_gpu.cu $(ALL_H) lj_gpu_memory.h lj_gpu_kernel.h - $(CUDA) -o $@ -c lj_gpu.cu - -$(OBJ_DIR)/gb_gpu_memory.cu_o: gb_gpu_memory.cu gb_gpu_memory.h $(ALL_H) - $(CUDA) -o $@ -c gb_gpu_memory.cu - -$(OBJ_DIR)/gb_gpu.cu_o: gb_gpu.cu $(ALL_H) gb_gpu_memory.h gb_gpu_kernel.h gb_gpu_extra.h - $(CUDA) -o $@ -c gb_gpu.cu - -$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o - $(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o - -$(CUDA_LIB): $(OBJS) - $(AR) -crusv $(CUDA_LIB) $(OBJS) - -clean: - rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo - -veryclean: clean - rm -rf *~ *.linkinfo - diff --git a/lib/gpu/README b/lib/gpu/README deleted file mode 100644 index 17f6e5457d..0000000000 --- a/lib/gpu/README +++ /dev/null @@ -1,94 +0,0 @@ -/*************************************************************************** - README - ------------------- - W. Michael Brown - - README for building LAMMPS GPU Library - - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Thu Jun 25 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - - - GENERAL NOTES - -This library, pair_gpu_lib.a, provides routines for GPGPU acceleration of -LAMMPS pair styles. Currently, only CUDA enabled GPUs are supported. -Compilation of this library requires installing the CUDA GPU driver and -CUDA toolkit for your operating system. In addition to the LAMMPS library, -the binary nvc_get_devices will also be built. This can be used to query -the names and properties of GPU devices on your system. - -NOTE: Installation of the CUDA SDK is not required. - -Current pair styles supporting GPU Accelartion: - - 1. lj/cut/gpu - 2. gayberne/gpu - - - - MULTIPLE LAMMPS PROCESSES - -When using GPGPU acceleration, you are restricted to one physical GPU per -LAMMPS process. This can be multiple GPUs on a single node or across multiple -nodes. Intructions on GPU assignment can be found in the LAMMPS documentation. - - - - SPEEDUPS - -The speedups that can be obtained using this library are highly -dependent on the GPU architecture and the computational expense of -the pair potential. When comparing a single precision Tesla C1060 run to a -serial Intel Xeon 5140 2.33 GHz serial run, the speedup is ~4.42x for lj/cut -with a cutoff of 2.5. For gayberne with a cutoff of 7, the speedup is >103x -for 8000 particles. The speedup will improve with an increase in the -number of particles or an increase in the cutoff. - - - - BUILDING AND PRECISION MODES - -To build, edit the CUDA_CPP, CUDA_ARCH, CUDA_PREC, and CUDA_LINK files for -your machine. Type make. Additionally, the GPU package must be installed and -compiled for LAMMPS. The library supports 3 precision modes as determined by -the CUDA_PREC variable: - - CUDA_PREC = -D_SINGLE_SINGLE # Single precision for all calculations - CUDA_PREC = -D_DOUBLE_DOUBLE # Double precision for all calculations - CUDA_PREC = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double - -NOTE: Double precision is only supported on certain GPUS (with - compute capability>=1.3). - -NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE - package has been installed before installing the GPU package in LAMMPS. - - - - GPU MEMORY - -Upon initialization of the pair style, the library will reserve memory for -64K atoms per GPU or 70% of each cards GPU memory, whichever value is limiting. -The value of 70% can be changed by editing the PERCENT_GPU_MEMORY definition -in the source file. The value of 64K cannot be increased and is the maximum -number of atoms allowed per GPU. Using the 'neigh_modify one' modifier in -your LAMMPS input script can help to increase maximum number of atoms per -GPU for cards with limited memory. - - - - diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu deleted file mode 100644 index cde56d569a..0000000000 --- a/lib/gpu/gb_gpu.cu +++ /dev/null @@ -1,518 +0,0 @@ -/*************************************************************************** - gb_gpu.cu - ------------------- - W. Michael Brown - - Gay-Berne anisotropic potential GPU calcultation - - *** Force decomposition by Atom Version *** - - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Tue Jun 23 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#include -#include -#include "nvc_macros.h" -#include "nvc_timer.h" -#include "nvc_device.h" -#include "gb_gpu_memory.h" -#include "gb_gpu_kernel.h" - -using namespace std; - -static GB_GPU_Memory GBMF[MAX_GPU_THREADS]; -#define GBMT GB_GPU_Memory - -// --------------------------------------------------------------------------- -// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access -// -- Only pack neighbors matching the specified inclusive range of forms -// -- Only pack neighbors within cutoff -// --------------------------------------------------------------------------- -template -__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, - const int start, const int inum, - const int *dev_ij, const int form_low, - const int form_high, const int nall) { - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start; - - if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=_x_(i,7); - - int newj=0; - for ( ; list=nall) - j%=nall; - int jtype=_x_(j,7); - - if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) { - // Compute r12; - numtyp rsq=_x_(j,0)-ix; - rsq*=rsq; - numtyp t=_x_(j,1)-iy; - rsq+=t*t; - t=_x_(j,2)-iz; - rsq+=t*t; - - if (rsq< _cutsq_(itype,jtype)) { - *nbor=j; - nbor+=nbor_pitch; - newj++; - } - } - } - *nbor_newj=newj; - } -} - -// --------------------------------------------------------------------------- -// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access -// -- Only pack neighbors matching the specified inclusive range of forms -// -- Only pack neighbors within cutoff -// -- Fast version of routine that uses shared memory for LJ constants -// --------------------------------------------------------------------------- -template -__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, - const int start, const int inum, - const int *dev_ij, const int form_low, - const int form_high, const int nall) { - - int ii=threadIdx.x; - __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - if (ii(itype,jtype); - form[ii]=_form_(itype,jtype); - } - ii+=INT_MUL(blockIdx.x,blockDim.x)+start; - - if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,7)); - - int newj=0; - for ( ; list=nall) - j%=nall; - int mtype=itype+_x_(j,7); - - if (form[mtype]>=form_low && form[mtype]<=form_high) { - // Compute r12; - numtyp rsq=_x_(j,0)-ix; - rsq*=rsq; - numtyp t=_x_(j,1)-iy; - rsq+=t*t; - t=_x_(j,2)-iz; - rsq+=t*t; - - if (rsq -void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, - const int inum, const int form_low, const int form_high) { - if (gbm.shared_types) - kernel_pack_nbor_fast<<>> - (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum, - gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); - else - kernel_pack_nbor<<>> - (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum, - gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); -} - -// --------------------------------------------------------------------------- -// Convert something to a string -// --------------------------------------------------------------------------- -#include -template -inline string gb_gpu_toa(const t& in) { - ostringstream o; - o.precision(2); - o << in; - return o.str(); -} - -// --------------------------------------------------------------------------- -// Return string with GPU info -// --------------------------------------------------------------------------- -string gb_gpu_name(const int id, const int max_nbors) { - string name=GBMF[0].gpu.name(id)+", "+ - gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+ - gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+ - gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+ - gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id), - max_nbors))+" Atoms"; - return name; -} - -// --------------------------------------------------------------------------- -// Allocate memory on host and device and copy constants to device -// --------------------------------------------------------------------------- -int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma, - const double upsilon, const double mu, double **shape, - double **well, double **cutsq, double **sigma, - double **epsilon, double *host_lshape, int **form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int max_nbors, const int thread, const int gpu_id) { - assert(thread -inline void _gb_gpu_atom(PairGPUAtom &atom, double **host_x, - double **host_quat, const int *host_type, - const bool rebuild, cudaStream_t &stream) { - atom.time_atom.start(); - atom.reset_write_buffer(); - - // Rows 1-3 of dev_x are position; rows 4-7 are quaternion - atom.add_atom_data(host_x[0],3); - atom.add_atom_data(host_x[0]+1,3); - atom.add_atom_data(host_x[0]+2,3); - atom.add_atom_data(host_quat[0],4); - atom.add_atom_data(host_quat[0]+1,4); - atom.add_atom_data(host_quat[0]+2,4); - atom.add_atom_data(host_quat[0]+3,4); - - int csize=7; - - // If a rebuild occured, copy type data - if (rebuild) { - atom.add_atom_data(host_type); - csize++; - } - - atom.copy_atom_data(csize,stream); - atom.time_atom.stop(); -} - -void gb_gpu_atom(double **host_x, double **host_quat, - const int *host_type, const bool rebuild, const int thread) { - _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild, - GBMF[thread].pair_stream); -} - -// --------------------------------------------------------------------------- -// Signal that we need to transfer a new neighbor list -// --------------------------------------------------------------------------- -template -int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, - const int inum, int *ilist, const int *numj, - const int *type, bool &success) { - if (nall>gbm.max_atoms) { - success=false; - return 0; - } - success=true; - - gbm.nbor.time_nbor.start(); - - gbm.atom.nall(nall); - gbm.atom.inum(inum); - - if (gbm.multiple_forms) { - int p=0, acc=0; - for (int i=0; i -void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) { - gbm.nbor.time_nbor.add_to_total(); - // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed - - gbm.nbor.time_nbor.start(); - gbm.nbor.add(num_ij,gbm.pair_stream); - gbm.nbor.time_nbor.stop(); -} - -void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) { - _gb_gpu_nbors(GBMF[thread],num_ij,eflag); -} - -// --------------------------------------------------------------------------- -// Calculate energies, forces, and torques for all ij interactions -// --------------------------------------------------------------------------- -template -void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, - const bool rebuild) { - // Compute the block size and grid size to keep all cores busy - const int BX=BLOCK_1D; - - int GX=static_cast(ceil(static_cast(gbm.atom.inum())/BX)); - - if (gbm.multiple_forms) { - gbm.time_kernel.start(); - if (gbm.last_ellipse>0) { - // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- - GX=static_cast(ceil(static_cast(gbm.last_ellipse)/ - static_cast(BX))); - pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE); - gbm.time_kernel.stop(); - - gbm.time_gayberne.start(); - kernel_gayberne<<>> - (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), - gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), - eflag, vflag, gbm.last_ellipse, gbm.atom.nall()); - gbm.time_gayberne.stop(); - - if (gbm.last_ellipse==gbm.atom.inum()) { - gbm.time_kernel2.start(); - gbm.time_kernel2.stop(); - gbm.time_gayberne2.start(); - gbm.time_gayberne2.stop(); - gbm.time_pair.start(); - gbm.time_pair.stop(); - return; - } - - // ------------ SPHERE_ELLIPSE --------------- - - gbm.time_kernel2.start(); - GX=static_cast(ceil(static_cast(gbm.atom.inum()- - gbm.last_ellipse)/BX)); - pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE, - ELLIPSE_SPHERE); - gbm.time_kernel2.stop(); - - gbm.time_gayberne2.start(); - kernel_sphere_gb<<>> - (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), - gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), - eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); - gbm.time_gayberne2.stop(); - } else { - gbm.atom.ans.zero(); - gbm.time_kernel.stop(); - gbm.time_gayberne.start(); - gbm.time_gayberne.stop(); - gbm.time_kernel2.start(); - gbm.time_kernel2.stop(); - gbm.time_gayberne2.start(); - gbm.time_gayberne2.stop(); - } - - // ------------ LJ --------------- - gbm.time_pair.start(); - if (gbm.last_ellipse<<>> - (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), - gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), - gbm.atom.ans.begin(), gbm.atom.ans.row_size(), - gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, - gbm.atom.inum(), gbm.atom.nall()); - else - kernel_lj<<>> - (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), - gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), - gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), - eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); - } - gbm.time_pair.stop(); - } else { - gbm.time_kernel.start(); - pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE); - gbm.time_kernel.stop(); - - gbm.time_gayberne.start(); - kernel_gayberne<<>> - (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), - gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), - eflag, vflag, gbm.atom.inum(), gbm.atom.nall()); - gbm.time_gayberne.stop(); - } -} - -void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, - const int thread) { - _gb_gpu_gayberne(GBMF[thread],eflag,vflag,rebuild); -} - -// --------------------------------------------------------------------------- -// Get energies, forces, and torques to host -// --------------------------------------------------------------------------- -template -double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist, - const bool eflag, const bool vflag, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial) { - double evdw; - - gbm.atom.time_answer.start(); - gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream); - - gbm.atom.time_atom.add_to_total(); - gbm.nbor.time_nbor.add_to_total(); - gbm.time_kernel.add_to_total(); - gbm.time_gayberne.add_to_total(); - if (gbm.multiple_forms) { - gbm.time_kernel2.add_to_total(); - gbm.time_gayberne2.add_to_total(); - gbm.time_pair.add_to_total(); - } - // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed - - evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); - gbm.atom.add_forces(ilist,f); - gbm.atom.add_torques(ilist,tor,gbm.last_ellipse); - gbm.atom.time_answer.stop(); - gbm.atom.time_answer.add_to_total(); - return evdw; -} - -double gb_gpu_forces(double **f, double **tor, const int *ilist, - const bool eflag, const bool vflag, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial, const int thread) { - return _gb_gpu_forces - (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom, - vflag_atom,eatom,vatom,virial); -} - -void gb_gpu_time(const int i) { - cout.precision(4); - cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds() - << " s.\n" - << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() - << " s.\n" - << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+ - GBMF[i].time_kernel2.total_seconds() << " s.\n" - << "Force calc: " << GBMF[i].time_gayberne.total_seconds()+ - GBMF[i].time_gayberne2.total_seconds()<< " s.\n"; - if (GBMF[i].multiple_forms) - cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n"; - cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds() - << " s.\n"; -} - -int gb_gpu_num_devices() { - return GBMF[0].gpu.num_devices(); -} - -double gb_gpu_bytes() { - return GBMF[0].host_memory_usage(); -} diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h deleted file mode 100644 index c918e07070..0000000000 --- a/lib/gpu/gb_gpu_extra.h +++ /dev/null @@ -1,337 +0,0 @@ -/*************************************************************************** - gb_gpu_extra.h - ------------------- - W. Michael Brown - - Inline GPU kernel routines ala math_extra for the CPU. - - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Tue Jun 23 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#ifndef GB_GPU_EXTRA_H -#define GB_GPU_EXTRA_H - -#include "math.h" -#include "stdio.h" -#include "string.h" - -/* ---------------------------------------------------------------------- - Atomic update of global memory -------------------------------------------------------------------------- */ -/* -template __device__ -inline void atomicAdd(numtyp *address, numtyp val); - -template <> -__device__ inline void atomicAdd(float *address, float val) -{ - int i_val = __float_as_int(val); - int tmp0 = 0; - int tmp1; - - while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) { - tmp0 = tmp1; - i_val = __float_as_int(val + __int_as_float(tmp1)); - } -}*/ - -/* ---------------------------------------------------------------------- - dot product of 2 vectors -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2) -{ - return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2]; -} - -/* ---------------------------------------------------------------------- - cross product of 2 vectors -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ void gpu_cross3(const numtyp *v1, - const numtyp *v2, numtyp *ans) -{ - ans[0] = v1[1]*v2[2]-v1[2]*v2[1]; - ans[1] = v1[2]*v2[0]-v1[0]*v2[2]; - ans[2] = v1[0]*v2[1]-v1[1]*v2[0]; -} - -/* ---------------------------------------------------------------------- - determinant of a matrix -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ numtyp gpu_det3(const numtyp m[9]) -{ - numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - - m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + - m[6]*m[1]*m[5] - m[6]*m[2]*m[4]; - return ans; -} - -/* ---------------------------------------------------------------------- - diagonal matrix times a full matrix -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9], - numtyp ans[9]) -{ - ans[0] = _well_(i,0)*m[0]; - ans[1] = _well_(i,0)*m[1]; - ans[2] = _well_(i,0)*m[2]; - ans[3] = _well_(i,1)*m[3]; - ans[4] = _well_(i,1)*m[4]; - ans[5] = _well_(i,1)*m[5]; - ans[6] = _well_(i,2)*m[6]; - ans[7] = _well_(i,2)*m[7]; - ans[8] = _well_(i,2)*m[8]; -} - -/* ---------------------------------------------------------------------- - diagonal matrix times a full matrix -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9], - numtyp ans[9]) -{ - ans[0] = _shape_(i,0)*m[0]; - ans[1] = _shape_(i,0)*m[1]; - ans[2] = _shape_(i,0)*m[2]; - ans[3] = _shape_(i,1)*m[3]; - ans[4] = _shape_(i,1)*m[4]; - ans[5] = _shape_(i,1)*m[5]; - ans[6] = _shape_(i,2)*m[6]; - ans[7] = _shape_(i,2)*m[7]; - ans[8] = _shape_(i,2)*m[8]; -} - -/* ---------------------------------------------------------------------- - add two matrices -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ void gpu_plus3(const numtyp m[9], - const numtyp m2[9], numtyp ans[9]) -{ - ans[0] = m[0]+m2[0]; - ans[1] = m[1]+m2[1]; - ans[2] = m[2]+m2[2]; - ans[3] = m[3]+m2[3]; - ans[4] = m[4]+m2[4]; - ans[5] = m[5]+m2[5]; - ans[6] = m[6]+m2[6]; - ans[7] = m[7]+m2[7]; - ans[8] = m[8]+m2[8]; -} - -/* ---------------------------------------------------------------------- - multiply the transpose of mat1 times mat2 -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9], - const numtyp m2[9], - numtyp ans[9]) -{ - ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6]; - ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7]; - ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8]; - ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6]; - ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7]; - ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8]; - ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6]; - ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7]; - ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8]; -} - -/* ---------------------------------------------------------------------- - row vector times matrix -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ void gpu_row_times3(const numtyp *v, - const numtyp m[9], numtyp *ans) -{ - ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6]; - ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7]; - ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2]; -} - -/* ---------------------------------------------------------------------- - solve Ax = b or M ans = v - use gaussian elimination & partial pivoting on matrix - error_flag set to 2 if bad matrix inversion attempted -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ void gpu_mldivide3(const numtyp m[9], - const numtyp *v, numtyp *ans, - int *error_flag) -{ - // create augmented matrix for pivoting - - numtyp aug[12], t; - - aug[3] = v[0]; - aug[0] = m[0]; - aug[1] = m[1]; - aug[2] = m[2]; - aug[7] = v[1]; - aug[4] = m[3]; - aug[5] = m[4]; - aug[6] = m[5]; - aug[11] = v[2]; - aug[8] = m[6]; - aug[9] = m[7]; - aug[10] = m[8]; - - if (fabs(aug[4]) > fabs(aug[0])) { - numtyp swapt; - swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; - swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; - swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; - swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; - } - if (fabs(aug[8]) > fabs(aug[0])) { - numtyp swapt; - swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; - swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; - swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; - swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; - } - - if (aug[0] != (numtyp)0.0) { - if (0!=0) { - numtyp swapt; - swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt; - swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt; - swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt; - swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt; - } - } else if (aug[4] != (numtyp)0.0) { - if (1!=0) { - numtyp swapt; - swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; - swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; - swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; - swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; - } - } else if (aug[8] != (numtyp)0.0) { - if (2!=0) { - numtyp swapt; - swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; - swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; - swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; - swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; - } - } else - *error_flag=2; - - t = aug[4]/aug[0]; - aug[5]-=t*aug[1]; - aug[6]-=t*aug[2]; - aug[7]-=t*aug[3]; - t = aug[8]/aug[0]; - aug[9]-=t*aug[1]; - aug[10]-=t*aug[2]; - aug[11]-=t*aug[3]; - - if (fabs(aug[9]) > fabs(aug[5])) { - numtyp swapt; - swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; - swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; - swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; - swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; - } - - if (aug[5] != (numtyp)0.0) { - if (1!=1) { - numtyp swapt; - swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt; - swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt; - swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt; - swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt; - } - } else if (aug[9] != (numtyp)0.0) { - if (2!=1) { - numtyp swapt; - swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; - swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; - swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; - swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; - } - } - - t = aug[9]/aug[5]; - aug[10]-=t*aug[6]; - aug[11]-=t*aug[7]; - - if (aug[10] == (numtyp)0.0) - *error_flag=2; - - ans[2] = aug[11]/aug[10]; - t = (numtyp)0.0; - t += aug[6]*ans[2]; - ans[1] = (aug[7]-t) / aug[5]; - t = (numtyp)0.0; - t += aug[1]*ans[1]; - t += aug[2]*ans[2]; - ans[0] = (aug[3]-t) / aug[0]; -} - -/* ---------------------------------------------------------------------- - compute rotation matrix from quaternion conjugate - quat = [w i j k] -------------------------------------------------------------------------- */ - -template -static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi, - numtyp mat[9]) -{ - numtyp qi3=_x_(qi,3); - numtyp qi4=_x_(qi,4); - numtyp qi5=_x_(qi,5); - numtyp qi6=_x_(qi,6); - - numtyp w2 = qi3*qi3; - numtyp i2 = qi4*qi4; - numtyp j2 = qi5*qi5; - numtyp k2 = qi6*qi6; - numtyp twoij = 2.0*qi4*qi5; - numtyp twoik = 2.0*qi4*qi6; - numtyp twojk = 2.0*qi5*qi6; - numtyp twoiw = 2.0*qi4*qi3; - numtyp twojw = 2.0*qi5*qi3; - numtyp twokw = 2.0*qi6*qi3; - - mat[0] = w2+i2-j2-k2; - mat[3] = twoij-twokw; - mat[6] = twojw+twoik; - - mat[1] = twoij+twokw; - mat[4] = w2-i2+j2-k2; - mat[7] = twojk-twoiw; - - mat[2] = twoik-twojw; - mat[5] = twojk+twoiw; - mat[8] = w2-i2-j2+k2; -} - -#endif diff --git a/lib/gpu/gb_gpu_kernel.h b/lib/gpu/gb_gpu_kernel.h deleted file mode 100644 index 8ca047e0e1..0000000000 --- a/lib/gpu/gb_gpu_kernel.h +++ /dev/null @@ -1,861 +0,0 @@ -/*************************************************************************** - gb_gpu_kernel.cu - ------------------- - W. Michael Brown - - Routines that actually perform the force/torque computation - - *** Force Decomposition by Atom Version *** - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Tue Jun 23 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#ifndef GB_GPU_KERNEL -#define GB_GPU_KERNEL - -#include "gb_gpu_extra.h" - -template -static __inline__ __device__ void compute_eta_torque(numtyp m[9], - numtyp m2[9], - const int i, - numtyp ans[9]) -{ - numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- - m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- - m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; - den = (numtyp)1.0/den; - - numtyp shapex=_shape_(i,0); - numtyp shapey=_shape_(i,1); - numtyp shapez=_shape_(i,2); - - ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- - m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ - m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- - m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ - m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; - - ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ - (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- - (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- - m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ - m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; - - ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- - m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- - m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ - (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ - m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; - - ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ - m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ - m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- - m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- - m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; - - ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ - (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- - (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ - m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- - m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; - - ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- - m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ - (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ - m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- - (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; - - ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ - (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ - m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- - m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- - m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; - - ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- - (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ - (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- - m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ - m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; - - ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- - m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- - m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ - (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ - m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; -} - -template -__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj, - const int *dev_nbor, const int nbor_pitch, - acctyp *ans, size_t ans_pitch, int *err_flag, - const bool eflag, const bool vflag, - const int inum, const int nall) { - - __shared__ numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - ii+=INT_MUL(blockIdx.x,blockDim.x); - - if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=_x_(i,7); - numtyp a1[9], b1[9], g1[9]; - { - numtyp t[9]; - gpu_quat_to_mat_trans(i,a1); - gpu_shape_times3(itype,a1,t); - gpu_transpose_times3(a1,t,g1); - gpu_well_times3(itype,a1,t); - gpu_transpose_times3(a1,t,b1); - } - - numtyp factor_lj; - for ( ; nbor(j,7); - - // Compute r12 - numtyp r12[3]; - r12[0] = _x_(j,0)-ix; - r12[1] = _x_(j,1)-iy; - r12[2] = _x_(j,2)-iz; - numtyp ir = gpu_dot3(r12,r12); - - ir = rsqrt(ir); - numtyp r = (numtyp)1.0/ir; - - numtyp a2[9]; - gpu_quat_to_mat_trans(j,a2); - - numtyp u_r, dUr[3], tUr[3], eta, teta[3]; - { // Compute U_r, dUr, eta, and teta - // Compute g12 - numtyp g12[9]; - { - numtyp g2[9]; - { - gpu_shape_times3(jtype,a2,g12); - gpu_transpose_times3(a2,g12,g2); - gpu_plus3(g1,g2,g12); - } - - { // Compute U_r and dUr - - // Compute kappa - numtyp kappa[3]; - gpu_mldivide3(g12,r12,kappa,err_flag); - - // -- replace r12 with r12 hat - r12[0]*=ir; - r12[1]*=ir; - r12[2]*=ir; - - // -- kappa is now / r - kappa[0]*=ir; - kappa[1]*=ir; - kappa[2]*=ir; - - // energy - - // compute u_r and dUr - numtyp uslj_rsq; - { - // Compute distance of closest approach - numtyp h12, sigma12; - sigma12 = gpu_dot3(r12,kappa); - sigma12 = rsqrt((numtyp)0.5*sigma12); - h12 = r-sigma12; - - // -- kappa is now ok - kappa[0]*=r; - kappa[1]*=r; - kappa[2]*=r; - - numtyp sigma = _sigma_(itype,jtype); - numtyp epsilon = _epsilon_(itype,jtype); - numtyp varrho = sigma/(h12+gum[0]*sigma); - numtyp varrho6 = varrho*varrho*varrho; - varrho6*=varrho6; - numtyp varrho12 = varrho6*varrho6; - u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); - - numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; - temp1 = temp1*(numtyp)24.0*epsilon; - uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; - numtyp temp2 = gpu_dot3(kappa,r12); - uslj_rsq = uslj_rsq*ir*ir; - - dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]); - dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]); - dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]); - } - - // torque for particle 1 - { - numtyp tempv[3], tempv2[3]; - tempv[0] = -uslj_rsq*kappa[0]; - tempv[1] = -uslj_rsq*kappa[1]; - tempv[2] = -uslj_rsq*kappa[2]; - gpu_row_times3(kappa,g1,tempv2); - gpu_cross3(tempv,tempv2,tUr); - } - } - } - - // Compute eta - { - eta = (numtyp)2.0*_lshape_(itype)*_lshape_(jtype); - numtyp det_g12 = gpu_det3(g12); - eta = pow(eta/det_g12,gum[1]); - } - - // Compute teta - numtyp temp[9], tempv[3], tempv2[3]; - compute_eta_torque(g12,a1,itype,temp); - numtyp temp1 = -eta*gum[1]; - - tempv[0] = temp1*temp[0]; - tempv[1] = temp1*temp[1]; - tempv[2] = temp1*temp[2]; - gpu_cross3(a1,tempv,tempv2); - teta[0] = tempv2[0]; - teta[1] = tempv2[1]; - teta[2] = tempv2[2]; - - tempv[0] = temp1*temp[3]; - tempv[1] = temp1*temp[4]; - tempv[2] = temp1*temp[5]; - gpu_cross3(a1+3,tempv,tempv2); - teta[0] += tempv2[0]; - teta[1] += tempv2[1]; - teta[2] += tempv2[2]; - - tempv[0] = temp1*temp[6]; - tempv[1] = temp1*temp[7]; - tempv[2] = temp1*temp[8]; - gpu_cross3(a1+6,tempv,tempv2); - teta[0] += tempv2[0]; - teta[1] += tempv2[1]; - teta[2] += tempv2[2]; - } - - numtyp chi, dchi[3], tchi[3]; - { // Compute chi and dchi - - // Compute b12 - numtyp b2[9], b12[9]; - { - gpu_well_times3(jtype,a2,b12); - gpu_transpose_times3(a2,b12,b2); - gpu_plus3(b1,b2,b12); - } - - // compute chi_12 - r12[0]*=r; - r12[1]*=r; - r12[2]*=r; - numtyp iota[3]; - gpu_mldivide3(b12,r12,iota,err_flag); - // -- iota is now iota/r - iota[0]*=ir; - iota[1]*=ir; - iota[2]*=ir; - r12[0]*=ir; - r12[1]*=ir; - r12[2]*=ir; - chi = gpu_dot3(r12,iota); - chi = pow(chi*(numtyp)2.0,gum[2]); - - // -- iota is now ok - iota[0]*=r; - iota[1]*=r; - iota[2]*=r; - - numtyp temp1 = gpu_dot3(iota,r12); - numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); - dchi[0] = temp2*(iota[0]-temp1*r12[0]); - dchi[1] = temp2*(iota[1]-temp1*r12[1]); - dchi[2] = temp2*(iota[2]-temp1*r12[2]); - - // compute t_chi - numtyp tempv[3]; - gpu_row_times3(iota,b1,tempv); - gpu_cross3(tempv,iota,tchi); - temp1 = (numtyp)-4.0*ir*ir; - tchi[0] *= temp1; - tchi[1] *= temp1; - tchi[2] *= temp1; - } - - numtyp temp2 = factor_lj*eta*chi; - if (eflag) - energy+=u_r*temp2; - numtyp temp1 = -eta*u_r*factor_lj; - if (vflag) { - r12[0]*=-r; - r12[1]*=-r; - r12[2]*=-r; - numtyp ft=temp1*dchi[0]-temp2*dUr[0]; - fx+=ft; - virial[0]+=r12[0]*ft; - ft=temp1*dchi[1]-temp2*dUr[1]; - fy+=ft; - virial[1]+=r12[1]*ft; - virial[3]+=r12[0]*ft; - ft=temp1*dchi[2]-temp2*dUr[2]; - fz+=ft; - virial[2]+=r12[2]*ft; - virial[4]+=r12[0]*ft; - virial[5]+=r12[1]*ft; - } else { - fx+=temp1*dchi[0]-temp2*dUr[0]; - fy+=temp1*dchi[1]-temp2*dUr[1]; - fz+=temp1*dchi[2]-temp2*dUr[2]; - } - - // Torque on 1 - temp1 = -u_r*eta*factor_lj; - temp2 = -u_r*chi*factor_lj; - numtyp temp3 = -chi*eta*factor_lj; - torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; - tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; - torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; - - } // for nbor - - // Store answers - acctyp *ap1=ans+ii; - if (eflag) { - *ap1=energy; - ap1+=ans_pitch; - } - if (vflag) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; - ap1+=ans_pitch; - } - } - *ap1=fx; - ap1+=ans_pitch; - *ap1=fy; - ap1+=ans_pitch; - *ap1=fz; - ap1+=ans_pitch; - *ap1=torx; - ap1+=ans_pitch; - *ap1=tory; - ap1+=ans_pitch; - *ap1=torz; - - } // if ii -} - -template -__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj, - const int *dev_nbor, const int nbor_pitch, - acctyp *ans, size_t ans_pitch, int *err_flag, - const bool eflag, const bool vflag, - const int start, const int inum, - const int nall) { - __shared__ numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - ii+=INT_MUL(blockIdx.x,blockDim.x)+start; - - if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=_x_(i,7); - - numtyp oner=_shape_(itype,0); - numtyp one_well=_well_(itype,0); - - numtyp factor_lj; - for ( ; nbor(j,7); - - // Compute r12 - numtyp r12[3]; - r12[0] = _x_(j,0)-ix; - r12[1] = _x_(j,1)-iy; - r12[2] = _x_(j,2)-iz; - numtyp ir = gpu_dot3(r12,r12); - - ir = rsqrt(ir); - numtyp r = (numtyp)1.0/ir; - - numtyp r12hat[3]; - r12hat[0]=r12[0]*ir; - r12hat[1]=r12[1]*ir; - r12hat[2]=r12[2]*ir; - - numtyp a2[9]; - gpu_quat_to_mat_trans(j,a2); - - numtyp u_r, dUr[3], eta; - { // Compute U_r, dUr, eta, and teta - // Compute g12 - numtyp g12[9]; - { - { - numtyp g2[9]; - gpu_shape_times3(jtype,a2,g12); - gpu_transpose_times3(a2,g12,g2); - g12[0]=g2[0]+oner; - g12[4]=g2[4]+oner; - g12[8]=g2[8]+oner; - g12[1]=g2[1]; - g12[2]=g2[2]; - g12[3]=g2[3]; - g12[5]=g2[5]; - g12[6]=g2[6]; - g12[7]=g2[7]; - } - - { // Compute U_r and dUr - - // Compute kappa - numtyp kappa[3]; - gpu_mldivide3(g12,r12,kappa,err_flag); - - // -- kappa is now / r - kappa[0]*=ir; - kappa[1]*=ir; - kappa[2]*=ir; - - // energy - - // compute u_r and dUr - numtyp uslj_rsq; - { - // Compute distance of closest approach - numtyp h12, sigma12; - sigma12 = gpu_dot3(r12hat,kappa); - sigma12 = rsqrt((numtyp)0.5*sigma12); - h12 = r-sigma12; - - // -- kappa is now ok - kappa[0]*=r; - kappa[1]*=r; - kappa[2]*=r; - - numtyp sigma = _sigma_(itype,jtype); - numtyp epsilon = _epsilon_(itype,jtype); - numtyp varrho = sigma/(h12+gum[0]*sigma); - numtyp varrho6 = varrho*varrho*varrho; - varrho6*=varrho6; - numtyp varrho12 = varrho6*varrho6; - u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); - - numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; - temp1 = temp1*(numtyp)24.0*epsilon; - uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; - numtyp temp2 = gpu_dot3(kappa,r12hat); - uslj_rsq = uslj_rsq*ir*ir; - - dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]); - dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]); - dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]); - } - } - } - - // Compute eta - { - eta = (numtyp)2.0*_lshape_(itype)*_lshape_(jtype); - numtyp det_g12 = gpu_det3(g12); - eta = pow(eta/det_g12,gum[1]); - } - } - - numtyp chi, dchi[3]; - { // Compute chi and dchi - - // Compute b12 - numtyp b12[9]; - { - numtyp b2[9]; - gpu_well_times3(jtype,a2,b12); - gpu_transpose_times3(a2,b12,b2); - b12[0]=b2[0]+one_well; - b12[4]=b2[4]+one_well; - b12[8]=b2[8]+one_well; - b12[1]=b2[1]; - b12[2]=b2[2]; - b12[3]=b2[3]; - b12[5]=b2[5]; - b12[6]=b2[6]; - b12[7]=b2[7]; - } - - // compute chi_12 - numtyp iota[3]; - gpu_mldivide3(b12,r12,iota,err_flag); - // -- iota is now iota/r - iota[0]*=ir; - iota[1]*=ir; - iota[2]*=ir; - chi = gpu_dot3(r12hat,iota); - chi = pow(chi*(numtyp)2.0,gum[2]); - - // -- iota is now ok - iota[0]*=r; - iota[1]*=r; - iota[2]*=r; - - numtyp temp1 = gpu_dot3(iota,r12hat); - numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); - dchi[0] = temp2*(iota[0]-temp1*r12hat[0]); - dchi[1] = temp2*(iota[1]-temp1*r12hat[1]); - dchi[2] = temp2*(iota[2]-temp1*r12hat[2]); - } - - numtyp temp2 = factor_lj*eta*chi; - if (eflag) - energy+=u_r*temp2; - numtyp temp1 = -eta*u_r*factor_lj; - if (vflag) { - r12[0]*=-1; - r12[1]*=-1; - r12[2]*=-1; - numtyp ft=temp1*dchi[0]-temp2*dUr[0]; - fx+=ft; - virial[0]+=r12[0]*ft; - ft=temp1*dchi[1]-temp2*dUr[1]; - fy+=ft; - virial[1]+=r12[1]*ft; - virial[3]+=r12[0]*ft; - ft=temp1*dchi[2]-temp2*dUr[2]; - fz+=ft; - virial[2]+=r12[2]*ft; - virial[4]+=r12[0]*ft; - virial[5]+=r12[1]*ft; - } else { - fx+=temp1*dchi[0]-temp2*dUr[0]; - fy+=temp1*dchi[1]-temp2*dUr[1]; - fz+=temp1*dchi[2]-temp2*dUr[2]; - } - } // for nbor - - // Store answers - acctyp *ap1=ans+ii; - if (eflag) { - *ap1=energy; - ap1+=ans_pitch; - } - if (vflag) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; - ap1+=ans_pitch; - } - } - *ap1=fx; - ap1+=ans_pitch; - *ap1=fy; - ap1+=ans_pitch; - *ap1=fz; - } // if ii -} - -template -__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, - const int *dev_ij, const int nbor_pitch, acctyp *ans, - size_t ans_pitch, int *err_flag, const bool eflag, - const bool vflag, const int start, const int inum, - const int nall) { - __shared__ numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - ii+=INT_MUL(blockIdx.x,blockDim.x)+start; - - if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=_x_(i,7); - - numtyp factor_lj; - for ( ; list(j,7); - - // Compute r12 - numtyp delx = ix-_x_(j,0); - numtyp dely = iy-_x_(j,1); - numtyp delz = iz-_x_(j,2); - numtyp r2inv = delx*delx+dely*dely+delz*delz; - - if (r2inv<_cutsq_(itype,jtype) && - _form_(itype,jtype)==SPHERE_SPHERE) { - r2inv=(numtyp)1.0/r2inv; - numtyp r6inv = r2inv*r2inv*r2inv; - numtyp force = r2inv*r6inv*(_lj1_(itype,jtype).x*r6inv- - _lj1_(itype,jtype).y); - force*=factor_lj; - - fx+=delx*force; - fy+=dely*force; - fz+=delz*force; - - if (eflag) { - numtyp e=r6inv*(_lj3_(itype,jtype).x*r6inv- - _lj3_(itype,jtype).y); - energy+=factor_lj*(e-_offset_(1,1)); - } - if (vflag) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } - } - - } // for nbor - - // Store answers - acctyp *ap1=ans+ii; - if (eflag) { - *ap1+=energy; - ap1+=ans_pitch; - } - if (vflag) { - for (int i=0; i<6; i++) { - *ap1+=virial[i]; - ap1+=ans_pitch; - } - } - *ap1+=fx; - ap1+=ans_pitch; - *ap1+=fy; - ap1+=ans_pitch; - *ap1+=fz; - - } // if ii -} - -template -__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, - const int *dev_ij, const int nbor_pitch, - acctyp *ans, size_t ans_pitch,int *err_flag, - const bool eflag, const bool vflag, - const int start, const int inum, const int nall){ - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - __shared__ numtyp sp_lj[4]; - __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - if (ii(itype,jtype); - form[ii]=_form_(itype,jtype); - lj1[ii]=_lj1_(itype,jtype).x; - lj2[ii]=_lj1_(itype,jtype).y; - if (eflag) { - lj3[ii]=_lj3_(itype,jtype).x; - lj4[ii]=_lj3_(itype,jtype).y; - offset[ii]=_offset_(itype,jtype); - } - } - ii+=INT_MUL(blockIdx.x,blockDim.x)+start; - - if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,7)); - - numtyp factor_lj; - for ( ; list(j,7); - - // Compute r12 - numtyp delx = ix-_x_(j,0); - numtyp dely = iy-_x_(j,1); - numtyp delz = iz-_x_(j,2); - numtyp r2inv = delx*delx+dely*dely+delz*delz; - - if (r2inv - -template -GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() { - this->atom.atom_fields(8); - this->atom.ans_fields(13); - this->nbor.packing(true); -} - -template -GB_GPU_MemoryT::~GB_GPU_Memory() { - clear(); -} - -template -int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, - const double gamma, const double upsilon, - const double mu, double **host_shape, - double **host_well, double **host_cutsq, - double **host_sigma, double **host_epsilon, - double *host_lshape, int **h_form, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int max_nbors, - const bool force_d, const int me) { - if (this->allocated) - clear(); - - LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon, - host_lj1, host_lj2, host_lj3, host_lj4, host_offset, - host_special_lj, max_nbors, me); - - host_form=h_form; - - // Initialize timers for the selected GPU - time_kernel.init(); - time_gayberne.init(); - time_kernel2.init(); - time_gayberne2.init(); - - // Use the write buffer from atom for data initialization - NVC_HostT &host_write=this->atom.host_write; - assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2); - - // Allocate, cast and asynchronous memcpy of constant data - gamma_upsilon_mu.safe_alloc(3); - host_write[0]=static_cast(gamma); - host_write[1]=static_cast(upsilon); - host_write[2]=static_cast(mu); - gamma_upsilon_mu.copy_from_host(host_write.begin()); - - lshape.safe_alloc(ntypes); - lshape.cast_copy(host_lshape,host_write); - lshape.copy_from_host(host_write.begin()); - - // Copy shape, well, sigma, epsilon, and cutsq onto GPU - shape.safe_alloc(ntypes,3); - shape.cast_copy(host_shape[0],host_write); - well.safe_alloc(ntypes,3); - well.cast_copy(host_well[0],host_write); - - // Copy LJ data onto GPU - int lj_types=ntypes; - if (lj_types<=MAX_SHARED_TYPES) - lj_types=MAX_SHARED_TYPES; - form.safe_alloc(lj_types,lj_types); - form.copy_2Dfrom_host(host_form[0],ntypes,ntypes); - - // See if we want fast GB-sphere or sphere-sphere calculations - multiple_forms=false; - for (int i=1; imax_atoms); - - // Bind constant data to textures - lshape_bind_texture(lshape); - shape_bind_texture(shape); - well_bind_texture(well); - form_bind_texture(form); - - return this->nbor.host_ij.begin(); -} - -template -void GB_GPU_MemoryT::clear() { - if (!this->allocated) - return; - - int err_flag; - this->dev_error.copy_to_host(&err_flag); - if (err_flag == 1) - std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N " - << "and RECOMPILE.\n"; - else if (err_flag == 2) - std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; - - LJ_GPU_MemoryT::clear(); - - shape_unbind_texture(); - well_unbind_texture(); - form_unbind_texture(); - - shape.clear(); - well.clear(); - form.clear(); - lshape.clear(); - gamma_upsilon_mu.clear(); - host_olist.clear(); -} - -template -double GB_GPU_MemoryT::host_memory_usage() { - return this->atom.host_memory_usage(this->max_atoms)+ - this->nbor.host_memory_usage()+4*sizeof(numtyp)+ - sizeof(GB_GPU_Memory)+this->max_atoms*sizeof(int); -} - -template class GB_GPU_Memory; diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h deleted file mode 100644 index 8886931c10..0000000000 --- a/lib/gpu/gb_gpu_memory.h +++ /dev/null @@ -1,74 +0,0 @@ -/*************************************************************************** - gb_gpu_memory.h - ------------------- - W. Michael Brown - - Global variables for GPU Gayberne Library - - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Thu Jun 25 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#ifndef GB_GPU_MEMORY_H -#define GB_GPU_MEMORY_H - -#define MAX_GPU_THREADS 4 -#include "lj_gpu_memory.h" -#define LJ_GPU_MemoryT LJ_GPU_Memory - -enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; - -template -class GB_GPU_Memory : public LJ_GPU_MemoryT { - public: - GB_GPU_Memory(); - ~GB_GPU_Memory(); - - int* init(const int ij_size, const int ntypes, const double gamma, - const double upsilon, const double mu, double **host_shape, - double **host_well, double **host_cutsq, double **host_sigma, - double **host_epsilon, double *host_lshape, int **h_form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int max_nbors, const bool force_d, const int me); - - void clear(); - - double host_memory_usage(); - - // ---------------------------- DATA ---------------------------- - - // ilist with particles sorted by type - NVC_HostI host_olist; - - // --------------- Const Data for Atoms - NVC_ConstMatT shape, well; - NVC_ConstMatI form; - NVC_VecT lshape, gamma_upsilon_mu; - - - // --------------- Timing Stuff - NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2; - - // True if we want to use fast GB-sphere or sphere-sphere calculations - bool multiple_forms; - int **host_form; - int last_ellipse; - - private: -}; - -#endif - diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu deleted file mode 100644 index ea5ba440cc..0000000000 --- a/lib/gpu/lj_gpu.cu +++ /dev/null @@ -1,234 +0,0 @@ -/*************************************************************************** - lj_gpu.cu - ------------------- - W. Michael Brown - - Lennard-Jones potential GPU calcultation - - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Tue Aug 4 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#include -#include -#include "nvc_macros.h" -#include "nvc_timer.h" -#include "nvc_device.h" -#include "lj_gpu_memory.h" -#include "lj_gpu_kernel.h" - -using namespace std; - -static LJ_GPU_Memory LJMF; -#define LJMT LJ_GPU_Memory - -// --------------------------------------------------------------------------- -// Convert something to a string -// --------------------------------------------------------------------------- -#include -template -inline string lj_gpu_toa(const t& in) { - ostringstream o; - o.precision(2); - o << in; - return o.str(); -} - -// --------------------------------------------------------------------------- -// Return string with GPU info -// --------------------------------------------------------------------------- -string lj_gpu_name(const int id, const int max_nbors) { - string name=LJMF.gpu.name(id)+", "+ - lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+ - lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+ - lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+ - lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id), - max_nbors))+" Atoms"; - return name; -} - -// --------------------------------------------------------------------------- -// Allocate memory on host and device and copy constants to device -// --------------------------------------------------------------------------- -int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, - double **epsilon, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, double **offset, - double *special_lj, const int max_nbors, const int gpu_id) { - if (LJMF.gpu.num_devices()==0) - return 0; - - ij_size=IJ_SIZE; - return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, - host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id); -} - -// --------------------------------------------------------------------------- -// Clear memory on host and device -// --------------------------------------------------------------------------- -void lj_gpu_clear() { - LJMF.clear(); -} - -// --------------------------------------------------------------------------- -// copy atom positions and optionally types to device -// --------------------------------------------------------------------------- -template -inline void _lj_gpu_atom(PairGPUAtom &atom, double **host_x, - const int *host_type, const bool rebuild, - cudaStream_t &stream) { - atom.time_atom.start(); - atom.reset_write_buffer(); - - // First row of dev_x is x position, second is y, third is z - atom.add_atom_data(host_x[0],3); - atom.add_atom_data(host_x[0]+1,3); - atom.add_atom_data(host_x[0]+2,3); - - int csize=3; - - // If a rebuild occured, copy type data - if (rebuild) { - atom.add_atom_data(host_type); - csize++; - } - - atom.copy_atom_data(csize,stream); - atom.time_atom.stop(); -} - -void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) { - _lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream); -} - -// --------------------------------------------------------------------------- -// Signal that we need to transfer a new neighbor list -// --------------------------------------------------------------------------- -template -bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum, - int *ilist, const int *numj) { - if (nall>ljm.max_atoms) - return false; - - ljm.nbor.time_nbor.start(); - - ljm.atom.nall(nall); - ljm.atom.inum(inum); - ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream); - - ljm.nbor.time_nbor.stop(); - return true; -} - -bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, - const int *numj) { - return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj); -} - -// --------------------------------------------------------------------------- -// Copy a set of ij_size ij interactions to device and compute energies, -// forces, and torques for those interactions -// --------------------------------------------------------------------------- -template -void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) { - ljm.nbor.time_nbor.add_to_total(); - // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed - - ljm.nbor.time_nbor.start(); - ljm.nbor.add(num_ij,ljm.pair_stream); - ljm.nbor.time_nbor.stop(); -} - -void lj_gpu_nbors(const int num_ij) { - _lj_gpu_nbors(LJMF,num_ij); -} - -// --------------------------------------------------------------------------- -// Calculate energies and forces for all ij interactions -// --------------------------------------------------------------------------- -template -void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){ - // Compute the block size and grid size to keep all cores busy - const int BX=BLOCK_1D; - - int GX=static_cast(ceil(static_cast(ljm.atom.inum())/BX)); - - ljm.time_pair.start(); - if (ljm.shared_types) - kernel_lj_fast<<>> - (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), - ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), - ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, - vflag, ljm.atom.inum(), ljm.atom.nall()); - else - kernel_lj<<>> - (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), - ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), - ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, - vflag, ljm.atom.inum(), ljm.atom.nall()); - ljm.time_pair.stop(); -} - -void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) { - _lj_gpu(LJMF,eflag,vflag,rebuild); -} - -// --------------------------------------------------------------------------- -// Get energies and forces to host -// --------------------------------------------------------------------------- -template -double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist, - const bool eflag, const bool vflag, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial) { - double evdw; - - ljm.atom.time_answer.start(); - ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream); - - ljm.atom.time_atom.add_to_total(); - ljm.nbor.time_nbor.add_to_total(); - ljm.time_pair.add_to_total(); - // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed - - evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); - ljm.atom.add_forces(ilist,f); - ljm.atom.time_answer.stop(); - ljm.atom.time_answer.add_to_total(); - return evdw; -} - -double lj_gpu_forces(double **f, const int *ilist, const bool eflag, - const bool vflag, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial) { - return _lj_gpu_forces - (LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial); -} - -void lj_gpu_time() { - cout.precision(4); - cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n"; - cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n"; - cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n"; - cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n"; -} - -int lj_gpu_num_devices() { - return LJMF.gpu.num_devices(); -} - -double lj_gpu_bytes() { - return LJMF.host_memory_usage(); -} diff --git a/lib/gpu/lj_gpu_kernel.h b/lib/gpu/lj_gpu_kernel.h deleted file mode 100644 index 5f3f0ff7c6..0000000000 --- a/lib/gpu/lj_gpu_kernel.h +++ /dev/null @@ -1,248 +0,0 @@ -/*************************************************************************** - lj_gpu_kernel.cu - ------------------- - W. Michael Brown - - Routines that actually perform the force computation - - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Tue Aug 4 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#ifndef LJ_GPU_KERNEL -#define LJ_GPU_KERNEL - -template -__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, - const int *dev_ij, const int nbor_pitch, acctyp *ans, - size_t ans_pitch, const bool eflag, - const bool vflag, const int inum, const int nall) { - __shared__ numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - ii+=INT_MUL(blockIdx.x,blockDim.x); - - if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=_x_(i,3); - - numtyp factor_lj; - for ( ; list(j,3); - - // Compute r12 - numtyp delx = ix-_x_(j,0); - numtyp dely = iy-_x_(j,1); - numtyp delz = iz-_x_(j,2); - numtyp r2inv = delx*delx+dely*dely+delz*delz; - - if (r2inv<_cutsq_(itype,jtype)) { - r2inv=(numtyp)1.0/r2inv; - numtyp r6inv =r2inv*r2inv*r2inv; - numtyp force =factor_lj*r2inv*r6inv*(_lj1_(itype,jtype).x*r6inv- - _lj1_(itype,jtype).y); - - fx+=delx*force; - fy+=dely*force; - fz+=delz*force; - - if (eflag) { - numtyp e=r6inv*(_lj3_(itype,jtype).x*r6inv- - _lj3_(itype,jtype).y); - energy+=factor_lj*(e-_offset_(1,1)); - } - if (vflag) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } - } - - } // for nbor - - // Store answers - acctyp *ap1=ans+ii; - if (eflag) { - *ap1=energy; - ap1+=ans_pitch; - } - if (vflag) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; - ap1+=ans_pitch; - } - } - *ap1=fx; - ap1+=ans_pitch; - *ap1=fy; - ap1+=ans_pitch; - *ap1=fz; - - } // if ii -} - -template -__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, - const int *dev_ij, const int nbor_pitch, - acctyp *ans, size_t ans_pitch,const bool eflag, - const bool vflag, const int inum, - const int nall) { - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - __shared__ numtyp sp_lj[4]; - __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - if (ii(itype,jtype); - lj1[ii]=_lj1_(itype,jtype).x; - lj2[ii]=_lj1_(itype,jtype).y; - if (eflag) { - lj3[ii]=_lj3_(itype,jtype).x; - lj4[ii]=_lj3_(itype,jtype).y; - offset[ii]=_offset_(itype,jtype); - } - } - ii+=INT_MUL(blockIdx.x,blockDim.x); - - if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,3)); - - numtyp factor_lj; - for ( ; list(j,3); - - // Compute r12 - numtyp delx = ix-_x_(j,0); - numtyp dely = iy-_x_(j,1); - numtyp delz = iz-_x_(j,2); - numtyp r2inv = delx*delx+dely*dely+delz*delz; - - if (r2inv - -template -int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { - return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors); -} - -template -int LJ_GPU_MemoryT::get_max_atoms(const int gpu_bytes, const int max_nbors) { - int matoms=static_cast(PERCENT_GPU_MEMORY*gpu_bytes/ - bytes_per_atom(max_nbors)); - if (matoms>MAX_ATOMS) - matoms=MAX_ATOMS; - return matoms; -} - -template -int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, - double **host_cutsq, double **host_sigma, - double **host_epsilon, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int max_nbors, - const int me) { - if (allocated) - clear(); - - if (me>=gpu.num_devices()) - return 0; - gpu.set(me); - if (gpu.revision()<1.0) - return 0; - - // Initialize timers for the selected GPU - time_pair.init(); - - // Initialize atom and nbor data - max_atoms=get_max_atoms(gpu.bytes(),max_nbors); - atom.init(max_atoms); - nbor.init(ij_size,max_atoms,max_nbors); - - // Get a stream for computing pair potentials - CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream)); - - // Use the write buffer from atom for data initialization - NVC_HostT &host_write=atom.host_write; - assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2); - - // Copy data for bonded interactions - special_lj.safe_alloc(4); - special_lj.cast_copy(host_special_lj,host_write); - - // Copy sigma, epsilon, and cutsq onto GPU - sigma.safe_alloc(ntypes,ntypes); - sigma.cast_copy(host_sigma[0],host_write); - epsilon.safe_alloc(ntypes,ntypes); - epsilon.cast_copy(host_epsilon[0],host_write); - cutsq.safe_alloc(ntypes,ntypes); - cutsq.cast_copy(host_cutsq[0],host_write); - - // If atom type constants fit in shared memory use fast kernel - int lj_types=ntypes; - shared_types=false; - if (lj_types<=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; - shared_types=true; - } - offset.safe_alloc(lj_types,lj_types); - offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes); - double *t1=host_lj1[0]; - double *t2=host_lj2[0]; - for (int i=0; i::vec2 *> (host_write.begin()), - ntypes,ntypes); - t1=host_lj3[0]; - t2=host_lj4[0]; - for (int i=0; i::vec2 *> (host_write.begin()), - ntypes,ntypes); - - // Bind constant data to textures - sigma_bind_texture(sigma); - epsilon_bind_texture(epsilon); - cutsq_bind_texture(cutsq); - offset_bind_texture(offset); - lj1_bind_texture::vec2>(lj1); - lj3_bind_texture::vec2>(lj3); - - dev_error.safe_alloc(1); - dev_error.zero(); - - allocated=true; - - return nbor.host_ij.begin(); -} - -template -void LJ_GPU_MemoryT::clear() { - if (!allocated) - return; - allocated=false; - - // Check for any pair style specific errors here - int err_flag; - dev_error.copy_to_host(&err_flag); - - atom.clear(); - nbor.clear(); - - sigma_unbind_texture(); - epsilon_unbind_texture(); - cutsq_unbind_texture(); - offset_unbind_texture(); - lj1_unbind_texture::vec2>(); - lj3_unbind_texture::vec2>(); - - CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream)); - - dev_error.clear(); - sigma.clear(); - epsilon.clear(); - special_lj.clear(); - cutsq.clear(); - offset.clear(); - lj1.clear(); - lj3.clear(); -} - -template -double LJ_GPU_MemoryT::host_memory_usage() const { - return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+ - sizeof(LJ_GPU_Memory); -} - -template class LJ_GPU_Memory; diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h deleted file mode 100644 index a3b4358192..0000000000 --- a/lib/gpu/lj_gpu_memory.h +++ /dev/null @@ -1,88 +0,0 @@ -/*************************************************************************** - lj_gpu_memory.h - ------------------- - W. Michael Brown - - Global variables for GPU Lennard-Jones Library - - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Tue Aug 4 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#ifndef LJ_GPU_MEMORY_H -#define LJ_GPU_MEMORY_H - -#include "nvc_device.h" -#include "pair_gpu_atom.h" -#include "pair_gpu_nbor.h" - -#define BLOCK_1D 64 -#define MAX_SHARED_TYPES 8 -#define PERCENT_GPU_MEMORY 0.7 - -template -class LJ_GPU_Memory { - public: - LJ_GPU_Memory() : allocated(false) {} - ~LJ_GPU_Memory() { clear(); } - - /// Allocate memory on host and device - int* init(const int ij_size, const int ntypes, double **host_cutsq, - double **host_sigma, double **host_epsilon, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int max_nbors, const int me); - /// Free any memory on host and device - void clear(); - - /// Returns memory usage on GPU per atom - int bytes_per_atom(const int max_nbors) const; - /// Maximum number of atoms that can be stored on GPU - int get_max_atoms(const int gpu_bytes, const int max_nbors); - /// Total host memory used by library - double host_memory_usage() const; - - // ------------------------- DATA ----------------------------- - - // Device Properties - NVCDevice gpu; - // Device Error Flag - NVC_VecI dev_error; - // Stream for asynchronous work - cudaStream_t pair_stream; - - // Atom Data - PairGPUAtom atom; - // Neighbor Data - PairGPUNbor nbor; - - // --------------- Const Data for Atoms - NVC_ConstMatT sigma, epsilon, cutsq, offset; - NVC_ConstMat< typename cu_vec_traits::vec2 > lj1, lj3; - NVC_VecT special_lj; - - size_t max_atoms; - - // Timing for pair calculation - NVCTimer time_pair; - - // If atom type constants fit in shared memory, use fast kernels - bool shared_types; - - protected: - bool allocated; -}; - -#endif diff --git a/lib/gpu/nvc_device.cu b/lib/gpu/nvc_device.cu deleted file mode 100644 index 01a2db1bd7..0000000000 --- a/lib/gpu/nvc_device.cu +++ /dev/null @@ -1,96 +0,0 @@ -/*************************************************************************** - nvc_device.cu - ------------------- - W. Michael Brown - - Utilities for dealing with cuda devices - - __________________________________________________________________________ - This file is part of the NVC Library - __________________________________________________________________________ - - begin : Wed Jan 28 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#include -#include -#include -#include "nvc_macros.h" -#include "nvc_device.h" - -// Grabs the properties for all devices -NVCDevice::NVCDevice() { - CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices)); - for (int dev=0; dev<_num_devices; ++dev) { - cudaDeviceProp deviceProp; - CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev)); - if (deviceProp.major == 9999 && deviceProp.minor == 9999) - break; - _properties.push_back(deviceProp); - } - _device=0; -} - -// Set the CUDA device to the specified device number -void NVCDevice::set(int num) { - if (_device==num) - return; - cudaThreadExit(); - CUDA_SAFE_CALL(cudaSetDevice(num)); - _device=num; -} - -// List all devices along with all properties -void NVCDevice::print_all(ostream &out) { - if (num_devices() == 0) - printf("There is no device supporting CUDA\n"); - for (int i=0; i= 2000 - printf(" Number of multiprocessors: %d\n", - _properties[i].multiProcessorCount); - printf(" Number of cores: %d\n",cores(i)); - #endif - printf(" Total amount of constant memory: %u bytes\n", - _properties[i].totalConstMem); - printf(" Total amount of shared memory per block: %u bytes\n", - _properties[i].sharedMemPerBlock); - printf(" Total number of registers available per block: %d\n", - _properties[i].regsPerBlock); - printf(" Warp size: %d\n", - _properties[i].warpSize); - printf(" Maximum number of threads per block: %d\n", - _properties[i].maxThreadsPerBlock); - printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", - _properties[i].maxThreadsDim[0], - _properties[i].maxThreadsDim[1], - _properties[i].maxThreadsDim[2]); - printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", - _properties[i].maxGridSize[0], - _properties[i].maxGridSize[1], - _properties[i].maxGridSize[2]); - printf(" Maximum memory pitch: %u bytes\n", - _properties[i].memPitch); - printf(" Texture alignment: %u bytes\n", - _properties[i].textureAlignment); - printf(" Clock rate: %.2f GHz\n", - clock_rate(i)); - #if CUDART_VERSION >= 2000 - printf(" Concurrent copy and execution: %s\n", - _properties[i].deviceOverlap ? "Yes" : "No"); - #endif - } -} - diff --git a/lib/gpu/nvc_device.h b/lib/gpu/nvc_device.h deleted file mode 100644 index 1ba5f2bc4c..0000000000 --- a/lib/gpu/nvc_device.h +++ /dev/null @@ -1,92 +0,0 @@ -/*************************************************************************** - nvc_device.h - ------------------- - W. Michael Brown - - Utilities for dealing with cuda devices - - __________________________________________________________________________ - This file is part of the NVC Library - __________________________________________________________________________ - - begin : Wed Jan 28 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#ifndef NVC_DEVICE -#define NVC_DEVICE - -#include -#include -#include - -using namespace std; - -/// Class for looking at device properties -/** \note Calls to change the device outside of the class results in incorrect - * behavior - * \note There is no error checking for indexing past the number of devices **/ -class NVCDevice { - public: - /// Grabs the properties for all devices - NVCDevice(); - - /// Return the number of devices that support CUDA - inline int num_devices() { return _properties.size(); } - - /// Set the CUDA device to the specified device number - void set(int num); - - /// Get the current device number - inline int device_num() { return _device; } - - /// Get the current CUDA device name - inline string name() { return name(_device); } - /// Get the CUDA device name - inline string name(const int i) { return string(_properties[i].name); } - - /// Get the number of cores in the current device - inline unsigned cores() { return cores(_device); } - /// Get the number of cores - inline unsigned cores(const int i) - { return _properties[i].multiProcessorCount*8; } - - /// Get the gigabytes of global memory in the current device - inline double gigabytes() { return gigabytes(_device); } - /// Get the gigabytes of global memory - inline double gigabytes(const int i) - { return static_cast(_properties[i].totalGlobalMem)/1073741824; } - - /// Get the bytes of global memory in the current device - inline size_t bytes() { return bytes(_device); } - /// Get the bytes of global memory - inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; } - - /// Return the GPGPU revision number for current device - inline double revision() { return revision(_device); } - /// Return the GPGPU revision number - inline double revision(const int i) - { return static_cast(_properties[i].minor)/10+_properties[i].major;} - - /// Clock rate in GHz for current device - inline double clock_rate() { return clock_rate(_device); } - /// Clock rate in GHz - inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;} - - /// List all devices along with all properties - void print_all(ostream &out); - - private: - int _device, _num_devices; - vector _properties; -}; - -#endif diff --git a/lib/gpu/nvc_get_devices.cu b/lib/gpu/nvc_get_devices.cu deleted file mode 100644 index a1d35232e6..0000000000 --- a/lib/gpu/nvc_get_devices.cu +++ /dev/null @@ -1,31 +0,0 @@ -/*************************************************************************** - nvc_get_devices.h - ------------------- - W. Michael Brown - - List properties of cuda devices - - __________________________________________________________________________ - This file is part of the NVC Library - __________________________________________________________________________ - - begin : Wed Jan 28 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#include "nvc_device.h" - -int main(int argc, char** argv) { - NVCDevice gpu; - gpu.print_all(cout); - return 0; -} - diff --git a/lib/gpu/nvc_macros.h b/lib/gpu/nvc_macros.h deleted file mode 100644 index 56bb5c64ce..0000000000 --- a/lib/gpu/nvc_macros.h +++ /dev/null @@ -1,121 +0,0 @@ -#ifndef NVC_MACROS_H -#define NVC_MACROS_H - -#include -#include "math_constants.h" -#define INT_MUL(x,y) (__mul24(x,y)) -//#define INT_MUL(x,y) ((x)*(y)) - -template -static __inline__ __device__ numbr cuda_inf() { return CUDART_INF_F; } - -#ifdef CUDA_DOUBLE -template <> -static __inline__ __device__ double cuda_inf() { return CUDART_INF; } -#endif - -template -static __inline__ __device__ numbr cuda_zero() { return 0.0; } - -template <> -static __inline__ __device__ float cuda_zero() { return 0.0f; } - -#ifdef DEBUG - -# define CU_SAFE_CALL_NO_SYNC( call ) do { \ - CUresult err = call; \ - if( CUDA_SUCCESS != err) { \ - fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \ - err, __FILE__, __LINE__ ); \ - exit(EXIT_FAILURE); \ - } } while (0) - -# define CU_SAFE_CALL( call ) do { \ - CU_SAFE_CALL_NO_SYNC(call); \ - CUresult err = cuCtxSynchronize(); \ - if( CUDA_SUCCESS != err) { \ - fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \ - err, __FILE__, __LINE__ ); \ - exit(EXIT_FAILURE); \ - } } while (0) - -# define CUDA_SAFE_CALL_NO_SYNC( call) do { \ - cudaError err = call; \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ - __FILE__, __LINE__, cudaGetErrorString( err) ); \ - exit(EXIT_FAILURE); \ - } } while (0) - -# define CUDA_SAFE_CALL( call) do { \ - CUDA_SAFE_CALL_NO_SYNC(call); \ - cudaError err = cudaThreadSynchronize(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ - __FILE__, __LINE__, cudaGetErrorString( err) ); \ - exit(EXIT_FAILURE); \ - } } while (0) - -# define CUFFT_SAFE_CALL( call) do { \ - cufftResult err = call; \ - if( CUFFT_SUCCESS != err) { \ - fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } } while (0) - -# define CUT_SAFE_CALL( call) \ - if( CUTTrue != call) { \ - fprintf(stderr, "Cut error in file '%s' in line %i.\n", \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } - - //! Check for CUDA error -# define CUT_CHECK_ERROR(errorMessage) do { \ - cudaError_t err = cudaGetLastError(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ - } \ - err = cudaThreadSynchronize(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ - } } while (0) - - //! Check for malloc error -# define CUT_SAFE_MALLOC( mallocCall ) do{ \ - if( !(mallocCall)) { \ - fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } } while(0); - - //! Check if conditon is true (flexible assert) -# define CUT_CONDITION( val) \ - if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) { \ - exit(EXIT_FAILURE); \ - } - -#else // not DEBUG - -#define CUT_BANK_CHECKER( array, index) array[index] - - // void macros for performance reasons -# define CUT_CHECK_ERROR(errorMessage) -# define CUT_CHECK_ERROR_GL() -# define CUT_CONDITION( val) -# define CU_SAFE_CALL_NO_SYNC( call) call -# define CU_SAFE_CALL( call) call -# define CUDA_SAFE_CALL_NO_SYNC( call) call -# define CUDA_SAFE_CALL( call) call -# define CUT_SAFE_CALL( call) call -# define CUFFT_SAFE_CALL( call) call -# define CUT_SAFE_MALLOC( mallocCall ) mallocCall - -#endif - -#endif diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h deleted file mode 100644 index a83178985c..0000000000 --- a/lib/gpu/nvc_memory.h +++ /dev/null @@ -1,447 +0,0 @@ -/*************************************************************************** - nvc_memory.h - ------------------- - W. Michael Brown - - Routines for memory management on CUDA devices - - __________________________________________________________________________ - This file is part of the NVC Library - __________________________________________________________________________ - - begin : Thu Jun 25 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#ifndef NVC_MEMORY_H -#define NVC_MEMORY_H - -#include - -#define NVC_HostT NVC_Host -#define NVC_HostD NVC_Host -#define NVC_HostS NVC_Host -#define NVC_HostI NVC_Host - -#define NVC_VecT NVC_Vec -#define NVC_VecD NVC_Vec -#define NVC_VecS NVC_Vec -#define NVC_VecI NVC_Vec -#define NVC_VecI2 NVC_Vec -#define NVC_VecU2 NVC_Vec - -#define NVC_MatT NVC_Mat -#define NVC_MatD NVC_Mat -#define NVC_MatS NVC_Mat -#define NVC_MatI NVC_Mat - -#define NVC_ConstMatT NVC_ConstMat -#define NVC_ConstMatD NVC_ConstMat -#define NVC_ConstMatS NVC_ConstMat -#define NVC_ConstMatI NVC_ConstMat -#define NVC_ConstMatD2 NVC_ConstMat - -namespace NVC { - -// Get a channel for float array -template -inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { - channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); -} - -// Get a channel for float2 array -template <> -inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { - channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); -} - -// Get a channel for double array -template <> -inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { - channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned); -} - -// Get a channel for double array -template <> -inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { - channel = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindSigned); -} - -// Get a channel for int array -template <> -inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { - channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned); -} - -} - -/// Page-locked Row Vector on Host -template -class NVC_Host { - public: - NVC_Host() { _cols=0; } - ~NVC_Host() { if (_cols>0) CUDA_SAFE_CALL(cudaFreeHost(_array)); } - - // Allocate page-locked memory with fast write/slow read on host - inline void safe_alloc_w(const size_t cols) { - _cols=cols; - _row_bytes=cols*sizeof(numtyp); - CUDA_SAFE_CALL(cudaHostAlloc((void **)&_array,_row_bytes, - cudaHostAllocWriteCombined)); - _end=_array+cols; - } - - // Allocate page-locked memory with fast read/write on host - inline void safe_alloc_rw(const size_t cols) { - _cols=cols; - _row_bytes=cols*sizeof(numtyp); - CUDA_SAFE_CALL(cudaMallocHost((void **)&_array,_row_bytes)); - _end=_array+cols; - } - - /// Free any memory associated with device - inline void clear() - { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } } - - /// Set each element to zero - inline void zero() { memset(_array,0,row_bytes()); } - - /// Set first n elements to zero - inline void zero(const int n) { memset(_array,0,n*sizeof(numtyp)); } - - inline numtyp * begin() { return _array; } - inline const numtyp * begin() const { return _array; } - inline numtyp * end() { return _end; } - inline const numtyp * end() const { return _end; } - - inline size_t numel() const { return _cols; } - inline size_t rows() const { return 1; } - inline size_t cols() const { return _cols; } - inline size_t row_size() const { return _cols; } - inline size_t row_bytes() const { return _row_bytes; } - - inline numtyp & operator[](const int i) { return _array[i]; } - inline const numtyp & operator[](const int i) const { return _array[i]; } - - /// Copy from device (numel is not bytes) - inline void copy_from_device(const numtyp *device_p, size_t numel) { - CUDA_SAFE_CALL(cudaMemcpy(_array,device_p,numel*sizeof(numtyp), - cudaMemcpyDeviceToHost)); - } - - /// Copy to device (numel is not bytes) - inline void copy_to_device(numtyp *device_p, size_t numel) { - CUDA_SAFE_CALL(cudaMemcpy(device_p,_array,numel*sizeof(numtyp), - cudaMemcpyHostToDevice)); - } - - /// Copy to 2D matrix on device (numel is not bytes) - inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, - const size_t rows, const size_t cols) { - CUDA_SAFE_CALL(cudaMemcpy2D(device_p,dev_row_size*sizeof(numtyp), - _array,cols*sizeof(numtyp), - cols*sizeof(numtyp),rows, - cudaMemcpyHostToDevice)); - } - - /// Asynchronous copy from device (numel is not bytes) - inline void copy_from_device(const numtyp *device_p, size_t numel, - cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp), - cudaMemcpyDeviceToHost,stream)); - } - - /// Asynchronous copy to device (numel is not bytes) - inline void copy_to_device(numtyp *device_p, size_t numel, - cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp), - cudaMemcpyHostToDevice,stream)); - } - - /// Asynchronous copy to 2D matrix on device (numel is not bytes) - inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, - const size_t rows, const size_t cols, - cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp), - _array,cols*sizeof(numtyp), - cols*sizeof(numtyp),rows, - cudaMemcpyHostToDevice,stream)); - } - - private: - numtyp *_array, *_end; - size_t _row_bytes, _row_size, _rows, _cols; -}; - -/// Row vector on device -template -class NVC_Vec { - public: - NVC_Vec() { _cols=0; } - ~NVC_Vec() { if (_cols>0) CUDA_SAFE_CALL(cudaFree(_array)); } - - // Row vector on device - inline void safe_alloc(const size_t cols) { - _cols=cols; - _row_bytes=cols*sizeof(numtyp); - CUDA_SAFE_CALL(cudaMalloc((void **)&_array,_row_bytes)); - _end=_array+cols; - } - - /// Free any memory associated with device - inline void clear() - { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFree(_array)); } } - - /// Set each element to zero - inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0,row_bytes())); } - - inline numtyp * begin() { return _array; } - inline const numtyp * begin() const { return _array; } - inline numtyp * end() { return _end; } - inline const numtyp * end() const { return _end; } - - inline size_t numel() const { return _cols; } - inline size_t rows() const { return 1; } - inline size_t cols() const { return _cols; } - inline size_t row_size() const { return _cols; } - inline size_t row_bytes() const { return _row_bytes; } - - /// Copy from host - inline void copy_from_host(const numtyp *host_p) - { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), - cudaMemcpyHostToDevice)); } - - /// Asynchronous copy from host - inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) - { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(), - cudaMemcpyHostToDevice, stream)); } - - /// Copy to host - inline void copy_to_host(numtyp *host_p) - { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(), - cudaMemcpyDeviceToHost)); } - - /// Copy n elements to host - inline void copy_to_host(numtyp *host_p, const int n) - { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,n*sizeof(numtyp), - cudaMemcpyDeviceToHost)); } - - /// Cast and then copy to device - template - inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) { - for (int i=0; i(buffer[i]); - copy_from_host(host_write.begin()); - } - - /// Bind to texture - template - inline void bind_texture(texture &texi, cudaChannelFormatDesc &channel) { - NVC::cuda_gb_get_channel(channel); - texi.addressMode[0] = cudaAddressModeClamp; - texi.addressMode[1] = cudaAddressModeClamp; - texi.filterMode = cudaFilterModePoint; - texi.normalized = false; - CUDA_SAFE_CALL(cudaBindTexture(NULL,&texi,_array,&channel)); - } - - /// Output the vector (debugging) - inline void print(std::ostream &out) { print (out, numel()); } - - // Output first n elements of vector - inline void print(std::ostream &out, const int n) { - numtyp *t=new numtyp[n]; - copy_to_host(t,n); - for (int i=0; i -class NVC_Mat { - public: - NVC_Mat() { _rows=0; } - ~NVC_Mat() { if (_rows>0) CUDA_SAFE_CALL(cudaFree(_array)); } - - // Row major matrix on device - // - Coalesced access using adjacent cols on same row - // - NVC_Mat(row,col) given by array[row*row_size()+col] - inline void safe_alloc(const size_t rows, const size_t cols) { - _rows=rows; - _cols=cols; - CUDA_SAFE_CALL(cudaMallocPitch((void **)&_array,&_pitch, - cols*sizeof(numtyp),rows)); - _row_size=_pitch/sizeof(numtyp); - _end=_array+_row_size*cols; - } - - /// Free any memory associated with device - inline void clear() - { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFree(_array)); } } - - /// Set each element to zero - inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0, _pitch*_rows)); } - - inline numtyp * begin() { return _array; } - inline const numtyp * begin() const { return _array; } - inline numtyp * end() { return _end; } - inline const numtyp * end() const { return _end; } - - - inline size_t numel() const { return _cols*_rows; } - inline size_t rows() const { return _rows; } - inline size_t cols() const { return _cols; } - inline size_t row_size() const { return _row_size; } - inline size_t row_bytes() const { return _pitch; } - - /// Copy from host (elements not bytes) - inline void copy_from_host(const numtyp *host_p, const size_t numel) - { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,numel*sizeof(numtyp), - cudaMemcpyHostToDevice)); } - - /// Asynchronous copy from host (elements not bytes) - inline void copy_from_host(const numtyp *host_p, const size_t numel, - cudaStream_t &stream) - { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), - cudaMemcpyHostToDevice, stream)); } - - /// Asynchronous Copy from Host - /** \note Used when the number of columns/rows allocated on host smaller than - * on device **/ - inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, - const size_t cols, cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp), - cols*sizeof(numtyp), rows, - cudaMemcpyHostToDevice,stream)); - } - - private: - numtyp *_array, *_end; - size_t _pitch, _row_size, _rows, _cols; -}; - -/// Const 2D Matrix on device (requires texture binding) -template -class NVC_ConstMat { - public: - NVC_ConstMat() { _rows=0; } - ~NVC_ConstMat() { if (_rows>0) CUDA_SAFE_CALL(cudaFreeArray(_array)); } - - /// Row major matrix on device - inline void safe_alloc(const size_t rows, const size_t cols) { - _rows=rows; - _cols=cols; - - NVC::cuda_gb_get_channel(_channel); - CUDA_SAFE_CALL(cudaMallocArray(&_array, &_channel, cols, rows)); - } - - /// Bind to texture - template - inline void bind_texture(texture &texi) { - texi.addressMode[0] = cudaAddressModeClamp; - texi.addressMode[1] = cudaAddressModeClamp; - texi.filterMode = cudaFilterModePoint; - texi.normalized = false; - CUDA_SAFE_CALL(cudaBindTextureToArray(&texi,_array,&_channel)); - } - - /// Free any memory associated with device - inline void clear() - { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFreeArray(_array)); } } - - inline size_t numel() const { return _cols*_rows; } - inline size_t rows() const { return _rows; } - inline size_t cols() const { return _cols; } - inline size_t row_size() const { return _cols; } - inline size_t row_bytes() const { return _cols*sizeof(numtyp); } - - /// Copy from Host - inline void copy_from_host(const numtyp *host_p) { - CUDA_SAFE_CALL(cudaMemcpyToArray(_array, 0, 0, host_p, - numel()*sizeof(numtyp), - cudaMemcpyHostToDevice)); - } - - /// Copy from Host - /** \note Used when the number of columns/rows allocated on host smaller than - * on device **/ - inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, - const size_t cols) { - CUDA_SAFE_CALL(cudaMemcpy2DToArray(_array, 0, 0, host_p, - cols*sizeof(numtyp), cols*sizeof(numtyp), rows, - cudaMemcpyHostToDevice)); - } - - /// Asynchronous Copy from Host - inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, - numel()*sizeof(numtyp), - cudaMemcpyHostToDevice,stream)); - } - - /// Asynchronous Copy from Host - /** \note Used when the number of columns/rows allocated on host smaller than - * on device **/ - inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, - const size_t cols, cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, - cols*sizeof(numtyp), cols*sizeof(numtyp), rows, - cudaMemcpyHostToDevice,stream)); - } - - /// Cast buffer to numtyp in host_write and copy to array - template - inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) { - int n=numel(); - for (int i=0; i(*buffer); buffer++; - } - copy_from_host(host_write.begin()); - } - - /// Cast buffer to numtyp in host_write and copy to array - /** \note Used when the number of columns/rows allocated on host smaller than - * on device **/ - template - inline void cast_copy2D(const numtyp2 *buffer, NVC_HostT &host_write, - const size_t rows, const size_t cols) { - int n=rows*cols; - for (int i=0; i(*buffer); buffer++; - } - copy_2Dfrom_host(host_write.begin(),rows,cols); - } - - /// Cast buffer to numtyp in host_write and copy to array asynchronously - template - inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write, - cudaStream_t &stream) { - int n=numel(); - for (int i=0; i(*buffer); buffer++; - } - copy_from_host(host_write.begin(),stream); - } - - private: - size_t _rows, _cols; - cudaArray *_array; - cudaChannelFormatDesc _channel; -}; - -#endif diff --git a/lib/gpu/nvc_timer.h b/lib/gpu/nvc_timer.h deleted file mode 100644 index 12346b5ea2..0000000000 --- a/lib/gpu/nvc_timer.h +++ /dev/null @@ -1,83 +0,0 @@ -/*************************************************************************** - nvc_timer.h - ------------------- - W. Michael Brown - - Class for timing CUDA routines - - __________________________________________________________________________ - This file is part of the NVC Library - __________________________________________________________________________ - - begin : Tue Feb 3 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#ifndef NVC_TIMER_H -#define NVC_TIMER_H - -#include "nvc_macros.h" - -#define cudaEventDestroy(a) - -/// Class for timing CUDA events -class NVCTimer { - public: - NVCTimer() : _total_time(0.0f), initialized(false) { } - - ~NVCTimer() { - if (initialized) - { cudaEventDestroy(start_event); cudaEventDestroy(stop_event); } - } - - inline void init() { - if (!initialized) { - initialized=true; - CUDA_SAFE_CALL( cudaEventCreate(&start_event) ); - CUDA_SAFE_CALL( cudaEventCreate(&stop_event) ); - } - } - - /// Start timing - inline void start() { cudaEventRecord(start_event,0); } - - /// Stop timing and store event time - inline void stop() { cudaEventRecord(stop_event,0); } - - /// Set the time elapsed to zero (not the total_time) - inline void zero() - { cudaEventRecord(start_event,0); cudaEventRecord(stop_event,0); } - - /// Add time from previous start and stop to total - /** Forces synchronization **/ - inline void add_to_total() { _total_time+=time(); } - - /// Return the time (ms) of last start to stop - Forces synchronization - inline double time() { - float timer; - cudaEventSynchronize(stop_event); - CUDA_SAFE_CALL( cudaEventElapsedTime(&timer,start_event,stop_event) ); - return timer; - } - - /// Return the total time in ms - inline double total_time() { return _total_time; } - - /// Return the total time in seconds - inline double total_seconds() { return _total_time/1000.0; } - - private: - cudaEvent_t start_event, stop_event; - double _total_time; - bool initialized; -}; - -#endif diff --git a/lib/gpu/pair_gpu_atom.cu b/lib/gpu/pair_gpu_atom.cu deleted file mode 100644 index 94617a5c11..0000000000 --- a/lib/gpu/pair_gpu_atom.cu +++ /dev/null @@ -1,173 +0,0 @@ -/*************************************************************************** - pair_gpu_atom.cu - ------------------- - W. Michael Brown - - Memory routines for moving atom and force data between host and gpu - - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Tue Aug 4 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#include "pair_gpu_atom.h" -#define PairGPUAtomT PairGPUAtom - -template -int PairGPUAtomT::bytes_per_atom() const { - return atom_fields()*sizeof(numtyp)+ans_fields()*sizeof(acctyp); -} - -template -void PairGPUAtomT::init(const int max_atoms) { - if (allocated) - clear(); - - _max_atoms=max_atoms; - - // Initialize timers for the selected GPU - time_atom.init(); - time_answer.init(); - - // Device matrices for atom and force data - dev_x.safe_alloc(atom_fields(),max_atoms); - x_bind_texture(dev_x); - ans.safe_alloc(ans_fields(),max_atoms); - - // Get a host write only buffer - host_write.safe_alloc_w(max_atoms*4); - // Get a host read/write buffer - host_read.safe_alloc_rw(ans.row_size()*ans_fields()); - - allocated=true; -} - -template -void PairGPUAtomT::clear() { - if (!allocated) - return; - allocated=false; - - x_unbind_texture(); - ans.clear(); - host_write.clear(); - host_read.clear(); - dev_x.clear(); -} - -template -double PairGPUAtomT::host_memory_usage(const int max_atoms) const { - return max_atoms*atom_fields()*sizeof(numtyp)+ - ans_fields()*(max_atoms)*sizeof(acctyp)+ - sizeof(PairGPUAtom); -} - -template -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - cudaStream_t &s) { - _eflag=eflag; - _vflag=vflag; - - int csize=ans_fields(); - if (!eflag) - csize--; - if (!vflag) - csize-=6; - - host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s); -} - -template -double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom, - const bool vflag_atom, double *eatom, - double **vatom, double *virial) { - double evdwl=0.0; - int gap=ans.row_size()-_inum; - - acctyp *ap=host_read.begin(); - if (_eflag) { - if (eflag_atom) { - for (int i=0; i<_inum; i++) { - evdwl+=*ap; - eatom[ilist[i]]+=*ap*0.5; - ap++; - } - } else - for (int i=0; i<_inum; i++) { - evdwl+=*ap; - ap++; - } - ap+=gap; - evdwl*=0.5; - } - _read_loc=ap; - gap=ans.row_size(); - if (_vflag) { - if (vflag_atom) { - for (int ii=0; ii<_inum; ii++) { - int i=ilist[ii]; - ap=_read_loc+ii; - for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=gap; - } - } - } else { - for (int ii=0; ii<_inum; ii++) { - ap=_read_loc+ii; - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=gap; - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - _read_loc+=gap*6; - } - - return evdwl; -} - -template -void PairGPUAtomT::add_forces(const int *ilist, double **f) { - int gap=ans.row_size(); - for (int ii=0; ii<_inum; ii++) { - acctyp *ap=_read_loc+ii; - int i=ilist[ii]; - f[i][0]+=*ap; - ap+=gap; - f[i][1]+=*ap; - ap+=gap; - f[i][2]+=*ap; - } -} - -template -void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) { - int gap=ans.row_size(); - _read_loc+=gap*3; - for (int ii=0; ii; diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h deleted file mode 100644 index a5796db45a..0000000000 --- a/lib/gpu/pair_gpu_atom.h +++ /dev/null @@ -1,151 +0,0 @@ -/*************************************************************************** - pair_gpu_atom.h - ------------------- - W. Michael Brown - - Memory routines for moving atom and force data between host and gpu - - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Tue Aug 4 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#ifndef PAIR_GPU_ATOM_H -#define PAIR_GPU_ATOM_H - -// PRECISION - Precision for rsq, energy, force, and torque calculation -// ACC_PRECISION - Precision for accumulation of energies, forces, and torques -#ifdef _SINGLE_DOUBLE -#define PRECISION float -#define ACC_PRECISION double -#endif - -#ifdef _DOUBLE_DOUBLE -#define PRECISION double -#define ACC_PRECISION double -#endif - -#ifndef PRECISION -#define PRECISION float -#define ACC_PRECISION double -#endif - -#define MAX_ATOMS 65536 -#include "nvc_timer.h" -#include "pair_gpu_texture.h" - -template -class PairGPUAtom { - public: - PairGPUAtom() : _atom_fields(4), _ans_fields(10), allocated(false) {} - ~PairGPUAtom() { clear(); } - - // Accessors - inline int atom_fields() const { return _atom_fields; } - inline int ans_fields() const { return _ans_fields; } - inline int max_atoms() const { return _max_atoms; } - inline int nall() const { return _nall; } - inline int inum() const { return _inum; } - - /// Set number of atoms for future copy operations - inline void nall(const int n) { _nall=n; } - /// Set number of inum for future copy operations - inline void inum(const int n) { _inum=n; } - /// Set the number of atom fields (x, y, z, type, etc) - inline void atom_fields(const int n) { _atom_fields=n; } - /// Set the number of answer fields (energy, virial, force, etc.) - inline void ans_fields(const int n) { _ans_fields=n; } - - /// Memory usage per atom in this class - /** \note atom_fields and ans_fields should be set for correct answer **/ - int bytes_per_atom() const; - - /// Must be called once to allocate host and device memory - /** \note atom_fields and ans_fields should be set first if not default **/ - void init(const int max_atoms); - - /// Free all memory on host and device - void clear(); - - /// Return the total amount of host memory used by class - double host_memory_usage(const int max_atoms) const; - - - // -------------------------COPY TO GPU ---------------------------------- - - /// Reset the write buffer pointer (Start copying new atom data) - inline void reset_write_buffer() { _write_loc=host_write.begin(); } - - /// Add a row to write buffer with unit stride - /** Copies nall() elements **/ - template - inline void add_atom_data(const cpytyp *host_ptr) - { for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i]; _write_loc++; } } - - /// Add a row to write buffer with non-unit stride - /** Copies nall() elements **/ - template - inline void add_atom_data(const cpytyp *hostptr, const int stride) { - int t=_nall*stride; - for (int i=0; i ans; - - // Buffer for moving floating point data to GPU - NVC_HostT host_write; - // Buffer for moving floating point data to CPU - NVC_Host host_read; - - // Timing Stuff - NVCTimer time_atom, time_answer; - - private: - bool allocated, _eflag, _vflag; - int _atom_fields, _ans_fields; - int _max_atoms, _nall, _inum; - numtyp * _write_loc; - acctyp * _read_loc; -}; - -#endif diff --git a/lib/gpu/pair_gpu_nbor.cu b/lib/gpu/pair_gpu_nbor.cu deleted file mode 100644 index e3fc2b28d8..0000000000 --- a/lib/gpu/pair_gpu_nbor.cu +++ /dev/null @@ -1,80 +0,0 @@ -/*************************************************************************** - pair_gpu_nbor.cu - ------------------- - W. Michael Brown - - Neighbor memory operations for LAMMPS GPU Library - - __________________________________________________________________________ - This file is part of the LAMMPS GPU Library - __________________________________________________________________________ - - begin : Tue Aug 4 2009 - copyright : (C) 2009 by W. Michael Brown - email : wmbrown@sandia.gov - ***************************************************************************/ - -/* ----------------------------------------------------------------------- - Copyright (2009) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - ----------------------------------------------------------------------- */ - -#include "pair_gpu_nbor.h" - -int PairGPUNbor::bytes_per_atom(const int max_nbors) const { - if (_use_packing) - return (max_nbors*2+4)*sizeof(int); - else - return (max_nbors+3)*sizeof(int); -} - -void PairGPUNbor::init(const int ij_size, const int max_atoms, - const int max_nbors) { - if (allocated) - clear(); - - // Initialize timers for the selected GPU - time_nbor.init(); - - if (_use_packing) - dev_nbor.safe_alloc(max_nbors+4,max_atoms); - else - dev_nbor.safe_alloc(3,max_atoms); - - ij.safe_alloc(max_nbors*max_atoms); - host_ij.safe_alloc_w(ij_size); - - allocated=true; -} - -void PairGPUNbor::clear() { - if (!allocated) - return; - allocated=false; - - ij.clear(); - host_ij.clear(); - dev_nbor.clear(); -} - -double PairGPUNbor::host_memory_usage() const { - return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor); -} - -void PairGPUNbor::reset(const int inum, int *ilist, const int *numj, - cudaStream_t &s) { - ij_total=0; - - dev_nbor.copy_from_host(ilist,inum); - int acc=0; - for (int i=0; i class cu_vec_traits; -template <> class cu_vec_traits { public: typedef float2 vec2; }; -template <> class cu_vec_traits { public: typedef double2 vec2; }; - -// ------------------------------- x ------------------------------------ - -static texture x_float_tex; -static texture x_double_tex; -template inline void x_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(x_float_tex); } - -template <> inline void x_bind_texture(NVC_ConstMatD &mat) - { mat.bind_texture(x_double_tex); } -template inline void x_unbind_texture() - { cudaUnbindTexture(x_float_tex); } -template <> inline void x_unbind_texture() - { cudaUnbindTexture(x_double_tex); } -template -static __inline__ __device__ numtyp _x_(const int i, const int j) { - return tex2D(x_float_tex,i,j); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _x_(const int i,const int j) { - int2 t=tex2D(x_double_tex,i,j); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- form ------------------------------------ - -static texture form_tex; -inline void form_bind_texture(NVC_ConstMatI &mat) - { mat.bind_texture(form_tex); } -inline void form_unbind_texture() - { cudaUnbindTexture(form_tex); } -static __inline__ __device__ int _form_(const int i, const int j) { - return tex2D(form_tex,i,j); -} - -// ------------------------------- lshape ------------------------------------ - -static texture lshape_float_tex; -static texture lshape_double_tex; -static cudaChannelFormatDesc channel_lshape; -template inline void lshape_bind_texture(NVC_VecT &vec) - { vec.bind_texture(lshape_float_tex,channel_lshape); } -template <> inline void lshape_bind_texture(NVC_VecD &vec) - { vec.bind_texture(lshape_double_tex,channel_lshape); } -template inline void lshape_unbind_texture() - { cudaUnbindTexture(lshape_float_tex); } -template <> inline void lshape_unbind_texture() - { cudaUnbindTexture(lshape_double_tex); } -template -static __inline__ __device__ numtyp _lshape_(const int i) - { return tex1Dfetch(lshape_float_tex,i); } -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _lshape_(const int i) { - int2 t=tex1Dfetch(lshape_double_tex,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- shape ------------------------------------ - -static texture shape_float_tex; -static texture shape_double_tex; -template inline void shape_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(shape_float_tex); } -template <> inline void shape_bind_texture(NVC_ConstMatD &mat) - { mat.bind_texture(shape_double_tex); } -template inline void shape_unbind_texture() - { cudaUnbindTexture(shape_float_tex); } -template <> inline void shape_unbind_texture() - { cudaUnbindTexture(shape_double_tex); } -template -static __inline__ __device__ numtyp _shape_(const int i, const int j) { - return tex2D(shape_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _shape_(const int i, const int j) { - int2 t=tex2D(shape_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- well ------------------------------------ - -static texture well_float_tex; -static texture well_double_tex; -template inline void well_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(well_float_tex); } -template <> inline void well_bind_texture(NVC_ConstMatD &mat) - { mat.bind_texture(well_double_tex); } -template inline void well_unbind_texture() - { cudaUnbindTexture(well_float_tex); } -template <> inline void well_unbind_texture() - { cudaUnbindTexture(well_double_tex); } -template -static __inline__ __device__ numtyp _well_(const int i, const int j) - { return tex2D(well_float_tex,j,i); } -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _well_(const int i,const int j) { - int2 t=tex2D(well_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- sigma ------------------------------------ - -static texture sigma_float_tex; -static texture sigma_double_tex; -template inline void sigma_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(sigma_float_tex); } -template <> inline void sigma_bind_texture(NVC_ConstMatD &mat) - { mat.bind_texture(sigma_double_tex); } -template inline void sigma_unbind_texture() - { cudaUnbindTexture(sigma_float_tex); } -template <> inline void sigma_unbind_texture() - { cudaUnbindTexture(sigma_double_tex); } -template -static __inline__ __device__ numtyp _sigma_(const int i, const int j) { - return tex2D(sigma_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _sigma_(const int i,const int j) { - int2 t=tex2D(sigma_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- epsilon ------------------------------------ - -static texture epsilon_float_tex; -static texture epsilon_double_tex; -template inline void epsilon_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(epsilon_float_tex); } -template <> inline void epsilon_bind_texture(NVC_ConstMatD &mat) - { mat.bind_texture(epsilon_double_tex); } -template inline void epsilon_unbind_texture() - { cudaUnbindTexture(epsilon_float_tex); } -template <> inline void epsilon_unbind_texture() - { cudaUnbindTexture(epsilon_double_tex); } -template -static __inline__ __device__ numtyp _epsilon_(const int i, const int j) { - return tex2D(epsilon_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _epsilon_(const int i,const int j) { - int2 t=tex2D(epsilon_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- cutsq ------------------------------------ - -static texture cutsq_float_tex; -static texture cutsq_double_tex; -template inline void cutsq_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(cutsq_float_tex); } -template <> inline void cutsq_bind_texture(NVC_ConstMatD &mat) - { mat.bind_texture(cutsq_double_tex); } -template inline void cutsq_unbind_texture() - { cudaUnbindTexture(cutsq_float_tex); } -template <> inline void cutsq_unbind_texture() - { cudaUnbindTexture(cutsq_double_tex); } -template -static __inline__ __device__ numtyp _cutsq_(const int i, const int j) { - return tex2D(cutsq_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _cutsq_(const int i,const int j) { - int2 t=tex2D(cutsq_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -// ------------------------------- lj1 ------------------------------------ - -static texture lj1_float_tex; -static texture lj1_double_tex; -template inline void lj1_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(lj1_float_tex); } -template <> inline void lj1_bind_texture(NVC_ConstMatD2 &mat) - { mat.bind_texture(lj1_double_tex); } -template inline void lj1_unbind_texture() - { cudaUnbindTexture(lj1_float_tex); } -template <> inline void lj1_unbind_texture() - { cudaUnbindTexture(lj1_double_tex); } -template -static __inline__ __device__ -typename cu_vec_traits::vec2 _lj1_(const int i, const int j) { - return tex2D(lj1_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double2 _lj1_(const int i,const int j) { - int4 t=tex2D(lj1_double_tex,j,i); - double2 ans; - ans.x=__hiloint2double(t.y, t.x); - ans.y=__hiloint2double(t.w, t.z); - return ans; -} -#endif - -// ------------------------------- lj3 ------------------------------------ - -static texture lj3_float_tex; -static texture lj3_double_tex; -template inline void lj3_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(lj3_float_tex); } -template <> inline void lj3_bind_texture(NVC_ConstMatD2 &mat) - { mat.bind_texture(lj3_double_tex); } -template inline void lj3_unbind_texture() - { cudaUnbindTexture(lj3_float_tex); } -template <> inline void lj3_unbind_texture() - { cudaUnbindTexture(lj3_double_tex); } -template -static __inline__ __device__ -typename cu_vec_traits::vec2 _lj3_(const int i, const int j) { - return tex2D(lj3_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double2 _lj3_(const int i,const int j) { - int4 t=tex2D(lj3_double_tex,j,i); - double2 ans; - ans.x=__hiloint2double(t.y, t.x); - ans.y=__hiloint2double(t.w, t.z); - return ans; -} -#endif - -// ------------------------------- offset ------------------------------------ - -static texture offset_float_tex; -static texture offset_double_tex; -template inline void offset_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(offset_float_tex); } -template <> inline void offset_bind_texture(NVC_ConstMatD &mat) - { mat.bind_texture(offset_double_tex); } -template inline void offset_unbind_texture() - { cudaUnbindTexture(offset_float_tex); } -template <> inline void offset_unbind_texture() - { cudaUnbindTexture(offset_double_tex); } -template -static __inline__ __device__ numtyp _offset_(const int i, const int j) { - return tex2D(offset_float_tex,j,i); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _offset_(const int i,const int j) { - int2 t=tex2D(offset_double_tex,j,i); - return __hiloint2double(t.y, t.x); -} -#endif - -#endif diff --git a/src/GPU/Install.csh b/src/GPU/Install.csh deleted file mode 100644 index d5d521911f..0000000000 --- a/src/GPU/Install.csh +++ /dev/null @@ -1,34 +0,0 @@ -# Install/unInstall package classes in LAMMPS -# edit Makefile.package to include/exclude GPU library - -if ($1 == 1) then - -# sed -i 's/\S*gpu //' ../Makefile.package -# sed -i 's|^PKGPATH =\s*|&-L../../lib/gpu |' ../Makefile.package -# sed -i 's|^PKGLIB =\s*|&-lpair_gpu_lib |' ../Makefile.package - - cp -f pair_lj_cut_gpu.h ../ - cp -f pair_lj_cut_gpu.cpp ../ - - if (-e ../pair_gayberne.cpp) then - cp -f style_gpu.h ../ - cp -f pair_gayberne_gpu.h ../ - cp -f pair_gayberne_gpu.cpp ../ - else - grep -v erne style_gpu.h > ../style_gpu.h - endif - -else if ($1 == 0) then - -# sed -i 's/\S*gpu //' ../Makefile.package - - rm -f ../style_gpu.h - touch ../style_gpu.h - - rm -f ../pair_lj_cut_gpu.h - rm -f ../pair_lj_cut_gpu.cpp - - rm -f ../pair_gayberne_gpu.h - rm -f ../pair_gayberne_gpu.cpp - -endif diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp deleted file mode 100644 index 23ddff98dd..0000000000 --- a/src/GPU/pair_gayberne_gpu.cpp +++ /dev/null @@ -1,369 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - cetain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing author: Mike Brown (SNL) -------------------------------------------------------------------------- */ - -#include "math.h" -#include "stdio.h" -#include "stdlib.h" -#include "string.h" -#include "pair_gayberne_gpu.h" -#include "math_extra.h" -#include "atom.h" -#include "atom_vec.h" -#include "comm.h" -#include "force.h" -#include "neighbor.h" -#include "neigh_list.h" -#include "integrate.h" -#include "memory.h" -#include "error.h" -#include "neigh_request.h" -#include "universe.h" - -#ifdef GB_GPU_OMP -#include "omp.h" -#endif - -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) - -// External functions from cuda library for atom decomposition -int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma, - const double upsilon, const double mu, double **shape, - double **well, double **cutsq, double **sigma, - double **epsilon, double *host_lshape, int **form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int max_nbors, const int thread, const int gpu_id); -void gb_gpu_clear(const int thread); -int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, - int *ilist, const int *numj, const int *type, - const int thread, bool &success); -void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread); -void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, - const bool rebuild, const int thread); -void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, - const int thread); -double gb_gpu_forces(double **f, double **tor, const int *ilist, - const bool eflag, const bool vflag, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial, const int thread); -std::string gb_gpu_name(const int i, const int max_nbors); -void gb_gpu_time(const int thread); -int gb_gpu_num_devices(); -double gb_gpu_bytes(); - -using namespace LAMMPS_NS; - -/* ---------------------------------------------------------------------- */ - -PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0), - omp_chunk(0), nthreads(1), - multi_gpu_mode(ONE_NODE), - multi_gpu_param(0) -{ - ij_new[0]=NULL; -} - -/* ---------------------------------------------------------------------- - free all arrays -------------------------------------------------------------------------- */ - -PairGayBerneGPU::~PairGayBerneGPU() -{ - if (comm->me == 0 && screen) { - printf("\n\n-------------------------------------"); - printf("--------------------------------\n"); - printf(" GPU Time Stamps: "); - printf("\n-------------------------------------"); - printf("--------------------------------\n"); - gb_gpu_time(my_thread); - std::cout << "Procs: " << universe->nprocs << std::endl; - printf("-------------------------------------"); - printf("--------------------------------\n\n"); - } - #pragma omp parallel - { - #ifdef GB_GPU_OMP - int my_thread=omp_get_thread_num(); - #endif - gb_gpu_clear(my_thread); - if (ij_new[my_thread]!=NULL) { - ij_new[my_thread]=NULL; - delete [] ij_new[my_thread]; - } - } -} - -/* ---------------------------------------------------------------------- */ - -void PairGayBerneGPU::compute(int eflag, int vflag) -{ - if (eflag || vflag) ev_setup(eflag,vflag); - else evflag = vflag_fdotr = 0; - if (vflag_atom) - error->all("Per-atom virial not available with GPU Gay-Berne."); - - int nlocal = atom->nlocal; - int nall = nlocal + atom->nghost; - int inum = list->inum; - - bool rebuild=false; - if (neighbor->ncalls > last_neighbor) { - last_neighbor=neighbor->ncalls; - rebuild=true; - } - - #pragma omp parallel - { - - bool success=true; - #ifdef GB_GPU_OMP - int my_thread=omp_get_thread_num(); - if (rebuild) { - omp_chunk=static_cast(ceil(static_cast(inum)/nthreads)); - if (my_thread==nthreads-1) - thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk; - else - thread_inum[my_thread]=omp_chunk; - olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, - thread_inum[my_thread], - list->ilist+omp_chunk*my_thread, - list->numneigh, atom->type, my_thread, - success); - } - #else - if (rebuild) - olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist, - list->numneigh, atom->type, my_thread, - success); - #endif - if (!success) - error->one("Total # of atoms exceeds maximum allowed per GPGPU.\n"); - - // copy atom data to GPU - gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread); - - int i,j,ii,jj,jnum; - double factor_lj; - int *jlist; - - if (rebuild==true) { - int num_ij = 0; - - // loop over neighbors of my atoms - int *ijp=ij_new[my_thread]; - #ifdef GB_GPU_OMP - int mgo=my_thread*omp_chunk; - int mgot=mgo+thread_inum[my_thread]; - #else - int mgo=0, mgot=inum; - #endif - for (ii = mgo; iifirstneigh[i]; - jnum = list->numneigh[i]; - - for (jj = 0; jj < jnum; jj++) { - j = jlist[jj]; - - *ijp=j; - ijp++; - num_ij++; - - if (num_ij==ij_size) { - memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int)); - gb_gpu_nbors(num_ij,eflag,my_thread); - ijp=ij_new[my_thread]; - num_ij=0; - } - } - } - if (num_ij>0) { - memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int)); - gb_gpu_nbors(num_ij,eflag,my_thread); - } - } - - gb_gpu_gayberne(eflag,vflag,rebuild,my_thread); - double lvirial[6]; - for (int i=0; i<6; i++) lvirial[i]=0.0; - double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag, - eflag_atom, vflag_atom, eatom, vatom, lvirial, - my_thread); - #pragma omp critical - { - eng_vdwl+=my_eng; - virial[0]+=lvirial[0]; - virial[1]+=lvirial[1]; - virial[2]+=lvirial[2]; - virial[3]+=lvirial[3]; - virial[4]+=lvirial[4]; - virial[5]+=lvirial[5]; - } - - } //End parallel - - if (vflag_fdotr) virial_compute(); -} - -/* ---------------------------------------------------------------------- - global settings -------------------------------------------------------------------------- */ - -void PairGayBerneGPU::settings(int narg, char **arg) -{ - if (narg != 4 && narg != 6) error->all("Illegal pair_style command"); - - // Set multi_gpu_mode to one_node for multiple gpus on 1 node - // -- param is starting gpu id - // Set multi_gpu_mode to one_gpu to select the same gpu id on every node - // -- param is id of gpu - // Set multi_gpu_mode to multi_gpu to get ma - // -- param is number of gpus per node - multi_gpu_mode=ONE_NODE; - multi_gpu_param=0; - if (narg==6) { - multi_gpu_param=atoi(arg[5]); - if (strcmp("one_node",arg[4])==0) - multi_gpu_mode=ONE_NODE; - else if (strcmp("one_gpu",arg[4])==0) - multi_gpu_mode=ONE_GPU; - else if (strcmp("multi_gpu",arg[4])==0) { - multi_gpu_mode=MULTI_GPU; - if (multi_gpu_param<1) - error->all("Illegal pair_style command"); - } else - error->all("Illegal pair_style command"); - narg-=2; - } - - PairGayBerne::settings(narg,arg); -} - -/* ---------------------------------------------------------------------- - init specific to this pair style -------------------------------------------------------------------------- */ - -void PairGayBerneGPU::init_style() -{ - // Set the GPU ID - int my_gpu=comm->me+multi_gpu_param; - int ngpus=universe->nprocs; - if (multi_gpu_mode==ONE_GPU) { - my_gpu=multi_gpu_param; - ngpus=1; - } else if (multi_gpu_mode==MULTI_GPU) { - ngpus=multi_gpu_param; - my_gpu=comm->me%ngpus; - if (ngpus>universe->nprocs) - ngpus=universe->nprocs; - } - - if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type) - error->all("Pair gayberne requires atom attributes quat, torque, shape"); - if (atom->radius_flag) - error->all("Pair gayberne cannot be used with atom attribute diameter"); - - int irequest = neighbor->request(this); - - // per-type shape precalculations - - for (int i = 1; i <= atom->ntypes; i++) { - if (setwell[i]) { - double *one = atom->shape[i]; - shape[i][0] = one[0]*one[0]; - shape[i][1] = one[1]*one[1]; - shape[i][2] = one[2]*one[2]; - lshape[i] = (one[0]*one[1]+one[2]*one[2])*sqrt(one[0]*one[1]); - } - } - - // Repeat cutsq calculation because done after call to init_style - double cut; - for (int i = 1; i <= atom->ntypes; i++) - for (int j = i; j <= atom->ntypes; j++) { - cut = init_one(i,j); - cutsq[i][j] = cutsq[j][i] = cut*cut; - } - - // If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads - #ifdef GB_GPU_OMP - if (multi_gpu_mode!=ONE_GPU) - nthreads=ngpus=gb_gpu_num_devices(); - else - nthreads=ngpus=1; - if (nthreads>MAX_GPU_THREADS) - nthreads=MAX_GPU_THREADS; - omp_set_num_threads(nthreads); - #endif - - #pragma omp parallel firstprivate(my_gpu) - { - #ifdef GB_GPU_OMP - int my_thread = omp_get_thread_num(); - if (multi_gpu_mode!=ONE_GPU) - my_gpu=my_thread; - if (multi_gpu_mode==ONE_NODE) - my_gpu+=multi_gpu_param; - #endif - - ij[my_thread]=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu, - shape, well, cutsq, sigma, epsilon, lshape, form, - lj1, lj2, lj3, lj4, offset, force->special_lj, - neighbor->oneatom, my_thread, my_gpu); - if (ij[my_thread]==0) - error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU."); - - if (ij_new[my_thread]!=NULL) - delete [] ij_new[my_thread]; - ij_new[my_thread]=new int[ij_size]; - } - - last_neighbor = -1; - neighbor->requests[irequest]->half = 0; - neighbor->requests[irequest]->full = 1; - if (force->newton_pair) - error->all("Cannot use newton with GPU GayBerne pair style."); - - if (comm->me == 0 && screen) { - printf("\n-------------------------------------"); - printf("-------------------------------------\n"); - printf("- Using GPGPU acceleration for Gay-Berne:\n"); - printf("-------------------------------------"); - printf("-------------------------------------\n"); - - for (int i=0; ioneatom); - printf("GPU %d: %s\n",gpui,gpu_string.c_str()); - } - printf("-------------------------------------"); - printf("-------------------------------------\n\n"); - } -} - -/* ---------------------------------------------------------------------- */ - -double PairGayBerneGPU::memory_usage() -{ - double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int); - return bytes+gb_gpu_bytes(); -} diff --git a/src/GPU/pair_gayberne_gpu.h b/src/GPU/pair_gayberne_gpu.h deleted file mode 100644 index 7be34c3531..0000000000 --- a/src/GPU/pair_gayberne_gpu.h +++ /dev/null @@ -1,43 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -#ifndef PAIR_GPU_H -#define PAIR_GPU_H - -#include "pair_gayberne.h" -#define MAX_GPU_THREADS 4 - -namespace LAMMPS_NS { - -class PairGayBerneGPU : public PairGayBerne { - public: - PairGayBerneGPU(LAMMPS *lmp); - ~PairGayBerneGPU(); - void compute(int, int); - void settings(int, char **); - void init_style(); - double memory_usage(); - - enum { ONE_NODE, ONE_GPU, MULTI_GPU }; - - private: - int ij_size; - int *ij[MAX_GPU_THREADS], *ij_new[MAX_GPU_THREADS], *olist[MAX_GPU_THREADS]; - - int my_thread, nthreads, thread_inum[MAX_GPU_THREADS], omp_chunk; - - int last_neighbor, multi_gpu_mode, multi_gpu_param; -}; - -} -#endif diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp deleted file mode 100644 index d82f99e87d..0000000000 --- a/src/GPU/pair_lj_cut_gpu.cpp +++ /dev/null @@ -1,270 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - cetain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing author: Mike Brown (SNL) -------------------------------------------------------------------------- */ - -#include "math.h" -#include "stdio.h" -#include "stdlib.h" -#include "string.h" -#include "pair_lj_cut_gpu.h" -#include "math_extra.h" -#include "atom.h" -#include "atom_vec.h" -#include "comm.h" -#include "force.h" -#include "neighbor.h" -#include "neigh_list.h" -#include "integrate.h" -#include "memory.h" -#include "error.h" -#include "neigh_request.h" -#include "universe.h" - -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) - -// External functions from cuda library for force decomposition -int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq, - double **sigma, double **epsilon, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int max_nbors, - const int gpu_id); -void lj_gpu_clear(); -bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, - const int *numj); -void lj_gpu_nbors(const int num_ij); -void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild); -void lj_gpu(const bool eflag, const bool vflag, const bool rebuild); -double lj_gpu_forces(double **f, const int *ilist, const bool eflag, - const bool vflag, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial); -std::string lj_gpu_name(const int gpu_id, const int max_nbors); -void lj_gpu_time(); -int lj_gpu_num_devices(); -double lj_gpu_bytes(); - -using namespace LAMMPS_NS; - -/* ---------------------------------------------------------------------- */ - -PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0) -{ - ij_new=NULL; - respa_enable = 0; -} - -/* ---------------------------------------------------------------------- - free all arrays -------------------------------------------------------------------------- */ - -PairLJCutGPU::~PairLJCutGPU() -{ - if (comm->me == 0 && screen) { - printf("\n\n-------------------------------------"); - printf("--------------------------------\n"); - printf(" GPU Time Stamps: "); - printf("\n-------------------------------------"); - printf("--------------------------------\n"); - lj_gpu_time(); - std::cout << "Procs: " << universe->nprocs << std::endl; - printf("-------------------------------------"); - printf("--------------------------------\n\n"); - } - lj_gpu_clear(); - if (ij_new!=NULL) { - ij_new=NULL; - delete [] ij_new; - } -} - -/* ---------------------------------------------------------------------- */ - -void PairLJCutGPU::compute(int eflag, int vflag) -{ - if (eflag || vflag) ev_setup(eflag,vflag); - else evflag = vflag_fdotr = 0; - if (vflag_atom) - error->all("Per-atom virial not available with GPU Gay-Berne."); - - int nlocal = atom->nlocal; - int nall = nlocal + atom->nghost; - int inum = list->inum; - int *ilist = list->ilist; - - bool rebuild=false; - if (neighbor->ncalls > last_neighbor) { - last_neighbor=neighbor->ncalls; - rebuild=true; - } - - // copy nbors to GPU - if (rebuild) - if (!lj_gpu_reset_nbors(nall, inum, ilist, list->numneigh)) - error->one("Total # of atoms exceed maximum allowed per GPGPU.\n"); - - // copy atom data to GPU - lj_gpu_atom(atom->x,atom->type,rebuild); - - int i,j,ii,jj,jnum; - double factor_lj; - int *jlist; - - if (rebuild==true) { - int num_ij = 0; - - // loop over neighbors of my atoms - int *ijp=ij_new; - for (ii = 0; iifirstneigh[i]; - jnum = list->numneigh[i]; - - for (jj = 0; jj < jnum; jj++) { - j = jlist[jj]; - - *ijp=j; - ijp++; - num_ij++; - - if (num_ij==ij_size) { - memcpy(ij,ij_new,num_ij*sizeof(int)); - lj_gpu_nbors(num_ij); - ijp=ij_new; - num_ij=0; - } - } - } - if (num_ij>0) { - memcpy(ij,ij_new,num_ij*sizeof(int)); - lj_gpu_nbors(num_ij); - } - } - - lj_gpu(eflag,vflag,rebuild); - eng_vdwl=lj_gpu_forces(atom->f,ilist,eflag,vflag, eflag_atom, vflag_atom, - eatom, vatom, virial); - - if (vflag_fdotr) virial_compute(); -} - -/* ---------------------------------------------------------------------- - global settings -------------------------------------------------------------------------- */ - -void PairLJCutGPU::settings(int narg, char **arg) -{ - if (narg != 1 && narg != 3) error->all("Illegal pair_style command"); - - // Set multi_gpu_mode to one_node for multiple gpus on 1 node - // -- param is starting gpu id - // Set multi_gpu_mode to one_gpu to select the same gpu id on every node - // -- param is id of gpu - // Set multi_gpu_mode to multi_gpu to get ma - // -- param is number of gpus per node - multi_gpu_mode=ONE_NODE; - multi_gpu_param=0; - if (narg==3) { - multi_gpu_param=atoi(arg[2]); - if (strcmp("one_node",arg[1])==0) - multi_gpu_mode=ONE_NODE; - else if (strcmp("one_gpu",arg[1])==0) - multi_gpu_mode=ONE_GPU; - else if (strcmp("multi_gpu",arg[1])==0) { - multi_gpu_mode=MULTI_GPU; - if (multi_gpu_param<1) - error->all("Illegal pair_style command"); - } else - error->all("Illegal pair_style command"); - narg-=2; - } - - PairLJCut::settings(narg,arg); -} - -/* ---------------------------------------------------------------------- - init specific to this pair style -------------------------------------------------------------------------- */ - -void PairLJCutGPU::init_style() -{ - // Set the GPU ID - int my_gpu=comm->me+multi_gpu_param; - int ngpus=universe->nprocs; - if (multi_gpu_mode==ONE_GPU) { - my_gpu=multi_gpu_param; - ngpus=1; - } else if (multi_gpu_mode==MULTI_GPU) { - ngpus=multi_gpu_param; - my_gpu=comm->me%ngpus; - if (ngpus>universe->nprocs) - ngpus=universe->nprocs; - } - - int irequest = neighbor->request(this); - cut_respa=NULL; - - // Repeat cutsq calculation because done after call to init_style - double cut; - for (int i = 1; i <= atom->ntypes; i++) - for (int j = i; j <= atom->ntypes; j++) { - cut = init_one(i,j); - cutsq[i][j] = cutsq[j][i] = cut*cut; - } - - ij=lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3, - lj4, offset, force->special_lj, neighbor->oneatom, my_gpu); - if (ij==0) - error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU."); - - if (ij_new!=NULL) - delete [] ij_new; - ij_new=new int[ij_size]; - - last_neighbor = -1; - neighbor->requests[irequest]->half = 0; - neighbor->requests[irequest]->full = 1; - if (force->newton_pair) - error->all("Cannot use newton with GPU LJCut pair style."); - - if (comm->me == 0 && screen) { - printf("\n-------------------------------------"); - printf("-------------------------------------\n"); - printf("- Using GPGPU acceleration for LJ-Cut:\n"); - printf("-------------------------------------"); - printf("-------------------------------------\n"); - - for (int i=0; ioneatom); - printf("GPU %d: %s\n",gpui,gpu_string.c_str()); - } - printf("-------------------------------------"); - printf("-------------------------------------\n\n"); - } -} - -/* ---------------------------------------------------------------------- */ - -double PairLJCutGPU::memory_usage() -{ - double bytes=Pair::memory_usage()+ij_size*sizeof(int); - return bytes+lj_gpu_bytes(); -} diff --git a/src/GPU/pair_lj_cut_gpu.h b/src/GPU/pair_lj_cut_gpu.h deleted file mode 100644 index 2774e723ed..0000000000 --- a/src/GPU/pair_lj_cut_gpu.h +++ /dev/null @@ -1,40 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -#ifndef PAIR_LJ_CUT_GPU_H -#define PAIR_LJ_CUT_GPU_H - -#include "pair_lj_cut.h" - -namespace LAMMPS_NS { - -class PairLJCutGPU : public PairLJCut { - public: - PairLJCutGPU(LAMMPS *lmp); - ~PairLJCutGPU(); - void compute(int, int); - void settings(int, char **); - void init_style(); - double memory_usage(); - - enum { ONE_NODE, ONE_GPU, MULTI_GPU }; - - private: - int ij_size; - int *ij, *ij_new; - - int last_neighbor, multi_gpu_mode, multi_gpu_param; -}; - -} -#endif diff --git a/src/GPU/style_gpu.h b/src/GPU/style_gpu.h deleted file mode 100644 index 1be999ccdb..0000000000 --- a/src/GPU/style_gpu.h +++ /dev/null @@ -1,41 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -#ifdef AtomInclude -#endif - -#ifdef AtomClass -# endif - -#ifdef ComputeInclude -#endif - -#ifdef ComputeClass -#endif - -#ifdef FixInclude -#endif - -#ifdef FixClass -#endif - -#ifdef PairInclude -#include "pair_lj_cut_gpu.h" -#include "pair_gayberne_gpu.h" -#endif - -#ifdef PairClass -PairStyle(lj/cut/gpu,PairLJCutGPU) -PairStyle(gayberne/gpu,PairGayBerneGPU) -#endif -