diff --git a/lib/README b/lib/README index a697ac0b9e..cd213217cf 100644 --- a/lib/README +++ b/lib/README @@ -9,6 +9,7 @@ not exist, you will need to edit one of the existing Makefiles. The libraries included with LAMMPS are the following: +gpu graphical processor (GPU) routines, currently NVIDIA specific poems POEMS rigid-body integration package from RPI meam modified embedded atom method (MEAM) potential from Greg Wagner reax ReaxFF potential, from Adri van Duin via Aidan Thompson diff --git a/lib/gpu/Makefile.nvidia b/lib/gpu/Makefile.nvidia new file mode 100644 index 0000000000..47e1fefbe4 --- /dev/null +++ b/lib/gpu/Makefile.nvidia @@ -0,0 +1,75 @@ +#*************************************************************************** +# Makefile +# ------------------- +# W. Michael Brown +# +# _________________________________________________________________________ +# Build for the LAMMPS GPU Force Library +# +# _________________________________________________________________________ +# +# begin : Tue June 23 2009 +# copyright : (C) 2009 by W. Michael Brown +# email : wmbrown@sandia.gov +# ***************************************************************************/ + +BIN_DIR = . +OBJ_DIR = . +AR = ar +CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math +CUDA_ARCH = -maxrregcount 128 #-arch=sm_13 +CUDA_PREC = -D_SINGLE_SINGLE +CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB) + +CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC) + +CUDA_LIB = $(OBJ_DIR)/libgpu.a + +# Headers for CUDA Stuff +NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h +# Headers for Pair Stuff +PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h + +ALL_H = $(NVC_H) $(PAIR_H) + +EXECS = $(BIN_DIR)/nvc_get_devices +OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/gb_gpu.cu_o \ + $(OBJ_DIR)/gb_gpu_memory.cu_o $(OBJ_DIR)/lj_gpu.cu_o \ + $(OBJ_DIR)/lj_gpu_memory.cu_o $(OBJ_DIR)/pair_gpu_atom.cu_o \ + $(OBJ_DIR)/pair_gpu_nbor.cu_o + +all: $(CUDA_LIB) $(EXECS) + +$(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H) + $(CUDA) -o $@ -c nvc_device.cu + +$(OBJ_DIR)/pair_gpu_atom.cu_o: pair_gpu_atom.cu pair_gpu_texture.h pair_gpu_atom.h $(NVC_H) + $(CUDA) -o $@ -c pair_gpu_atom.cu + +$(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H) + $(CUDA) -o $@ -c pair_gpu_nbor.cu + +$(OBJ_DIR)/lj_gpu_memory.cu_o: lj_gpu_memory.cu lj_gpu_memory.h $(ALL_H) + $(CUDA) -o $@ -c lj_gpu_memory.cu + +$(OBJ_DIR)/lj_gpu.cu_o: lj_gpu.cu $(ALL_H) lj_gpu_memory.h lj_gpu_kernel.h + $(CUDA) -o $@ -c lj_gpu.cu + +$(OBJ_DIR)/gb_gpu_memory.cu_o: gb_gpu_memory.cu gb_gpu_memory.h $(ALL_H) + $(CUDA) -o $@ -c gb_gpu_memory.cu + +$(OBJ_DIR)/gb_gpu.cu_o: gb_gpu.cu $(ALL_H) gb_gpu_memory.h gb_gpu_kernel.h gb_gpu_extra.h + $(CUDA) -o $@ -c gb_gpu.cu + +$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o + $(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o + +$(CUDA_LIB): $(OBJS) + $(AR) -crusv $(CUDA_LIB) $(OBJS) + +clean: + rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo + +veryclean: clean + rm -rf *~ *.linkinfo + diff --git a/lib/gpu/README b/lib/gpu/README new file mode 100644 index 0000000000..ed95c3eeab --- /dev/null +++ b/lib/gpu/README @@ -0,0 +1,84 @@ +/*************************************************************************** + README + ------------------- + W. Michael Brown + + README for building LAMMPS GPU Library + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Thu Jun 25 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + + GENERAL NOTES + +This library, pair_gpu_lib.a, provides routines for GPGPU acceleration +of LAMMPS pair styles. Currently, only CUDA enabled GPUs are +supported. Compilation of this library requires installing the CUDA +GPU driver and CUDA toolkit for your operating system. In addition to +the LAMMPS library, the binary nvc_get_devices will also be +built. This can be used to query the names and properties of GPU +devices on your system. + +NOTE: Installation of the CUDA SDK is not required. + +Current pair styles supporting GPU Accelartion: + + 1. lj/cut/gpu + 2. gayberne/gpu + + MULTIPLE LAMMPS PROCESSES + +When using GPGPU acceleration, you are restricted to one physical GPU +per LAMMPS process. This can be multiple GPUs on a single node or +across multiple nodes. Intructions on GPU assignment can be found in +the LAMMPS documentation. + + SPEEDUPS + +The speedups that can be obtained using this library are highly +dependent on the GPU architecture and the computational expense of the +pair potential. When comparing a single precision Tesla C1060 run to a +serial Intel Xeon 5140 2.33 GHz serial run, the speedup is ~4.42x for +lj/cut with a cutoff of 2.5. For gayberne with a cutoff of 7, the +speedup is >103x for 8000 particles. The speedup will improve with an +increase in the number of particles or an increase in the cutoff. + + BUILDING AND PRECISION MODES + +To build, edit the CUDA_CPP, CUDA_ARCH, CUDA_PREC, and CUDA_LINK files for +your machine. Type make. Additionally, the GPU package must be installed and +compiled for LAMMPS. The library supports 3 precision modes as determined by +the CUDA_PREC variable: + + CUDA_PREC = -D_SINGLE_SINGLE # Single precision for all calculations + CUDA_PREC = -D_DOUBLE_DOUBLE # Double precision for all calculations + CUDA_PREC = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double + +NOTE: Double precision is only supported on certain GPUS (with + compute capability>=1.3). + +NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE + package has been installed before installing the GPU package in LAMMPS. + + GPU MEMORY + +Upon initialization of the pair style, the library will reserve memory +for 64K atoms per GPU or 70% of each cards GPU memory, whichever value +is limiting. The value of 70% can be changed by editing the +PERCENT_GPU_MEMORY definition in the source file. The value of 64K +cannot be increased and is the maximum number of atoms allowed per +GPU. Using the 'neigh_modify one' modifier in your LAMMPS input script +can help to increase maximum number of atoms per GPU for cards with +limited memory. diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu new file mode 100644 index 0000000000..cde56d569a --- /dev/null +++ b/lib/gpu/gb_gpu.cu @@ -0,0 +1,518 @@ +/*************************************************************************** + gb_gpu.cu + ------------------- + W. Michael Brown + + Gay-Berne anisotropic potential GPU calcultation + + *** Force decomposition by Atom Version *** + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Jun 23 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#include +#include +#include "nvc_macros.h" +#include "nvc_timer.h" +#include "nvc_device.h" +#include "gb_gpu_memory.h" +#include "gb_gpu_kernel.h" + +using namespace std; + +static GB_GPU_Memory GBMF[MAX_GPU_THREADS]; +#define GBMT GB_GPU_Memory + +// --------------------------------------------------------------------------- +// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access +// -- Only pack neighbors matching the specified inclusive range of forms +// -- Only pack neighbors within cutoff +// --------------------------------------------------------------------------- +template +__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, + const int start, const int inum, + const int *dev_ij, const int form_low, + const int form_high, const int nall) { + + // ii indexes the two interacting particles in gi + int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start; + + if (ii(i,0); + numtyp iy=_x_(i,1); + numtyp iz=_x_(i,2); + int itype=_x_(i,7); + + int newj=0; + for ( ; list=nall) + j%=nall; + int jtype=_x_(j,7); + + if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) { + // Compute r12; + numtyp rsq=_x_(j,0)-ix; + rsq*=rsq; + numtyp t=_x_(j,1)-iy; + rsq+=t*t; + t=_x_(j,2)-iz; + rsq+=t*t; + + if (rsq< _cutsq_(itype,jtype)) { + *nbor=j; + nbor+=nbor_pitch; + newj++; + } + } + } + *nbor_newj=newj; + } +} + +// --------------------------------------------------------------------------- +// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access +// -- Only pack neighbors matching the specified inclusive range of forms +// -- Only pack neighbors within cutoff +// -- Fast version of routine that uses shared memory for LJ constants +// --------------------------------------------------------------------------- +template +__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, + const int start, const int inum, + const int *dev_ij, const int form_low, + const int form_high, const int nall) { + + int ii=threadIdx.x; + __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + if (ii(itype,jtype); + form[ii]=_form_(itype,jtype); + } + ii+=INT_MUL(blockIdx.x,blockDim.x)+start; + + if (ii(i,0); + numtyp iy=_x_(i,1); + numtyp iz=_x_(i,2); + int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,7)); + + int newj=0; + for ( ; list=nall) + j%=nall; + int mtype=itype+_x_(j,7); + + if (form[mtype]>=form_low && form[mtype]<=form_high) { + // Compute r12; + numtyp rsq=_x_(j,0)-ix; + rsq*=rsq; + numtyp t=_x_(j,1)-iy; + rsq+=t*t; + t=_x_(j,2)-iz; + rsq+=t*t; + + if (rsq +void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, + const int inum, const int form_low, const int form_high) { + if (gbm.shared_types) + kernel_pack_nbor_fast<<>> + (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum, + gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); + else + kernel_pack_nbor<<>> + (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum, + gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); +} + +// --------------------------------------------------------------------------- +// Convert something to a string +// --------------------------------------------------------------------------- +#include +template +inline string gb_gpu_toa(const t& in) { + ostringstream o; + o.precision(2); + o << in; + return o.str(); +} + +// --------------------------------------------------------------------------- +// Return string with GPU info +// --------------------------------------------------------------------------- +string gb_gpu_name(const int id, const int max_nbors) { + string name=GBMF[0].gpu.name(id)+", "+ + gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+ + gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+ + gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+ + gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id), + max_nbors))+" Atoms"; + return name; +} + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma, + const double upsilon, const double mu, double **shape, + double **well, double **cutsq, double **sigma, + double **epsilon, double *host_lshape, int **form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int max_nbors, const int thread, const int gpu_id) { + assert(thread +inline void _gb_gpu_atom(PairGPUAtom &atom, double **host_x, + double **host_quat, const int *host_type, + const bool rebuild, cudaStream_t &stream) { + atom.time_atom.start(); + atom.reset_write_buffer(); + + // Rows 1-3 of dev_x are position; rows 4-7 are quaternion + atom.add_atom_data(host_x[0],3); + atom.add_atom_data(host_x[0]+1,3); + atom.add_atom_data(host_x[0]+2,3); + atom.add_atom_data(host_quat[0],4); + atom.add_atom_data(host_quat[0]+1,4); + atom.add_atom_data(host_quat[0]+2,4); + atom.add_atom_data(host_quat[0]+3,4); + + int csize=7; + + // If a rebuild occured, copy type data + if (rebuild) { + atom.add_atom_data(host_type); + csize++; + } + + atom.copy_atom_data(csize,stream); + atom.time_atom.stop(); +} + +void gb_gpu_atom(double **host_x, double **host_quat, + const int *host_type, const bool rebuild, const int thread) { + _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild, + GBMF[thread].pair_stream); +} + +// --------------------------------------------------------------------------- +// Signal that we need to transfer a new neighbor list +// --------------------------------------------------------------------------- +template +int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, + const int inum, int *ilist, const int *numj, + const int *type, bool &success) { + if (nall>gbm.max_atoms) { + success=false; + return 0; + } + success=true; + + gbm.nbor.time_nbor.start(); + + gbm.atom.nall(nall); + gbm.atom.inum(inum); + + if (gbm.multiple_forms) { + int p=0, acc=0; + for (int i=0; i +void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) { + gbm.nbor.time_nbor.add_to_total(); + // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed + + gbm.nbor.time_nbor.start(); + gbm.nbor.add(num_ij,gbm.pair_stream); + gbm.nbor.time_nbor.stop(); +} + +void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) { + _gb_gpu_nbors(GBMF[thread],num_ij,eflag); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques for all ij interactions +// --------------------------------------------------------------------------- +template +void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, + const bool rebuild) { + // Compute the block size and grid size to keep all cores busy + const int BX=BLOCK_1D; + + int GX=static_cast(ceil(static_cast(gbm.atom.inum())/BX)); + + if (gbm.multiple_forms) { + gbm.time_kernel.start(); + if (gbm.last_ellipse>0) { + // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- + GX=static_cast(ceil(static_cast(gbm.last_ellipse)/ + static_cast(BX))); + pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE); + gbm.time_kernel.stop(); + + gbm.time_gayberne.start(); + kernel_gayberne<<>> + (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), + gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), + gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), + eflag, vflag, gbm.last_ellipse, gbm.atom.nall()); + gbm.time_gayberne.stop(); + + if (gbm.last_ellipse==gbm.atom.inum()) { + gbm.time_kernel2.start(); + gbm.time_kernel2.stop(); + gbm.time_gayberne2.start(); + gbm.time_gayberne2.stop(); + gbm.time_pair.start(); + gbm.time_pair.stop(); + return; + } + + // ------------ SPHERE_ELLIPSE --------------- + + gbm.time_kernel2.start(); + GX=static_cast(ceil(static_cast(gbm.atom.inum()- + gbm.last_ellipse)/BX)); + pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE, + ELLIPSE_SPHERE); + gbm.time_kernel2.stop(); + + gbm.time_gayberne2.start(); + kernel_sphere_gb<<>> + (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), + gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), + gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), + eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); + gbm.time_gayberne2.stop(); + } else { + gbm.atom.ans.zero(); + gbm.time_kernel.stop(); + gbm.time_gayberne.start(); + gbm.time_gayberne.stop(); + gbm.time_kernel2.start(); + gbm.time_kernel2.stop(); + gbm.time_gayberne2.start(); + gbm.time_gayberne2.stop(); + } + + // ------------ LJ --------------- + gbm.time_pair.start(); + if (gbm.last_ellipse<<>> + (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), + gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), + gbm.atom.ans.begin(), gbm.atom.ans.row_size(), + gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, + gbm.atom.inum(), gbm.atom.nall()); + else + kernel_lj<<>> + (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), + gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), + gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), + eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); + } + gbm.time_pair.stop(); + } else { + gbm.time_kernel.start(); + pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE); + gbm.time_kernel.stop(); + + gbm.time_gayberne.start(); + kernel_gayberne<<>> + (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), + gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), + gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), + eflag, vflag, gbm.atom.inum(), gbm.atom.nall()); + gbm.time_gayberne.stop(); + } +} + +void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, + const int thread) { + _gb_gpu_gayberne(GBMF[thread],eflag,vflag,rebuild); +} + +// --------------------------------------------------------------------------- +// Get energies, forces, and torques to host +// --------------------------------------------------------------------------- +template +double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist, + const bool eflag, const bool vflag, const bool eflag_atom, + const bool vflag_atom, double *eatom, double **vatom, + double *virial) { + double evdw; + + gbm.atom.time_answer.start(); + gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream); + + gbm.atom.time_atom.add_to_total(); + gbm.nbor.time_nbor.add_to_total(); + gbm.time_kernel.add_to_total(); + gbm.time_gayberne.add_to_total(); + if (gbm.multiple_forms) { + gbm.time_kernel2.add_to_total(); + gbm.time_gayberne2.add_to_total(); + gbm.time_pair.add_to_total(); + } + // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed + + evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); + gbm.atom.add_forces(ilist,f); + gbm.atom.add_torques(ilist,tor,gbm.last_ellipse); + gbm.atom.time_answer.stop(); + gbm.atom.time_answer.add_to_total(); + return evdw; +} + +double gb_gpu_forces(double **f, double **tor, const int *ilist, + const bool eflag, const bool vflag, const bool eflag_atom, + const bool vflag_atom, double *eatom, double **vatom, + double *virial, const int thread) { + return _gb_gpu_forces + (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom, + vflag_atom,eatom,vatom,virial); +} + +void gb_gpu_time(const int i) { + cout.precision(4); + cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds() + << " s.\n" + << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() + << " s.\n" + << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+ + GBMF[i].time_kernel2.total_seconds() << " s.\n" + << "Force calc: " << GBMF[i].time_gayberne.total_seconds()+ + GBMF[i].time_gayberne2.total_seconds()<< " s.\n"; + if (GBMF[i].multiple_forms) + cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n"; + cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds() + << " s.\n"; +} + +int gb_gpu_num_devices() { + return GBMF[0].gpu.num_devices(); +} + +double gb_gpu_bytes() { + return GBMF[0].host_memory_usage(); +} diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h new file mode 100644 index 0000000000..c918e07070 --- /dev/null +++ b/lib/gpu/gb_gpu_extra.h @@ -0,0 +1,337 @@ +/*************************************************************************** + gb_gpu_extra.h + ------------------- + W. Michael Brown + + Inline GPU kernel routines ala math_extra for the CPU. + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Jun 23 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#ifndef GB_GPU_EXTRA_H +#define GB_GPU_EXTRA_H + +#include "math.h" +#include "stdio.h" +#include "string.h" + +/* ---------------------------------------------------------------------- + Atomic update of global memory +------------------------------------------------------------------------- */ +/* +template __device__ +inline void atomicAdd(numtyp *address, numtyp val); + +template <> +__device__ inline void atomicAdd(float *address, float val) +{ + int i_val = __float_as_int(val); + int tmp0 = 0; + int tmp1; + + while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) { + tmp0 = tmp1; + i_val = __float_as_int(val + __int_as_float(tmp1)); + } +}*/ + +/* ---------------------------------------------------------------------- + dot product of 2 vectors +------------------------------------------------------------------------- */ + +template +static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2) +{ + return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2]; +} + +/* ---------------------------------------------------------------------- + cross product of 2 vectors +------------------------------------------------------------------------- */ + +template +static __inline__ __device__ void gpu_cross3(const numtyp *v1, + const numtyp *v2, numtyp *ans) +{ + ans[0] = v1[1]*v2[2]-v1[2]*v2[1]; + ans[1] = v1[2]*v2[0]-v1[0]*v2[2]; + ans[2] = v1[0]*v2[1]-v1[1]*v2[0]; +} + +/* ---------------------------------------------------------------------- + determinant of a matrix +------------------------------------------------------------------------- */ + +template +static __inline__ __device__ numtyp gpu_det3(const numtyp m[9]) +{ + numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - + m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + + m[6]*m[1]*m[5] - m[6]*m[2]*m[4]; + return ans; +} + +/* ---------------------------------------------------------------------- + diagonal matrix times a full matrix +------------------------------------------------------------------------- */ + +template +static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9], + numtyp ans[9]) +{ + ans[0] = _well_(i,0)*m[0]; + ans[1] = _well_(i,0)*m[1]; + ans[2] = _well_(i,0)*m[2]; + ans[3] = _well_(i,1)*m[3]; + ans[4] = _well_(i,1)*m[4]; + ans[5] = _well_(i,1)*m[5]; + ans[6] = _well_(i,2)*m[6]; + ans[7] = _well_(i,2)*m[7]; + ans[8] = _well_(i,2)*m[8]; +} + +/* ---------------------------------------------------------------------- + diagonal matrix times a full matrix +------------------------------------------------------------------------- */ + +template +static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9], + numtyp ans[9]) +{ + ans[0] = _shape_(i,0)*m[0]; + ans[1] = _shape_(i,0)*m[1]; + ans[2] = _shape_(i,0)*m[2]; + ans[3] = _shape_(i,1)*m[3]; + ans[4] = _shape_(i,1)*m[4]; + ans[5] = _shape_(i,1)*m[5]; + ans[6] = _shape_(i,2)*m[6]; + ans[7] = _shape_(i,2)*m[7]; + ans[8] = _shape_(i,2)*m[8]; +} + +/* ---------------------------------------------------------------------- + add two matrices +------------------------------------------------------------------------- */ + +template +static __inline__ __device__ void gpu_plus3(const numtyp m[9], + const numtyp m2[9], numtyp ans[9]) +{ + ans[0] = m[0]+m2[0]; + ans[1] = m[1]+m2[1]; + ans[2] = m[2]+m2[2]; + ans[3] = m[3]+m2[3]; + ans[4] = m[4]+m2[4]; + ans[5] = m[5]+m2[5]; + ans[6] = m[6]+m2[6]; + ans[7] = m[7]+m2[7]; + ans[8] = m[8]+m2[8]; +} + +/* ---------------------------------------------------------------------- + multiply the transpose of mat1 times mat2 +------------------------------------------------------------------------- */ + +template +static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9], + const numtyp m2[9], + numtyp ans[9]) +{ + ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6]; + ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7]; + ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8]; + ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6]; + ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7]; + ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8]; + ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6]; + ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7]; + ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8]; +} + +/* ---------------------------------------------------------------------- + row vector times matrix +------------------------------------------------------------------------- */ + +template +static __inline__ __device__ void gpu_row_times3(const numtyp *v, + const numtyp m[9], numtyp *ans) +{ + ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6]; + ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7]; + ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2]; +} + +/* ---------------------------------------------------------------------- + solve Ax = b or M ans = v + use gaussian elimination & partial pivoting on matrix + error_flag set to 2 if bad matrix inversion attempted +------------------------------------------------------------------------- */ + +template +static __inline__ __device__ void gpu_mldivide3(const numtyp m[9], + const numtyp *v, numtyp *ans, + int *error_flag) +{ + // create augmented matrix for pivoting + + numtyp aug[12], t; + + aug[3] = v[0]; + aug[0] = m[0]; + aug[1] = m[1]; + aug[2] = m[2]; + aug[7] = v[1]; + aug[4] = m[3]; + aug[5] = m[4]; + aug[6] = m[5]; + aug[11] = v[2]; + aug[8] = m[6]; + aug[9] = m[7]; + aug[10] = m[8]; + + if (fabs(aug[4]) > fabs(aug[0])) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; + swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; + swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; + swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; + } + if (fabs(aug[8]) > fabs(aug[0])) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; + swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; + swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; + swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; + } + + if (aug[0] != (numtyp)0.0) { + if (0!=0) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt; + swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt; + swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt; + swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt; + } + } else if (aug[4] != (numtyp)0.0) { + if (1!=0) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; + swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; + swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; + swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; + } + } else if (aug[8] != (numtyp)0.0) { + if (2!=0) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; + swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; + swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; + swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; + } + } else + *error_flag=2; + + t = aug[4]/aug[0]; + aug[5]-=t*aug[1]; + aug[6]-=t*aug[2]; + aug[7]-=t*aug[3]; + t = aug[8]/aug[0]; + aug[9]-=t*aug[1]; + aug[10]-=t*aug[2]; + aug[11]-=t*aug[3]; + + if (fabs(aug[9]) > fabs(aug[5])) { + numtyp swapt; + swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; + swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; + swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; + swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; + } + + if (aug[5] != (numtyp)0.0) { + if (1!=1) { + numtyp swapt; + swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt; + swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt; + swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt; + swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt; + } + } else if (aug[9] != (numtyp)0.0) { + if (2!=1) { + numtyp swapt; + swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; + swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; + swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; + swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; + } + } + + t = aug[9]/aug[5]; + aug[10]-=t*aug[6]; + aug[11]-=t*aug[7]; + + if (aug[10] == (numtyp)0.0) + *error_flag=2; + + ans[2] = aug[11]/aug[10]; + t = (numtyp)0.0; + t += aug[6]*ans[2]; + ans[1] = (aug[7]-t) / aug[5]; + t = (numtyp)0.0; + t += aug[1]*ans[1]; + t += aug[2]*ans[2]; + ans[0] = (aug[3]-t) / aug[0]; +} + +/* ---------------------------------------------------------------------- + compute rotation matrix from quaternion conjugate + quat = [w i j k] +------------------------------------------------------------------------- */ + +template +static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi, + numtyp mat[9]) +{ + numtyp qi3=_x_(qi,3); + numtyp qi4=_x_(qi,4); + numtyp qi5=_x_(qi,5); + numtyp qi6=_x_(qi,6); + + numtyp w2 = qi3*qi3; + numtyp i2 = qi4*qi4; + numtyp j2 = qi5*qi5; + numtyp k2 = qi6*qi6; + numtyp twoij = 2.0*qi4*qi5; + numtyp twoik = 2.0*qi4*qi6; + numtyp twojk = 2.0*qi5*qi6; + numtyp twoiw = 2.0*qi4*qi3; + numtyp twojw = 2.0*qi5*qi3; + numtyp twokw = 2.0*qi6*qi3; + + mat[0] = w2+i2-j2-k2; + mat[3] = twoij-twokw; + mat[6] = twojw+twoik; + + mat[1] = twoij+twokw; + mat[4] = w2-i2+j2-k2; + mat[7] = twojk-twoiw; + + mat[2] = twoik-twojw; + mat[5] = twojk+twoiw; + mat[8] = w2-i2-j2+k2; +} + +#endif diff --git a/lib/gpu/gb_gpu_kernel.h b/lib/gpu/gb_gpu_kernel.h new file mode 100644 index 0000000000..8ca047e0e1 --- /dev/null +++ b/lib/gpu/gb_gpu_kernel.h @@ -0,0 +1,861 @@ +/*************************************************************************** + gb_gpu_kernel.cu + ------------------- + W. Michael Brown + + Routines that actually perform the force/torque computation + + *** Force Decomposition by Atom Version *** + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Jun 23 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#ifndef GB_GPU_KERNEL +#define GB_GPU_KERNEL + +#include "gb_gpu_extra.h" + +template +static __inline__ __device__ void compute_eta_torque(numtyp m[9], + numtyp m2[9], + const int i, + numtyp ans[9]) +{ + numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- + m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- + m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; + den = (numtyp)1.0/den; + + numtyp shapex=_shape_(i,0); + numtyp shapey=_shape_(i,1); + numtyp shapez=_shape_(i,2); + + ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- + m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ + m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- + m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ + m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; + + ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ + (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- + (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- + m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ + m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; + + ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- + m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- + m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ + (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ + m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; + + ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ + m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ + m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- + m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- + m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; + + ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ + (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- + (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ + m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- + m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; + + ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- + m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ + (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ + m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- + (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; + + ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ + (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ + m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- + m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- + m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; + + ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- + (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ + (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- + m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ + m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; + + ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- + m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- + m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ + (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ + m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; +} + +template +__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj, + const int *dev_nbor, const int nbor_pitch, + acctyp *ans, size_t ans_pitch, int *err_flag, + const bool eflag, const bool vflag, + const int inum, const int nall) { + + __shared__ numtyp sp_lj[4]; + + // ii indexes the two interacting particles in gi + int ii=threadIdx.x; + if (ii<4) + sp_lj[ii]=special_lj[ii]; + ii+=INT_MUL(blockIdx.x,blockDim.x); + + if (ii(i,0); + numtyp iy=_x_(i,1); + numtyp iz=_x_(i,2); + int itype=_x_(i,7); + numtyp a1[9], b1[9], g1[9]; + { + numtyp t[9]; + gpu_quat_to_mat_trans(i,a1); + gpu_shape_times3(itype,a1,t); + gpu_transpose_times3(a1,t,g1); + gpu_well_times3(itype,a1,t); + gpu_transpose_times3(a1,t,b1); + } + + numtyp factor_lj; + for ( ; nbor(j,7); + + // Compute r12 + numtyp r12[3]; + r12[0] = _x_(j,0)-ix; + r12[1] = _x_(j,1)-iy; + r12[2] = _x_(j,2)-iz; + numtyp ir = gpu_dot3(r12,r12); + + ir = rsqrt(ir); + numtyp r = (numtyp)1.0/ir; + + numtyp a2[9]; + gpu_quat_to_mat_trans(j,a2); + + numtyp u_r, dUr[3], tUr[3], eta, teta[3]; + { // Compute U_r, dUr, eta, and teta + // Compute g12 + numtyp g12[9]; + { + numtyp g2[9]; + { + gpu_shape_times3(jtype,a2,g12); + gpu_transpose_times3(a2,g12,g2); + gpu_plus3(g1,g2,g12); + } + + { // Compute U_r and dUr + + // Compute kappa + numtyp kappa[3]; + gpu_mldivide3(g12,r12,kappa,err_flag); + + // -- replace r12 with r12 hat + r12[0]*=ir; + r12[1]*=ir; + r12[2]*=ir; + + // -- kappa is now / r + kappa[0]*=ir; + kappa[1]*=ir; + kappa[2]*=ir; + + // energy + + // compute u_r and dUr + numtyp uslj_rsq; + { + // Compute distance of closest approach + numtyp h12, sigma12; + sigma12 = gpu_dot3(r12,kappa); + sigma12 = rsqrt((numtyp)0.5*sigma12); + h12 = r-sigma12; + + // -- kappa is now ok + kappa[0]*=r; + kappa[1]*=r; + kappa[2]*=r; + + numtyp sigma = _sigma_(itype,jtype); + numtyp epsilon = _epsilon_(itype,jtype); + numtyp varrho = sigma/(h12+gum[0]*sigma); + numtyp varrho6 = varrho*varrho*varrho; + varrho6*=varrho6; + numtyp varrho12 = varrho6*varrho6; + u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); + + numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; + temp1 = temp1*(numtyp)24.0*epsilon; + uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; + numtyp temp2 = gpu_dot3(kappa,r12); + uslj_rsq = uslj_rsq*ir*ir; + + dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]); + dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]); + dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]); + } + + // torque for particle 1 + { + numtyp tempv[3], tempv2[3]; + tempv[0] = -uslj_rsq*kappa[0]; + tempv[1] = -uslj_rsq*kappa[1]; + tempv[2] = -uslj_rsq*kappa[2]; + gpu_row_times3(kappa,g1,tempv2); + gpu_cross3(tempv,tempv2,tUr); + } + } + } + + // Compute eta + { + eta = (numtyp)2.0*_lshape_(itype)*_lshape_(jtype); + numtyp det_g12 = gpu_det3(g12); + eta = pow(eta/det_g12,gum[1]); + } + + // Compute teta + numtyp temp[9], tempv[3], tempv2[3]; + compute_eta_torque(g12,a1,itype,temp); + numtyp temp1 = -eta*gum[1]; + + tempv[0] = temp1*temp[0]; + tempv[1] = temp1*temp[1]; + tempv[2] = temp1*temp[2]; + gpu_cross3(a1,tempv,tempv2); + teta[0] = tempv2[0]; + teta[1] = tempv2[1]; + teta[2] = tempv2[2]; + + tempv[0] = temp1*temp[3]; + tempv[1] = temp1*temp[4]; + tempv[2] = temp1*temp[5]; + gpu_cross3(a1+3,tempv,tempv2); + teta[0] += tempv2[0]; + teta[1] += tempv2[1]; + teta[2] += tempv2[2]; + + tempv[0] = temp1*temp[6]; + tempv[1] = temp1*temp[7]; + tempv[2] = temp1*temp[8]; + gpu_cross3(a1+6,tempv,tempv2); + teta[0] += tempv2[0]; + teta[1] += tempv2[1]; + teta[2] += tempv2[2]; + } + + numtyp chi, dchi[3], tchi[3]; + { // Compute chi and dchi + + // Compute b12 + numtyp b2[9], b12[9]; + { + gpu_well_times3(jtype,a2,b12); + gpu_transpose_times3(a2,b12,b2); + gpu_plus3(b1,b2,b12); + } + + // compute chi_12 + r12[0]*=r; + r12[1]*=r; + r12[2]*=r; + numtyp iota[3]; + gpu_mldivide3(b12,r12,iota,err_flag); + // -- iota is now iota/r + iota[0]*=ir; + iota[1]*=ir; + iota[2]*=ir; + r12[0]*=ir; + r12[1]*=ir; + r12[2]*=ir; + chi = gpu_dot3(r12,iota); + chi = pow(chi*(numtyp)2.0,gum[2]); + + // -- iota is now ok + iota[0]*=r; + iota[1]*=r; + iota[2]*=r; + + numtyp temp1 = gpu_dot3(iota,r12); + numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); + dchi[0] = temp2*(iota[0]-temp1*r12[0]); + dchi[1] = temp2*(iota[1]-temp1*r12[1]); + dchi[2] = temp2*(iota[2]-temp1*r12[2]); + + // compute t_chi + numtyp tempv[3]; + gpu_row_times3(iota,b1,tempv); + gpu_cross3(tempv,iota,tchi); + temp1 = (numtyp)-4.0*ir*ir; + tchi[0] *= temp1; + tchi[1] *= temp1; + tchi[2] *= temp1; + } + + numtyp temp2 = factor_lj*eta*chi; + if (eflag) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag) { + r12[0]*=-r; + r12[1]*=-r; + r12[2]*=-r; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + fx+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + fy+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + fz+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + fx+=temp1*dchi[0]-temp2*dUr[0]; + fy+=temp1*dchi[1]-temp2*dUr[1]; + fz+=temp1*dchi[2]-temp2*dUr[2]; + } + + // Torque on 1 + temp1 = -u_r*eta*factor_lj; + temp2 = -u_r*chi*factor_lj; + numtyp temp3 = -chi*eta*factor_lj; + torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; + tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; + torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; + + } // for nbor + + // Store answers + acctyp *ap1=ans+ii; + if (eflag) { + *ap1=energy; + ap1+=ans_pitch; + } + if (vflag) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=ans_pitch; + } + } + *ap1=fx; + ap1+=ans_pitch; + *ap1=fy; + ap1+=ans_pitch; + *ap1=fz; + ap1+=ans_pitch; + *ap1=torx; + ap1+=ans_pitch; + *ap1=tory; + ap1+=ans_pitch; + *ap1=torz; + + } // if ii +} + +template +__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj, + const int *dev_nbor, const int nbor_pitch, + acctyp *ans, size_t ans_pitch, int *err_flag, + const bool eflag, const bool vflag, + const int start, const int inum, + const int nall) { + __shared__ numtyp sp_lj[4]; + + // ii indexes the two interacting particles in gi + int ii=threadIdx.x; + if (ii<4) + sp_lj[ii]=special_lj[ii]; + ii+=INT_MUL(blockIdx.x,blockDim.x)+start; + + if (ii(i,0); + numtyp iy=_x_(i,1); + numtyp iz=_x_(i,2); + int itype=_x_(i,7); + + numtyp oner=_shape_(itype,0); + numtyp one_well=_well_(itype,0); + + numtyp factor_lj; + for ( ; nbor(j,7); + + // Compute r12 + numtyp r12[3]; + r12[0] = _x_(j,0)-ix; + r12[1] = _x_(j,1)-iy; + r12[2] = _x_(j,2)-iz; + numtyp ir = gpu_dot3(r12,r12); + + ir = rsqrt(ir); + numtyp r = (numtyp)1.0/ir; + + numtyp r12hat[3]; + r12hat[0]=r12[0]*ir; + r12hat[1]=r12[1]*ir; + r12hat[2]=r12[2]*ir; + + numtyp a2[9]; + gpu_quat_to_mat_trans(j,a2); + + numtyp u_r, dUr[3], eta; + { // Compute U_r, dUr, eta, and teta + // Compute g12 + numtyp g12[9]; + { + { + numtyp g2[9]; + gpu_shape_times3(jtype,a2,g12); + gpu_transpose_times3(a2,g12,g2); + g12[0]=g2[0]+oner; + g12[4]=g2[4]+oner; + g12[8]=g2[8]+oner; + g12[1]=g2[1]; + g12[2]=g2[2]; + g12[3]=g2[3]; + g12[5]=g2[5]; + g12[6]=g2[6]; + g12[7]=g2[7]; + } + + { // Compute U_r and dUr + + // Compute kappa + numtyp kappa[3]; + gpu_mldivide3(g12,r12,kappa,err_flag); + + // -- kappa is now / r + kappa[0]*=ir; + kappa[1]*=ir; + kappa[2]*=ir; + + // energy + + // compute u_r and dUr + numtyp uslj_rsq; + { + // Compute distance of closest approach + numtyp h12, sigma12; + sigma12 = gpu_dot3(r12hat,kappa); + sigma12 = rsqrt((numtyp)0.5*sigma12); + h12 = r-sigma12; + + // -- kappa is now ok + kappa[0]*=r; + kappa[1]*=r; + kappa[2]*=r; + + numtyp sigma = _sigma_(itype,jtype); + numtyp epsilon = _epsilon_(itype,jtype); + numtyp varrho = sigma/(h12+gum[0]*sigma); + numtyp varrho6 = varrho*varrho*varrho; + varrho6*=varrho6; + numtyp varrho12 = varrho6*varrho6; + u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); + + numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; + temp1 = temp1*(numtyp)24.0*epsilon; + uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; + numtyp temp2 = gpu_dot3(kappa,r12hat); + uslj_rsq = uslj_rsq*ir*ir; + + dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]); + dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]); + dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]); + } + } + } + + // Compute eta + { + eta = (numtyp)2.0*_lshape_(itype)*_lshape_(jtype); + numtyp det_g12 = gpu_det3(g12); + eta = pow(eta/det_g12,gum[1]); + } + } + + numtyp chi, dchi[3]; + { // Compute chi and dchi + + // Compute b12 + numtyp b12[9]; + { + numtyp b2[9]; + gpu_well_times3(jtype,a2,b12); + gpu_transpose_times3(a2,b12,b2); + b12[0]=b2[0]+one_well; + b12[4]=b2[4]+one_well; + b12[8]=b2[8]+one_well; + b12[1]=b2[1]; + b12[2]=b2[2]; + b12[3]=b2[3]; + b12[5]=b2[5]; + b12[6]=b2[6]; + b12[7]=b2[7]; + } + + // compute chi_12 + numtyp iota[3]; + gpu_mldivide3(b12,r12,iota,err_flag); + // -- iota is now iota/r + iota[0]*=ir; + iota[1]*=ir; + iota[2]*=ir; + chi = gpu_dot3(r12hat,iota); + chi = pow(chi*(numtyp)2.0,gum[2]); + + // -- iota is now ok + iota[0]*=r; + iota[1]*=r; + iota[2]*=r; + + numtyp temp1 = gpu_dot3(iota,r12hat); + numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); + dchi[0] = temp2*(iota[0]-temp1*r12hat[0]); + dchi[1] = temp2*(iota[1]-temp1*r12hat[1]); + dchi[2] = temp2*(iota[2]-temp1*r12hat[2]); + } + + numtyp temp2 = factor_lj*eta*chi; + if (eflag) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag) { + r12[0]*=-1; + r12[1]*=-1; + r12[2]*=-1; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + fx+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + fy+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + fz+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + fx+=temp1*dchi[0]-temp2*dUr[0]; + fy+=temp1*dchi[1]-temp2*dUr[1]; + fz+=temp1*dchi[2]-temp2*dUr[2]; + } + } // for nbor + + // Store answers + acctyp *ap1=ans+ii; + if (eflag) { + *ap1=energy; + ap1+=ans_pitch; + } + if (vflag) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=ans_pitch; + } + } + *ap1=fx; + ap1+=ans_pitch; + *ap1=fy; + ap1+=ans_pitch; + *ap1=fz; + } // if ii +} + +template +__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, + const int *dev_ij, const int nbor_pitch, acctyp *ans, + size_t ans_pitch, int *err_flag, const bool eflag, + const bool vflag, const int start, const int inum, + const int nall) { + __shared__ numtyp sp_lj[4]; + + // ii indexes the two interacting particles in gi + int ii=threadIdx.x; + if (ii<4) + sp_lj[ii]=special_lj[ii]; + ii+=INT_MUL(blockIdx.x,blockDim.x)+start; + + if (ii(i,0); + numtyp iy=_x_(i,1); + numtyp iz=_x_(i,2); + int itype=_x_(i,7); + + numtyp factor_lj; + for ( ; list(j,7); + + // Compute r12 + numtyp delx = ix-_x_(j,0); + numtyp dely = iy-_x_(j,1); + numtyp delz = iz-_x_(j,2); + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + if (r2inv<_cutsq_(itype,jtype) && + _form_(itype,jtype)==SPHERE_SPHERE) { + r2inv=(numtyp)1.0/r2inv; + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp force = r2inv*r6inv*(_lj1_(itype,jtype).x*r6inv- + _lj1_(itype,jtype).y); + force*=factor_lj; + + fx+=delx*force; + fy+=dely*force; + fz+=delz*force; + + if (eflag) { + numtyp e=r6inv*(_lj3_(itype,jtype).x*r6inv- + _lj3_(itype,jtype).y); + energy+=factor_lj*(e-_offset_(1,1)); + } + if (vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + acctyp *ap1=ans+ii; + if (eflag) { + *ap1+=energy; + ap1+=ans_pitch; + } + if (vflag) { + for (int i=0; i<6; i++) { + *ap1+=virial[i]; + ap1+=ans_pitch; + } + } + *ap1+=fx; + ap1+=ans_pitch; + *ap1+=fy; + ap1+=ans_pitch; + *ap1+=fz; + + } // if ii +} + +template +__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, + const int *dev_ij, const int nbor_pitch, + acctyp *ans, size_t ans_pitch,int *err_flag, + const bool eflag, const bool vflag, + const int start, const int inum, const int nall){ + // ii indexes the two interacting particles in gi + int ii=threadIdx.x; + __shared__ numtyp sp_lj[4]; + __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + if (ii<4) + sp_lj[ii]=special_lj[ii]; + if (ii(itype,jtype); + form[ii]=_form_(itype,jtype); + lj1[ii]=_lj1_(itype,jtype).x; + lj2[ii]=_lj1_(itype,jtype).y; + if (eflag) { + lj3[ii]=_lj3_(itype,jtype).x; + lj4[ii]=_lj3_(itype,jtype).y; + offset[ii]=_offset_(itype,jtype); + } + } + ii+=INT_MUL(blockIdx.x,blockDim.x)+start; + + if (ii(i,0); + numtyp iy=_x_(i,1); + numtyp iz=_x_(i,2); + int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,7)); + + numtyp factor_lj; + for ( ; list(j,7); + + // Compute r12 + numtyp delx = ix-_x_(j,0); + numtyp dely = iy-_x_(j,1); + numtyp delz = iz-_x_(j,2); + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + if (r2inv + +template +GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() { + this->atom.atom_fields(8); + this->atom.ans_fields(13); + this->nbor.packing(true); +} + +template +GB_GPU_MemoryT::~GB_GPU_Memory() { + clear(); +} + +template +int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, + const double gamma, const double upsilon, + const double mu, double **host_shape, + double **host_well, double **host_cutsq, + double **host_sigma, double **host_epsilon, + double *host_lshape, int **h_form, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int max_nbors, + const bool force_d, const int me) { + if (this->allocated) + clear(); + + LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon, + host_lj1, host_lj2, host_lj3, host_lj4, host_offset, + host_special_lj, max_nbors, me); + + host_form=h_form; + + // Initialize timers for the selected GPU + time_kernel.init(); + time_gayberne.init(); + time_kernel2.init(); + time_gayberne2.init(); + + // Use the write buffer from atom for data initialization + NVC_HostT &host_write=this->atom.host_write; + assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2); + + // Allocate, cast and asynchronous memcpy of constant data + gamma_upsilon_mu.safe_alloc(3); + host_write[0]=static_cast(gamma); + host_write[1]=static_cast(upsilon); + host_write[2]=static_cast(mu); + gamma_upsilon_mu.copy_from_host(host_write.begin()); + + lshape.safe_alloc(ntypes); + lshape.cast_copy(host_lshape,host_write); + lshape.copy_from_host(host_write.begin()); + + // Copy shape, well, sigma, epsilon, and cutsq onto GPU + shape.safe_alloc(ntypes,3); + shape.cast_copy(host_shape[0],host_write); + well.safe_alloc(ntypes,3); + well.cast_copy(host_well[0],host_write); + + // Copy LJ data onto GPU + int lj_types=ntypes; + if (lj_types<=MAX_SHARED_TYPES) + lj_types=MAX_SHARED_TYPES; + form.safe_alloc(lj_types,lj_types); + form.copy_2Dfrom_host(host_form[0],ntypes,ntypes); + + // See if we want fast GB-sphere or sphere-sphere calculations + multiple_forms=false; + for (int i=1; imax_atoms); + + // Bind constant data to textures + lshape_bind_texture(lshape); + shape_bind_texture(shape); + well_bind_texture(well); + form_bind_texture(form); + + return this->nbor.host_ij.begin(); +} + +template +void GB_GPU_MemoryT::clear() { + if (!this->allocated) + return; + + int err_flag; + this->dev_error.copy_to_host(&err_flag); + if (err_flag == 1) + std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N " + << "and RECOMPILE.\n"; + else if (err_flag == 2) + std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; + + LJ_GPU_MemoryT::clear(); + + shape_unbind_texture(); + well_unbind_texture(); + form_unbind_texture(); + + shape.clear(); + well.clear(); + form.clear(); + lshape.clear(); + gamma_upsilon_mu.clear(); + host_olist.clear(); +} + +template +double GB_GPU_MemoryT::host_memory_usage() { + return this->atom.host_memory_usage(this->max_atoms)+ + this->nbor.host_memory_usage()+4*sizeof(numtyp)+ + sizeof(GB_GPU_Memory)+this->max_atoms*sizeof(int); +} + +template class GB_GPU_Memory; diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h new file mode 100644 index 0000000000..8886931c10 --- /dev/null +++ b/lib/gpu/gb_gpu_memory.h @@ -0,0 +1,74 @@ +/*************************************************************************** + gb_gpu_memory.h + ------------------- + W. Michael Brown + + Global variables for GPU Gayberne Library + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Thu Jun 25 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#ifndef GB_GPU_MEMORY_H +#define GB_GPU_MEMORY_H + +#define MAX_GPU_THREADS 4 +#include "lj_gpu_memory.h" +#define LJ_GPU_MemoryT LJ_GPU_Memory + +enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; + +template +class GB_GPU_Memory : public LJ_GPU_MemoryT { + public: + GB_GPU_Memory(); + ~GB_GPU_Memory(); + + int* init(const int ij_size, const int ntypes, const double gamma, + const double upsilon, const double mu, double **host_shape, + double **host_well, double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, int **h_form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int max_nbors, const bool force_d, const int me); + + void clear(); + + double host_memory_usage(); + + // ---------------------------- DATA ---------------------------- + + // ilist with particles sorted by type + NVC_HostI host_olist; + + // --------------- Const Data for Atoms + NVC_ConstMatT shape, well; + NVC_ConstMatI form; + NVC_VecT lshape, gamma_upsilon_mu; + + + // --------------- Timing Stuff + NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2; + + // True if we want to use fast GB-sphere or sphere-sphere calculations + bool multiple_forms; + int **host_form; + int last_ellipse; + + private: +}; + +#endif + diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu new file mode 100644 index 0000000000..ea5ba440cc --- /dev/null +++ b/lib/gpu/lj_gpu.cu @@ -0,0 +1,234 @@ +/*************************************************************************** + lj_gpu.cu + ------------------- + W. Michael Brown + + Lennard-Jones potential GPU calcultation + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Aug 4 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#include +#include +#include "nvc_macros.h" +#include "nvc_timer.h" +#include "nvc_device.h" +#include "lj_gpu_memory.h" +#include "lj_gpu_kernel.h" + +using namespace std; + +static LJ_GPU_Memory LJMF; +#define LJMT LJ_GPU_Memory + +// --------------------------------------------------------------------------- +// Convert something to a string +// --------------------------------------------------------------------------- +#include +template +inline string lj_gpu_toa(const t& in) { + ostringstream o; + o.precision(2); + o << in; + return o.str(); +} + +// --------------------------------------------------------------------------- +// Return string with GPU info +// --------------------------------------------------------------------------- +string lj_gpu_name(const int id, const int max_nbors) { + string name=LJMF.gpu.name(id)+", "+ + lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+ + lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+ + lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+ + lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id), + max_nbors))+" Atoms"; + return name; +} + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, + double **epsilon, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, + double *special_lj, const int max_nbors, const int gpu_id) { + if (LJMF.gpu.num_devices()==0) + return 0; + + ij_size=IJ_SIZE; + return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, + host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id); +} + +// --------------------------------------------------------------------------- +// Clear memory on host and device +// --------------------------------------------------------------------------- +void lj_gpu_clear() { + LJMF.clear(); +} + +// --------------------------------------------------------------------------- +// copy atom positions and optionally types to device +// --------------------------------------------------------------------------- +template +inline void _lj_gpu_atom(PairGPUAtom &atom, double **host_x, + const int *host_type, const bool rebuild, + cudaStream_t &stream) { + atom.time_atom.start(); + atom.reset_write_buffer(); + + // First row of dev_x is x position, second is y, third is z + atom.add_atom_data(host_x[0],3); + atom.add_atom_data(host_x[0]+1,3); + atom.add_atom_data(host_x[0]+2,3); + + int csize=3; + + // If a rebuild occured, copy type data + if (rebuild) { + atom.add_atom_data(host_type); + csize++; + } + + atom.copy_atom_data(csize,stream); + atom.time_atom.stop(); +} + +void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) { + _lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream); +} + +// --------------------------------------------------------------------------- +// Signal that we need to transfer a new neighbor list +// --------------------------------------------------------------------------- +template +bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum, + int *ilist, const int *numj) { + if (nall>ljm.max_atoms) + return false; + + ljm.nbor.time_nbor.start(); + + ljm.atom.nall(nall); + ljm.atom.inum(inum); + ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream); + + ljm.nbor.time_nbor.stop(); + return true; +} + +bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, + const int *numj) { + return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj); +} + +// --------------------------------------------------------------------------- +// Copy a set of ij_size ij interactions to device and compute energies, +// forces, and torques for those interactions +// --------------------------------------------------------------------------- +template +void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) { + ljm.nbor.time_nbor.add_to_total(); + // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed + + ljm.nbor.time_nbor.start(); + ljm.nbor.add(num_ij,ljm.pair_stream); + ljm.nbor.time_nbor.stop(); +} + +void lj_gpu_nbors(const int num_ij) { + _lj_gpu_nbors(LJMF,num_ij); +} + +// --------------------------------------------------------------------------- +// Calculate energies and forces for all ij interactions +// --------------------------------------------------------------------------- +template +void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){ + // Compute the block size and grid size to keep all cores busy + const int BX=BLOCK_1D; + + int GX=static_cast(ceil(static_cast(ljm.atom.inum())/BX)); + + ljm.time_pair.start(); + if (ljm.shared_types) + kernel_lj_fast<<>> + (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), + ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), + ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, + vflag, ljm.atom.inum(), ljm.atom.nall()); + else + kernel_lj<<>> + (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), + ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), + ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, + vflag, ljm.atom.inum(), ljm.atom.nall()); + ljm.time_pair.stop(); +} + +void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) { + _lj_gpu(LJMF,eflag,vflag,rebuild); +} + +// --------------------------------------------------------------------------- +// Get energies and forces to host +// --------------------------------------------------------------------------- +template +double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist, + const bool eflag, const bool vflag, const bool eflag_atom, + const bool vflag_atom, double *eatom, double **vatom, + double *virial) { + double evdw; + + ljm.atom.time_answer.start(); + ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream); + + ljm.atom.time_atom.add_to_total(); + ljm.nbor.time_nbor.add_to_total(); + ljm.time_pair.add_to_total(); + // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed + + evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); + ljm.atom.add_forces(ilist,f); + ljm.atom.time_answer.stop(); + ljm.atom.time_answer.add_to_total(); + return evdw; +} + +double lj_gpu_forces(double **f, const int *ilist, const bool eflag, + const bool vflag, const bool eflag_atom, + const bool vflag_atom, double *eatom, double **vatom, + double *virial) { + return _lj_gpu_forces + (LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial); +} + +void lj_gpu_time() { + cout.precision(4); + cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n"; + cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n"; + cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n"; + cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n"; +} + +int lj_gpu_num_devices() { + return LJMF.gpu.num_devices(); +} + +double lj_gpu_bytes() { + return LJMF.host_memory_usage(); +} diff --git a/lib/gpu/lj_gpu_kernel.h b/lib/gpu/lj_gpu_kernel.h new file mode 100644 index 0000000000..5f3f0ff7c6 --- /dev/null +++ b/lib/gpu/lj_gpu_kernel.h @@ -0,0 +1,248 @@ +/*************************************************************************** + lj_gpu_kernel.cu + ------------------- + W. Michael Brown + + Routines that actually perform the force computation + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Aug 4 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#ifndef LJ_GPU_KERNEL +#define LJ_GPU_KERNEL + +template +__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, + const int *dev_ij, const int nbor_pitch, acctyp *ans, + size_t ans_pitch, const bool eflag, + const bool vflag, const int inum, const int nall) { + __shared__ numtyp sp_lj[4]; + + // ii indexes the two interacting particles in gi + int ii=threadIdx.x; + if (ii<4) + sp_lj[ii]=special_lj[ii]; + ii+=INT_MUL(blockIdx.x,blockDim.x); + + if (ii(i,0); + numtyp iy=_x_(i,1); + numtyp iz=_x_(i,2); + int itype=_x_(i,3); + + numtyp factor_lj; + for ( ; list(j,3); + + // Compute r12 + numtyp delx = ix-_x_(j,0); + numtyp dely = iy-_x_(j,1); + numtyp delz = iz-_x_(j,2); + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + if (r2inv<_cutsq_(itype,jtype)) { + r2inv=(numtyp)1.0/r2inv; + numtyp r6inv =r2inv*r2inv*r2inv; + numtyp force =factor_lj*r2inv*r6inv*(_lj1_(itype,jtype).x*r6inv- + _lj1_(itype,jtype).y); + + fx+=delx*force; + fy+=dely*force; + fz+=delz*force; + + if (eflag) { + numtyp e=r6inv*(_lj3_(itype,jtype).x*r6inv- + _lj3_(itype,jtype).y); + energy+=factor_lj*(e-_offset_(1,1)); + } + if (vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Store answers + acctyp *ap1=ans+ii; + if (eflag) { + *ap1=energy; + ap1+=ans_pitch; + } + if (vflag) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=ans_pitch; + } + } + *ap1=fx; + ap1+=ans_pitch; + *ap1=fy; + ap1+=ans_pitch; + *ap1=fz; + + } // if ii +} + +template +__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, + const int *dev_ij, const int nbor_pitch, + acctyp *ans, size_t ans_pitch,const bool eflag, + const bool vflag, const int inum, + const int nall) { + + // ii indexes the two interacting particles in gi + int ii=threadIdx.x; + __shared__ numtyp sp_lj[4]; + __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + if (ii<4) + sp_lj[ii]=special_lj[ii]; + if (ii(itype,jtype); + lj1[ii]=_lj1_(itype,jtype).x; + lj2[ii]=_lj1_(itype,jtype).y; + if (eflag) { + lj3[ii]=_lj3_(itype,jtype).x; + lj4[ii]=_lj3_(itype,jtype).y; + offset[ii]=_offset_(itype,jtype); + } + } + ii+=INT_MUL(blockIdx.x,blockDim.x); + + if (ii(i,0); + numtyp iy=_x_(i,1); + numtyp iz=_x_(i,2); + int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,3)); + + numtyp factor_lj; + for ( ; list(j,3); + + // Compute r12 + numtyp delx = ix-_x_(j,0); + numtyp dely = iy-_x_(j,1); + numtyp delz = iz-_x_(j,2); + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + if (r2inv + +template +int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { + return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors); +} + +template +int LJ_GPU_MemoryT::get_max_atoms(const int gpu_bytes, const int max_nbors) { + int matoms=static_cast(PERCENT_GPU_MEMORY*gpu_bytes/ + bytes_per_atom(max_nbors)); + if (matoms>MAX_ATOMS) + matoms=MAX_ATOMS; + return matoms; +} + +template +int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, + double **host_cutsq, double **host_sigma, + double **host_epsilon, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int max_nbors, + const int me) { + if (allocated) + clear(); + + if (me>=gpu.num_devices()) + return 0; + gpu.set(me); + if (gpu.revision()<1.0) + return 0; + + // Initialize timers for the selected GPU + time_pair.init(); + + // Initialize atom and nbor data + max_atoms=get_max_atoms(gpu.bytes(),max_nbors); + atom.init(max_atoms); + nbor.init(ij_size,max_atoms,max_nbors); + + // Get a stream for computing pair potentials + CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream)); + + // Use the write buffer from atom for data initialization + NVC_HostT &host_write=atom.host_write; + assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2); + + // Copy data for bonded interactions + special_lj.safe_alloc(4); + special_lj.cast_copy(host_special_lj,host_write); + + // Copy sigma, epsilon, and cutsq onto GPU + sigma.safe_alloc(ntypes,ntypes); + sigma.cast_copy(host_sigma[0],host_write); + epsilon.safe_alloc(ntypes,ntypes); + epsilon.cast_copy(host_epsilon[0],host_write); + cutsq.safe_alloc(ntypes,ntypes); + cutsq.cast_copy(host_cutsq[0],host_write); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=MAX_SHARED_TYPES) { + lj_types=MAX_SHARED_TYPES; + shared_types=true; + } + offset.safe_alloc(lj_types,lj_types); + offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes); + double *t1=host_lj1[0]; + double *t2=host_lj2[0]; + for (int i=0; i::vec2 *> (host_write.begin()), + ntypes,ntypes); + t1=host_lj3[0]; + t2=host_lj4[0]; + for (int i=0; i::vec2 *> (host_write.begin()), + ntypes,ntypes); + + // Bind constant data to textures + sigma_bind_texture(sigma); + epsilon_bind_texture(epsilon); + cutsq_bind_texture(cutsq); + offset_bind_texture(offset); + lj1_bind_texture::vec2>(lj1); + lj3_bind_texture::vec2>(lj3); + + dev_error.safe_alloc(1); + dev_error.zero(); + + allocated=true; + + return nbor.host_ij.begin(); +} + +template +void LJ_GPU_MemoryT::clear() { + if (!allocated) + return; + allocated=false; + + // Check for any pair style specific errors here + int err_flag; + dev_error.copy_to_host(&err_flag); + + atom.clear(); + nbor.clear(); + + sigma_unbind_texture(); + epsilon_unbind_texture(); + cutsq_unbind_texture(); + offset_unbind_texture(); + lj1_unbind_texture::vec2>(); + lj3_unbind_texture::vec2>(); + + CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream)); + + dev_error.clear(); + sigma.clear(); + epsilon.clear(); + special_lj.clear(); + cutsq.clear(); + offset.clear(); + lj1.clear(); + lj3.clear(); +} + +template +double LJ_GPU_MemoryT::host_memory_usage() const { + return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+ + sizeof(LJ_GPU_Memory); +} + +template class LJ_GPU_Memory; diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h new file mode 100644 index 0000000000..a3b4358192 --- /dev/null +++ b/lib/gpu/lj_gpu_memory.h @@ -0,0 +1,88 @@ +/*************************************************************************** + lj_gpu_memory.h + ------------------- + W. Michael Brown + + Global variables for GPU Lennard-Jones Library + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Aug 4 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#ifndef LJ_GPU_MEMORY_H +#define LJ_GPU_MEMORY_H + +#include "nvc_device.h" +#include "pair_gpu_atom.h" +#include "pair_gpu_nbor.h" + +#define BLOCK_1D 64 +#define MAX_SHARED_TYPES 8 +#define PERCENT_GPU_MEMORY 0.7 + +template +class LJ_GPU_Memory { + public: + LJ_GPU_Memory() : allocated(false) {} + ~LJ_GPU_Memory() { clear(); } + + /// Allocate memory on host and device + int* init(const int ij_size, const int ntypes, double **host_cutsq, + double **host_sigma, double **host_epsilon, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int max_nbors, const int me); + /// Free any memory on host and device + void clear(); + + /// Returns memory usage on GPU per atom + int bytes_per_atom(const int max_nbors) const; + /// Maximum number of atoms that can be stored on GPU + int get_max_atoms(const int gpu_bytes, const int max_nbors); + /// Total host memory used by library + double host_memory_usage() const; + + // ------------------------- DATA ----------------------------- + + // Device Properties + NVCDevice gpu; + // Device Error Flag + NVC_VecI dev_error; + // Stream for asynchronous work + cudaStream_t pair_stream; + + // Atom Data + PairGPUAtom atom; + // Neighbor Data + PairGPUNbor nbor; + + // --------------- Const Data for Atoms + NVC_ConstMatT sigma, epsilon, cutsq, offset; + NVC_ConstMat< typename cu_vec_traits::vec2 > lj1, lj3; + NVC_VecT special_lj; + + size_t max_atoms; + + // Timing for pair calculation + NVCTimer time_pair; + + // If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + protected: + bool allocated; +}; + +#endif diff --git a/lib/gpu/nvc_device.cu b/lib/gpu/nvc_device.cu new file mode 100644 index 0000000000..01a2db1bd7 --- /dev/null +++ b/lib/gpu/nvc_device.cu @@ -0,0 +1,96 @@ +/*************************************************************************** + nvc_device.cu + ------------------- + W. Michael Brown + + Utilities for dealing with cuda devices + + __________________________________________________________________________ + This file is part of the NVC Library + __________________________________________________________________________ + + begin : Wed Jan 28 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#include +#include +#include +#include "nvc_macros.h" +#include "nvc_device.h" + +// Grabs the properties for all devices +NVCDevice::NVCDevice() { + CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices)); + for (int dev=0; dev<_num_devices; ++dev) { + cudaDeviceProp deviceProp; + CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev)); + if (deviceProp.major == 9999 && deviceProp.minor == 9999) + break; + _properties.push_back(deviceProp); + } + _device=0; +} + +// Set the CUDA device to the specified device number +void NVCDevice::set(int num) { + if (_device==num) + return; + cudaThreadExit(); + CUDA_SAFE_CALL(cudaSetDevice(num)); + _device=num; +} + +// List all devices along with all properties +void NVCDevice::print_all(ostream &out) { + if (num_devices() == 0) + printf("There is no device supporting CUDA\n"); + for (int i=0; i= 2000 + printf(" Number of multiprocessors: %d\n", + _properties[i].multiProcessorCount); + printf(" Number of cores: %d\n",cores(i)); + #endif + printf(" Total amount of constant memory: %u bytes\n", + _properties[i].totalConstMem); + printf(" Total amount of shared memory per block: %u bytes\n", + _properties[i].sharedMemPerBlock); + printf(" Total number of registers available per block: %d\n", + _properties[i].regsPerBlock); + printf(" Warp size: %d\n", + _properties[i].warpSize); + printf(" Maximum number of threads per block: %d\n", + _properties[i].maxThreadsPerBlock); + printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", + _properties[i].maxThreadsDim[0], + _properties[i].maxThreadsDim[1], + _properties[i].maxThreadsDim[2]); + printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", + _properties[i].maxGridSize[0], + _properties[i].maxGridSize[1], + _properties[i].maxGridSize[2]); + printf(" Maximum memory pitch: %u bytes\n", + _properties[i].memPitch); + printf(" Texture alignment: %u bytes\n", + _properties[i].textureAlignment); + printf(" Clock rate: %.2f GHz\n", + clock_rate(i)); + #if CUDART_VERSION >= 2000 + printf(" Concurrent copy and execution: %s\n", + _properties[i].deviceOverlap ? "Yes" : "No"); + #endif + } +} + diff --git a/lib/gpu/nvc_device.h b/lib/gpu/nvc_device.h new file mode 100644 index 0000000000..1ba5f2bc4c --- /dev/null +++ b/lib/gpu/nvc_device.h @@ -0,0 +1,92 @@ +/*************************************************************************** + nvc_device.h + ------------------- + W. Michael Brown + + Utilities for dealing with cuda devices + + __________________________________________________________________________ + This file is part of the NVC Library + __________________________________________________________________________ + + begin : Wed Jan 28 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#ifndef NVC_DEVICE +#define NVC_DEVICE + +#include +#include +#include + +using namespace std; + +/// Class for looking at device properties +/** \note Calls to change the device outside of the class results in incorrect + * behavior + * \note There is no error checking for indexing past the number of devices **/ +class NVCDevice { + public: + /// Grabs the properties for all devices + NVCDevice(); + + /// Return the number of devices that support CUDA + inline int num_devices() { return _properties.size(); } + + /// Set the CUDA device to the specified device number + void set(int num); + + /// Get the current device number + inline int device_num() { return _device; } + + /// Get the current CUDA device name + inline string name() { return name(_device); } + /// Get the CUDA device name + inline string name(const int i) { return string(_properties[i].name); } + + /// Get the number of cores in the current device + inline unsigned cores() { return cores(_device); } + /// Get the number of cores + inline unsigned cores(const int i) + { return _properties[i].multiProcessorCount*8; } + + /// Get the gigabytes of global memory in the current device + inline double gigabytes() { return gigabytes(_device); } + /// Get the gigabytes of global memory + inline double gigabytes(const int i) + { return static_cast(_properties[i].totalGlobalMem)/1073741824; } + + /// Get the bytes of global memory in the current device + inline size_t bytes() { return bytes(_device); } + /// Get the bytes of global memory + inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; } + + /// Return the GPGPU revision number for current device + inline double revision() { return revision(_device); } + /// Return the GPGPU revision number + inline double revision(const int i) + { return static_cast(_properties[i].minor)/10+_properties[i].major;} + + /// Clock rate in GHz for current device + inline double clock_rate() { return clock_rate(_device); } + /// Clock rate in GHz + inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;} + + /// List all devices along with all properties + void print_all(ostream &out); + + private: + int _device, _num_devices; + vector _properties; +}; + +#endif diff --git a/lib/gpu/nvc_get_devices.cu b/lib/gpu/nvc_get_devices.cu new file mode 100644 index 0000000000..a1d35232e6 --- /dev/null +++ b/lib/gpu/nvc_get_devices.cu @@ -0,0 +1,31 @@ +/*************************************************************************** + nvc_get_devices.h + ------------------- + W. Michael Brown + + List properties of cuda devices + + __________________________________________________________________________ + This file is part of the NVC Library + __________________________________________________________________________ + + begin : Wed Jan 28 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#include "nvc_device.h" + +int main(int argc, char** argv) { + NVCDevice gpu; + gpu.print_all(cout); + return 0; +} + diff --git a/lib/gpu/nvc_macros.h b/lib/gpu/nvc_macros.h new file mode 100644 index 0000000000..56bb5c64ce --- /dev/null +++ b/lib/gpu/nvc_macros.h @@ -0,0 +1,121 @@ +#ifndef NVC_MACROS_H +#define NVC_MACROS_H + +#include +#include "math_constants.h" +#define INT_MUL(x,y) (__mul24(x,y)) +//#define INT_MUL(x,y) ((x)*(y)) + +template +static __inline__ __device__ numbr cuda_inf() { return CUDART_INF_F; } + +#ifdef CUDA_DOUBLE +template <> +static __inline__ __device__ double cuda_inf() { return CUDART_INF; } +#endif + +template +static __inline__ __device__ numbr cuda_zero() { return 0.0; } + +template <> +static __inline__ __device__ float cuda_zero() { return 0.0f; } + +#ifdef DEBUG + +# define CU_SAFE_CALL_NO_SYNC( call ) do { \ + CUresult err = call; \ + if( CUDA_SUCCESS != err) { \ + fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \ + err, __FILE__, __LINE__ ); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define CU_SAFE_CALL( call ) do { \ + CU_SAFE_CALL_NO_SYNC(call); \ + CUresult err = cuCtxSynchronize(); \ + if( CUDA_SUCCESS != err) { \ + fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \ + err, __FILE__, __LINE__ ); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define CUDA_SAFE_CALL_NO_SYNC( call) do { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define CUDA_SAFE_CALL( call) do { \ + CUDA_SAFE_CALL_NO_SYNC(call); \ + cudaError err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define CUFFT_SAFE_CALL( call) do { \ + cufftResult err = call; \ + if( CUFFT_SUCCESS != err) { \ + fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define CUT_SAFE_CALL( call) \ + if( CUTTrue != call) { \ + fprintf(stderr, "Cut error in file '%s' in line %i.\n", \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } + + //! Check for CUDA error +# define CUT_CHECK_ERROR(errorMessage) do { \ + cudaError_t err = cudaGetLastError(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } } while (0) + + //! Check for malloc error +# define CUT_SAFE_MALLOC( mallocCall ) do{ \ + if( !(mallocCall)) { \ + fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } } while(0); + + //! Check if conditon is true (flexible assert) +# define CUT_CONDITION( val) \ + if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) { \ + exit(EXIT_FAILURE); \ + } + +#else // not DEBUG + +#define CUT_BANK_CHECKER( array, index) array[index] + + // void macros for performance reasons +# define CUT_CHECK_ERROR(errorMessage) +# define CUT_CHECK_ERROR_GL() +# define CUT_CONDITION( val) +# define CU_SAFE_CALL_NO_SYNC( call) call +# define CU_SAFE_CALL( call) call +# define CUDA_SAFE_CALL_NO_SYNC( call) call +# define CUDA_SAFE_CALL( call) call +# define CUT_SAFE_CALL( call) call +# define CUFFT_SAFE_CALL( call) call +# define CUT_SAFE_MALLOC( mallocCall ) mallocCall + +#endif + +#endif diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h new file mode 100644 index 0000000000..a83178985c --- /dev/null +++ b/lib/gpu/nvc_memory.h @@ -0,0 +1,447 @@ +/*************************************************************************** + nvc_memory.h + ------------------- + W. Michael Brown + + Routines for memory management on CUDA devices + + __________________________________________________________________________ + This file is part of the NVC Library + __________________________________________________________________________ + + begin : Thu Jun 25 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#ifndef NVC_MEMORY_H +#define NVC_MEMORY_H + +#include + +#define NVC_HostT NVC_Host +#define NVC_HostD NVC_Host +#define NVC_HostS NVC_Host +#define NVC_HostI NVC_Host + +#define NVC_VecT NVC_Vec +#define NVC_VecD NVC_Vec +#define NVC_VecS NVC_Vec +#define NVC_VecI NVC_Vec +#define NVC_VecI2 NVC_Vec +#define NVC_VecU2 NVC_Vec + +#define NVC_MatT NVC_Mat +#define NVC_MatD NVC_Mat +#define NVC_MatS NVC_Mat +#define NVC_MatI NVC_Mat + +#define NVC_ConstMatT NVC_ConstMat +#define NVC_ConstMatD NVC_ConstMat +#define NVC_ConstMatS NVC_ConstMat +#define NVC_ConstMatI NVC_ConstMat +#define NVC_ConstMatD2 NVC_ConstMat + +namespace NVC { + +// Get a channel for float array +template +inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { + channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); +} + +// Get a channel for float2 array +template <> +inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { + channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); +} + +// Get a channel for double array +template <> +inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { + channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned); +} + +// Get a channel for double array +template <> +inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { + channel = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindSigned); +} + +// Get a channel for int array +template <> +inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { + channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned); +} + +} + +/// Page-locked Row Vector on Host +template +class NVC_Host { + public: + NVC_Host() { _cols=0; } + ~NVC_Host() { if (_cols>0) CUDA_SAFE_CALL(cudaFreeHost(_array)); } + + // Allocate page-locked memory with fast write/slow read on host + inline void safe_alloc_w(const size_t cols) { + _cols=cols; + _row_bytes=cols*sizeof(numtyp); + CUDA_SAFE_CALL(cudaHostAlloc((void **)&_array,_row_bytes, + cudaHostAllocWriteCombined)); + _end=_array+cols; + } + + // Allocate page-locked memory with fast read/write on host + inline void safe_alloc_rw(const size_t cols) { + _cols=cols; + _row_bytes=cols*sizeof(numtyp); + CUDA_SAFE_CALL(cudaMallocHost((void **)&_array,_row_bytes)); + _end=_array+cols; + } + + /// Free any memory associated with device + inline void clear() + { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } } + + /// Set each element to zero + inline void zero() { memset(_array,0,row_bytes()); } + + /// Set first n elements to zero + inline void zero(const int n) { memset(_array,0,n*sizeof(numtyp)); } + + inline numtyp * begin() { return _array; } + inline const numtyp * begin() const { return _array; } + inline numtyp * end() { return _end; } + inline const numtyp * end() const { return _end; } + + inline size_t numel() const { return _cols; } + inline size_t rows() const { return 1; } + inline size_t cols() const { return _cols; } + inline size_t row_size() const { return _cols; } + inline size_t row_bytes() const { return _row_bytes; } + + inline numtyp & operator[](const int i) { return _array[i]; } + inline const numtyp & operator[](const int i) const { return _array[i]; } + + /// Copy from device (numel is not bytes) + inline void copy_from_device(const numtyp *device_p, size_t numel) { + CUDA_SAFE_CALL(cudaMemcpy(_array,device_p,numel*sizeof(numtyp), + cudaMemcpyDeviceToHost)); + } + + /// Copy to device (numel is not bytes) + inline void copy_to_device(numtyp *device_p, size_t numel) { + CUDA_SAFE_CALL(cudaMemcpy(device_p,_array,numel*sizeof(numtyp), + cudaMemcpyHostToDevice)); + } + + /// Copy to 2D matrix on device (numel is not bytes) + inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, + const size_t rows, const size_t cols) { + CUDA_SAFE_CALL(cudaMemcpy2D(device_p,dev_row_size*sizeof(numtyp), + _array,cols*sizeof(numtyp), + cols*sizeof(numtyp),rows, + cudaMemcpyHostToDevice)); + } + + /// Asynchronous copy from device (numel is not bytes) + inline void copy_from_device(const numtyp *device_p, size_t numel, + cudaStream_t &stream) { + CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp), + cudaMemcpyDeviceToHost,stream)); + } + + /// Asynchronous copy to device (numel is not bytes) + inline void copy_to_device(numtyp *device_p, size_t numel, + cudaStream_t &stream) { + CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp), + cudaMemcpyHostToDevice,stream)); + } + + /// Asynchronous copy to 2D matrix on device (numel is not bytes) + inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, + const size_t rows, const size_t cols, + cudaStream_t &stream) { + CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp), + _array,cols*sizeof(numtyp), + cols*sizeof(numtyp),rows, + cudaMemcpyHostToDevice,stream)); + } + + private: + numtyp *_array, *_end; + size_t _row_bytes, _row_size, _rows, _cols; +}; + +/// Row vector on device +template +class NVC_Vec { + public: + NVC_Vec() { _cols=0; } + ~NVC_Vec() { if (_cols>0) CUDA_SAFE_CALL(cudaFree(_array)); } + + // Row vector on device + inline void safe_alloc(const size_t cols) { + _cols=cols; + _row_bytes=cols*sizeof(numtyp); + CUDA_SAFE_CALL(cudaMalloc((void **)&_array,_row_bytes)); + _end=_array+cols; + } + + /// Free any memory associated with device + inline void clear() + { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFree(_array)); } } + + /// Set each element to zero + inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0,row_bytes())); } + + inline numtyp * begin() { return _array; } + inline const numtyp * begin() const { return _array; } + inline numtyp * end() { return _end; } + inline const numtyp * end() const { return _end; } + + inline size_t numel() const { return _cols; } + inline size_t rows() const { return 1; } + inline size_t cols() const { return _cols; } + inline size_t row_size() const { return _cols; } + inline size_t row_bytes() const { return _row_bytes; } + + /// Copy from host + inline void copy_from_host(const numtyp *host_p) + { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), + cudaMemcpyHostToDevice)); } + + /// Asynchronous copy from host + inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) + { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(), + cudaMemcpyHostToDevice, stream)); } + + /// Copy to host + inline void copy_to_host(numtyp *host_p) + { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(), + cudaMemcpyDeviceToHost)); } + + /// Copy n elements to host + inline void copy_to_host(numtyp *host_p, const int n) + { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,n*sizeof(numtyp), + cudaMemcpyDeviceToHost)); } + + /// Cast and then copy to device + template + inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) { + for (int i=0; i(buffer[i]); + copy_from_host(host_write.begin()); + } + + /// Bind to texture + template + inline void bind_texture(texture &texi, cudaChannelFormatDesc &channel) { + NVC::cuda_gb_get_channel(channel); + texi.addressMode[0] = cudaAddressModeClamp; + texi.addressMode[1] = cudaAddressModeClamp; + texi.filterMode = cudaFilterModePoint; + texi.normalized = false; + CUDA_SAFE_CALL(cudaBindTexture(NULL,&texi,_array,&channel)); + } + + /// Output the vector (debugging) + inline void print(std::ostream &out) { print (out, numel()); } + + // Output first n elements of vector + inline void print(std::ostream &out, const int n) { + numtyp *t=new numtyp[n]; + copy_to_host(t,n); + for (int i=0; i +class NVC_Mat { + public: + NVC_Mat() { _rows=0; } + ~NVC_Mat() { if (_rows>0) CUDA_SAFE_CALL(cudaFree(_array)); } + + // Row major matrix on device + // - Coalesced access using adjacent cols on same row + // - NVC_Mat(row,col) given by array[row*row_size()+col] + inline void safe_alloc(const size_t rows, const size_t cols) { + _rows=rows; + _cols=cols; + CUDA_SAFE_CALL(cudaMallocPitch((void **)&_array,&_pitch, + cols*sizeof(numtyp),rows)); + _row_size=_pitch/sizeof(numtyp); + _end=_array+_row_size*cols; + } + + /// Free any memory associated with device + inline void clear() + { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFree(_array)); } } + + /// Set each element to zero + inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0, _pitch*_rows)); } + + inline numtyp * begin() { return _array; } + inline const numtyp * begin() const { return _array; } + inline numtyp * end() { return _end; } + inline const numtyp * end() const { return _end; } + + + inline size_t numel() const { return _cols*_rows; } + inline size_t rows() const { return _rows; } + inline size_t cols() const { return _cols; } + inline size_t row_size() const { return _row_size; } + inline size_t row_bytes() const { return _pitch; } + + /// Copy from host (elements not bytes) + inline void copy_from_host(const numtyp *host_p, const size_t numel) + { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,numel*sizeof(numtyp), + cudaMemcpyHostToDevice)); } + + /// Asynchronous copy from host (elements not bytes) + inline void copy_from_host(const numtyp *host_p, const size_t numel, + cudaStream_t &stream) + { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), + cudaMemcpyHostToDevice, stream)); } + + /// Asynchronous Copy from Host + /** \note Used when the number of columns/rows allocated on host smaller than + * on device **/ + inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, + const size_t cols, cudaStream_t &stream) { + CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp), + cols*sizeof(numtyp), rows, + cudaMemcpyHostToDevice,stream)); + } + + private: + numtyp *_array, *_end; + size_t _pitch, _row_size, _rows, _cols; +}; + +/// Const 2D Matrix on device (requires texture binding) +template +class NVC_ConstMat { + public: + NVC_ConstMat() { _rows=0; } + ~NVC_ConstMat() { if (_rows>0) CUDA_SAFE_CALL(cudaFreeArray(_array)); } + + /// Row major matrix on device + inline void safe_alloc(const size_t rows, const size_t cols) { + _rows=rows; + _cols=cols; + + NVC::cuda_gb_get_channel(_channel); + CUDA_SAFE_CALL(cudaMallocArray(&_array, &_channel, cols, rows)); + } + + /// Bind to texture + template + inline void bind_texture(texture &texi) { + texi.addressMode[0] = cudaAddressModeClamp; + texi.addressMode[1] = cudaAddressModeClamp; + texi.filterMode = cudaFilterModePoint; + texi.normalized = false; + CUDA_SAFE_CALL(cudaBindTextureToArray(&texi,_array,&_channel)); + } + + /// Free any memory associated with device + inline void clear() + { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFreeArray(_array)); } } + + inline size_t numel() const { return _cols*_rows; } + inline size_t rows() const { return _rows; } + inline size_t cols() const { return _cols; } + inline size_t row_size() const { return _cols; } + inline size_t row_bytes() const { return _cols*sizeof(numtyp); } + + /// Copy from Host + inline void copy_from_host(const numtyp *host_p) { + CUDA_SAFE_CALL(cudaMemcpyToArray(_array, 0, 0, host_p, + numel()*sizeof(numtyp), + cudaMemcpyHostToDevice)); + } + + /// Copy from Host + /** \note Used when the number of columns/rows allocated on host smaller than + * on device **/ + inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, + const size_t cols) { + CUDA_SAFE_CALL(cudaMemcpy2DToArray(_array, 0, 0, host_p, + cols*sizeof(numtyp), cols*sizeof(numtyp), rows, + cudaMemcpyHostToDevice)); + } + + /// Asynchronous Copy from Host + inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) { + CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, + numel()*sizeof(numtyp), + cudaMemcpyHostToDevice,stream)); + } + + /// Asynchronous Copy from Host + /** \note Used when the number of columns/rows allocated on host smaller than + * on device **/ + inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, + const size_t cols, cudaStream_t &stream) { + CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, + cols*sizeof(numtyp), cols*sizeof(numtyp), rows, + cudaMemcpyHostToDevice,stream)); + } + + /// Cast buffer to numtyp in host_write and copy to array + template + inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) { + int n=numel(); + for (int i=0; i(*buffer); buffer++; + } + copy_from_host(host_write.begin()); + } + + /// Cast buffer to numtyp in host_write and copy to array + /** \note Used when the number of columns/rows allocated on host smaller than + * on device **/ + template + inline void cast_copy2D(const numtyp2 *buffer, NVC_HostT &host_write, + const size_t rows, const size_t cols) { + int n=rows*cols; + for (int i=0; i(*buffer); buffer++; + } + copy_2Dfrom_host(host_write.begin(),rows,cols); + } + + /// Cast buffer to numtyp in host_write and copy to array asynchronously + template + inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write, + cudaStream_t &stream) { + int n=numel(); + for (int i=0; i(*buffer); buffer++; + } + copy_from_host(host_write.begin(),stream); + } + + private: + size_t _rows, _cols; + cudaArray *_array; + cudaChannelFormatDesc _channel; +}; + +#endif diff --git a/lib/gpu/nvc_timer.h b/lib/gpu/nvc_timer.h new file mode 100644 index 0000000000..12346b5ea2 --- /dev/null +++ b/lib/gpu/nvc_timer.h @@ -0,0 +1,83 @@ +/*************************************************************************** + nvc_timer.h + ------------------- + W. Michael Brown + + Class for timing CUDA routines + + __________________________________________________________________________ + This file is part of the NVC Library + __________________________________________________________________________ + + begin : Tue Feb 3 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#ifndef NVC_TIMER_H +#define NVC_TIMER_H + +#include "nvc_macros.h" + +#define cudaEventDestroy(a) + +/// Class for timing CUDA events +class NVCTimer { + public: + NVCTimer() : _total_time(0.0f), initialized(false) { } + + ~NVCTimer() { + if (initialized) + { cudaEventDestroy(start_event); cudaEventDestroy(stop_event); } + } + + inline void init() { + if (!initialized) { + initialized=true; + CUDA_SAFE_CALL( cudaEventCreate(&start_event) ); + CUDA_SAFE_CALL( cudaEventCreate(&stop_event) ); + } + } + + /// Start timing + inline void start() { cudaEventRecord(start_event,0); } + + /// Stop timing and store event time + inline void stop() { cudaEventRecord(stop_event,0); } + + /// Set the time elapsed to zero (not the total_time) + inline void zero() + { cudaEventRecord(start_event,0); cudaEventRecord(stop_event,0); } + + /// Add time from previous start and stop to total + /** Forces synchronization **/ + inline void add_to_total() { _total_time+=time(); } + + /// Return the time (ms) of last start to stop - Forces synchronization + inline double time() { + float timer; + cudaEventSynchronize(stop_event); + CUDA_SAFE_CALL( cudaEventElapsedTime(&timer,start_event,stop_event) ); + return timer; + } + + /// Return the total time in ms + inline double total_time() { return _total_time; } + + /// Return the total time in seconds + inline double total_seconds() { return _total_time/1000.0; } + + private: + cudaEvent_t start_event, stop_event; + double _total_time; + bool initialized; +}; + +#endif diff --git a/lib/gpu/pair_gpu_atom.cu b/lib/gpu/pair_gpu_atom.cu new file mode 100644 index 0000000000..94617a5c11 --- /dev/null +++ b/lib/gpu/pair_gpu_atom.cu @@ -0,0 +1,173 @@ +/*************************************************************************** + pair_gpu_atom.cu + ------------------- + W. Michael Brown + + Memory routines for moving atom and force data between host and gpu + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Aug 4 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#include "pair_gpu_atom.h" +#define PairGPUAtomT PairGPUAtom + +template +int PairGPUAtomT::bytes_per_atom() const { + return atom_fields()*sizeof(numtyp)+ans_fields()*sizeof(acctyp); +} + +template +void PairGPUAtomT::init(const int max_atoms) { + if (allocated) + clear(); + + _max_atoms=max_atoms; + + // Initialize timers for the selected GPU + time_atom.init(); + time_answer.init(); + + // Device matrices for atom and force data + dev_x.safe_alloc(atom_fields(),max_atoms); + x_bind_texture(dev_x); + ans.safe_alloc(ans_fields(),max_atoms); + + // Get a host write only buffer + host_write.safe_alloc_w(max_atoms*4); + // Get a host read/write buffer + host_read.safe_alloc_rw(ans.row_size()*ans_fields()); + + allocated=true; +} + +template +void PairGPUAtomT::clear() { + if (!allocated) + return; + allocated=false; + + x_unbind_texture(); + ans.clear(); + host_write.clear(); + host_read.clear(); + dev_x.clear(); +} + +template +double PairGPUAtomT::host_memory_usage(const int max_atoms) const { + return max_atoms*atom_fields()*sizeof(numtyp)+ + ans_fields()*(max_atoms)*sizeof(acctyp)+ + sizeof(PairGPUAtom); +} + +template +void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, + cudaStream_t &s) { + _eflag=eflag; + _vflag=vflag; + + int csize=ans_fields(); + if (!eflag) + csize--; + if (!vflag) + csize-=6; + + host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s); +} + +template +double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom, + const bool vflag_atom, double *eatom, + double **vatom, double *virial) { + double evdwl=0.0; + int gap=ans.row_size()-_inum; + + acctyp *ap=host_read.begin(); + if (_eflag) { + if (eflag_atom) { + for (int i=0; i<_inum; i++) { + evdwl+=*ap; + eatom[ilist[i]]+=*ap*0.5; + ap++; + } + } else + for (int i=0; i<_inum; i++) { + evdwl+=*ap; + ap++; + } + ap+=gap; + evdwl*=0.5; + } + _read_loc=ap; + gap=ans.row_size(); + if (_vflag) { + if (vflag_atom) { + for (int ii=0; ii<_inum; ii++) { + int i=ilist[ii]; + ap=_read_loc+ii; + for (int j=0; j<6; j++) { + vatom[i][j]+=*ap*0.5; + virial[j]+=*ap; + ap+=gap; + } + } + } else { + for (int ii=0; ii<_inum; ii++) { + ap=_read_loc+ii; + for (int j=0; j<6; j++) { + virial[j]+=*ap; + ap+=gap; + } + } + } + for (int j=0; j<6; j++) + virial[j]*=0.5; + _read_loc+=gap*6; + } + + return evdwl; +} + +template +void PairGPUAtomT::add_forces(const int *ilist, double **f) { + int gap=ans.row_size(); + for (int ii=0; ii<_inum; ii++) { + acctyp *ap=_read_loc+ii; + int i=ilist[ii]; + f[i][0]+=*ap; + ap+=gap; + f[i][1]+=*ap; + ap+=gap; + f[i][2]+=*ap; + } +} + +template +void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) { + int gap=ans.row_size(); + _read_loc+=gap*3; + for (int ii=0; ii; diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h new file mode 100644 index 0000000000..a5796db45a --- /dev/null +++ b/lib/gpu/pair_gpu_atom.h @@ -0,0 +1,151 @@ +/*************************************************************************** + pair_gpu_atom.h + ------------------- + W. Michael Brown + + Memory routines for moving atom and force data between host and gpu + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Aug 4 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#ifndef PAIR_GPU_ATOM_H +#define PAIR_GPU_ATOM_H + +// PRECISION - Precision for rsq, energy, force, and torque calculation +// ACC_PRECISION - Precision for accumulation of energies, forces, and torques +#ifdef _SINGLE_DOUBLE +#define PRECISION float +#define ACC_PRECISION double +#endif + +#ifdef _DOUBLE_DOUBLE +#define PRECISION double +#define ACC_PRECISION double +#endif + +#ifndef PRECISION +#define PRECISION float +#define ACC_PRECISION double +#endif + +#define MAX_ATOMS 65536 +#include "nvc_timer.h" +#include "pair_gpu_texture.h" + +template +class PairGPUAtom { + public: + PairGPUAtom() : _atom_fields(4), _ans_fields(10), allocated(false) {} + ~PairGPUAtom() { clear(); } + + // Accessors + inline int atom_fields() const { return _atom_fields; } + inline int ans_fields() const { return _ans_fields; } + inline int max_atoms() const { return _max_atoms; } + inline int nall() const { return _nall; } + inline int inum() const { return _inum; } + + /// Set number of atoms for future copy operations + inline void nall(const int n) { _nall=n; } + /// Set number of inum for future copy operations + inline void inum(const int n) { _inum=n; } + /// Set the number of atom fields (x, y, z, type, etc) + inline void atom_fields(const int n) { _atom_fields=n; } + /// Set the number of answer fields (energy, virial, force, etc.) + inline void ans_fields(const int n) { _ans_fields=n; } + + /// Memory usage per atom in this class + /** \note atom_fields and ans_fields should be set for correct answer **/ + int bytes_per_atom() const; + + /// Must be called once to allocate host and device memory + /** \note atom_fields and ans_fields should be set first if not default **/ + void init(const int max_atoms); + + /// Free all memory on host and device + void clear(); + + /// Return the total amount of host memory used by class + double host_memory_usage(const int max_atoms) const; + + + // -------------------------COPY TO GPU ---------------------------------- + + /// Reset the write buffer pointer (Start copying new atom data) + inline void reset_write_buffer() { _write_loc=host_write.begin(); } + + /// Add a row to write buffer with unit stride + /** Copies nall() elements **/ + template + inline void add_atom_data(const cpytyp *host_ptr) + { for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i]; _write_loc++; } } + + /// Add a row to write buffer with non-unit stride + /** Copies nall() elements **/ + template + inline void add_atom_data(const cpytyp *hostptr, const int stride) { + int t=_nall*stride; + for (int i=0; i ans; + + // Buffer for moving floating point data to GPU + NVC_HostT host_write; + // Buffer for moving floating point data to CPU + NVC_Host host_read; + + // Timing Stuff + NVCTimer time_atom, time_answer; + + private: + bool allocated, _eflag, _vflag; + int _atom_fields, _ans_fields; + int _max_atoms, _nall, _inum; + numtyp * _write_loc; + acctyp * _read_loc; +}; + +#endif diff --git a/lib/gpu/pair_gpu_nbor.cu b/lib/gpu/pair_gpu_nbor.cu new file mode 100644 index 0000000000..e3fc2b28d8 --- /dev/null +++ b/lib/gpu/pair_gpu_nbor.cu @@ -0,0 +1,80 @@ +/*************************************************************************** + pair_gpu_nbor.cu + ------------------- + W. Michael Brown + + Neighbor memory operations for LAMMPS GPU Library + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Aug 4 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#include "pair_gpu_nbor.h" + +int PairGPUNbor::bytes_per_atom(const int max_nbors) const { + if (_use_packing) + return (max_nbors*2+4)*sizeof(int); + else + return (max_nbors+3)*sizeof(int); +} + +void PairGPUNbor::init(const int ij_size, const int max_atoms, + const int max_nbors) { + if (allocated) + clear(); + + // Initialize timers for the selected GPU + time_nbor.init(); + + if (_use_packing) + dev_nbor.safe_alloc(max_nbors+4,max_atoms); + else + dev_nbor.safe_alloc(3,max_atoms); + + ij.safe_alloc(max_nbors*max_atoms); + host_ij.safe_alloc_w(ij_size); + + allocated=true; +} + +void PairGPUNbor::clear() { + if (!allocated) + return; + allocated=false; + + ij.clear(); + host_ij.clear(); + dev_nbor.clear(); +} + +double PairGPUNbor::host_memory_usage() const { + return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor); +} + +void PairGPUNbor::reset(const int inum, int *ilist, const int *numj, + cudaStream_t &s) { + ij_total=0; + + dev_nbor.copy_from_host(ilist,inum); + int acc=0; + for (int i=0; i class cu_vec_traits; +template <> class cu_vec_traits { public: typedef float2 vec2; }; +template <> class cu_vec_traits { public: typedef double2 vec2; }; + +// ------------------------------- x ------------------------------------ + +static texture x_float_tex; +static texture x_double_tex; +template inline void x_bind_texture(NVC_ConstMatT &mat) + { mat.bind_texture(x_float_tex); } + +template <> inline void x_bind_texture(NVC_ConstMatD &mat) + { mat.bind_texture(x_double_tex); } +template inline void x_unbind_texture() + { cudaUnbindTexture(x_float_tex); } +template <> inline void x_unbind_texture() + { cudaUnbindTexture(x_double_tex); } +template +static __inline__ __device__ numtyp _x_(const int i, const int j) { + return tex2D(x_float_tex,i,j); +} +#ifdef GB_GPU_DOUBLE +template <> +static __inline__ __device__ double _x_(const int i,const int j) { + int2 t=tex2D(x_double_tex,i,j); + return __hiloint2double(t.y, t.x); +} +#endif + +// ------------------------------- form ------------------------------------ + +static texture form_tex; +inline void form_bind_texture(NVC_ConstMatI &mat) + { mat.bind_texture(form_tex); } +inline void form_unbind_texture() + { cudaUnbindTexture(form_tex); } +static __inline__ __device__ int _form_(const int i, const int j) { + return tex2D(form_tex,i,j); +} + +// ------------------------------- lshape ------------------------------------ + +static texture lshape_float_tex; +static texture lshape_double_tex; +static cudaChannelFormatDesc channel_lshape; +template inline void lshape_bind_texture(NVC_VecT &vec) + { vec.bind_texture(lshape_float_tex,channel_lshape); } +template <> inline void lshape_bind_texture(NVC_VecD &vec) + { vec.bind_texture(lshape_double_tex,channel_lshape); } +template inline void lshape_unbind_texture() + { cudaUnbindTexture(lshape_float_tex); } +template <> inline void lshape_unbind_texture() + { cudaUnbindTexture(lshape_double_tex); } +template +static __inline__ __device__ numtyp _lshape_(const int i) + { return tex1Dfetch(lshape_float_tex,i); } +#ifdef GB_GPU_DOUBLE +template <> +static __inline__ __device__ double _lshape_(const int i) { + int2 t=tex1Dfetch(lshape_double_tex,i); + return __hiloint2double(t.y, t.x); +} +#endif + +// ------------------------------- shape ------------------------------------ + +static texture shape_float_tex; +static texture shape_double_tex; +template inline void shape_bind_texture(NVC_ConstMatT &mat) + { mat.bind_texture(shape_float_tex); } +template <> inline void shape_bind_texture(NVC_ConstMatD &mat) + { mat.bind_texture(shape_double_tex); } +template inline void shape_unbind_texture() + { cudaUnbindTexture(shape_float_tex); } +template <> inline void shape_unbind_texture() + { cudaUnbindTexture(shape_double_tex); } +template +static __inline__ __device__ numtyp _shape_(const int i, const int j) { + return tex2D(shape_float_tex,j,i); +} +#ifdef GB_GPU_DOUBLE +template <> +static __inline__ __device__ double _shape_(const int i, const int j) { + int2 t=tex2D(shape_double_tex,j,i); + return __hiloint2double(t.y, t.x); +} +#endif + +// ------------------------------- well ------------------------------------ + +static texture well_float_tex; +static texture well_double_tex; +template inline void well_bind_texture(NVC_ConstMatT &mat) + { mat.bind_texture(well_float_tex); } +template <> inline void well_bind_texture(NVC_ConstMatD &mat) + { mat.bind_texture(well_double_tex); } +template inline void well_unbind_texture() + { cudaUnbindTexture(well_float_tex); } +template <> inline void well_unbind_texture() + { cudaUnbindTexture(well_double_tex); } +template +static __inline__ __device__ numtyp _well_(const int i, const int j) + { return tex2D(well_float_tex,j,i); } +#ifdef GB_GPU_DOUBLE +template <> +static __inline__ __device__ double _well_(const int i,const int j) { + int2 t=tex2D(well_double_tex,j,i); + return __hiloint2double(t.y, t.x); +} +#endif + +// ------------------------------- sigma ------------------------------------ + +static texture sigma_float_tex; +static texture sigma_double_tex; +template inline void sigma_bind_texture(NVC_ConstMatT &mat) + { mat.bind_texture(sigma_float_tex); } +template <> inline void sigma_bind_texture(NVC_ConstMatD &mat) + { mat.bind_texture(sigma_double_tex); } +template inline void sigma_unbind_texture() + { cudaUnbindTexture(sigma_float_tex); } +template <> inline void sigma_unbind_texture() + { cudaUnbindTexture(sigma_double_tex); } +template +static __inline__ __device__ numtyp _sigma_(const int i, const int j) { + return tex2D(sigma_float_tex,j,i); +} +#ifdef GB_GPU_DOUBLE +template <> +static __inline__ __device__ double _sigma_(const int i,const int j) { + int2 t=tex2D(sigma_double_tex,j,i); + return __hiloint2double(t.y, t.x); +} +#endif + +// ------------------------------- epsilon ------------------------------------ + +static texture epsilon_float_tex; +static texture epsilon_double_tex; +template inline void epsilon_bind_texture(NVC_ConstMatT &mat) + { mat.bind_texture(epsilon_float_tex); } +template <> inline void epsilon_bind_texture(NVC_ConstMatD &mat) + { mat.bind_texture(epsilon_double_tex); } +template inline void epsilon_unbind_texture() + { cudaUnbindTexture(epsilon_float_tex); } +template <> inline void epsilon_unbind_texture() + { cudaUnbindTexture(epsilon_double_tex); } +template +static __inline__ __device__ numtyp _epsilon_(const int i, const int j) { + return tex2D(epsilon_float_tex,j,i); +} +#ifdef GB_GPU_DOUBLE +template <> +static __inline__ __device__ double _epsilon_(const int i,const int j) { + int2 t=tex2D(epsilon_double_tex,j,i); + return __hiloint2double(t.y, t.x); +} +#endif + +// ------------------------------- cutsq ------------------------------------ + +static texture cutsq_float_tex; +static texture cutsq_double_tex; +template inline void cutsq_bind_texture(NVC_ConstMatT &mat) + { mat.bind_texture(cutsq_float_tex); } +template <> inline void cutsq_bind_texture(NVC_ConstMatD &mat) + { mat.bind_texture(cutsq_double_tex); } +template inline void cutsq_unbind_texture() + { cudaUnbindTexture(cutsq_float_tex); } +template <> inline void cutsq_unbind_texture() + { cudaUnbindTexture(cutsq_double_tex); } +template +static __inline__ __device__ numtyp _cutsq_(const int i, const int j) { + return tex2D(cutsq_float_tex,j,i); +} +#ifdef GB_GPU_DOUBLE +template <> +static __inline__ __device__ double _cutsq_(const int i,const int j) { + int2 t=tex2D(cutsq_double_tex,j,i); + return __hiloint2double(t.y, t.x); +} +#endif + +// ------------------------------- lj1 ------------------------------------ + +static texture lj1_float_tex; +static texture lj1_double_tex; +template inline void lj1_bind_texture(NVC_ConstMatT &mat) + { mat.bind_texture(lj1_float_tex); } +template <> inline void lj1_bind_texture(NVC_ConstMatD2 &mat) + { mat.bind_texture(lj1_double_tex); } +template inline void lj1_unbind_texture() + { cudaUnbindTexture(lj1_float_tex); } +template <> inline void lj1_unbind_texture() + { cudaUnbindTexture(lj1_double_tex); } +template +static __inline__ __device__ +typename cu_vec_traits::vec2 _lj1_(const int i, const int j) { + return tex2D(lj1_float_tex,j,i); +} +#ifdef GB_GPU_DOUBLE +template <> +static __inline__ __device__ double2 _lj1_(const int i,const int j) { + int4 t=tex2D(lj1_double_tex,j,i); + double2 ans; + ans.x=__hiloint2double(t.y, t.x); + ans.y=__hiloint2double(t.w, t.z); + return ans; +} +#endif + +// ------------------------------- lj3 ------------------------------------ + +static texture lj3_float_tex; +static texture lj3_double_tex; +template inline void lj3_bind_texture(NVC_ConstMatT &mat) + { mat.bind_texture(lj3_float_tex); } +template <> inline void lj3_bind_texture(NVC_ConstMatD2 &mat) + { mat.bind_texture(lj3_double_tex); } +template inline void lj3_unbind_texture() + { cudaUnbindTexture(lj3_float_tex); } +template <> inline void lj3_unbind_texture() + { cudaUnbindTexture(lj3_double_tex); } +template +static __inline__ __device__ +typename cu_vec_traits::vec2 _lj3_(const int i, const int j) { + return tex2D(lj3_float_tex,j,i); +} +#ifdef GB_GPU_DOUBLE +template <> +static __inline__ __device__ double2 _lj3_(const int i,const int j) { + int4 t=tex2D(lj3_double_tex,j,i); + double2 ans; + ans.x=__hiloint2double(t.y, t.x); + ans.y=__hiloint2double(t.w, t.z); + return ans; +} +#endif + +// ------------------------------- offset ------------------------------------ + +static texture offset_float_tex; +static texture offset_double_tex; +template inline void offset_bind_texture(NVC_ConstMatT &mat) + { mat.bind_texture(offset_float_tex); } +template <> inline void offset_bind_texture(NVC_ConstMatD &mat) + { mat.bind_texture(offset_double_tex); } +template inline void offset_unbind_texture() + { cudaUnbindTexture(offset_float_tex); } +template <> inline void offset_unbind_texture() + { cudaUnbindTexture(offset_double_tex); } +template +static __inline__ __device__ numtyp _offset_(const int i, const int j) { + return tex2D(offset_float_tex,j,i); +} +#ifdef GB_GPU_DOUBLE +template <> +static __inline__ __device__ double _offset_(const int i,const int j) { + int2 t=tex2D(offset_double_tex,j,i); + return __hiloint2double(t.y, t.x); +} +#endif + +#endif