/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov Peng Wang (Nvidia), penwang@nvidia.com Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ #ifndef NVC_MEMORY_H #define NVC_MEMORY_H #include #include "nvc_macros.h" #define NVC_HostT NVC_Host #define NVC_HostD NVC_Host #define NVC_HostS NVC_Host #define NVC_HostI NVC_Host #define NVC_VecT NVC_Vec #define NVC_VecD NVC_Vec #define NVC_VecS NVC_Vec #define NVC_VecI NVC_Vec #define NVC_VecI2 NVC_Vec #define NVC_VecU2 NVC_Vec #define NVC_MatT NVC_Mat #define NVC_MatD NVC_Mat #define NVC_MatS NVC_Mat #define NVC_MatI NVC_Mat #define NVC_ConstMatT NVC_ConstMat #define NVC_ConstMatD NVC_ConstMat #define NVC_ConstMatS NVC_ConstMat #define NVC_ConstMatI NVC_ConstMat #define NVC_ConstMatD2 NVC_ConstMat namespace NVC { // Get a channel for float array template inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); } // Get a channel for float2 array template <> inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); } // Get a channel for double array template <> inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned); } // Get a channel for double array template <> inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { channel = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindSigned); } // Get a channel for int array template <> inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned); } } /// Page-locked Row Vector on Host template class NVC_Host { public: NVC_Host() { _cols=0; } ~NVC_Host() { if (_cols>0) CUDA_SAFE_CALL(cudaFreeHost(_array)); } // Allocate page-locked memory with fast write/slow read on host inline void safe_alloc_w(const size_t cols) { _cols=cols; _row_bytes=cols*sizeof(numtyp); CUDA_SAFE_CALL(cudaHostAlloc((void **)&_array,_row_bytes, cudaHostAllocWriteCombined)); _end=_array+cols; } // Allocate page-locked memory with fast write/slow read on host inline bool alloc_w(const size_t cols) { _cols=cols; _row_bytes=cols*sizeof(numtyp); if (cudaHostAlloc((void **)&_array,_row_bytes,cudaHostAllocWriteCombined)!= cudaSuccess) return false; _end=_array+cols; return true; } // Allocate page-locked memory with fast read/write on host inline void safe_alloc_rw(const size_t cols) { _cols=cols; _row_bytes=cols*sizeof(numtyp); CUDA_SAFE_CALL(cudaMallocHost((void **)&_array,_row_bytes)); _end=_array+cols; } // Allocate page-locked memory with fast read/write on host inline bool alloc_rw(const size_t cols) { _cols=cols; _row_bytes=cols*sizeof(numtyp); if (cudaMallocHost((void **)&_array,_row_bytes)!=cudaSuccess) return false; _end=_array+cols; return true; } /// Free any memory associated with device inline void clear() { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } } /// Set each element to zero inline void zero() { memset(_array,0,row_bytes()); } /// Set first n elements to zero inline void zero(const int n) { memset(_array,0,n*sizeof(numtyp)); } inline numtyp * begin() { return _array; } inline const numtyp * begin() const { return _array; } inline numtyp * end() { return _end; } inline const numtyp * end() const { return _end; } inline size_t numel() const { return _cols; } inline size_t rows() const { return 1; } inline size_t cols() const { return _cols; } inline size_t row_size() const { return _cols; } inline size_t row_bytes() const { return _row_bytes; } inline numtyp & operator[](const int i) { return _array[i]; } inline const numtyp & operator[](const int i) const { return _array[i]; } /// Copy from device (numel is not bytes) inline void copy_from_device(const numtyp *device_p, size_t numel) { CUDA_SAFE_CALL(cudaMemcpy(_array,device_p,numel*sizeof(numtyp), cudaMemcpyDeviceToHost)); } /// Copy to device (numel is not bytes) inline void copy_to_device(numtyp *device_p, size_t numel) { CUDA_SAFE_CALL(cudaMemcpy(device_p,_array,numel*sizeof(numtyp), cudaMemcpyHostToDevice)); } /// Copy to 2D matrix on device (numel is not bytes) inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, const size_t rows, const size_t cols) { CUDA_SAFE_CALL(cudaMemcpy2D(device_p,dev_row_size*sizeof(numtyp), _array,cols*sizeof(numtyp), cols*sizeof(numtyp),rows, cudaMemcpyHostToDevice)); } /// Asynchronous copy from device (numel is not bytes) inline void copy_from_device(const numtyp *device_p, size_t numel, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp), cudaMemcpyDeviceToHost,stream)); } /// Asynchronous copy to device (numel is not bytes) inline void copy_to_device(numtyp *device_p, size_t numel, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp), cudaMemcpyHostToDevice,stream)); } /// Asynchronous copy to device (numel is not bytes) inline void copy_to_device(size_t offset, numtyp *device_p, size_t numel, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array+offset,numel*sizeof(numtyp), cudaMemcpyHostToDevice,stream)); } /// Asynchronous copy to 2D matrix on device (numel is not bytes) inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, const size_t rows, const size_t cols, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(device_p, dev_row_size*sizeof(numtyp), _array,cols*sizeof(numtyp), cols*sizeof(numtyp),rows, cudaMemcpyHostToDevice,stream)); } private: numtyp *_array, *_end; size_t _row_bytes, _row_size, _rows, _cols; }; /// Row vector on device template class NVC_Vec { public: NVC_Vec() { _cols=0; } ~NVC_Vec() { if (_cols>0) CUDA_SAFE_CALL(cudaFree(_array)); } // Row vector on device inline void safe_alloc(const size_t cols) { _cols=cols; _row_bytes=cols*sizeof(numtyp); CUDA_SAFE_CALL(cudaMalloc((void **)&_array,_row_bytes)); _end=_array+cols; } // Row vector on device inline bool alloc(const size_t cols) { _cols=cols; _row_bytes=cols*sizeof(numtyp); if (cudaMalloc((void **)&_array,_row_bytes)!=cudaSuccess) return false; _end=_array+cols; return true; } // Row vector on device (allocate and assign texture and bind) inline void safe_alloc(const size_t cols, textureReference *t) { safe_alloc(cols); assign_texture(t); bind(); } /// Free any memory associated with device inline void clear() { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFree(_array)); } } /// Set each element to zero inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0,row_bytes())); } inline numtyp * begin() { return _array; } inline const numtyp * begin() const { return _array; } inline numtyp * end() { return _end; } inline const numtyp * end() const { return _end; } inline size_t numel() const { return _cols; } inline size_t rows() const { return 1; } inline size_t cols() const { return _cols; } inline size_t row_size() const { return _cols; } inline size_t row_bytes() const { return _row_bytes; } /// Copy from host inline void copy_from_host(const numtyp *host_p) { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), cudaMemcpyHostToDevice)); } /// Copy from host (n elements) inline void copy_from_host(const numtyp *host_p, const size_t n) { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,n*sizeof(numtyp), cudaMemcpyHostToDevice)); } /// Asynchronous copy from host inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(), cudaMemcpyHostToDevice, stream)); } /// Asynchronous copy from host (n elements) inline void copy_from_host(const numtyp *host_p, const size_t n, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,n*sizeof(numtyp), cudaMemcpyHostToDevice, stream)); } /// Copy to host inline void copy_to_host(numtyp *host_p) { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(), cudaMemcpyDeviceToHost)); } /// Copy n elements to host inline void copy_to_host(numtyp *host_p, const int n) { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,n*sizeof(numtyp), cudaMemcpyDeviceToHost)); } /// Cast and then copy to device template inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) { for (int i=0; i(buffer[i]); copy_from_host(host_write.begin()); } /// Assign a texture to matrix inline void assign_texture(textureReference *t) { _tex_ptr=t; } /// Bind to texture inline void bind() { NVC::cuda_gb_get_channel(_channel); (*_tex_ptr).addressMode[0] = cudaAddressModeClamp; (*_tex_ptr).addressMode[1] = cudaAddressModeClamp; (*_tex_ptr).filterMode = cudaFilterModePoint; (*_tex_ptr).normalized = false; CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,_array,&_channel)); } /// Unbind texture inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); } /// Output the vector (debugging) inline void print(std::ostream &out) { print (out, numel()); } // Output first n elements of vector inline void print(std::ostream &out, const int n) { numtyp *t=new numtyp[n]; copy_to_host(t,n); for (int i=0; i class NVC_Mat { public: NVC_Mat() { _rows=0; } ~NVC_Mat() { if (_rows>0) CUDA_SAFE_CALL(cudaFree(_array)); } // Row major matrix on device // - Coalesced access using adjacent cols on same row // - NVC_Mat(row,col) given by array[row*row_size()+col] inline void safe_alloc(const size_t rows, const size_t cols) { _rows=rows; _cols=cols; CUDA_SAFE_CALL(cudaMallocPitch((void **)&_array,&_pitch, cols*sizeof(numtyp),rows)); _row_size=_pitch/sizeof(numtyp); _end=_array+_row_size*cols; } /// Free any memory associated with device inline void clear() { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFree(_array)); } } /// Set each element to zero inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0, _pitch*_rows)); } inline numtyp * begin() { return _array; } inline const numtyp * begin() const { return _array; } inline numtyp * end() { return _end; } inline const numtyp * end() const { return _end; } inline size_t numel() const { return _cols*_rows; } inline size_t rows() const { return _rows; } inline size_t cols() const { return _cols; } inline size_t row_size() const { return _row_size; } inline size_t row_bytes() const { return _pitch; } /// Copy from host (elements not bytes) inline void copy_from_host(const numtyp *host_p, const size_t numel) { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,numel*sizeof(numtyp), cudaMemcpyHostToDevice)); } /// Asynchronous copy from host (elements not bytes) inline void copy_from_host(const numtyp *host_p, const size_t numel, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), cudaMemcpyHostToDevice, stream)); } /// Asynchronous Copy from Host /** \note Used when the number of columns/rows allocated on host smaller than * on device **/ inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, const size_t cols, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(_array, _pitch, host_p, cols*sizeof(numtyp), cols*sizeof(numtyp), rows, cudaMemcpyHostToDevice,stream)); } private: numtyp *_array, *_end; size_t _pitch, _row_size, _rows, _cols; }; /// Const 2D Matrix on device (requires texture binding) template class NVC_ConstMat { public: NVC_ConstMat() { _rows=0; } ~NVC_ConstMat() { if (_rows>0) CUDA_SAFE_CALL(cudaFreeArray(_array)); } /// Assign a texture to matrix inline void assign_texture(textureReference *t) { _tex_ptr=t; } /// Row major matrix on device inline void safe_alloc(const size_t rows, const size_t cols) { _rows=rows; _cols=cols; NVC::cuda_gb_get_channel(_channel); CUDA_SAFE_CALL(cudaMallocArray(&_array, &_channel, cols, rows)); } /// Row major matrix on device (Allocate and bind texture) inline void safe_alloc(const size_t rows, const size_t cols, textureReference *t) { safe_alloc(rows,cols); assign_texture(t); bind(); } /// Bind to texture inline void bind() { (*_tex_ptr).addressMode[0] = cudaAddressModeClamp; (*_tex_ptr).addressMode[1] = cudaAddressModeClamp; (*_tex_ptr).filterMode = cudaFilterModePoint; (*_tex_ptr).normalized = false; CUDA_SAFE_CALL(cudaBindTextureToArray(_tex_ptr,_array,&_channel)); } /// Unbind texture inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); } /// Free any memory associated with device and unbind inline void clear() { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); CUDA_SAFE_CALL(cudaFreeArray(_array)); } } inline size_t numel() const { return _cols*_rows; } inline size_t rows() const { return _rows; } inline size_t cols() const { return _cols; } inline size_t row_size() const { return _cols; } inline size_t row_bytes() const { return _cols*sizeof(numtyp); } /// Copy from Host inline void copy_from_host(const numtyp *host_p) { CUDA_SAFE_CALL(cudaMemcpyToArray(_array, 0, 0, host_p, numel()*sizeof(numtyp), cudaMemcpyHostToDevice)); } /// Copy from Host /** \note Used when the number of columns/rows allocated on host smaller than * on device **/ inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, const size_t cols) { CUDA_SAFE_CALL(cudaMemcpy2DToArray(_array, 0, 0, host_p, cols*sizeof(numtyp), cols*sizeof(numtyp), rows, cudaMemcpyHostToDevice)); } /// Asynchronous Copy from Host inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, numel()*sizeof(numtyp), cudaMemcpyHostToDevice, stream)); } /// Asynchronous Copy from Host /** \note Used when the number of columns/rows allocated on host smaller than * on device **/ inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, const size_t cols, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, cols*sizeof(numtyp), cols*sizeof(numtyp), rows, cudaMemcpyHostToDevice,stream)); } /// Cast buffer to numtyp in host_write and copy to array template inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) { int n=numel(); for (int i=0; i(*buffer); buffer++; } copy_from_host(host_write.begin()); } /// Cast buffer to numtyp in host_write and copy to array /** \note Used when the number of columns/rows allocated on host smaller than * on device **/ template inline void cast_copy2D(const numtyp2 *buffer, NVC_HostT &host_write, const size_t rows, const size_t cols) { int n=rows*cols; for (int i=0; i(*buffer); buffer++; } copy_2Dfrom_host(host_write.begin(),rows,cols); } /// Cast buffer to numtyp in host_write and copy to array asynchronously template inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write, cudaStream_t &stream) { int n=numel(); for (int i=0; i(*buffer); buffer++; } copy_from_host(host_write.begin(),stream); } private: size_t _rows, _cols; cudaArray *_array; cudaChannelFormatDesc _channel; textureReference *_tex_ptr; }; #endif