git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7885 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -1,2 +1 @@
|
|||||||
Geryon Version 11.094
|
Geryon Version 12.034
|
||||||
|
|
||||||
|
|||||||
@ -117,6 +117,14 @@ class UCL_Device {
|
|||||||
_cq.pop_back();
|
_cq.pop_back();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Set the default command queue (by default this is the null stream)
|
||||||
|
/** \param i index of the command queue (as added by push_command_queue())
|
||||||
|
If i is 0, the default command queue is set to the null stream **/
|
||||||
|
inline void set_command_queue(const int i) {
|
||||||
|
if (i==0) _cq[0]=0;
|
||||||
|
else _cq[0]=_cq[i];
|
||||||
|
}
|
||||||
|
|
||||||
/// Get the current CUDA device name
|
/// Get the current CUDA device name
|
||||||
inline std::string name() { return name(_device); }
|
inline std::string name() { return name(_device); }
|
||||||
/// Get the CUDA device name
|
/// Get the CUDA device name
|
||||||
@ -280,6 +288,7 @@ inline int UCL_Device::set(int num) {
|
|||||||
if (_device>-1) {
|
if (_device>-1) {
|
||||||
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
|
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
|
||||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||||
|
_cq[0]=0;
|
||||||
}
|
}
|
||||||
_device=_properties[num].device_id;
|
_device=_properties[num].device_id;
|
||||||
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
|
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
|
||||||
|
|||||||
@ -34,11 +34,11 @@ class UCL_Texture;
|
|||||||
/// Class storing 1 or more kernel functions from a single string or file
|
/// Class storing 1 or more kernel functions from a single string or file
|
||||||
class UCL_Program {
|
class UCL_Program {
|
||||||
public:
|
public:
|
||||||
inline UCL_Program(UCL_Device &device) {}
|
inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
|
||||||
inline ~UCL_Program() {}
|
inline ~UCL_Program() {}
|
||||||
|
|
||||||
/// Initialize the program with a device
|
/// Initialize the program with a device
|
||||||
inline void init(UCL_Device &device) { }
|
inline void init(UCL_Device &device) { _cq=device.cq(); }
|
||||||
|
|
||||||
/// Clear any data associated with program
|
/// Clear any data associated with program
|
||||||
/** \note Must call init() after each clear **/
|
/** \note Must call init() after each clear **/
|
||||||
@ -130,6 +130,7 @@ class UCL_Program {
|
|||||||
friend class UCL_Kernel;
|
friend class UCL_Kernel;
|
||||||
private:
|
private:
|
||||||
CUmodule _module;
|
CUmodule _module;
|
||||||
|
CUstream _cq;
|
||||||
friend class UCL_Texture;
|
friend class UCL_Texture;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -141,7 +142,7 @@ class UCL_Kernel {
|
|||||||
|
|
||||||
UCL_Kernel(UCL_Program &program, const char *function) :
|
UCL_Kernel(UCL_Program &program, const char *function) :
|
||||||
_dimensions(1), _num_args(0), _param_size(0)
|
_dimensions(1), _num_args(0), _param_size(0)
|
||||||
{ _num_blocks[0]=0; set_function(program,function); }
|
{ _num_blocks[0]=0; set_function(program,function); _cq=program._cq; }
|
||||||
|
|
||||||
~UCL_Kernel() {}
|
~UCL_Kernel() {}
|
||||||
|
|
||||||
@ -160,6 +161,7 @@ class UCL_Kernel {
|
|||||||
#endif
|
#endif
|
||||||
return UCL_FUNCTION_NOT_FOUND;
|
return UCL_FUNCTION_NOT_FOUND;
|
||||||
}
|
}
|
||||||
|
_cq=program._cq;
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -229,7 +231,7 @@ class UCL_Kernel {
|
|||||||
/// Run the kernel in the default command queue
|
/// Run the kernel in the default command queue
|
||||||
inline void run() {
|
inline void run() {
|
||||||
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
|
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
|
||||||
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],0));
|
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the kernel in the specified command queue
|
/// Run the kernel in the specified command queue
|
||||||
@ -245,6 +247,7 @@ class UCL_Kernel {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
CUfunction _kernel;
|
CUfunction _kernel;
|
||||||
|
CUstream _cq;
|
||||||
unsigned _dimensions;
|
unsigned _dimensions;
|
||||||
unsigned _num_blocks[2];
|
unsigned _num_blocks[2];
|
||||||
unsigned _num_args;
|
unsigned _num_args;
|
||||||
|
|||||||
@ -57,6 +57,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|||||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||||
return UCL_MEMORY_ERROR;
|
return UCL_MEMORY_ERROR;
|
||||||
|
mat.cq()=cm.cq();
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,6 +73,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|||||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||||
return UCL_MEMORY_ERROR;
|
return UCL_MEMORY_ERROR;
|
||||||
|
mat.cq()=dev.cq();
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -92,6 +94,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|||||||
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
||||||
if (err!=CUDA_SUCCESS)
|
if (err!=CUDA_SUCCESS)
|
||||||
return UCL_MEMORY_ERROR;
|
return UCL_MEMORY_ERROR;
|
||||||
|
mat.cq()=cm.cq();
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -101,6 +104,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|||||||
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
||||||
if (err!=CUDA_SUCCESS)
|
if (err!=CUDA_SUCCESS)
|
||||||
return UCL_MEMORY_ERROR;
|
return UCL_MEMORY_ERROR;
|
||||||
|
mat.cq()=dev.cq();
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -115,6 +119,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
|||||||
pitch=static_cast<size_t>(upitch);
|
pitch=static_cast<size_t>(upitch);
|
||||||
if (err!=CUDA_SUCCESS)
|
if (err!=CUDA_SUCCESS)
|
||||||
return UCL_MEMORY_ERROR;
|
return UCL_MEMORY_ERROR;
|
||||||
|
mat.cq()=cm.cq();
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -129,6 +134,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
|||||||
pitch=static_cast<size_t>(upitch);
|
pitch=static_cast<size_t>(upitch);
|
||||||
if (err!=CUDA_SUCCESS)
|
if (err!=CUDA_SUCCESS)
|
||||||
return UCL_MEMORY_ERROR;
|
return UCL_MEMORY_ERROR;
|
||||||
|
mat.cq()=d.cq();
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -243,8 +249,8 @@ template<> struct _ucl_memcpy<2,2> {
|
|||||||
const size_t rows) {
|
const size_t rows) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstArray=dst.cbegin();
|
ins.dstArray=dst.cbegin();
|
||||||
ins.srcArray=src.cbegin();
|
ins.srcArray=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||||
@ -255,8 +261,8 @@ template<> struct _ucl_memcpy<2,2> {
|
|||||||
const size_t rows, CUstream &cq) {
|
const size_t rows, CUstream &cq) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstArray=dst.cbegin();
|
ins.dstArray=dst.cbegin();
|
||||||
ins.srcArray=src.cbegin();
|
ins.srcArray=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||||
@ -280,8 +286,8 @@ template<> struct _ucl_memcpy<2,0> {
|
|||||||
const size_t rows) {
|
const size_t rows) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstArray=dst.cbegin();
|
ins.dstArray=dst.cbegin();
|
||||||
ins.srcDevice=src.cbegin();
|
ins.srcDevice=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||||
@ -292,8 +298,8 @@ template<> struct _ucl_memcpy<2,0> {
|
|||||||
const size_t rows, CUstream &cq) {
|
const size_t rows, CUstream &cq) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstArray=dst.cbegin();
|
ins.dstArray=dst.cbegin();
|
||||||
ins.srcDevice=src.cbegin();
|
ins.srcDevice=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||||
@ -317,8 +323,8 @@ template<> struct _ucl_memcpy<2,1> {
|
|||||||
const size_t rows) {
|
const size_t rows) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstArray=dst.cbegin();
|
ins.dstArray=dst.cbegin();
|
||||||
ins.srcHost=src.begin();
|
ins.srcHost=src.begin();
|
||||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||||
@ -329,8 +335,8 @@ template<> struct _ucl_memcpy<2,1> {
|
|||||||
const size_t rows, CUstream &cq) {
|
const size_t rows, CUstream &cq) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstArray=dst.cbegin();
|
ins.dstArray=dst.cbegin();
|
||||||
ins.srcHost=src.begin();
|
ins.srcHost=src.begin();
|
||||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||||
@ -354,8 +360,8 @@ template<> struct _ucl_memcpy<0,2> {
|
|||||||
const size_t rows) {
|
const size_t rows) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstDevice=dst.cbegin();
|
ins.dstDevice=dst.cbegin();
|
||||||
ins.srcArray=src.cbegin();
|
ins.srcArray=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||||
@ -366,8 +372,8 @@ template<> struct _ucl_memcpy<0,2> {
|
|||||||
const size_t rows, CUstream &cq) {
|
const size_t rows, CUstream &cq) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstDevice=dst.cbegin();
|
ins.dstDevice=dst.cbegin();
|
||||||
ins.srcArray=src.cbegin();
|
ins.srcArray=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||||
@ -391,8 +397,8 @@ template<> struct _ucl_memcpy<1,2> {
|
|||||||
const size_t rows) {
|
const size_t rows) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstHost=dst.begin();
|
ins.dstHost=dst.begin();
|
||||||
ins.srcArray=src.cbegin();
|
ins.srcArray=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||||
@ -403,8 +409,8 @@ template<> struct _ucl_memcpy<1,2> {
|
|||||||
const size_t rows, CUstream &cq) {
|
const size_t rows, CUstream &cq) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstHost=dst.begin();
|
ins.dstHost=dst.begin();
|
||||||
ins.srcArray=src.cbegin();
|
ins.srcArray=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||||
@ -428,8 +434,8 @@ template <> struct _ucl_memcpy<1,0> {
|
|||||||
const size_t rows) {
|
const size_t rows) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstHost=dst.begin();
|
ins.dstHost=dst.begin();
|
||||||
ins.srcDevice=src.cbegin();
|
ins.srcDevice=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||||
@ -440,8 +446,8 @@ template <> struct _ucl_memcpy<1,0> {
|
|||||||
const size_t rows, CUstream &cq) {
|
const size_t rows, CUstream &cq) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstHost=dst.begin();
|
ins.dstHost=dst.begin();
|
||||||
ins.srcDevice=src.cbegin();
|
ins.srcDevice=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||||
@ -465,8 +471,8 @@ template <> struct _ucl_memcpy<0,1> {
|
|||||||
const size_t rows) {
|
const size_t rows) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstDevice=dst.cbegin();
|
ins.dstDevice=dst.cbegin();
|
||||||
ins.srcHost=src.begin();
|
ins.srcHost=src.begin();
|
||||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||||
@ -477,8 +483,8 @@ template <> struct _ucl_memcpy<0,1> {
|
|||||||
const size_t rows, CUstream &cq) {
|
const size_t rows, CUstream &cq) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstDevice=dst.cbegin();
|
ins.dstDevice=dst.cbegin();
|
||||||
ins.srcHost=src.begin();
|
ins.srcHost=src.begin();
|
||||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||||
@ -500,8 +506,8 @@ template <> struct _ucl_memcpy<1,1> {
|
|||||||
const size_t rows) {
|
const size_t rows) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstHost=dst.begin();
|
ins.dstHost=dst.begin();
|
||||||
ins.srcHost=src.begin();
|
ins.srcHost=src.begin();
|
||||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||||
@ -512,8 +518,8 @@ template <> struct _ucl_memcpy<1,1> {
|
|||||||
const size_t rows, CUstream &cq) {
|
const size_t rows, CUstream &cq) {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstHost=dst.begin();
|
ins.dstHost=dst.begin();
|
||||||
ins.srcHost=src.begin();
|
ins.srcHost=src.begin();
|
||||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||||
@ -529,7 +535,7 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||||||
template <class p1, class p2>
|
template <class p1, class p2>
|
||||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||||
CUstream &cq) {
|
CUstream &cq) {
|
||||||
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
|
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
|
||||||
}
|
}
|
||||||
template <class p1, class p2>
|
template <class p1, class p2>
|
||||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||||
@ -546,8 +552,8 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||||||
} else {
|
} else {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstDevice=dst.cbegin();
|
ins.dstDevice=dst.cbegin();
|
||||||
ins.srcDevice=src.cbegin();
|
ins.srcDevice=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||||
@ -560,16 +566,16 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||||||
if (p1::PADDED==0 || p2::PADDED==0) {
|
if (p1::PADDED==0 || p2::PADDED==0) {
|
||||||
size_t src_offset=0, dst_offset=0;
|
size_t src_offset=0, dst_offset=0;
|
||||||
for (size_t i=0; i<rows; i++) {
|
for (size_t i=0; i<rows; i++) {
|
||||||
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
|
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset,
|
||||||
src.cbegin()+src_offset,cols));
|
src.cbegin()+src_offset,cols,cq));
|
||||||
src_offset+=spitch;
|
src_offset+=spitch;
|
||||||
dst_offset+=dpitch;
|
dst_offset+=dpitch;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
CUDA_MEMCPY2D ins;
|
CUDA_MEMCPY2D ins;
|
||||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||||
ins.dstDevice=dst.cbegin();
|
ins.dstDevice=dst.cbegin();
|
||||||
ins.srcDevice=src.cbegin();
|
ins.srcDevice=src.cbegin();
|
||||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||||
|
|||||||
@ -124,16 +124,25 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
|||||||
if (my_gpu>=gpu->num_devices())
|
if (my_gpu>=gpu->num_devices())
|
||||||
return -2;
|
return -2;
|
||||||
|
|
||||||
|
#ifndef CUDA_PRX
|
||||||
if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
|
if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
|
||||||
return -7;
|
return -7;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (gpu->set(my_gpu)!=UCL_SUCCESS)
|
if (gpu->set(my_gpu)!=UCL_SUCCESS)
|
||||||
return -6;
|
return -6;
|
||||||
|
|
||||||
|
gpu->push_command_queue();
|
||||||
|
gpu->set_command_queue(1);
|
||||||
|
|
||||||
_long_range_precompute=0;
|
_long_range_precompute=0;
|
||||||
|
|
||||||
int flag=compile_kernels();
|
int flag=0;
|
||||||
|
for (int i=0; i<_procs_per_gpu; i++) {
|
||||||
|
if (_gpu_rank==i)
|
||||||
|
flag=compile_kernels();
|
||||||
|
gpu_barrier();
|
||||||
|
}
|
||||||
return flag;
|
return flag;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -348,8 +348,9 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
|
|||||||
// copy fp from device to host for comm
|
// copy fp from device to host for comm
|
||||||
_nlocal=nlocal;
|
_nlocal=nlocal;
|
||||||
time_fp1.start();
|
time_fp1.start();
|
||||||
ucl_copy(host_fp,dev_fp,nlocal,false);
|
ucl_copy(host_fp,dev_fp,nlocal,true);
|
||||||
time_fp1.stop();
|
time_fp1.stop();
|
||||||
|
time_fp1.sync_stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@ -427,8 +428,9 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
|
|||||||
// copy fp from device to host for comm
|
// copy fp from device to host for comm
|
||||||
_nlocal=inum_full;
|
_nlocal=inum_full;
|
||||||
time_fp1.start();
|
time_fp1.start();
|
||||||
ucl_copy(host_fp,dev_fp,inum_full,false);
|
ucl_copy(host_fp,dev_fp,inum_full,true);
|
||||||
time_fp1.stop();
|
time_fp1.stop();
|
||||||
|
time_fp1.sync_stop();
|
||||||
|
|
||||||
return this->nbor->host_jlist.begin()-host_start;
|
return this->nbor->host_jlist.begin()-host_start;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -69,10 +69,12 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
|
|||||||
time_kernel.init(*dev);
|
time_kernel.init(*dev);
|
||||||
time_hybrid1.init(*dev);
|
time_hybrid1.init(*dev);
|
||||||
time_hybrid2.init(*dev);
|
time_hybrid2.init(*dev);
|
||||||
|
time_transpose.init(*dev);
|
||||||
time_nbor.zero();
|
time_nbor.zero();
|
||||||
time_kernel.zero();
|
time_kernel.zero();
|
||||||
time_hybrid1.zero();
|
time_hybrid1.zero();
|
||||||
time_hybrid2.zero();
|
time_hybrid2.zero();
|
||||||
|
time_transpose.zero();
|
||||||
|
|
||||||
_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
|
_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
|
||||||
if (_max_atoms==0)
|
if (_max_atoms==0)
|
||||||
@ -203,6 +205,7 @@ void Neighbor::clear() {
|
|||||||
time_nbor.clear();
|
time_nbor.clear();
|
||||||
time_hybrid1.clear();
|
time_hybrid1.clear();
|
||||||
time_hybrid2.clear();
|
time_hybrid2.clear();
|
||||||
|
time_transpose.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -285,6 +288,41 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
|
||||||
|
if (maxn>_max_nbors) {
|
||||||
|
int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
|
||||||
|
dev_nbor.clear();
|
||||||
|
success=success &&
|
||||||
|
(dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS);
|
||||||
|
_gpu_bytes=dev_nbor.row_bytes();
|
||||||
|
if (_max_host>0) {
|
||||||
|
host_nbor.clear();
|
||||||
|
dev_host_nbor.clear();
|
||||||
|
success=success && (host_nbor.alloc(mn*_max_host,*dev,
|
||||||
|
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
||||||
|
success=success && (dev_host_nbor.alloc(mn*_max_host,
|
||||||
|
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
|
int *ptr=host_nbor.begin();
|
||||||
|
for (int i=0; i<_max_host; i++) {
|
||||||
|
host_jlist[i]=ptr;
|
||||||
|
ptr+=mn;
|
||||||
|
}
|
||||||
|
_gpu_bytes+=dev_host_nbor.row_bytes();
|
||||||
|
} else {
|
||||||
|
dev_host_nbor.view(dev_nbor);
|
||||||
|
dev_host_numj.view(dev_nbor);
|
||||||
|
}
|
||||||
|
if (_alloc_packed) {
|
||||||
|
dev_packed.clear();
|
||||||
|
success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
|
||||||
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
|
_gpu_bytes+=dev_packed.row_bytes();
|
||||||
|
}
|
||||||
|
_max_nbors=mn;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||||
const int nall, Atom<numtyp,acctyp> &atom,
|
const int nall, Atom<numtyp,acctyp> &atom,
|
||||||
@ -320,6 +358,29 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
|
|
||||||
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
|
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
|
||||||
|
|
||||||
|
if (_maxspecial>0) {
|
||||||
|
time_nbor.start();
|
||||||
|
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
|
||||||
|
view_nspecial.view(nspecial[0],nt*3,*dev);
|
||||||
|
view_special.view(special[0],nt*_maxspecial,*dev);
|
||||||
|
view_tag.view(tag,nall,*dev);
|
||||||
|
ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
|
||||||
|
ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
|
||||||
|
ucl_copy(atom.dev_tag,view_tag,nall,false);
|
||||||
|
time_nbor.stop();
|
||||||
|
if (_time_device)
|
||||||
|
time_nbor.add_to_total();
|
||||||
|
time_transpose.start();
|
||||||
|
const int b2x=_block_cell_2d;
|
||||||
|
const int b2y=_block_cell_2d;
|
||||||
|
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
|
||||||
|
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
|
||||||
|
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
||||||
|
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
|
||||||
|
&_maxspecial,&nt);
|
||||||
|
time_transpose.stop();
|
||||||
|
}
|
||||||
|
|
||||||
// If binning on CPU, do this now
|
// If binning on CPU, do this now
|
||||||
if (_gpu_nbor==2) {
|
if (_gpu_nbor==2) {
|
||||||
double stime = MPI_Wtime();
|
double stime = MPI_Wtime();
|
||||||
@ -352,6 +413,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
cell_id[i]=id;
|
cell_id[i]=id;
|
||||||
host_cell_counts[id+1]++;
|
host_cell_counts[id+1]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mn=0;
|
||||||
|
for (int i=0; i<_ncells; i++)
|
||||||
|
mn=std::max(mn,host_cell_counts[i]);
|
||||||
|
mn*=8;
|
||||||
|
resize_max_neighbors<numtyp,acctyp>(mn,success);
|
||||||
|
if (!success)
|
||||||
|
return;
|
||||||
|
_total_atoms=nt;
|
||||||
|
|
||||||
cell_iter[0]=0;
|
cell_iter[0]=0;
|
||||||
for (int i=1; i<_ncells; i++) {
|
for (int i=1; i<_ncells; i++) {
|
||||||
host_cell_counts[i]+=host_cell_counts[i-1];
|
host_cell_counts[i]+=host_cell_counts[i-1];
|
||||||
@ -372,28 +443,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
_bin_time+=MPI_Wtime()-stime;
|
_bin_time+=MPI_Wtime()-stime;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_maxspecial>0) {
|
time_kernel.start();
|
||||||
time_nbor.start();
|
|
||||||
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
|
|
||||||
view_nspecial.view(nspecial[0],nt*3,*dev);
|
|
||||||
view_special.view(special[0],nt*_maxspecial,*dev);
|
|
||||||
view_tag.view(tag,nall,*dev);
|
|
||||||
ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
|
|
||||||
ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
|
|
||||||
ucl_copy(atom.dev_tag,view_tag,nall,false);
|
|
||||||
time_nbor.stop();
|
|
||||||
if (_time_device)
|
|
||||||
time_nbor.add_to_total();
|
|
||||||
time_kernel.start();
|
|
||||||
const int b2x=_block_cell_2d;
|
|
||||||
const int b2y=_block_cell_2d;
|
|
||||||
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
|
|
||||||
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
|
|
||||||
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
|
||||||
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
|
|
||||||
&_maxspecial,&nt);
|
|
||||||
} else
|
|
||||||
time_kernel.start();
|
|
||||||
|
|
||||||
_nbor_pitch=inum;
|
_nbor_pitch=inum;
|
||||||
_shared->neigh_tex.bind_float(atom.dev_x,4);
|
_shared->neigh_tex.bind_float(atom.dev_x,4);
|
||||||
@ -435,54 +485,30 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
/* Get the maximum number of nbors and realloc if necessary */
|
/* Get the maximum number of nbors and realloc if necessary */
|
||||||
UCL_D_Vec<int> numj;
|
UCL_D_Vec<int> numj;
|
||||||
numj.view_offset(inum,dev_nbor,inum);
|
numj.view_offset(inum,dev_nbor,inum);
|
||||||
ucl_copy(host_acc,numj,inum,false);
|
ucl_copy(host_acc,numj,inum,true);
|
||||||
if (nt>inum) {
|
if (nt>inum) {
|
||||||
UCL_H_Vec<int> host_offset;
|
UCL_H_Vec<int> host_offset;
|
||||||
host_offset.view_offset(inum,host_acc,nt-inum);
|
host_offset.view_offset(inum,host_acc,nt-inum);
|
||||||
ucl_copy(host_offset,dev_host_numj,nt-inum,false);
|
ucl_copy(host_offset,dev_host_numj,nt-inum,true);
|
||||||
}
|
}
|
||||||
mn=host_acc[0];
|
|
||||||
for (int i=1; i<nt; i++)
|
if (_gpu_nbor!=2) {
|
||||||
mn=std::max(mn,host_acc[i]);
|
host_acc.sync();
|
||||||
|
mn=host_acc[0];
|
||||||
|
for (int i=1; i<nt; i++)
|
||||||
|
mn=std::max(mn,host_acc[i]);
|
||||||
|
|
||||||
if (mn>_max_nbors) {
|
if (mn>_max_nbors) {
|
||||||
mn=static_cast<int>(static_cast<double>(mn)*1.10);
|
resize_max_neighbors<numtyp,acctyp>(mn,success);
|
||||||
dev_nbor.clear();
|
if (!success)
|
||||||
success=success &&
|
return;
|
||||||
(dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_x)==UCL_SUCCESS);
|
time_kernel.stop();
|
||||||
_gpu_bytes=dev_nbor.row_bytes();
|
if (_time_device)
|
||||||
if (_max_host>0) {
|
time_kernel.add_to_total();
|
||||||
host_nbor.clear();
|
build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag,
|
||||||
dev_host_nbor.clear();
|
nspecial, special, success, mn);
|
||||||
success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
|
|
||||||
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
|
||||||
success=success && (dev_host_nbor.alloc(mn*_max_host,
|
|
||||||
dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
|
||||||
int *ptr=host_nbor.begin();
|
|
||||||
for (int i=0; i<_max_host; i++) {
|
|
||||||
host_jlist[i]=ptr;
|
|
||||||
ptr+=mn;
|
|
||||||
}
|
|
||||||
_gpu_bytes+=dev_host_nbor.row_bytes();
|
|
||||||
} else {
|
|
||||||
dev_host_nbor.view(dev_nbor);
|
|
||||||
dev_host_numj.view(dev_nbor);
|
|
||||||
}
|
|
||||||
if (_alloc_packed) {
|
|
||||||
dev_packed.clear();
|
|
||||||
success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
|
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
|
||||||
_gpu_bytes+=dev_packed.row_bytes();
|
|
||||||
}
|
|
||||||
if (!success)
|
|
||||||
return;
|
return;
|
||||||
_max_nbors=mn;
|
}
|
||||||
time_kernel.stop();
|
|
||||||
if (_time_device)
|
|
||||||
time_kernel.add_to_total();
|
|
||||||
build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
|
|
||||||
special, success, mn);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_maxspecial>0) {
|
if (_maxspecial>0) {
|
||||||
@ -497,8 +523,10 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
time_kernel.stop();
|
time_kernel.stop();
|
||||||
|
|
||||||
time_nbor.start();
|
time_nbor.start();
|
||||||
if (inum<nt)
|
if (inum<nt) {
|
||||||
ucl_copy(host_nbor,dev_host_nbor,false);
|
ucl_copy(host_nbor,dev_host_nbor,true);
|
||||||
|
host_nbor.sync();
|
||||||
|
}
|
||||||
time_nbor.stop();
|
time_nbor.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -106,13 +106,24 @@ class Neighbor {
|
|||||||
|
|
||||||
inline void acc_timers() {
|
inline void acc_timers() {
|
||||||
if (_nbor_time_avail) {
|
if (_nbor_time_avail) {
|
||||||
time_nbor.add_to_total();
|
|
||||||
time_kernel.add_to_total();
|
|
||||||
if (_gpu_nbor==2) {
|
if (_gpu_nbor==2) {
|
||||||
time_hybrid1.add_to_total();
|
int mn=0;
|
||||||
time_hybrid2.add_to_total();
|
for (int i=0; i<_total_atoms; i++)
|
||||||
|
mn=std::max(mn,host_acc[i]);
|
||||||
|
if (mn>_max_nbors)
|
||||||
|
assert(0==1);
|
||||||
|
}
|
||||||
|
if (_time_device) {
|
||||||
|
time_nbor.add_to_total();
|
||||||
|
time_kernel.add_to_total();
|
||||||
|
if (_gpu_nbor==2) {
|
||||||
|
time_hybrid1.add_to_total();
|
||||||
|
time_hybrid2.add_to_total();
|
||||||
|
}
|
||||||
|
if (_maxspecial>0)
|
||||||
|
time_transpose.add_to_total();
|
||||||
|
_nbor_time_avail=false;
|
||||||
}
|
}
|
||||||
_nbor_time_avail=false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -213,7 +224,7 @@ class Neighbor {
|
|||||||
UCL_D_Vec<int> dev_cell_counts;
|
UCL_D_Vec<int> dev_cell_counts;
|
||||||
|
|
||||||
/// Device timers
|
/// Device timers
|
||||||
UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2;
|
UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
NeighborShared *_shared;
|
NeighborShared *_shared;
|
||||||
@ -228,6 +239,10 @@ class Neighbor {
|
|||||||
|
|
||||||
int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
|
int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
|
||||||
int _threads_per_atom;
|
int _threads_per_atom;
|
||||||
|
int _total_atoms;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
inline void resize_max_neighbors(const int maxn, bool &success);
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -85,7 +85,7 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
|
|||||||
__kernel void transpose(__global int *out, __global int *in, int columns_in,
|
__kernel void transpose(__global int *out, __global int *in, int columns_in,
|
||||||
int rows_in)
|
int rows_in)
|
||||||
{
|
{
|
||||||
__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
|
__local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
|
||||||
|
|
||||||
unsigned ti=THREAD_ID_X;
|
unsigned ti=THREAD_ID_X;
|
||||||
unsigned tj=THREAD_ID_Y;
|
unsigned tj=THREAD_ID_Y;
|
||||||
|
|||||||
Reference in New Issue
Block a user