Update Kokkos to v2.04.11
This commit is contained in:
@ -63,7 +63,7 @@
|
||||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
@ -110,21 +110,12 @@ struct apply_impl<2,RP,Functor,void >
|
||||
{
|
||||
// LL
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
/*
|
||||
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
|
||||
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
|
||||
|
||||
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
|
||||
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
|
||||
m_func(i, j);
|
||||
} }
|
||||
*/
|
||||
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1);
|
||||
}
|
||||
@ -134,21 +125,12 @@ struct apply_impl<2,RP,Functor,void >
|
||||
}
|
||||
// LR
|
||||
else {
|
||||
/*
|
||||
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
|
||||
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
|
||||
|
||||
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
|
||||
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
|
||||
m_func(i, j);
|
||||
} }
|
||||
*/
|
||||
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
|
||||
m_func(offset_0 , offset_1);
|
||||
}
|
||||
@ -182,21 +164,12 @@ struct apply_impl<2,RP,Functor,Tag>
|
||||
{
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
// Loop over size maxnumblocks until full range covered
|
||||
/*
|
||||
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
|
||||
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
|
||||
|
||||
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
|
||||
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
|
||||
m_func(Tag(), i, j);
|
||||
} }
|
||||
*/
|
||||
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(Tag(), offset_0 , offset_1);
|
||||
}
|
||||
@ -205,21 +178,12 @@ struct apply_impl<2,RP,Functor,Tag>
|
||||
}
|
||||
}
|
||||
else {
|
||||
/*
|
||||
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
|
||||
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
|
||||
|
||||
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
|
||||
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
|
||||
m_func(Tag(), i, j);
|
||||
} }
|
||||
*/
|
||||
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
|
||||
m_func(Tag(), offset_0 , offset_1);
|
||||
}
|
||||
@ -255,15 +219,15 @@ struct apply_impl<3,RP,Functor,void >
|
||||
// LL
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
@ -276,15 +240,15 @@ struct apply_impl<3,RP,Functor,void >
|
||||
// LR
|
||||
else {
|
||||
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
@ -319,15 +283,15 @@ struct apply_impl<3,RP,Functor,Tag>
|
||||
{
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(Tag(), offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
@ -339,15 +303,15 @@ struct apply_impl<3,RP,Functor,Tag>
|
||||
}
|
||||
else {
|
||||
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
|
||||
m_func(Tag(), offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
@ -398,19 +362,19 @@ struct apply_impl<4,RP,Functor,void >
|
||||
const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
|
||||
|
||||
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
@ -436,19 +400,19 @@ struct apply_impl<4,RP,Functor,void >
|
||||
const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
@ -498,19 +462,19 @@ struct apply_impl<4,RP,Functor,Tag>
|
||||
const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
|
||||
|
||||
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
@ -535,19 +499,19 @@ struct apply_impl<4,RP,Functor,Tag>
|
||||
const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
@ -612,23 +576,23 @@ struct apply_impl<5,RP,Functor,void >
|
||||
const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
|
||||
|
||||
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
|
||||
}
|
||||
@ -667,23 +631,23 @@ struct apply_impl<5,RP,Functor,void >
|
||||
const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
|
||||
}
|
||||
@ -747,23 +711,23 @@ struct apply_impl<5,RP,Functor,Tag>
|
||||
const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
|
||||
|
||||
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
|
||||
}
|
||||
@ -802,23 +766,23 @@ struct apply_impl<5,RP,Functor,Tag>
|
||||
const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
|
||||
}
|
||||
@ -895,27 +859,27 @@ struct apply_impl<6,RP,Functor,void >
|
||||
const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
|
||||
}
|
||||
@ -967,27 +931,27 @@ struct apply_impl<6,RP,Functor,void >
|
||||
const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
|
||||
}
|
||||
@ -1064,27 +1028,27 @@ struct apply_impl<6,RP,Functor,Tag>
|
||||
const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
|
||||
}
|
||||
@ -1136,27 +1100,27 @@ struct apply_impl<6,RP,Functor,Tag>
|
||||
const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
|
||||
}
|
||||
@ -1292,7 +1256,7 @@ protected:
|
||||
const Functor m_func;
|
||||
};
|
||||
|
||||
} } } //end namespace Kokkos::Experimental::Impl
|
||||
} } //end namespace Kokkos::Impl
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -63,7 +63,7 @@
|
||||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
namespace Refactor {
|
||||
|
||||
@ -2709,7 +2709,7 @@ private:
|
||||
|
||||
// ----------------------------------------------------------------------------------
|
||||
|
||||
} } } //end namespace Kokkos::Experimental::Impl
|
||||
} } //end namespace Kokkos::Impl
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()
|
||||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
//__launch_bounds__(maxTperB, minBperSM)
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
const DriverType & driver =
|
||||
@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
//__launch_bounds__(maxTperB, minBperSM)
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
@ -193,9 +193,14 @@ template < class DriverType
|
||||
, bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
|
||||
struct CudaParallelLaunch ;
|
||||
|
||||
template < class DriverType, class LaunchBounds >
|
||||
struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
|
||||
|
||||
template < class DriverType
|
||||
, unsigned int MaxThreadsPerBlock
|
||||
, unsigned int MinBlocksPerSM >
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds< MaxThreadsPerBlock
|
||||
, MinBlocksPerSM >
|
||||
, true >
|
||||
{
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
@ -216,21 +221,28 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
|
||||
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else if ( shmem ) {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
|
||||
} else {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_constant_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
|
||||
cudaMemcpyToSymbol(
|
||||
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
|
||||
cuda_parallel_launch_constant_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
<<< grid , block , shmem , stream >>>();
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
@ -240,9 +252,11 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType, class LaunchBounds >
|
||||
struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
|
||||
|
||||
template < class DriverType >
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds<>
|
||||
, true >
|
||||
{
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
@ -252,20 +266,136 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
|
||||
{
|
||||
if ( grid.x && ( block.x * block.y * block.z ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
|
||||
}
|
||||
|
||||
// Fence before changing settings and copying closure
|
||||
Kokkos::Cuda::fence();
|
||||
|
||||
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else if ( shmem ) {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
|
||||
} else {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_constant_memory< DriverType >
|
||||
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
cudaMemcpyToSymbol(
|
||||
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_constant_memory< DriverType >
|
||||
<<< grid , block , shmem , stream >>>();
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda::fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType
|
||||
, unsigned int MaxThreadsPerBlock
|
||||
, unsigned int MinBlocksPerSM >
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds< MaxThreadsPerBlock
|
||||
, MinBlocksPerSM >
|
||||
, false >
|
||||
{
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, const cudaStream_t stream = 0 )
|
||||
{
|
||||
if ( grid.x && ( block.x * block.y * block.z ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
|
||||
}
|
||||
|
||||
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_local_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_local_memory
|
||||
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
|
||||
<<< grid , block , shmem , stream >>>( driver );
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
Kokkos::Cuda::fence();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType >
|
||||
struct CudaParallelLaunch< DriverType
|
||||
, Kokkos::LaunchBounds<>
|
||||
, false >
|
||||
{
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
, const dim3 & grid
|
||||
, const dim3 & block
|
||||
, const int shmem
|
||||
, const cudaStream_t stream = 0 )
|
||||
{
|
||||
if ( grid.x && ( block.x * block.y * block.z ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
|
||||
}
|
||||
|
||||
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig
|
||||
( cuda_parallel_launch_local_memory< DriverType >
|
||||
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
|
||||
) );
|
||||
}
|
||||
#endif
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_local_memory< DriverType >
|
||||
<<< grid , block , shmem , stream >>>( driver );
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
@ -713,7 +713,7 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void *
|
||||
// Iterate records to print orphaned memory ...
|
||||
void
|
||||
SharedAllocationRecord< Kokkos::CudaSpace , void >::
|
||||
print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail )
|
||||
print_records( std::ostream & s , const Kokkos::CudaSpace & , bool detail )
|
||||
{
|
||||
SharedAllocationRecord< void , void > * r = & s_root_record ;
|
||||
|
||||
@ -751,7 +751,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
|
||||
, reinterpret_cast<uintptr_t>( r->m_dealloc )
|
||||
, head.m_label
|
||||
);
|
||||
std::cout << buffer ;
|
||||
s << buffer ;
|
||||
r = r->m_next ;
|
||||
} while ( r != & s_root_record );
|
||||
}
|
||||
@ -781,7 +781,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
|
||||
else {
|
||||
snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" );
|
||||
}
|
||||
std::cout << buffer ;
|
||||
s << buffer ;
|
||||
r = r->m_next ;
|
||||
} while ( r != & s_root_record );
|
||||
}
|
||||
@ -789,14 +789,14 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
|
||||
|
||||
void
|
||||
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
|
||||
print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail )
|
||||
print_records( std::ostream & s , const Kokkos::CudaUVMSpace & , bool detail )
|
||||
{
|
||||
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail );
|
||||
}
|
||||
|
||||
void
|
||||
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
|
||||
print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail )
|
||||
print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & , bool detail )
|
||||
{
|
||||
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
|
||||
}
|
||||
|
||||
@ -421,7 +421,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
std::string msg = ss.str();
|
||||
Kokkos::abort( msg.c_str() );
|
||||
}
|
||||
if ( compiled_major != cudaProp.major || compiled_minor != cudaProp.minor ) {
|
||||
if ( Kokkos::show_warnings() && (compiled_major != cudaProp.major || compiled_minor != cudaProp.minor) ) {
|
||||
std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
|
||||
<< compiled_major << "." << compiled_minor
|
||||
<< " on device with compute capability "
|
||||
@ -467,7 +467,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
|
||||
m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
|
||||
|
||||
if ( ! m_scratchUnifiedSupported ) {
|
||||
if ( Kokkos::show_warnings() && ! m_scratchUnifiedSupported ) {
|
||||
std::cout << "Kokkos::Cuda device "
|
||||
<< cudaProp.name << " capability "
|
||||
<< cudaProp.major << "." << cudaProp.minor
|
||||
@ -545,7 +545,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_UVM
|
||||
if(!cuda_launch_blocking()) {
|
||||
if( Kokkos::show_warnings() && !cuda_launch_blocking() ) {
|
||||
std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
|
||||
std::cout << " without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
|
||||
std::cout << " The code must call Cuda::fence() after each kernel" << std::endl;
|
||||
@ -561,7 +561,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
bool visible_devices_one=true;
|
||||
if (env_visible_devices == 0) visible_devices_one=false;
|
||||
|
||||
if(!visible_devices_one && !force_device_alloc) {
|
||||
if( Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc) ) {
|
||||
std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
|
||||
std::cout << " without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
|
||||
std::cout << " setting CUDA_VISIBLE_DEVICES." << std::endl;
|
||||
|
||||
@ -381,12 +381,12 @@ public:
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, Kokkos::MDRangePolicy< Traits ... >
|
||||
, Kokkos::Cuda
|
||||
>
|
||||
{
|
||||
private:
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
|
||||
typedef Kokkos::MDRangePolicy< Traits ... > Policy ;
|
||||
using RP = Policy;
|
||||
typedef typename Policy::array_index_type array_index_type;
|
||||
typedef typename Policy::index_type index_type;
|
||||
@ -402,7 +402,7 @@ public:
|
||||
__device__
|
||||
void operator()(void) const
|
||||
{
|
||||
Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
|
||||
Kokkos::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
|
||||
}
|
||||
|
||||
|
||||
@ -858,14 +858,14 @@ public:
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, Kokkos::MDRangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Cuda
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
|
||||
typedef Kokkos::MDRangePolicy< Traits ... > Policy ;
|
||||
typedef typename Policy::array_index_type array_index_type;
|
||||
typedef typename Policy::index_type index_type;
|
||||
|
||||
@ -898,7 +898,7 @@ public:
|
||||
size_type * m_scratch_flags ;
|
||||
size_type * m_unified_space ;
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
|
||||
typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
|
||||
|
||||
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
|
||||
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
|
||||
@ -913,7 +913,7 @@ public:
|
||||
void
|
||||
exec_range( reference_type update ) const
|
||||
{
|
||||
Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
|
||||
Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
|
||||
}
|
||||
|
||||
inline
|
||||
|
||||
@ -127,11 +127,11 @@ struct CudaTextureFetch {
|
||||
template< class CudaMemorySpace >
|
||||
inline explicit
|
||||
CudaTextureFetch( const ValueType * const arg_ptr
|
||||
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
|
||||
, Kokkos::Impl::SharedAllocationRecord< CudaMemorySpace , void > * record
|
||||
)
|
||||
: m_obj( record.template attach_texture_object< AliasType >() )
|
||||
: m_obj( record->template attach_texture_object< AliasType >() )
|
||||
, m_ptr( arg_ptr )
|
||||
, m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
|
||||
, m_offset( record->attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
|
||||
{}
|
||||
|
||||
// Texture object spans the entire allocation.
|
||||
@ -199,8 +199,8 @@ struct CudaLDGFetch {
|
||||
template< class CudaMemorySpace >
|
||||
inline explicit
|
||||
CudaLDGFetch( const ValueType * const arg_ptr
|
||||
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
|
||||
)
|
||||
, Kokkos::Impl::SharedAllocationRecord<CudaMemorySpace,void>*
|
||||
)
|
||||
: m_ptr( arg_ptr )
|
||||
{}
|
||||
|
||||
@ -285,7 +285,21 @@ public:
|
||||
// Assignment of texture = non-texture requires creation of a texture object
|
||||
// which can only occur on the host. In addition, 'get_record' is only valid
|
||||
// if called in a host execution space
|
||||
return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
|
||||
|
||||
|
||||
typedef typename Traits::memory_space memory_space ;
|
||||
typedef typename Impl::SharedAllocationRecord<memory_space,void> record ;
|
||||
|
||||
record * const r = arg_tracker.template get_record< memory_space >();
|
||||
|
||||
#if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
|
||||
if ( 0 == r ) {
|
||||
Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
|
||||
}
|
||||
#endif
|
||||
|
||||
return handle_type( arg_data_ptr , r );
|
||||
|
||||
#else
|
||||
Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
|
||||
return handle_type();
|
||||
|
||||
@ -48,50 +48,52 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::Cuda
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::WorkGraphPolicy< Traits ... >
|
||||
, Kokkos::Cuda
|
||||
>
|
||||
: public Kokkos::Impl::Experimental::
|
||||
WorkGraphExec< FunctorType,
|
||||
Kokkos::Cuda,
|
||||
Traits ...
|
||||
>
|
||||
{
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
|
||||
typedef Kokkos::Impl::Experimental::
|
||||
WorkGraphExec<FunctorType, Kokkos::Cuda, Traits ... > Base ;
|
||||
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
|
||||
typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda> Self ;
|
||||
|
||||
private:
|
||||
|
||||
template< class TagType >
|
||||
__device__
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
Base::m_functor( i );
|
||||
}
|
||||
Policy m_policy ;
|
||||
FunctorType m_functor ;
|
||||
|
||||
template< class TagType >
|
||||
__device__
|
||||
__device__ inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_one( const std::int32_t w ) const noexcept
|
||||
{ m_functor( w ); }
|
||||
|
||||
template< class TagType >
|
||||
__device__ inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
const TagType t{} ;
|
||||
Base::m_functor( t , i );
|
||||
}
|
||||
exec_one( const std::int32_t w ) const noexcept
|
||||
{ const TagType t{} ; m_functor( t , w ); }
|
||||
|
||||
public:
|
||||
|
||||
__device__
|
||||
inline
|
||||
void operator()() const {
|
||||
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
|
||||
exec_one< typename Policy::work_tag >( i );
|
||||
Base::after_work(i);
|
||||
__device__ inline
|
||||
void operator()() const noexcept
|
||||
{
|
||||
if ( 0 == ( threadIdx.y % 16 ) ) {
|
||||
|
||||
// Spin until COMPLETED_TOKEN.
|
||||
// END_TOKEN indicates no work is currently available.
|
||||
|
||||
for ( std::int32_t w = Policy::END_TOKEN ;
|
||||
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
|
||||
if ( Policy::END_TOKEN != w ) {
|
||||
exec_one< typename Policy::work_tag >( w );
|
||||
m_policy.completed_work(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void execute()
|
||||
@ -108,9 +110,9 @@ public:
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: Base( arg_functor, arg_policy )
|
||||
{
|
||||
}
|
||||
: m_policy( arg_policy )
|
||||
, m_functor( arg_functor )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
@ -55,7 +55,7 @@
|
||||
#include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
|
||||
#endif
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
namespace Kokkos {
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
@ -331,11 +331,23 @@ struct MDRangePolicy
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
// For backward compatibility
|
||||
namespace Kokkos { namespace Experimental {
|
||||
using Kokkos::MDRangePolicy;
|
||||
using Kokkos::Rank;
|
||||
using Kokkos::Iterate;
|
||||
} } // end Kokkos::Experimental
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
//md_parallel_for - deprecated use parallel_for
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
|
||||
template <typename MDRange, typename Functor, typename Enable = void>
|
||||
void md_parallel_for( MDRange const& range
|
||||
, Functor const& f
|
||||
@ -347,7 +359,7 @@ void md_parallel_for( MDRange const& range
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
|
||||
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
|
||||
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
|
||||
@ -365,7 +377,7 @@ void md_parallel_for( const std::string& str
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
|
||||
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
|
||||
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
|
||||
@ -385,7 +397,7 @@ void md_parallel_for( const std::string& str
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
|
||||
Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
|
||||
closure.execute();
|
||||
}
|
||||
|
||||
@ -400,7 +412,7 @@ void md_parallel_for( MDRange const& range
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
|
||||
Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
|
||||
closure.execute();
|
||||
}
|
||||
#endif
|
||||
@ -421,7 +433,7 @@ void md_parallel_reduce( MDRange const& range
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
|
||||
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
|
||||
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
|
||||
@ -439,7 +451,7 @@ void md_parallel_reduce( const std::string& str
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
|
||||
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
|
||||
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
|
||||
@ -448,7 +460,7 @@ void md_parallel_reduce( const std::string& str
|
||||
|
||||
// Cuda - md_parallel_reduce not implemented - use parallel_reduce
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
} } // namespace Kokkos::Experimental
|
||||
|
||||
#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
|
||||
|
||||
|
||||
@ -81,10 +81,10 @@ struct IndexType
|
||||
|
||||
/**\brief Specify Launch Bounds for CUDA execution.
|
||||
*
|
||||
* The "best" defaults may be architecture specific.
|
||||
* If no launch bounds specified then do not set launch bounds.
|
||||
*/
|
||||
template< unsigned int maxT = 1024 /* Max threads per block */
|
||||
, unsigned int minB = 1 /* Min blocks per SM */
|
||||
template< unsigned int maxT = 0 /* Max threads per block */
|
||||
, unsigned int minB = 0 /* Min blocks per SM */
|
||||
>
|
||||
struct LaunchBounds
|
||||
{
|
||||
@ -280,6 +280,9 @@ struct MemorySpaceAccess {
|
||||
enum { deepcopy = assignable };
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/**\brief Can AccessSpace access MemorySpace ?
|
||||
*
|
||||
@ -358,6 +361,13 @@ public:
|
||||
>::type space ;
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
using Kokkos::SpaceAccessibility ; // For backward compatibility
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -99,13 +99,17 @@ struct InitArguments {
|
||||
int num_threads;
|
||||
int num_numa;
|
||||
int device_id;
|
||||
bool disable_warnings;
|
||||
|
||||
InitArguments( int nt = -1
|
||||
, int nn = -1
|
||||
, int dv = -1)
|
||||
: num_threads( nt )
|
||||
, num_numa( nn )
|
||||
, device_id( dv )
|
||||
, int dv = -1
|
||||
, bool dw = false
|
||||
)
|
||||
: num_threads{ nt }
|
||||
, num_numa{ nn }
|
||||
, device_id{ dv }
|
||||
, disable_warnings{ dw }
|
||||
{}
|
||||
};
|
||||
|
||||
@ -113,6 +117,10 @@ void initialize(int& narg, char* arg[]);
|
||||
|
||||
void initialize(const InitArguments& args = InitArguments());
|
||||
|
||||
bool is_initialized() noexcept;
|
||||
|
||||
bool show_warnings() noexcept;
|
||||
|
||||
/** \brief Finalize the spaces that were initialized via Kokkos::initialize */
|
||||
void finalize();
|
||||
|
||||
|
||||
@ -45,7 +45,6 @@
|
||||
#define KOKKOS_CRS_HPP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
/// \class Crs
|
||||
/// \brief Compressed row storage array.
|
||||
@ -164,7 +163,7 @@ void transpose_crs(
|
||||
Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
|
||||
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in);
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
@ -172,7 +171,6 @@ void transpose_crs(
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace Experimental {
|
||||
|
||||
template <class InCrs, class OutCounts>
|
||||
class GetCrsTransposeCounts {
|
||||
@ -277,14 +275,13 @@ class FillCrsTransposeEntries {
|
||||
}
|
||||
};
|
||||
|
||||
}}} // namespace Kokkos::Impl::Experimental
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
template< class OutCounts,
|
||||
class DataType,
|
||||
@ -297,8 +294,7 @@ void get_crs_transpose_counts(
|
||||
std::string const& name) {
|
||||
using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>;
|
||||
out = OutCounts(name, in.numRows());
|
||||
Kokkos::Impl::Experimental::
|
||||
GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
|
||||
Kokkos::Impl::GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
|
||||
}
|
||||
|
||||
template< class OutRowMap,
|
||||
@ -308,8 +304,7 @@ typename OutRowMap::value_type get_crs_row_map_from_counts(
|
||||
InCounts const& in,
|
||||
std::string const& name) {
|
||||
out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
|
||||
Kokkos::Impl::Experimental::
|
||||
CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
|
||||
Kokkos::Impl::CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
|
||||
return functor.execute();
|
||||
}
|
||||
|
||||
@ -326,32 +321,37 @@ void transpose_crs(
|
||||
typedef View<SizeType*, memory_space> counts_type ;
|
||||
{
|
||||
counts_type counts;
|
||||
Kokkos::Experimental::get_crs_transpose_counts(counts, in);
|
||||
Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts,
|
||||
Kokkos::get_crs_transpose_counts(counts, in);
|
||||
Kokkos::get_crs_row_map_from_counts(out.row_map, counts,
|
||||
"tranpose_row_map");
|
||||
}
|
||||
out.entries = decltype(out.entries)("transpose_entries", in.entries.size());
|
||||
Kokkos::Impl::Experimental::
|
||||
Kokkos::Impl::
|
||||
FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
|
||||
}
|
||||
|
||||
template< class CrsType,
|
||||
class Functor>
|
||||
struct CountAndFill {
|
||||
class Functor,
|
||||
class ExecutionSpace = typename CrsType::execution_space>
|
||||
struct CountAndFillBase;
|
||||
|
||||
template< class CrsType,
|
||||
class Functor,
|
||||
class ExecutionSpace>
|
||||
struct CountAndFillBase {
|
||||
using data_type = typename CrsType::size_type;
|
||||
using size_type = typename CrsType::size_type;
|
||||
using row_map_type = typename CrsType::row_map_type;
|
||||
using entries_type = typename CrsType::entries_type;
|
||||
using counts_type = row_map_type;
|
||||
CrsType m_crs;
|
||||
Functor m_functor;
|
||||
counts_type m_counts;
|
||||
struct Count {};
|
||||
KOKKOS_INLINE_FUNCTION void operator()(Count, size_type i) const {
|
||||
inline void operator()(Count, size_type i) const {
|
||||
m_counts(i) = m_functor(i, nullptr);
|
||||
}
|
||||
struct Fill {};
|
||||
KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
|
||||
inline void operator()(Fill, size_type i) const {
|
||||
auto j = m_crs.row_map(i);
|
||||
/* we don't want to access entries(entries.size()), even if its just to get its
|
||||
address and never use it.
|
||||
@ -363,13 +363,63 @@ struct CountAndFill {
|
||||
nullptr : (&(m_crs.entries(j)));
|
||||
m_functor(i, fill);
|
||||
}
|
||||
using self_type = CountAndFill<CrsType, Functor>;
|
||||
CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
|
||||
CountAndFillBase(CrsType& crs, Functor const& f):
|
||||
m_crs(crs),
|
||||
m_functor(f)
|
||||
{}
|
||||
};
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
template< class CrsType,
|
||||
class Functor>
|
||||
struct CountAndFillBase<CrsType, Functor, Kokkos::Cuda> {
|
||||
using data_type = typename CrsType::size_type;
|
||||
using size_type = typename CrsType::size_type;
|
||||
using row_map_type = typename CrsType::row_map_type;
|
||||
using counts_type = row_map_type;
|
||||
CrsType m_crs;
|
||||
Functor m_functor;
|
||||
counts_type m_counts;
|
||||
struct Count {};
|
||||
__device__ inline void operator()(Count, size_type i) const {
|
||||
m_counts(i) = m_functor(i, nullptr);
|
||||
}
|
||||
struct Fill {};
|
||||
__device__ inline void operator()(Fill, size_type i) const {
|
||||
auto j = m_crs.row_map(i);
|
||||
/* we don't want to access entries(entries.size()), even if its just to get its
|
||||
address and never use it.
|
||||
this can happen when row (i) is empty and all rows after it are also empty.
|
||||
we could compare to row_map(i + 1), but that is a read from global memory,
|
||||
whereas dimension_0() should be part of the View in registers (or constant memory) */
|
||||
data_type* fill =
|
||||
(j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
|
||||
nullptr : (&(m_crs.entries(j)));
|
||||
m_functor(i, fill);
|
||||
}
|
||||
CountAndFillBase(CrsType& crs, Functor const& f):
|
||||
m_crs(crs),
|
||||
m_functor(f)
|
||||
{}
|
||||
};
|
||||
#endif
|
||||
|
||||
template< class CrsType,
|
||||
class Functor>
|
||||
struct CountAndFill : public CountAndFillBase<CrsType, Functor> {
|
||||
using base_type = CountAndFillBase<CrsType, Functor>;
|
||||
using typename base_type::data_type;
|
||||
using typename base_type::size_type;
|
||||
using typename base_type::counts_type;
|
||||
using typename base_type::Count;
|
||||
using typename base_type::Fill;
|
||||
using entries_type = typename CrsType::entries_type;
|
||||
using self_type = CountAndFill<CrsType, Functor>;
|
||||
CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
|
||||
base_type(crs, f)
|
||||
{
|
||||
using execution_space = typename CrsType::execution_space;
|
||||
m_counts = counts_type("counts", nrows);
|
||||
this->m_counts = counts_type("counts", nrows);
|
||||
{
|
||||
using count_policy_type = RangePolicy<size_type, execution_space, Count>;
|
||||
using count_closure_type =
|
||||
@ -377,10 +427,10 @@ struct CountAndFill {
|
||||
const count_closure_type closure(*this, count_policy_type(0, nrows));
|
||||
closure.execute();
|
||||
}
|
||||
auto nentries = Kokkos::Experimental::
|
||||
get_crs_row_map_from_counts(m_crs.row_map, m_counts);
|
||||
m_counts = counts_type();
|
||||
m_crs.entries = entries_type("entries", nentries);
|
||||
auto nentries = Kokkos::
|
||||
get_crs_row_map_from_counts(this->m_crs.row_map, this->m_counts);
|
||||
this->m_counts = counts_type();
|
||||
this->m_crs.entries = entries_type("entries", nentries);
|
||||
{
|
||||
using fill_policy_type = RangePolicy<size_type, execution_space, Fill>;
|
||||
using fill_closure_type =
|
||||
@ -388,7 +438,7 @@ struct CountAndFill {
|
||||
const fill_closure_type closure(*this, fill_policy_type(0, nrows));
|
||||
closure.execute();
|
||||
}
|
||||
crs = m_crs;
|
||||
crs = this->m_crs;
|
||||
}
|
||||
};
|
||||
|
||||
@ -398,9 +448,9 @@ void count_and_fill_crs(
|
||||
CrsType& crs,
|
||||
typename CrsType::size_type nrows,
|
||||
Functor const& f) {
|
||||
Kokkos::Experimental::CountAndFill<CrsType, Functor>(crs, nrows, f);
|
||||
Kokkos::CountAndFill<CrsType, Functor>(crs, nrows, f);
|
||||
}
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #define KOKKOS_CRS_HPP */
|
||||
|
||||
@ -379,12 +379,13 @@ Impl::PerThreadValue PerThread(const int& arg);
|
||||
* uses variadic templates. Each and any of the template arguments can
|
||||
* be omitted.
|
||||
*
|
||||
* Possible Template arguments and there default values:
|
||||
* Possible Template arguments and their default values:
|
||||
* ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be enabled.
|
||||
* WorkTag (none): Tag which is used as the first argument for the functor operator.
|
||||
* Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
|
||||
* IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
|
||||
* LaunchBounds<int,int> (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation.
|
||||
* LaunchBounds<unsigned,unsigned> Launch Bounds for CUDA compilation,
|
||||
* default of LaunchBounds<0,0> indicates no launch bounds specified.
|
||||
*/
|
||||
template< class ... Properties>
|
||||
class TeamPolicy: public
|
||||
|
||||
@ -274,18 +274,14 @@
|
||||
#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
|
||||
#endif
|
||||
|
||||
#if ! defined( KOKKOS_MEMORY_ALIGNMENT )
|
||||
#define KOKKOS_MEMORY_ALIGNMENT 64
|
||||
#endif
|
||||
|
||||
#define KOKKOS_RESTRICT __restrict__
|
||||
|
||||
#ifndef KOKKOS_ALIGN
|
||||
#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
|
||||
#endif
|
||||
|
||||
#ifndef KOKKOS_ALIGN_PTR
|
||||
#define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size)))
|
||||
#endif
|
||||
|
||||
#ifndef KOKKOS_ALIGN_SIZE
|
||||
#define KOKKOS_ALIGN_SIZE 64
|
||||
#ifndef KOKKOS_IMPL_ALIGN_PTR
|
||||
#define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size)))
|
||||
#endif
|
||||
|
||||
#if ( 1400 > KOKKOS_COMPILER_INTEL )
|
||||
@ -351,6 +347,11 @@
|
||||
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
#if !defined( KOKKOS_IMPL_ALIGN_PTR )
|
||||
#define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((aligned(size)))
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -426,16 +427,16 @@
|
||||
//----------------------------------------------------------------------------
|
||||
// Define Macro for alignment:
|
||||
|
||||
#if !defined KOKKOS_ALIGN_SIZE
|
||||
#define KOKKOS_ALIGN_SIZE 16
|
||||
#if ! defined( KOKKOS_MEMORY_ALIGNMENT )
|
||||
#define KOKKOS_MEMORY_ALIGNMENT 16
|
||||
#endif
|
||||
|
||||
#if !defined( KOKKOS_ALIGN )
|
||||
#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
|
||||
#if ! defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
|
||||
#define KOKKOS_MEMORY_ALIGNMENT_THRESHOLD 4
|
||||
#endif
|
||||
|
||||
#if !defined( KOKKOS_ALIGN_PTR )
|
||||
#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
|
||||
#if !defined( KOKKOS_IMPL_ALIGN_PTR )
|
||||
#define KOKKOS_IMPL_ALIGN_PTR(size) /* */
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -111,6 +111,10 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
/**\brief The maximum size of a superblock and block */
|
||||
enum : uint32_t { max_superblock_size = 1LU << 31 /* 2 gigabytes */ };
|
||||
enum : uint32_t { max_block_per_superblock = max_bit_count };
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -206,7 +210,7 @@ public:
|
||||
const uint32_t * sb_state_ptr = sb_state_array ;
|
||||
|
||||
s << "pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) << ")"
|
||||
<< " superblock_size(" << ( 1 << m_sb_size_lg2 ) << ")" << std::endl ;
|
||||
<< " superblock_size(" << ( 1LU << m_sb_size_lg2 ) << ")" << std::endl ;
|
||||
|
||||
for ( int32_t i = 0 ; i < m_sb_count
|
||||
; ++i , sb_state_ptr += m_sb_state_size ) {
|
||||
@ -215,7 +219,7 @@ public:
|
||||
|
||||
const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
|
||||
const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
|
||||
const uint32_t block_count = 1 << block_count_lg2 ;
|
||||
const uint32_t block_count = 1u << block_count_lg2 ;
|
||||
const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
|
||||
|
||||
s << "Superblock[ " << i << " / " << m_sb_count << " ] {"
|
||||
@ -284,43 +288,87 @@ public:
|
||||
{
|
||||
const uint32_t int_align_lg2 = 3 ; /* align as int[8] */
|
||||
const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
|
||||
const uint32_t default_min_block_size = 1u << 6 ; /* 64 bytes */
|
||||
const uint32_t default_max_block_size = 1u << 12 ;/* 4k bytes */
|
||||
const uint32_t default_min_superblock_size = 1u << 20 ;/* 1M bytes */
|
||||
|
||||
// Constraints and defaults:
|
||||
// min_block_alloc_size <= max_block_alloc_size
|
||||
// max_block_alloc_size <= min_superblock_size
|
||||
// min_superblock_size <= min_total_alloc_size
|
||||
//--------------------------------------------------
|
||||
// Default block and superblock sizes:
|
||||
|
||||
const uint32_t MIN_BLOCK_SIZE = 1u << 6 /* 64 bytes */ ;
|
||||
const uint32_t MAX_BLOCK_SIZE = 1u << 12 /* 4k bytes */ ;
|
||||
if ( 0 == min_block_alloc_size ) {
|
||||
// Default all sizes:
|
||||
|
||||
if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ;
|
||||
min_superblock_size =
|
||||
std::min( size_t(default_min_superblock_size)
|
||||
, min_total_alloc_size );
|
||||
|
||||
min_block_alloc_size =
|
||||
std::min( size_t(default_min_block_size)
|
||||
, min_superblock_size );
|
||||
|
||||
max_block_alloc_size =
|
||||
std::min( size_t(default_max_block_size)
|
||||
, min_superblock_size );
|
||||
}
|
||||
else if ( 0 == min_superblock_size ) {
|
||||
|
||||
// Choose superblock size as minimum of:
|
||||
// max_block_per_superblock * min_block_size
|
||||
// max_superblock_size
|
||||
// min_total_alloc_size
|
||||
|
||||
const size_t max_superblock =
|
||||
min_block_alloc_size * max_block_per_superblock ;
|
||||
|
||||
min_superblock_size =
|
||||
std::min( max_superblock ,
|
||||
std::min( size_t(max_superblock_size)
|
||||
, min_total_alloc_size ) );
|
||||
}
|
||||
|
||||
if ( 0 == max_block_alloc_size ) {
|
||||
|
||||
max_block_alloc_size = MAX_BLOCK_SIZE ;
|
||||
|
||||
// Upper bound of total allocation size
|
||||
max_block_alloc_size = std::min( size_t(max_block_alloc_size)
|
||||
, min_total_alloc_size );
|
||||
|
||||
// Lower bound of minimum block size
|
||||
max_block_alloc_size = std::max( max_block_alloc_size
|
||||
, min_block_alloc_size );
|
||||
max_block_alloc_size = min_superblock_size ;
|
||||
}
|
||||
|
||||
if ( 0 == min_superblock_size ) {
|
||||
min_superblock_size = max_block_alloc_size ;
|
||||
//--------------------------------------------------
|
||||
|
||||
// Upper bound of total allocation size
|
||||
min_superblock_size = std::min( size_t(min_superblock_size)
|
||||
, min_total_alloc_size );
|
||||
{
|
||||
/* Enforce size constraints:
|
||||
* min_block_alloc_size <= max_block_alloc_size
|
||||
* max_block_alloc_size <= min_superblock_size
|
||||
* min_superblock_size <= max_superblock_size
|
||||
* min_superblock_size <= min_total_alloc_size
|
||||
* min_superblock_size <= min_block_alloc_size *
|
||||
* max_block_per_superblock
|
||||
*/
|
||||
|
||||
// Lower bound of maximum block size
|
||||
min_superblock_size = std::max( min_superblock_size
|
||||
, max_block_alloc_size );
|
||||
const size_t max_superblock =
|
||||
min_block_alloc_size * max_block_per_superblock ;
|
||||
|
||||
if ( ( size_t(max_superblock_size) < min_superblock_size ) ||
|
||||
( min_total_alloc_size < min_superblock_size ) ||
|
||||
( max_superblock < min_superblock_size ) ||
|
||||
( min_superblock_size < max_block_alloc_size ) ||
|
||||
( max_block_alloc_size < min_block_alloc_size ) ) {
|
||||
|
||||
#if 1
|
||||
printf( " MemoryPool min_block_alloc_size(%ld) max_block_alloc_size(%ld) min_superblock_size(%ld) min_total_alloc_size(%ld) ; max_superblock_size(%ld) max_block_per_superblock(%ld)\n"
|
||||
, min_block_alloc_size
|
||||
, max_block_alloc_size
|
||||
, min_superblock_size
|
||||
, min_total_alloc_size
|
||||
, size_t(max_superblock_size)
|
||||
, size_t(max_block_per_superblock)
|
||||
);
|
||||
#endif
|
||||
|
||||
Kokkos::abort("Kokkos MemoryPool size constraint violation");
|
||||
}
|
||||
}
|
||||
|
||||
//--------------------------------------------------
|
||||
// Block and superblock size is power of two:
|
||||
// Maximum value is 'max_superblock_size'
|
||||
|
||||
m_min_block_size_lg2 =
|
||||
Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
|
||||
@ -331,45 +379,26 @@ public:
|
||||
m_sb_size_lg2 =
|
||||
Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
|
||||
|
||||
// Constraints:
|
||||
// m_min_block_size_lg2 <= m_max_block_size_lg2 <= m_sb_size_lg2
|
||||
// m_sb_size_lg2 <= m_min_block_size + max_bit_count_lg2
|
||||
{
|
||||
// number of superblocks is multiple of superblock size that
|
||||
// can hold min_total_alloc_size.
|
||||
|
||||
if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_sb_size_lg2 ) {
|
||||
m_min_block_size_lg2 = m_sb_size_lg2 - max_bit_count_lg2 ;
|
||||
}
|
||||
if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_max_block_size_lg2 ) {
|
||||
m_min_block_size_lg2 = m_max_block_size_lg2 - max_bit_count_lg2 ;
|
||||
}
|
||||
if ( m_max_block_size_lg2 < m_min_block_size_lg2 ) {
|
||||
m_max_block_size_lg2 = m_min_block_size_lg2 ;
|
||||
}
|
||||
if ( m_sb_size_lg2 < m_max_block_size_lg2 ) {
|
||||
m_sb_size_lg2 = m_max_block_size_lg2 ;
|
||||
const uint64_t sb_size_mask = ( 1LU << m_sb_size_lg2 ) - 1 ;
|
||||
|
||||
m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
|
||||
}
|
||||
|
||||
// At least 32 minimum size blocks in a superblock
|
||||
{
|
||||
// Any superblock can be assigned to the smallest size block
|
||||
// Size the block bitset to maximum number of blocks
|
||||
|
||||
if ( m_sb_size_lg2 < m_min_block_size_lg2 + 5 ) {
|
||||
m_sb_size_lg2 = m_min_block_size_lg2 + 5 ;
|
||||
const uint32_t max_block_count_lg2 =
|
||||
m_sb_size_lg2 - m_min_block_size_lg2 ;
|
||||
|
||||
m_sb_state_size =
|
||||
( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
|
||||
}
|
||||
|
||||
// number of superblocks is multiple of superblock size that
|
||||
// can hold min_total_alloc_size.
|
||||
|
||||
const uint32_t sb_size_mask = ( 1u << m_sb_size_lg2 ) - 1 ;
|
||||
|
||||
m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
|
||||
|
||||
// Any superblock can be assigned to the smallest size block
|
||||
// Size the block bitset to maximum number of blocks
|
||||
|
||||
const uint32_t max_block_count_lg2 =
|
||||
m_sb_size_lg2 - m_min_block_size_lg2 ;
|
||||
|
||||
m_sb_state_size =
|
||||
( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
|
||||
|
||||
// Array of all superblock states
|
||||
|
||||
const size_t all_sb_state_size =
|
||||
@ -454,7 +483,7 @@ private:
|
||||
* Restrict lower bound to minimum block size.
|
||||
*/
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
unsigned get_block_size_lg2( unsigned n ) const noexcept
|
||||
uint32_t get_block_size_lg2( uint32_t n ) const noexcept
|
||||
{
|
||||
const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n );
|
||||
|
||||
@ -463,11 +492,12 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
/* Return 0 for invalid block size */
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t allocate_block_size( uint32_t alloc_size ) const noexcept
|
||||
uint32_t allocate_block_size( uint64_t alloc_size ) const noexcept
|
||||
{
|
||||
return alloc_size <= (1UL << m_max_block_size_lg2)
|
||||
? ( 1u << get_block_size_lg2( alloc_size ) )
|
||||
? ( 1UL << get_block_size_lg2( uint32_t(alloc_size) ) )
|
||||
: 0 ;
|
||||
}
|
||||
|
||||
@ -485,246 +515,253 @@ public:
|
||||
void * allocate( size_t alloc_size
|
||||
, int32_t attempt_limit = 1 ) const noexcept
|
||||
{
|
||||
if ( size_t(1LU << m_max_block_size_lg2) < alloc_size ) {
|
||||
Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
|
||||
}
|
||||
|
||||
if ( 0 == alloc_size ) return (void*) 0 ;
|
||||
|
||||
void * p = 0 ;
|
||||
|
||||
const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
|
||||
|
||||
if ( block_size_lg2 <= m_max_block_size_lg2 ) {
|
||||
// Allocation will fit within a superblock
|
||||
// that has block sizes ( 1 << block_size_lg2 )
|
||||
|
||||
// Allocation will fit within a superblock
|
||||
// that has block sizes ( 1 << block_size_lg2 )
|
||||
const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
|
||||
const uint32_t block_state = block_count_lg2 << state_shift ;
|
||||
const uint32_t block_count = 1u << block_count_lg2 ;
|
||||
|
||||
const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
|
||||
const uint32_t block_state = block_count_lg2 << state_shift ;
|
||||
const uint32_t block_count = 1u << block_count_lg2 ;
|
||||
// Superblock hints for this block size:
|
||||
// hint_sb_id_ptr[0] is the dynamically changing hint
|
||||
// hint_sb_id_ptr[1] is the static start point
|
||||
|
||||
// Superblock hints for this block size:
|
||||
// hint_sb_id_ptr[0] is the dynamically changing hint
|
||||
// hint_sb_id_ptr[1] is the static start point
|
||||
volatile uint32_t * const hint_sb_id_ptr
|
||||
= m_sb_state_array /* memory pool state array */
|
||||
+ m_hint_offset /* offset to hint portion of array */
|
||||
+ HINT_PER_BLOCK_SIZE /* number of hints per block size */
|
||||
* ( block_size_lg2 - m_min_block_size_lg2 ); /* block size id */
|
||||
|
||||
volatile uint32_t * const hint_sb_id_ptr
|
||||
= m_sb_state_array /* memory pool state array */
|
||||
+ m_hint_offset /* offset to hint portion of array */
|
||||
+ HINT_PER_BLOCK_SIZE /* number of hints per block size */
|
||||
* ( block_size_lg2 - m_min_block_size_lg2 ); /* block size id */
|
||||
const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );
|
||||
|
||||
const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );
|
||||
// Fast query clock register 'tic' to pseudo-randomize
|
||||
// the guess for which block within a superblock should
|
||||
// be claimed. If not available then a search occurs.
|
||||
|
||||
// Fast query clock register 'tic' to pseudo-randomize
|
||||
// the guess for which block within a superblock should
|
||||
// be claimed. If not available then a search occurs.
|
||||
|
||||
const uint32_t block_id_hint =
|
||||
(uint32_t)( Kokkos::Impl::clock_tic()
|
||||
const uint32_t block_id_hint =
|
||||
(uint32_t)( Kokkos::Impl::clock_tic()
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
|
||||
// Spread out potentially concurrent access
|
||||
// by threads within a warp or thread block.
|
||||
+ ( threadIdx.x + blockDim.x * threadIdx.y )
|
||||
// Spread out potentially concurrent access
|
||||
// by threads within a warp or thread block.
|
||||
+ ( threadIdx.x + blockDim.x * threadIdx.y )
|
||||
#endif
|
||||
);
|
||||
);
|
||||
|
||||
// expected state of superblock for allocation
|
||||
uint32_t sb_state = block_state ;
|
||||
// expected state of superblock for allocation
|
||||
uint32_t sb_state = block_state ;
|
||||
|
||||
int32_t sb_id = -1 ;
|
||||
int32_t sb_id = -1 ;
|
||||
|
||||
volatile uint32_t * sb_state_array = 0 ;
|
||||
volatile uint32_t * sb_state_array = 0 ;
|
||||
|
||||
while ( attempt_limit ) {
|
||||
while ( attempt_limit ) {
|
||||
|
||||
int32_t hint_sb_id = -1 ;
|
||||
int32_t hint_sb_id = -1 ;
|
||||
|
||||
if ( sb_id < 0 ) {
|
||||
if ( sb_id < 0 ) {
|
||||
|
||||
// No superblock specified, try the hint for this block size
|
||||
// No superblock specified, try the hint for this block size
|
||||
|
||||
sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
|
||||
sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
|
||||
|
||||
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
|
||||
}
|
||||
|
||||
// Require:
|
||||
// 0 <= sb_id
|
||||
// sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
|
||||
|
||||
if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
|
||||
|
||||
// This superblock state is as expected, for the moment.
|
||||
// Attempt to claim a bit. The attempt updates the state
|
||||
// so have already made sure the state header is as expected.
|
||||
|
||||
const uint32_t count_lg2 = sb_state >> state_shift ;
|
||||
const uint32_t mask = ( 1u << count_lg2 ) - 1 ;
|
||||
|
||||
const Kokkos::pair<int,int> result =
|
||||
CB::acquire_bounded_lg2( sb_state_array
|
||||
, count_lg2
|
||||
, block_id_hint & mask
|
||||
, sb_state
|
||||
);
|
||||
|
||||
// If result.first < 0 then failed to acquire
|
||||
// due to either full or buffer was wrong state.
|
||||
// Could be wrong state if a deallocation raced the
|
||||
// superblock to empty before the acquire could succeed.
|
||||
|
||||
if ( 0 <= result.first ) { // acquired a bit
|
||||
|
||||
const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
|
||||
|
||||
// Set the allocated block pointer
|
||||
|
||||
p = ((char*)( m_sb_state_array + m_data_offset ))
|
||||
+ ( uint64_t(sb_id) << m_sb_size_lg2 ) // superblock memory
|
||||
+ ( uint64_t(result.first) << size_lg2 ); // block memory
|
||||
|
||||
#if 0
|
||||
printf( " MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
|
||||
, (uintptr_t)m_sb_state_array
|
||||
, (uintptr_t)p
|
||||
, alloc_size
|
||||
, sb_id
|
||||
, sb_state
|
||||
, (1u << size_lg2)
|
||||
, (1u << count_lg2)
|
||||
, result.first
|
||||
, result.second );
|
||||
#endif
|
||||
|
||||
break ; // Success
|
||||
}
|
||||
}
|
||||
//------------------------------------------------------------------
|
||||
// Arrive here if failed to acquire a block.
|
||||
// Must find a new superblock.
|
||||
|
||||
// Start searching at designated index for this block size.
|
||||
// Look for superblock that, in preferential order,
|
||||
// 1) part-full superblock of this block size
|
||||
// 2) empty superblock to claim for this block size
|
||||
// 3) part-full superblock of the next larger block size
|
||||
|
||||
sb_state = block_state ; // Expect to find the desired state
|
||||
sb_id = -1 ;
|
||||
|
||||
bool update_hint = false ;
|
||||
int32_t sb_id_empty = -1 ;
|
||||
int32_t sb_id_large = -1 ;
|
||||
uint32_t sb_state_large = 0 ;
|
||||
|
||||
sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
|
||||
|
||||
for ( int32_t i = 0 , id = sb_id_begin ; i < m_sb_count ; ++i ) {
|
||||
|
||||
// Query state of the candidate superblock.
|
||||
// Note that the state may change at any moment
|
||||
// as concurrent allocations and deallocations occur.
|
||||
|
||||
const uint32_t full_state = *sb_state_array ;
|
||||
const uint32_t used = full_state & state_used_mask ;
|
||||
const uint32_t state = full_state & state_header_mask ;
|
||||
|
||||
if ( state == block_state ) {
|
||||
|
||||
// Superblock is assigned to this block size
|
||||
|
||||
if ( used < block_count ) {
|
||||
|
||||
// There is room to allocate one block
|
||||
|
||||
sb_id = id ;
|
||||
|
||||
// Is there room to allocate more than one block?
|
||||
|
||||
update_hint = used + 1 < block_count ;
|
||||
|
||||
break ;
|
||||
}
|
||||
}
|
||||
else if ( 0 == used ) {
|
||||
|
||||
// Superblock is empty
|
||||
|
||||
if ( -1 == sb_id_empty ) {
|
||||
|
||||
// Superblock is not assigned to this block size
|
||||
// and is the first empty superblock encountered.
|
||||
// Save this id to use if a partfull superblock is not found.
|
||||
|
||||
sb_id_empty = id ;
|
||||
}
|
||||
}
|
||||
else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
|
||||
( -1 == sb_id_large /* have not found a larger */ ) &&
|
||||
( state < block_state /* a larger block */ ) &&
|
||||
// is not full:
|
||||
( used < ( 1u << ( state >> state_shift ) ) ) ) {
|
||||
// First superblock encountered that is
|
||||
// larger than this block size and
|
||||
// has room for an allocation.
|
||||
// Save this id to use of partfull or empty superblock not found
|
||||
sb_id_large = id ;
|
||||
sb_state_large = state ;
|
||||
}
|
||||
|
||||
// Iterate around the superblock array:
|
||||
|
||||
if ( ++id < m_sb_count ) {
|
||||
sb_state_array += m_sb_state_size ;
|
||||
}
|
||||
else {
|
||||
id = 0 ;
|
||||
sb_state_array = m_sb_state_array ;
|
||||
}
|
||||
}
|
||||
|
||||
// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
|
||||
|
||||
if ( sb_id < 0 ) {
|
||||
|
||||
// Did not find a partfull superblock for this block size.
|
||||
|
||||
if ( 0 <= sb_id_empty ) {
|
||||
|
||||
// Found first empty superblock following designated superblock
|
||||
// Attempt to claim it for this block size.
|
||||
// If the claim fails assume that another thread claimed it
|
||||
// for this block size and try to use it anyway,
|
||||
// but do not update hint.
|
||||
|
||||
sb_id = sb_id_empty ;
|
||||
|
||||
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
|
||||
|
||||
// If successfully changed assignment of empty superblock 'sb_id'
|
||||
// to this block_size then update the hint.
|
||||
|
||||
const uint32_t state_empty = state_header_mask & *sb_state_array ;
|
||||
|
||||
// If this thread claims the empty block then update the hint
|
||||
update_hint =
|
||||
state_empty ==
|
||||
Kokkos::atomic_compare_exchange
|
||||
(sb_state_array,state_empty,block_state);
|
||||
}
|
||||
else if ( 0 <= sb_id_large ) {
|
||||
|
||||
// Found a larger superblock with space available
|
||||
|
||||
sb_id = sb_id_large ;
|
||||
sb_state = sb_state_large ;
|
||||
|
||||
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
|
||||
}
|
||||
|
||||
// Require:
|
||||
// 0 <= sb_id
|
||||
// sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
|
||||
|
||||
if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
|
||||
|
||||
// This superblock state is as expected, for the moment.
|
||||
// Attempt to claim a bit. The attempt updates the state
|
||||
// so have already made sure the state header is as expected.
|
||||
|
||||
const uint32_t count_lg2 = sb_state >> state_shift ;
|
||||
const uint32_t mask = ( 1u << count_lg2 ) - 1 ;
|
||||
|
||||
const Kokkos::pair<int,int> result =
|
||||
CB::acquire_bounded_lg2( sb_state_array
|
||||
, count_lg2
|
||||
, block_id_hint & mask
|
||||
, sb_state
|
||||
);
|
||||
|
||||
// If result.first < 0 then failed to acquire
|
||||
// due to either full or buffer was wrong state.
|
||||
// Could be wrong state if a deallocation raced the
|
||||
// superblock to empty before the acquire could succeed.
|
||||
|
||||
if ( 0 <= result.first ) { // acquired a bit
|
||||
|
||||
const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
|
||||
|
||||
// Set the allocated block pointer
|
||||
|
||||
p = ((char*)( m_sb_state_array + m_data_offset ))
|
||||
+ ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory
|
||||
+ ( result.first << size_lg2 ); // block memory
|
||||
|
||||
break ; // Success
|
||||
}
|
||||
|
||||
// printf(" acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second );
|
||||
|
||||
else {
|
||||
// Did not find a potentially usable superblock
|
||||
--attempt_limit ;
|
||||
}
|
||||
//------------------------------------------------------------------
|
||||
// Arrive here if failed to acquire a block.
|
||||
// Must find a new superblock.
|
||||
}
|
||||
|
||||
// Start searching at designated index for this block size.
|
||||
// Look for superblock that, in preferential order,
|
||||
// 1) part-full superblock of this block size
|
||||
// 2) empty superblock to claim for this block size
|
||||
// 3) part-full superblock of the next larger block size
|
||||
|
||||
sb_state = block_state ; // Expect to find the desired state
|
||||
sb_id = -1 ;
|
||||
|
||||
bool update_hint = false ;
|
||||
int32_t sb_id_empty = -1 ;
|
||||
int32_t sb_id_large = -1 ;
|
||||
uint32_t sb_state_large = 0 ;
|
||||
|
||||
sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
|
||||
|
||||
for ( int32_t i = 0 , id = sb_id_begin ; i < m_sb_count ; ++i ) {
|
||||
|
||||
// Query state of the candidate superblock.
|
||||
// Note that the state may change at any moment
|
||||
// as concurrent allocations and deallocations occur.
|
||||
|
||||
const uint32_t full_state = *sb_state_array ;
|
||||
const uint32_t used = full_state & state_used_mask ;
|
||||
const uint32_t state = full_state & state_header_mask ;
|
||||
|
||||
if ( state == block_state ) {
|
||||
|
||||
// Superblock is assigned to this block size
|
||||
|
||||
if ( used < block_count ) {
|
||||
|
||||
// There is room to allocate one block
|
||||
|
||||
sb_id = id ;
|
||||
|
||||
// Is there room to allocate more than one block?
|
||||
|
||||
update_hint = used + 1 < block_count ;
|
||||
|
||||
break ;
|
||||
}
|
||||
}
|
||||
else if ( 0 == used ) {
|
||||
|
||||
// Superblock is empty
|
||||
|
||||
if ( -1 == sb_id_empty ) {
|
||||
|
||||
// Superblock is not assigned to this block size
|
||||
// and is the first empty superblock encountered.
|
||||
// Save this id to use if a partfull superblock is not found.
|
||||
|
||||
sb_id_empty = id ;
|
||||
}
|
||||
}
|
||||
else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
|
||||
( -1 == sb_id_large /* have not found a larger */ ) &&
|
||||
( state < block_state /* a larger block */ ) &&
|
||||
// is not full:
|
||||
( used < ( 1u << ( state >> state_shift ) ) ) ) {
|
||||
// First superblock encountered that is
|
||||
// larger than this block size and
|
||||
// has room for an allocation.
|
||||
// Save this id to use of partfull or empty superblock not found
|
||||
sb_id_large = id ;
|
||||
sb_state_large = state ;
|
||||
}
|
||||
|
||||
// Iterate around the superblock array:
|
||||
|
||||
if ( ++id < m_sb_count ) {
|
||||
sb_state_array += m_sb_state_size ;
|
||||
}
|
||||
else {
|
||||
id = 0 ;
|
||||
sb_state_array = m_sb_state_array ;
|
||||
}
|
||||
}
|
||||
|
||||
// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
|
||||
|
||||
if ( sb_id < 0 ) {
|
||||
|
||||
// Did not find a partfull superblock for this block size.
|
||||
|
||||
if ( 0 <= sb_id_empty ) {
|
||||
|
||||
// Found first empty superblock following designated superblock
|
||||
// Attempt to claim it for this block size.
|
||||
// If the claim fails assume that another thread claimed it
|
||||
// for this block size and try to use it anyway,
|
||||
// but do not update hint.
|
||||
|
||||
sb_id = sb_id_empty ;
|
||||
|
||||
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
|
||||
|
||||
// If successfully changed assignment of empty superblock 'sb_id'
|
||||
// to this block_size then update the hint.
|
||||
|
||||
const uint32_t state_empty = state_header_mask & *sb_state_array ;
|
||||
|
||||
// If this thread claims the empty block then update the hint
|
||||
update_hint =
|
||||
state_empty ==
|
||||
Kokkos::atomic_compare_exchange
|
||||
(sb_state_array,state_empty,block_state);
|
||||
}
|
||||
else if ( 0 <= sb_id_large ) {
|
||||
|
||||
// Found a larger superblock with space available
|
||||
|
||||
sb_id = sb_id_large ;
|
||||
sb_state = sb_state_large ;
|
||||
|
||||
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
|
||||
}
|
||||
else {
|
||||
// Did not find a potentially usable superblock
|
||||
--attempt_limit ;
|
||||
}
|
||||
}
|
||||
|
||||
if ( update_hint ) {
|
||||
Kokkos::atomic_compare_exchange
|
||||
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
|
||||
}
|
||||
} // end allocation attempt loop
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
}
|
||||
else {
|
||||
Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
|
||||
}
|
||||
if ( update_hint ) {
|
||||
Kokkos::atomic_compare_exchange
|
||||
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
|
||||
}
|
||||
} // end allocation attempt loop
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
return p ;
|
||||
}
|
||||
@ -765,7 +802,7 @@ public:
|
||||
const uint32_t block_size_lg2 =
|
||||
m_sb_size_lg2 - ( block_state >> state_shift );
|
||||
|
||||
ok_block_aligned = 0 == ( d & ( ( 1 << block_size_lg2 ) - 1 ) );
|
||||
ok_block_aligned = 0 == ( d & ( ( 1UL << block_size_lg2 ) - 1 ) );
|
||||
|
||||
if ( ok_block_aligned ) {
|
||||
|
||||
@ -773,31 +810,70 @@ public:
|
||||
// mask into superblock and then shift down for block index
|
||||
|
||||
const uint32_t bit =
|
||||
( d & ( ptrdiff_t( 1 << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
|
||||
( d & ( ptrdiff_t( 1LU << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
|
||||
|
||||
const int result =
|
||||
CB::release( sb_state_array , bit , block_state );
|
||||
|
||||
ok_dealloc_once = 0 <= result ;
|
||||
|
||||
// printf(" deallocate from sb_id(%d) result(%d) bit(%d) state(0x%x)\n"
|
||||
// , sb_id
|
||||
// , result
|
||||
// , uint32_t(d >> block_size_lg2)
|
||||
// , *sb_state_array );
|
||||
|
||||
#if 0
|
||||
printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
|
||||
, (uintptr_t)m_sb_state_array
|
||||
, (uintptr_t)p
|
||||
, sb_id
|
||||
, (1u << block_size_lg2)
|
||||
, (1u << (m_sb_size_lg2 - block_size_lg2))
|
||||
, bit
|
||||
, result );
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) {
|
||||
#if 0
|
||||
printf("Kokkos MemoryPool deallocate(0x%lx) contains(%d) block_aligned(%d) dealloc_once(%d)\n",(uintptr_t)p,ok_contains,ok_block_aligned,ok_dealloc_once);
|
||||
printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
|
||||
, (uintptr_t)m_sb_state_array
|
||||
, (uintptr_t)p
|
||||
, int(ok_contains)
|
||||
, int(ok_block_aligned)
|
||||
, int(ok_dealloc_once) );
|
||||
#endif
|
||||
Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
|
||||
}
|
||||
}
|
||||
// end deallocate
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int number_of_superblocks() const noexcept { return m_sb_count ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void superblock_state( int sb_id
|
||||
, int & block_size
|
||||
, int & block_count_capacity
|
||||
, int & block_count_used ) const noexcept
|
||||
{
|
||||
block_size = 0 ;
|
||||
block_count_capacity = 0 ;
|
||||
block_count_used = 0 ;
|
||||
|
||||
if ( Kokkos::Impl::MemorySpaceAccess
|
||||
< Kokkos::Impl::ActiveExecutionMemorySpace
|
||||
, base_memory_space >::accessible ) {
|
||||
// Can access the state array
|
||||
|
||||
const uint32_t state =
|
||||
((uint32_t volatile *)m_sb_state_array)[sb_id*m_sb_state_size];
|
||||
|
||||
const uint32_t block_count_lg2 = state >> state_shift ;
|
||||
const uint32_t block_used = state & state_used_mask ;
|
||||
|
||||
block_size = 1LU << ( m_sb_size_lg2 - block_count_lg2 );
|
||||
block_count_capacity = 1LU << block_count_lg2 ;
|
||||
block_count_used = block_used ;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
@ -97,26 +97,22 @@ typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryR
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
static_assert(
|
||||
( 0 < int(KOKKOS_MEMORY_ALIGNMENT) ) &&
|
||||
( 0 == ( int(KOKKOS_MEMORY_ALIGNMENT) & (int(KOKKOS_MEMORY_ALIGNMENT)-1))) ,
|
||||
"KOKKOS_MEMORY_ALIGNMENT must be a power of two" );
|
||||
|
||||
/** \brief Memory alignment settings
|
||||
*
|
||||
* Sets global value for memory alignment. Must be a power of two!
|
||||
* Enable compatibility of views from different devices with static stride.
|
||||
* Use compiler flag to enable overwrites.
|
||||
*/
|
||||
enum { MEMORY_ALIGNMENT =
|
||||
#if defined( KOKKOS_MEMORY_ALIGNMENT )
|
||||
( 1 << Kokkos::Impl::integral_power_of_two( KOKKOS_MEMORY_ALIGNMENT ) )
|
||||
#else
|
||||
( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
|
||||
#endif
|
||||
#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
|
||||
enum : unsigned
|
||||
{ MEMORY_ALIGNMENT = KOKKOS_MEMORY_ALIGNMENT
|
||||
, MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
|
||||
#else
|
||||
, MEMORY_ALIGNMENT_THRESHOLD = 4
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
} //namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
@ -204,8 +204,8 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
>
|
||||
{
|
||||
enum { value = false };
|
||||
inline static void verify( void ) { Experimental::ROCmSpace::access_error(); }
|
||||
inline static void verify( const void * p ) { Experimental::ROCmSpace::access_error(p); }
|
||||
inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
|
||||
inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
|
||||
};
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
@ -619,16 +619,16 @@ namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::Experimental::MDRangePolicy< Traits ... > ,
|
||||
Kokkos::MDRangePolicy< Traits ... > ,
|
||||
Kokkos::Serial
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
|
||||
typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const MDRangePolicy m_mdr_policy ;
|
||||
@ -661,14 +661,14 @@ public:
|
||||
|
||||
template< class FunctorType , class ReducerType , class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, Kokkos::MDRangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Serial
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
|
||||
typedef typename MDRangePolicy::work_tag WorkTag ;
|
||||
@ -686,7 +686,7 @@ private:
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
|
||||
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
|
||||
, FunctorType
|
||||
, WorkTag
|
||||
, ValueType
|
||||
|
||||
@ -408,7 +408,7 @@ view_alloc( Args const & ... args )
|
||||
}
|
||||
|
||||
template< class ... Args >
|
||||
inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
|
||||
view_wrap( Args const & ... args )
|
||||
{
|
||||
@ -1216,6 +1216,13 @@ public:
|
||||
m_track.assign_allocated_record_to_uninitialized( record );
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void assign_data( pointer_type arg_data )
|
||||
{
|
||||
m_track.clear();
|
||||
m_map.assign_data( arg_data );
|
||||
}
|
||||
|
||||
// Wrap memory according to properties and array layout
|
||||
template< class ... P >
|
||||
explicit KOKKOS_INLINE_FUNCTION
|
||||
@ -2235,6 +2242,29 @@ create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
|
||||
return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
|
||||
}
|
||||
|
||||
// Create a mirror view and deep_copy in a new space (specialization for same space)
|
||||
template<class Space, class T, class ... P>
|
||||
typename Impl::MirrorViewType<Space,T,P ...>::view_type
|
||||
create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
|
||||
, std::string const& name = ""
|
||||
, typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
|
||||
(void)name;
|
||||
return src;
|
||||
}
|
||||
|
||||
// Create a mirror view and deep_copy in a new space (specialization for different space)
|
||||
template<class Space, class T, class ... P>
|
||||
typename Impl::MirrorViewType<Space,T,P ...>::view_type
|
||||
create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
|
||||
, std::string const& name = ""
|
||||
, typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
|
||||
using Mirror = typename Impl::MirrorViewType<Space,T,P ...>::view_type;
|
||||
std::string label = name.empty() ? src.label() : name;
|
||||
auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout());
|
||||
deep_copy(mirror, src);
|
||||
return mirror;
|
||||
}
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -2432,6 +2462,7 @@ struct CommonViewAllocProp< void, ValueType >
|
||||
using scalar_array_type = ValueType;
|
||||
|
||||
template < class ... Views >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CommonViewAllocProp( const Views & ... ) {}
|
||||
};
|
||||
|
||||
@ -2499,6 +2530,7 @@ using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...
|
||||
|
||||
// User function
|
||||
template < class ... Views >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
DeducedCommonPropsType<Views...>
|
||||
common_view_alloc_prop( Views const & ... views )
|
||||
{
|
||||
|
||||
@ -46,205 +46,198 @@
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace Experimental {
|
||||
|
||||
template< class functor_type , class execution_space, class ... policy_args >
|
||||
class WorkGraphExec;
|
||||
|
||||
}}} // namespace Kokkos::Impl::Experimental
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
template< class ... Properties >
|
||||
class WorkGraphPolicy
|
||||
{
|
||||
public:
|
||||
|
||||
using self_type = WorkGraphPolicy<Properties ... >;
|
||||
using traits = Kokkos::Impl::PolicyTraits<Properties ... >;
|
||||
using index_type = typename traits::index_type;
|
||||
using self_type = WorkGraphPolicy<Properties ... >;
|
||||
using traits = Kokkos::Impl::PolicyTraits<Properties ... >;
|
||||
using index_type = typename traits::index_type;
|
||||
using member_type = index_type;
|
||||
using work_tag = typename traits::work_tag;
|
||||
using execution_space = typename traits::execution_space;
|
||||
using work_tag = typename traits::work_tag;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
using graph_type = Kokkos::Experimental::Crs<index_type, execution_space, void, index_type>;
|
||||
using member_type = index_type;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
using graph_type = Kokkos::Crs<index_type,execution_space,void,index_type>;
|
||||
|
||||
enum : std::int32_t {
|
||||
END_TOKEN = -1 ,
|
||||
BEGIN_TOKEN = -2 ,
|
||||
COMPLETED_TOKEN = -3 };
|
||||
|
||||
private:
|
||||
|
||||
graph_type m_graph;
|
||||
|
||||
using ints_type = Kokkos::View<std::int32_t*, memory_space>;
|
||||
using range_type = Kokkos::pair<std::int32_t, std::int32_t>;
|
||||
using ranges_type = Kokkos::View<range_type*, memory_space>;
|
||||
const std::int32_t m_total_work;
|
||||
ints_type m_counts;
|
||||
ints_type m_queue;
|
||||
ranges_type m_ranges;
|
||||
|
||||
// Let N = m_graph.numRows(), the total work
|
||||
// m_queue[ 0 .. N-1] = the ready queue
|
||||
// m_queue[ N .. 2*N-1] = the waiting queue counts
|
||||
// m_queue[2*N .. 2*N+2] = the ready queue hints
|
||||
|
||||
graph_type const m_graph;
|
||||
ints_type m_queue ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void push_work( const std::int32_t w ) const noexcept
|
||||
{
|
||||
const std::int32_t N = m_graph.numRows();
|
||||
|
||||
std::int32_t volatile * const ready_queue = & m_queue[0] ;
|
||||
std::int32_t volatile * const end_hint = & m_queue[2*N+1] ;
|
||||
|
||||
// Push work to end of queue
|
||||
const std::int32_t j = atomic_fetch_add( end_hint , 1 );
|
||||
|
||||
if ( ( N <= j ) ||
|
||||
( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) {
|
||||
// ERROR: past the end of queue or did not replace END_TOKEN
|
||||
Kokkos::abort("WorkGraphPolicy push_work error");
|
||||
}
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
struct TagZeroRanges {};
|
||||
/**\brief Attempt to pop the work item at the head of the queue.
|
||||
*
|
||||
* Find entry 'i' such that
|
||||
* ( m_queue[i] != BEGIN_TOKEN ) AND
|
||||
* ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN )
|
||||
* if found then
|
||||
* increment begin hint
|
||||
* return atomic_exchange( m_queue[i] , BEGIN_TOKEN )
|
||||
* else if i < total work
|
||||
* return END_TOKEN
|
||||
* else
|
||||
* return COMPLETED_TOKEN
|
||||
*
|
||||
*/
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagZeroRanges, std::int32_t i) const {
|
||||
m_ranges[i] = range_type(0, 0);
|
||||
}
|
||||
void zero_ranges() {
|
||||
using policy_type = RangePolicy<std::int32_t, execution_space, TagZeroRanges>;
|
||||
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
|
||||
const closure_type closure(*this, policy_type(0, 1));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
}
|
||||
std::int32_t pop_work() const noexcept
|
||||
{
|
||||
const std::int32_t N = m_graph.numRows();
|
||||
|
||||
struct TagFillQueue {};
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagFillQueue, std::int32_t i) const {
|
||||
if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i);
|
||||
}
|
||||
void fill_queue() {
|
||||
using policy_type = RangePolicy<std::int32_t, execution_space, TagFillQueue>;
|
||||
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
|
||||
const closure_type closure(*this, policy_type(0, m_total_work));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
}
|
||||
std::int32_t volatile * const ready_queue = & m_queue[0] ;
|
||||
std::int32_t volatile * const begin_hint = & m_queue[2*N] ;
|
||||
|
||||
private:
|
||||
// begin hint is guaranteed to be less than or equal to
|
||||
// actual begin location in the queue.
|
||||
|
||||
inline
|
||||
void setup() {
|
||||
if (m_graph.numRows() > std::numeric_limits<std::int32_t>::max()) {
|
||||
Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t");
|
||||
}
|
||||
get_crs_transpose_counts(m_counts, m_graph);
|
||||
m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work);
|
||||
deep_copy(m_queue, std::int32_t(-1));
|
||||
m_ranges = ranges_type("ranges", 1);
|
||||
fill_queue();
|
||||
}
|
||||
for ( std::int32_t i = *begin_hint ; i < N ; ++i ) {
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
std::int32_t pop_work() const {
|
||||
range_type w(-1,-1);
|
||||
while (true) {
|
||||
const range_type w_new( w.first + 1 , w.second );
|
||||
w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
|
||||
if ( w.first < w.second ) { // there was work in the queue
|
||||
if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
|
||||
// we got a work item
|
||||
std::int32_t i;
|
||||
// the push_work function may have incremented the end counter
|
||||
// but not yet written the work index into the queue.
|
||||
// wait until the entry is valid.
|
||||
while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) );
|
||||
return i;
|
||||
} // we got a work item
|
||||
} else { // there was no work in the queue
|
||||
#ifdef KOKKOS_DEBUG
|
||||
if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
|
||||
Kokkos::abort("bug in pop_work");
|
||||
const std::int32_t w = ready_queue[i] ;
|
||||
|
||||
if ( w == END_TOKEN ) { return END_TOKEN ; }
|
||||
|
||||
if ( ( w != BEGIN_TOKEN ) &&
|
||||
( w == atomic_compare_exchange(ready_queue+i,w,BEGIN_TOKEN) ) ) {
|
||||
// Attempt to claim ready work index succeeded,
|
||||
// update the hint and return work index
|
||||
atomic_increment( begin_hint );
|
||||
return w ;
|
||||
}
|
||||
#endif
|
||||
if (w.first == m_total_work) { // all work is done
|
||||
return -1;
|
||||
} else { // need to wait for more work to be pushed
|
||||
// take a guess that one work item will be pushed
|
||||
// the key thing is we can't leave (w) alone, because
|
||||
// otherwise the next compare_exchange may succeed in
|
||||
// popping work from an empty queue
|
||||
w.second++;
|
||||
}
|
||||
} // there was no work in the queue
|
||||
} // while (true)
|
||||
}
|
||||
// arrive here when ready_queue[i] == BEGIN_TOKEN
|
||||
}
|
||||
|
||||
return COMPLETED_TOKEN ;
|
||||
}
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void push_work(std::int32_t i) const {
|
||||
range_type w(-1,-1);
|
||||
while (true) {
|
||||
const range_type w_new( w.first , w.second + 1 );
|
||||
// try to increment the end counter
|
||||
w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
|
||||
// stop trying if the increment was successful
|
||||
if ( w.first == w_new.first && w.second + 1 == w_new.second ) break;
|
||||
void completed_work( std::int32_t w ) const noexcept
|
||||
{
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// Make sure the completed work function's memory accesses are flushed.
|
||||
|
||||
const std::int32_t N = m_graph.numRows();
|
||||
|
||||
std::int32_t volatile * const count_queue = & m_queue[N] ;
|
||||
|
||||
const std::int32_t B = m_graph.row_map(w);
|
||||
const std::int32_t E = m_graph.row_map(w+1);
|
||||
|
||||
for ( std::int32_t i = B ; i < E ; ++i ) {
|
||||
const std::int32_t j = m_graph.entries(i);
|
||||
if ( 1 == atomic_fetch_add(count_queue+j,-1) ) {
|
||||
push_work(j);
|
||||
}
|
||||
}
|
||||
}
|
||||
// write the work index into the claimed spot in the queue
|
||||
*((volatile std::int32_t*)(&m_queue( w.second ))) = i;
|
||||
// push this write out into the memory system
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
template< class functor_type , class execution_space, class ... policy_args >
|
||||
friend class Kokkos::Impl::Experimental::WorkGraphExec;
|
||||
struct TagInit {};
|
||||
struct TagCount {};
|
||||
struct TagReady {};
|
||||
|
||||
public:
|
||||
/**\brief Initialize queue
|
||||
*
|
||||
* m_queue[0..N-1] = END_TOKEN, the ready queue
|
||||
* m_queue[N..2*N-1] = 0, the waiting count queue
|
||||
* m_queue[2*N..2*N+1] = 0, begin/end hints for ready queue
|
||||
*/
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const TagInit , int i ) const noexcept
|
||||
{ m_queue[i] = i < m_graph.numRows() ? END_TOKEN : 0 ; }
|
||||
|
||||
WorkGraphPolicy(graph_type arg_graph)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const TagCount , int i ) const noexcept
|
||||
{
|
||||
std::int32_t volatile * const count_queue =
|
||||
& m_queue[ m_graph.numRows() ] ;
|
||||
|
||||
atomic_increment( count_queue + m_graph.entries[i] );
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const TagReady , int w ) const noexcept
|
||||
{
|
||||
std::int32_t const * const count_queue =
|
||||
& m_queue[ m_graph.numRows() ] ;
|
||||
|
||||
if ( 0 == count_queue[w] ) push_work(w);
|
||||
}
|
||||
|
||||
WorkGraphPolicy( const graph_type & arg_graph )
|
||||
: m_graph(arg_graph)
|
||||
, m_total_work( arg_graph.numRows() )
|
||||
, m_queue( view_alloc( "queue" , WithoutInitializing )
|
||||
, arg_graph.numRows() * 2 + 2 )
|
||||
{
|
||||
setup();
|
||||
}
|
||||
{ // Initialize
|
||||
using policy_type = RangePolicy<std::int32_t, execution_space, TagInit>;
|
||||
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
|
||||
const closure_type closure(*this, policy_type(0, m_queue.size()));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
}
|
||||
|
||||
};
|
||||
{ // execute-after counts
|
||||
using policy_type = RangePolicy<std::int32_t, execution_space, TagCount>;
|
||||
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
|
||||
const closure_type closure(*this,policy_type(0,m_graph.entries.size()));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
}
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace Experimental {
|
||||
|
||||
template< class functor_type , class execution_space, class ... policy_args >
|
||||
class WorkGraphExec
|
||||
{
|
||||
public:
|
||||
|
||||
using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >;
|
||||
using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >;
|
||||
using member_type = typename policy_type::member_type;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
|
||||
protected:
|
||||
|
||||
const functor_type m_functor;
|
||||
const policy_type m_policy;
|
||||
|
||||
protected:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
std::int32_t before_work() const {
|
||||
return m_policy.pop_work();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void after_work(std::int32_t i) const {
|
||||
/* fence any writes that were done by the work item itself
|
||||
(usually writing its result to global memory) */
|
||||
memory_fence();
|
||||
const std::int32_t begin = m_policy.m_graph.row_map( i );
|
||||
const std::int32_t end = m_policy.m_graph.row_map( i + 1 );
|
||||
for (std::int32_t j = begin; j < end; ++j) {
|
||||
const std::int32_t next = m_policy.m_graph.entries( j );
|
||||
const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 );
|
||||
if ( old_count == 1 ) m_policy.push_work( next );
|
||||
{ // Scheduling ready tasks
|
||||
using policy_type = RangePolicy<std::int32_t, execution_space, TagReady>;
|
||||
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
|
||||
const closure_type closure(*this,policy_type(0,m_graph.numRows()));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
WorkGraphExec( const functor_type & arg_functor
|
||||
, const policy_type & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
}}} // namespace Kokkos::Impl::Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#ifdef KOKKOS_ENABLE_SERIAL
|
||||
#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"
|
||||
|
||||
@ -294,7 +294,7 @@ void OpenMP::initialize( int thread_count )
|
||||
}
|
||||
|
||||
{
|
||||
if (nullptr == std::getenv("OMP_PROC_BIND") ) {
|
||||
if ( Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND") ) {
|
||||
printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
|
||||
printf(" In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
|
||||
printf(" For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
|
||||
@ -327,7 +327,7 @@ void OpenMP::initialize( int thread_count )
|
||||
omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
|
||||
}
|
||||
else {
|
||||
if( thread_count > process_num_threads ) {
|
||||
if( Kokkos::show_warnings() && thread_count > process_num_threads ) {
|
||||
printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
|
||||
printf( " process threads available : %3d, requested thread : %3d\n", process_num_threads, thread_count );
|
||||
}
|
||||
@ -364,12 +364,12 @@ void OpenMP::initialize( int thread_count )
|
||||
|
||||
|
||||
// Check for over-subscription
|
||||
//if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
|
||||
// std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
|
||||
// std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
|
||||
// std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
|
||||
// std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
|
||||
//}
|
||||
if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
|
||||
std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
|
||||
std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
|
||||
std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
|
||||
std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
|
||||
}
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
|
||||
|
||||
@ -170,20 +170,20 @@ public:
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, Kokkos::MDRangePolicy< Traits ... >
|
||||
, Kokkos::OpenMP
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
typedef typename MDRangePolicy::work_tag WorkTag ;
|
||||
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
|
||||
typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
|
||||
|
||||
OpenMPExec * m_instance ;
|
||||
const FunctorType m_functor ;
|
||||
@ -445,14 +445,14 @@ public:
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ...>
|
||||
, Kokkos::MDRangePolicy< Traits ...>
|
||||
, ReducerType
|
||||
, Kokkos::OpenMP
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
|
||||
typedef typename MDRangePolicy::work_tag WorkTag ;
|
||||
@ -472,7 +472,7 @@ private:
|
||||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
|
||||
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
|
||||
, FunctorType
|
||||
, WorkTag
|
||||
, ValueType
|
||||
|
||||
@ -49,33 +49,26 @@ namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::OpenMP
|
||||
>
|
||||
: public Kokkos::Impl::Experimental::
|
||||
WorkGraphExec< FunctorType,
|
||||
Kokkos::OpenMP,
|
||||
Traits ...
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
|
||||
typedef Kokkos::Impl::Experimental::
|
||||
WorkGraphExec<FunctorType, Kokkos::OpenMP, Traits ... > Base ;
|
||||
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
|
||||
|
||||
Policy m_policy ;
|
||||
FunctorType m_functor ;
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
Base::m_functor( i );
|
||||
}
|
||||
exec_one( const std::int32_t w ) const noexcept
|
||||
{ m_functor( w ); }
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
const TagType t{} ;
|
||||
Base::m_functor( t , i );
|
||||
}
|
||||
exec_one( const std::int32_t w ) const noexcept
|
||||
{ const TagType t{} ; m_functor( t , w ); }
|
||||
|
||||
public:
|
||||
|
||||
@ -86,9 +79,15 @@ public:
|
||||
|
||||
#pragma omp parallel num_threads(pool_size)
|
||||
{
|
||||
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
|
||||
exec_one< typename Policy::work_tag >( i );
|
||||
Base::after_work(i);
|
||||
// Spin until COMPLETED_TOKEN.
|
||||
// END_TOKEN indicates no work is currently available.
|
||||
|
||||
for ( std::int32_t w = Policy::END_TOKEN ;
|
||||
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
|
||||
if ( Policy::END_TOKEN != w ) {
|
||||
exec_one< typename Policy::work_tag >( w );
|
||||
m_policy.completed_work(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -96,12 +95,13 @@ public:
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: Base( arg_functor, arg_policy )
|
||||
{
|
||||
}
|
||||
: m_policy( arg_policy )
|
||||
, m_functor( arg_functor )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */
|
||||
|
||||
|
||||
@ -125,7 +125,7 @@ namespace Kokkos {
|
||||
oldval.t = *dest ;
|
||||
assume.i = oldval.i ;
|
||||
newval.t = val ;
|
||||
atomic_compare_exchange( reinterpret_cast<int*>(dest) , assume.i, newval.i );
|
||||
atomic_compare_exchange( (int*)(dest) , assume.i, newval.i );
|
||||
|
||||
return oldval.t ;
|
||||
}
|
||||
|
||||
@ -608,6 +608,7 @@ ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size )
|
||||
|
||||
void ROCmInternal::finalize()
|
||||
{
|
||||
Kokkos::Impl::rocm_device_synchronize();
|
||||
was_finalized = 1;
|
||||
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
|
||||
|
||||
|
||||
@ -277,7 +277,7 @@ public:
|
||||
this->team_barrier();
|
||||
value = local_value;
|
||||
}
|
||||
// Reduce accross a team of threads.
|
||||
// Reduce across a team of threads.
|
||||
//
|
||||
// Each thread has vector_length elements.
|
||||
// This reduction is for TeamThreadRange operations, where the range
|
||||
@ -354,6 +354,80 @@ public:
|
||||
return buffer[0];
|
||||
}
|
||||
|
||||
// Reduce across a team of threads, with a reducer data type
|
||||
//
|
||||
// Each thread has vector_length elements.
|
||||
// This reduction is for TeamThreadRange operations, where the range
|
||||
// is spread across threads. Effectively, there are vector_length
|
||||
// independent reduction operations.
|
||||
// This is different from a reduction across the elements of a thread,
|
||||
// which reduces every vector element.
|
||||
|
||||
template< class ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
team_reduce( const ReducerType & reducer) const
|
||||
{
|
||||
typedef typename ReducerType::value_type value_type ;
|
||||
|
||||
tile_static value_type buffer[512];
|
||||
const auto local = lindex();
|
||||
const auto team = team_rank();
|
||||
auto vector_rank = local%m_vector_length;
|
||||
auto thread_base = team*m_vector_length;
|
||||
|
||||
const std::size_t size = next_pow_2(m_team_size+1)/2;
|
||||
#if defined(ROCM15)
|
||||
buffer[local] = reducer.reference();
|
||||
#else
|
||||
// ROCM 1.5 handles address spaces better, previous version didn't
|
||||
lds_for(buffer[local], [&](ValueType& x)
|
||||
{
|
||||
x = value;
|
||||
});
|
||||
#endif
|
||||
m_idx.barrier.wait();
|
||||
|
||||
for(std::size_t s = 1; s < size; s *= 2)
|
||||
{
|
||||
const std::size_t index = 2 * s * team;
|
||||
if (index < size)
|
||||
{
|
||||
#if defined(ROCM15)
|
||||
reducer.join(buffer[vector_rank+index*m_vector_length],
|
||||
buffer[vector_rank+(index+s)*m_vector_length]);
|
||||
#else
|
||||
lds_for(buffer[vector_rank+index*m_vector_length], [&](ValueType& x)
|
||||
{
|
||||
lds_for(buffer[vector_rank+(index+s)*m_vector_length],
|
||||
[&](ValueType& y)
|
||||
{
|
||||
reducer.join(x, y);
|
||||
});
|
||||
});
|
||||
#endif
|
||||
}
|
||||
m_idx.barrier.wait();
|
||||
}
|
||||
|
||||
if (local == 0)
|
||||
{
|
||||
for(int i=size*m_vector_length; i<m_team_size*m_vector_length; i+=m_vector_length)
|
||||
#if defined(ROCM15)
|
||||
reducer.join(buffer[vector_rank], buffer[vector_rank+i]);
|
||||
#else
|
||||
lds_for(buffer[vector_rank], [&](ValueType& x)
|
||||
{
|
||||
lds_for(buffer[vector_rank+i],
|
||||
[&](ValueType& y)
|
||||
{
|
||||
reducer.join(x, y);
|
||||
});
|
||||
});
|
||||
#endif
|
||||
}
|
||||
m_idx.barrier.wait();
|
||||
}
|
||||
|
||||
/** \brief Intra-team vector reduce
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
@ -406,6 +480,33 @@ public:
|
||||
return buffer[thread_base];
|
||||
}
|
||||
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
vector_reduce( ReducerType const & reducer )
|
||||
{
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
if(blockDim_x == 1) return;
|
||||
|
||||
// Intra vector lane shuffle reduction:
|
||||
typename ReducerType::value_type tmp ( reducer.reference() );
|
||||
|
||||
for ( int i = blockDim_x ; ( i >>= 1 ) ; ) {
|
||||
shfl_down( reducer.reference() , i , blockDim_x );
|
||||
if ( (int)threadIdx_x < i ) { reducer.join( tmp , reducer.reference() ); }
|
||||
}
|
||||
|
||||
// Broadcast from root lane to all other lanes.
|
||||
// Cannot use "butterfly" algorithm to avoid the broadcast
|
||||
// because floating point summation is not associative
|
||||
// and thus different threads could have different results.
|
||||
|
||||
shfl( reducer.reference() , 0 , blockDim_x );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
*
|
||||
@ -1075,6 +1176,22 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROC
|
||||
// Impl::JoinAdd<ValueType>());
|
||||
}
|
||||
|
||||
/** \brief Inter-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, ReducerType const & reducer) {
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,reducer.reference());
|
||||
}
|
||||
loop_boundaries.thread.team_reduce(reducer);
|
||||
}
|
||||
|
||||
/** \brief Intra-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
@ -1161,6 +1278,41 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::R
|
||||
result = loop_boundaries.thread.thread_reduce(result,join);
|
||||
}
|
||||
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, ReducerType const & reducer) {
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,reducer.reference());
|
||||
}
|
||||
loop_boundaries.thread.vector_reduce(reducer);
|
||||
}
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
|
||||
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
|
||||
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ReducerType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, const JoinType& join, ReducerType const & reducer) {
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,reducer.reference());
|
||||
loop_boundaries.thread.team_barrier();
|
||||
}
|
||||
reducer.reference() = loop_boundaries.thread.thread_reduce(reducer.reference(),join);
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
|
||||
* for each i=0..N-1.
|
||||
*
|
||||
|
||||
@ -266,7 +266,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
|
||||
const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
|
||||
|
||||
for ( int i = 0 ; i < n ; ++i ) {
|
||||
Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal<int>( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
exec.m_pool_state = ThreadsExec::Inactive ;
|
||||
@ -310,7 +310,7 @@ void ThreadsExec::fence()
|
||||
{
|
||||
if ( s_thread_pool_size[0] ) {
|
||||
// Wait for the root thread to complete:
|
||||
Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal<int>( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
s_current_function = 0 ;
|
||||
@ -716,12 +716,12 @@ void ThreadsExec::initialize( unsigned thread_count ,
|
||||
}
|
||||
|
||||
// Check for over-subscription
|
||||
//if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
|
||||
// std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
|
||||
// std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
|
||||
// std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
|
||||
// std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
|
||||
//}
|
||||
if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
|
||||
std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
|
||||
std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
|
||||
std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
|
||||
std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
|
||||
}
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
|
||||
@ -50,6 +50,7 @@
|
||||
#include <cstdio>
|
||||
|
||||
#include <utility>
|
||||
#include <cstdalign>
|
||||
#include <impl/Kokkos_Spinwait.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
@ -107,7 +108,7 @@ private:
|
||||
// Which thread am I stealing from currently
|
||||
int m_current_steal_target;
|
||||
// This thread's owned work_range
|
||||
Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN(16);
|
||||
Kokkos::pair<long,long> m_work_range __attribute__((aligned(16))) ;
|
||||
// Team Offset if one thread determines work_range for others
|
||||
long m_team_work_index;
|
||||
|
||||
@ -191,13 +192,13 @@ public:
|
||||
// Fan-in reduction with highest ranking thread as the root
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
// Wait: Active -> Rendezvous
|
||||
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_pool_state = ThreadsExec::Rendezvous ;
|
||||
// Wait: Rendezvous -> Active
|
||||
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
|
||||
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
|
||||
}
|
||||
else {
|
||||
// Root thread does the reduction and broadcast
|
||||
@ -233,13 +234,13 @@ public:
|
||||
// Fan-in reduction with highest ranking thread as the root
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
// Wait: Active -> Rendezvous
|
||||
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_pool_state = ThreadsExec::Rendezvous ;
|
||||
// Wait: Rendezvous -> Active
|
||||
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
|
||||
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
|
||||
}
|
||||
else {
|
||||
// Root thread does the reduction and broadcast
|
||||
@ -268,7 +269,7 @@ public:
|
||||
|
||||
ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
|
||||
|
||||
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );
|
||||
|
||||
Join::join( f , reduce_memory() , fan.reduce_memory() );
|
||||
}
|
||||
@ -295,7 +296,7 @@ public:
|
||||
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
|
||||
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal<int>( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
}
|
||||
|
||||
@ -327,7 +328,7 @@ public:
|
||||
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
|
||||
|
||||
// Wait: Active -> ReductionAvailable (or ScanAvailable)
|
||||
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );
|
||||
Join::join( f , work_value , fan.reduce_memory() );
|
||||
}
|
||||
|
||||
@ -345,8 +346,8 @@ public:
|
||||
|
||||
// Wait: Active -> ReductionAvailable
|
||||
// Wait: ReductionAvailable -> ScanAvailable
|
||||
Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable );
|
||||
Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::ReductionAvailable );
|
||||
|
||||
Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
|
||||
}
|
||||
@ -357,7 +358,7 @@ public:
|
||||
|
||||
// Wait for all threads to complete inclusive scan
|
||||
// Wait: ScanAvailable -> Rendezvous
|
||||
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable );
|
||||
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanAvailable );
|
||||
}
|
||||
|
||||
//--------------------------------
|
||||
@ -365,7 +366,7 @@ public:
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
|
||||
// Wait: ReductionAvailable -> ScanAvailable
|
||||
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable );
|
||||
Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::ReductionAvailable );
|
||||
// Set: ScanAvailable -> Rendezvous
|
||||
fan.m_pool_state = ThreadsExec::Rendezvous ;
|
||||
}
|
||||
@ -392,13 +393,13 @@ public:
|
||||
// Wait for all threads to copy previous thread's inclusive scan value
|
||||
// Wait for all threads: Rendezvous -> ScanCompleted
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
|
||||
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
|
||||
}
|
||||
if ( rev_rank ) {
|
||||
// Set: ScanAvailable -> ScanCompleted
|
||||
m_pool_state = ThreadsExec::ScanCompleted ;
|
||||
// Wait: ScanCompleted -> Active
|
||||
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted );
|
||||
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanCompleted );
|
||||
}
|
||||
// Set: ScanCompleted -> Active
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
@ -425,7 +426,7 @@ public:
|
||||
// Fan-in reduction with highest ranking thread as the root
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
// Wait: Active -> Rendezvous
|
||||
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
|
||||
@ -433,7 +434,7 @@ public:
|
||||
if ( rev_rank ) {
|
||||
m_pool_state = ThreadsExec::Rendezvous ;
|
||||
// Wait: Rendezvous -> Active
|
||||
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
|
||||
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
|
||||
}
|
||||
else {
|
||||
// Root thread does the thread-scan before releasing threads
|
||||
|
||||
@ -107,13 +107,13 @@ public:
|
||||
|
||||
// Wait for fan-in threads
|
||||
for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
|
||||
Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal<int>( m_team_base[j]->state() , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
// If not root then wait for release
|
||||
if ( m_team_rank_rev ) {
|
||||
m_exec->state() = ThreadsExec::Rendezvous ;
|
||||
Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous );
|
||||
Impl::spinwait_while_equal<int>( m_exec->state() , ThreadsExec::Rendezvous );
|
||||
}
|
||||
|
||||
return ! m_team_rank_rev ;
|
||||
|
||||
@ -180,12 +180,12 @@ public:
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, Kokkos::MDRangePolicy< Traits ... >
|
||||
, Kokkos::Threads
|
||||
>
|
||||
{
|
||||
private:
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
|
||||
typedef typename MDRangePolicy::work_tag WorkTag ;
|
||||
@ -193,7 +193,7 @@ private:
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
|
||||
typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const MDRangePolicy m_mdr_policy ;
|
||||
@ -548,14 +548,14 @@ public:
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, Kokkos::MDRangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Threads
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
|
||||
typedef typename MDRangePolicy::work_tag WorkTag ;
|
||||
@ -573,7 +573,7 @@ private:
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
|
||||
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
|
||||
, FunctorType
|
||||
, WorkTag
|
||||
, ValueType
|
||||
|
||||
@ -49,48 +49,50 @@ namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::Threads
|
||||
>
|
||||
: public Kokkos::Impl::Experimental::
|
||||
WorkGraphExec< FunctorType,
|
||||
Kokkos::Threads,
|
||||
Traits ...
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
|
||||
typedef Kokkos::Impl::Experimental::
|
||||
WorkGraphExec<FunctorType, Kokkos::Threads, Traits ... > Base ;
|
||||
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
|
||||
|
||||
typedef ParallelFor<FunctorType,
|
||||
Kokkos::Experimental::WorkGraphPolicy<Traits ...>,
|
||||
Kokkos::WorkGraphPolicy<Traits ...>,
|
||||
Kokkos::Threads> Self ;
|
||||
|
||||
Policy m_policy ;
|
||||
FunctorType m_functor ;
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
Base::m_functor( i );
|
||||
}
|
||||
exec_one( const std::int32_t w ) const noexcept
|
||||
{ m_functor( w ); }
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
const TagType t{} ;
|
||||
Base::m_functor( t , i );
|
||||
}
|
||||
exec_one( const std::int32_t w ) const noexcept
|
||||
{ const TagType t{}; m_functor( t , w ); }
|
||||
|
||||
inline void exec_one_thread() const {
|
||||
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
|
||||
exec_one< typename Policy::work_tag >( i );
|
||||
Base::after_work(i);
|
||||
inline void exec_one_thread() const noexcept
|
||||
{
|
||||
// Spin until COMPLETED_TOKEN.
|
||||
// END_TOKEN indicates no work is currently available.
|
||||
|
||||
for ( std::int32_t w = Policy::END_TOKEN ;
|
||||
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
|
||||
if ( Policy::END_TOKEN != w ) {
|
||||
exec_one< typename Policy::work_tag >( w );
|
||||
m_policy.completed_work(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void thread_main( ThreadsExec&, const void* arg ) {
|
||||
const Self& self = *(static_cast<const Self*>(arg));
|
||||
self.exec_one_thread();
|
||||
}
|
||||
static inline void thread_main( ThreadsExec&, const void* arg ) noexcept
|
||||
{
|
||||
const Self& self = *(static_cast<const Self*>(arg));
|
||||
self.exec_one_thread();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
@ -104,9 +106,9 @@ public:
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: Base( arg_functor, arg_policy )
|
||||
{
|
||||
}
|
||||
: m_policy( arg_policy )
|
||||
, m_functor( arg_functor )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
@ -59,7 +59,7 @@
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
|
||||
namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
// Temporary, for testing new loop macros
|
||||
#define KOKKOS_ENABLE_NEW_LOOP_MACROS 1
|
||||
@ -1274,7 +1274,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i
|
||||
|
||||
|
||||
template <typename T>
|
||||
using is_void = std::is_same< T , void >;
|
||||
using is_void_type = std::is_same< T , void >;
|
||||
|
||||
template <typename T>
|
||||
struct is_type_array : std::false_type
|
||||
@ -1303,7 +1303,7 @@ template < typename RP
|
||||
, typename Tag
|
||||
, typename ValueType
|
||||
>
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void<ValueType >::value >::type >
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void_type<ValueType >::value >::type >
|
||||
{
|
||||
using index_type = typename RP::index_type;
|
||||
using point_type = typename RP::point_type;
|
||||
@ -1781,7 +1781,7 @@ template < typename RP
|
||||
, typename Tag
|
||||
, typename ValueType
|
||||
>
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type >
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && !is_type_array<ValueType>::value >::type >
|
||||
{
|
||||
using index_type = typename RP::index_type;
|
||||
using point_type = typename RP::point_type;
|
||||
@ -2268,7 +2268,7 @@ template < typename RP
|
||||
, typename Tag
|
||||
, typename ValueType
|
||||
>
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type >
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && is_type_array<ValueType>::value >::type >
|
||||
{
|
||||
using index_type = typename RP::index_type;
|
||||
using point_type = typename RP::point_type;
|
||||
@ -2750,6 +2750,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
// Cuda uses DeviceIterateTile directly within md_parallel_for
|
||||
// TODO Once md_parallel_{for,reduce} removed, this can be removed
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
// ParallelReduce - scalar reductions
|
||||
template < typename MDRange, typename Functor, typename ValueType = void >
|
||||
struct MDFunctor
|
||||
@ -2759,11 +2761,11 @@ struct MDFunctor
|
||||
using value_type = ValueType;
|
||||
using work_tag = typename range_policy::work_tag;
|
||||
using index_type = typename range_policy::index_type;
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
|
||||
, Functor
|
||||
, work_tag
|
||||
, value_type
|
||||
>;
|
||||
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
|
||||
, Functor
|
||||
, work_tag
|
||||
, value_type
|
||||
>;
|
||||
|
||||
|
||||
inline
|
||||
@ -2804,11 +2806,11 @@ struct MDFunctor< MDRange, Functor, ValueType[] >
|
||||
using value_type = ValueType[];
|
||||
using work_tag = typename range_policy::work_tag;
|
||||
using index_type = typename range_policy::index_type;
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
|
||||
, Functor
|
||||
, work_tag
|
||||
, value_type
|
||||
>;
|
||||
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
|
||||
, Functor
|
||||
, work_tag
|
||||
, value_type
|
||||
>;
|
||||
|
||||
|
||||
inline
|
||||
@ -2852,11 +2854,11 @@ struct MDFunctor< MDRange, Functor, void >
|
||||
using functor_type = Functor;
|
||||
using work_tag = typename range_policy::work_tag;
|
||||
using index_type = typename range_policy::index_type;
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
|
||||
, Functor
|
||||
, work_tag
|
||||
, void
|
||||
>;
|
||||
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
|
||||
, Functor
|
||||
, work_tag
|
||||
, void
|
||||
>;
|
||||
|
||||
|
||||
inline
|
||||
@ -2887,8 +2889,9 @@ struct MDFunctor< MDRange, Functor, void >
|
||||
Functor m_func;
|
||||
};
|
||||
|
||||
} // end namespace Experimental
|
||||
#undef KOKKOS_ENABLE_NEW_LOOP_MACROS
|
||||
|
||||
} } } //end namespace Kokkos::Experimental::Impl
|
||||
} } //end namespace Kokkos::Impl
|
||||
|
||||
#endif
|
||||
|
||||
@ -51,9 +51,12 @@
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
bool g_is_initialized = false;
|
||||
bool g_show_warnings = true;
|
||||
}
|
||||
|
||||
namespace Kokkos { namespace Impl { namespace {
|
||||
|
||||
bool is_unsigned_int(const char* str)
|
||||
{
|
||||
@ -75,6 +78,10 @@ void initialize_internal(const InitArguments& args)
|
||||
setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
#endif
|
||||
|
||||
if (args.disable_warnings) {
|
||||
g_show_warnings = false;
|
||||
}
|
||||
|
||||
// Protect declarations, to prevent "unused variable" warnings.
|
||||
#if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
const int num_threads = args.num_threads;
|
||||
@ -177,6 +184,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
g_is_initialized = true;
|
||||
}
|
||||
|
||||
void finalize_internal( const bool all_spaces = false )
|
||||
@ -233,6 +241,9 @@ void finalize_internal( const bool all_spaces = false )
|
||||
Kokkos::Serial::finalize();
|
||||
}
|
||||
#endif
|
||||
|
||||
g_is_initialized = false;
|
||||
g_show_warnings = true;
|
||||
}
|
||||
|
||||
void fence_internal()
|
||||
@ -306,9 +317,7 @@ bool check_int_arg(char const* arg, char const* expected, int* value) {
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
}}} // namespace Kokkos::Impl::{unnamed}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -319,6 +328,7 @@ void initialize(int& narg, char* arg[])
|
||||
int num_threads = -1;
|
||||
int numa = -1;
|
||||
int device = -1;
|
||||
bool disable_warnings = false;
|
||||
|
||||
int kokkos_threads_found = 0;
|
||||
int kokkos_numa_found = 0;
|
||||
@ -415,6 +425,12 @@ void initialize(int& narg, char* arg[])
|
||||
} else {
|
||||
iarg++;
|
||||
}
|
||||
} else if ( strcmp(arg[iarg],"--kokkos-disable-warnings") == 0) {
|
||||
disable_warnings = true;
|
||||
for(int k=iarg;k<narg-1;k++) {
|
||||
arg[k] = arg[k+1];
|
||||
}
|
||||
narg--;
|
||||
} else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
|
||||
std::cout << std::endl;
|
||||
std::cout << "--------------------------------------------------------------------------------" << std::endl;
|
||||
@ -427,6 +443,7 @@ void initialize(int& narg, char* arg[])
|
||||
std::cout << "settings." << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "--kokkos-help : print this message" << std::endl;
|
||||
std::cout << "--kokkos-disable-warnings : disable kokkos warning messages" << std::endl;
|
||||
std::cout << "--kokkos-threads=INT : specify total number of threads or" << std::endl;
|
||||
std::cout << " number of threads per NUMA region if " << std::endl;
|
||||
std::cout << " used in conjunction with '--numa' option. " << std::endl;
|
||||
@ -457,7 +474,7 @@ void initialize(int& narg, char* arg[])
|
||||
iarg++;
|
||||
}
|
||||
|
||||
InitArguments arguments{num_threads, numa, device};
|
||||
InitArguments arguments{num_threads, numa, device, disable_warnings};
|
||||
Impl::initialize_internal(arguments);
|
||||
}
|
||||
|
||||
@ -787,5 +804,9 @@ void print_configuration( std::ostream & out , const bool detail )
|
||||
out << msg.str() << std::endl;
|
||||
}
|
||||
|
||||
bool is_initialized() noexcept { return g_is_initialized; }
|
||||
|
||||
bool show_warnings() noexcept { return g_show_warnings; }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
@ -476,7 +476,7 @@ template< class FunctorType , class ArgTag , class T , class Enable >
|
||||
struct FunctorValueInit< FunctorType , ArgTag , T & , Enable >
|
||||
{
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
T & init( const FunctorType & f , void * p )
|
||||
T & init( const FunctorType & , void * p )
|
||||
{ return *( new(p) T() ); };
|
||||
};
|
||||
|
||||
|
||||
@ -254,7 +254,12 @@ void * HostSpace::allocate( const size_t arg_alloc_size ) const
|
||||
}
|
||||
|
||||
|
||||
void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
|
||||
void HostSpace::deallocate( void * const arg_alloc_ptr
|
||||
, const size_t
|
||||
#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
|
||||
arg_alloc_size
|
||||
#endif
|
||||
) const
|
||||
{
|
||||
if ( arg_alloc_ptr ) {
|
||||
|
||||
@ -409,7 +414,7 @@ SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr
|
||||
|
||||
// Iterate records to print orphaned memory ...
|
||||
void SharedAllocationRecord< Kokkos::HostSpace , void >::
|
||||
print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail )
|
||||
print_records( std::ostream & s , const Kokkos::HostSpace & , bool detail )
|
||||
{
|
||||
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail );
|
||||
}
|
||||
|
||||
@ -44,6 +44,9 @@
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE_HPP )
|
||||
#define KOKKOS_MEMORY_FENCE_HPP
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -53,23 +56,8 @@ void memory_fence()
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
__threadfence();
|
||||
#elif defined( KOKKOS_ENABLE_ROCM_ATOMICS )
|
||||
amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
|
||||
#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
|
||||
asm volatile (
|
||||
"mfence" ::: "memory"
|
||||
);
|
||||
#elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
|
||||
( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
|
||||
__sync_synchronize();
|
||||
#elif defined( KOKKOS_ENABLE_INTEL_ATOMICS )
|
||||
_mm_mfence();
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
|
||||
#pragma omp flush
|
||||
#elif defined( KOKKOS_ENABLE_WINDOWS_ATOMICS )
|
||||
MemoryBarrier();
|
||||
#else
|
||||
#error "Error: memory_fence() not defined"
|
||||
std::atomic_thread_fence( std::memory_order_seq_cst );
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -81,12 +69,10 @@ void memory_fence()
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void store_fence()
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
|
||||
asm volatile (
|
||||
"sfence" ::: "memory"
|
||||
);
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
__threadfence();
|
||||
#else
|
||||
memory_fence();
|
||||
std::atomic_thread_fence( std::memory_order_seq_cst );
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -98,12 +84,10 @@ void store_fence()
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void load_fence()
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
|
||||
asm volatile (
|
||||
"lfence" ::: "memory"
|
||||
);
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
__threadfence();
|
||||
#else
|
||||
memory_fence();
|
||||
std::atomic_thread_fence( std::memory_order_seq_cst );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -49,51 +49,50 @@ namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::Serial
|
||||
>
|
||||
: public Kokkos::Impl::Experimental::
|
||||
WorkGraphExec< FunctorType,
|
||||
Kokkos::Serial,
|
||||
Traits ...
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
|
||||
typedef Kokkos::Impl::Experimental::
|
||||
WorkGraphExec<FunctorType, Kokkos::Serial, Traits ... > Base ;
|
||||
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
|
||||
|
||||
Policy m_policy ;
|
||||
FunctorType m_functor ;
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
Base::m_functor( i );
|
||||
}
|
||||
exec_one( const std::int32_t w ) const noexcept
|
||||
{ m_functor( w ); }
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
const TagType t{} ;
|
||||
Base::m_functor( t , i );
|
||||
}
|
||||
exec_one( const std::int32_t w ) const noexcept
|
||||
{ const TagType t{}; m_functor( t , w ); }
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute()
|
||||
{
|
||||
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
|
||||
exec_one< typename Policy::work_tag >( i );
|
||||
Base::after_work(i);
|
||||
void execute() const noexcept
|
||||
{
|
||||
// Spin until COMPLETED_TOKEN.
|
||||
// END_TOKEN indicates no work is currently available.
|
||||
|
||||
for ( std::int32_t w = Policy::END_TOKEN ;
|
||||
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
|
||||
if ( Policy::END_TOKEN != w ) {
|
||||
exec_one< typename Policy::work_tag >( w );
|
||||
m_policy.completed_work(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: Base( arg_functor, arg_policy )
|
||||
{
|
||||
}
|
||||
: m_policy( arg_policy )
|
||||
, m_functor( arg_functor )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
@ -306,7 +306,7 @@ print_host_accessible_records( std::ostream & s
|
||||
, reinterpret_cast<uintptr_t>( r->m_dealloc )
|
||||
, r->m_alloc_ptr->m_label
|
||||
);
|
||||
std::cout << buffer ;
|
||||
s << buffer ;
|
||||
r = r->m_next ;
|
||||
} while ( r != root );
|
||||
}
|
||||
@ -334,7 +334,7 @@ print_host_accessible_records( std::ostream & s
|
||||
else {
|
||||
snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name );
|
||||
}
|
||||
std::cout << buffer ;
|
||||
s << buffer ;
|
||||
r = r->m_next ;
|
||||
} while ( r != root );
|
||||
}
|
||||
|
||||
@ -294,9 +294,13 @@ public:
|
||||
|
||||
template< class MemorySpace >
|
||||
constexpr
|
||||
SharedAllocationRecord< MemorySpace , void > &
|
||||
get_record() const
|
||||
{ return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
|
||||
SharedAllocationRecord< MemorySpace , void > *
|
||||
get_record() const noexcept
|
||||
{
|
||||
return ( m_record_bits & DO_NOT_DEREF_FLAG )
|
||||
? (SharedAllocationRecord< MemorySpace,void>*) 0
|
||||
: static_cast<SharedAllocationRecord<MemorySpace,void>*>(m_record);
|
||||
}
|
||||
|
||||
template< class MemorySpace >
|
||||
std::string get_label() const
|
||||
@ -323,6 +327,16 @@ public:
|
||||
return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0;
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void clear()
|
||||
{
|
||||
// If this is tracking then must decrement
|
||||
KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT
|
||||
// Reset to default constructed value.
|
||||
m_record_bits = DO_NOT_DEREF_FLAG ;
|
||||
}
|
||||
|
||||
// Copy:
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
~SharedAllocationTracker()
|
||||
{ KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT }
|
||||
|
||||
@ -48,7 +48,7 @@
|
||||
#include <impl/Kokkos_Spinwait.hpp>
|
||||
#include <impl/Kokkos_BitOps.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_STDTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_STDTHREAD) || defined( _WIN32 )
|
||||
#include <thread>
|
||||
#elif !defined( _WIN32 )
|
||||
#include <sched.h>
|
||||
@ -63,9 +63,8 @@
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
|
||||
void host_thread_yield( const uint32_t i , const int force_yield )
|
||||
void host_thread_yield( const uint32_t i , const WaitMode mode )
|
||||
{
|
||||
static constexpr uint32_t sleep_limit = 1 << 13 ;
|
||||
static constexpr uint32_t yield_limit = 1 << 12 ;
|
||||
@ -76,28 +75,26 @@ void host_thread_yield( const uint32_t i , const int force_yield )
|
||||
|
||||
// Attempt to put the thread to sleep for 'c' milliseconds
|
||||
|
||||
#if defined( KOKKOS_ENABLE_STDTHREAD )
|
||||
std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) )
|
||||
#elif !defined( _WIN32 )
|
||||
#if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
std::this_thread::yield();
|
||||
std::this_thread::sleep_until( start + std::chrono::nanoseconds( c * 1000 ) );
|
||||
#else
|
||||
timespec req ;
|
||||
req.tv_sec = 0 ;
|
||||
req.tv_nsec = 1000 * c ;
|
||||
nanosleep( &req, nullptr );
|
||||
#else /* defined( _WIN32 ) IS Microsoft Windows */
|
||||
Sleep(c);
|
||||
#endif
|
||||
}
|
||||
|
||||
else if ( force_yield || yield_limit < i ) {
|
||||
else if ( mode == WaitMode::PASSIVE || yield_limit < i ) {
|
||||
|
||||
// Attempt to yield thread resources to runtime
|
||||
|
||||
#if defined( KOKKOS_ENABLE_STDTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
|
||||
std::this_thread::yield();
|
||||
#elif !defined( _WIN32 )
|
||||
#else
|
||||
sched_yield();
|
||||
#else /* defined( _WIN32 ) IS Microsoft Windows */
|
||||
YieldProcessor();
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -110,9 +107,9 @@ void host_thread_yield( const uint32_t i , const int force_yield )
|
||||
for ( int k = 0 ; k < c ; ++k ) {
|
||||
#if defined( __amd64 ) || defined( __amd64__ ) || \
|
||||
defined( __x86_64 ) || defined( __x86_64__ )
|
||||
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
|
||||
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
|
||||
asm volatile( "nop\n" );
|
||||
#else
|
||||
#else
|
||||
__asm__ __volatile__( "nop\n" );
|
||||
#endif
|
||||
#elif defined(__PPC64__)
|
||||
@ -123,86 +120,22 @@ void host_thread_yield( const uint32_t i , const int force_yield )
|
||||
|
||||
{
|
||||
// Insert memory pause
|
||||
#if defined( __amd64 ) || defined( __amd64__ ) || \
|
||||
defined( __x86_64 ) || defined( __x86_64__ )
|
||||
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
|
||||
#if defined( __amd64 ) || defined( __amd64__ ) || \
|
||||
defined( __x86_64 ) || defined( __x86_64__ )
|
||||
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
|
||||
asm volatile( "pause\n":::"memory" );
|
||||
#else
|
||||
#else
|
||||
__asm__ __volatile__( "pause\n":::"memory" );
|
||||
#endif
|
||||
#elif defined(__PPC64__)
|
||||
asm volatile( "or 27, 27, 27" ::: "memory" );
|
||||
asm volatile( "or 27, 27, 27" ::: "memory" );
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* defined( KOKKOS_ENABLE_ASM ) */
|
||||
}
|
||||
|
||||
}}} // namespace Kokkos::Impl::{anonymous}
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void yield_while_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void yield_until_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void yield_while_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void yield_until_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
|
||||
|
||||
@ -46,47 +46,95 @@
|
||||
#define KOKKOS_SPINWAIT_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
|
||||
void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
|
||||
enum class WaitMode : int {
|
||||
ACTIVE // Used for tight loops to keep threads active longest
|
||||
, PASSIVE // Used to quickly yield the thread to quite down the system
|
||||
};
|
||||
|
||||
void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
|
||||
void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
|
||||
|
||||
void yield_while_equal( volatile int32_t & flag , const int32_t value );
|
||||
void yield_until_equal( volatile int32_t & flag , const int32_t value );
|
||||
void host_thread_yield( const uint32_t i , const WaitMode mode );
|
||||
|
||||
void yield_while_equal( volatile int64_t & flag , const int64_t value );
|
||||
void yield_until_equal( volatile int64_t & flag , const int64_t value );
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if< std::is_integral<T>::value, void>::type
|
||||
spinwait_while_equal( T const volatile & flag, const T value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ;
|
||||
while( value == flag ) {
|
||||
host_thread_yield(++i, WaitMode::ACTIVE);
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if< std::is_integral<T>::value, void>::type
|
||||
yield_while_equal( T const volatile & flag, const T value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ;
|
||||
while( value == flag ) {
|
||||
host_thread_yield(++i, WaitMode::PASSIVE);
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if< std::is_integral<T>::value, void>::type
|
||||
spinwait_until_equal( T const volatile & flag, const T value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ;
|
||||
while( value != flag ) {
|
||||
host_thread_yield(++i, WaitMode::ACTIVE);
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if< std::is_integral<T>::value, void>::type
|
||||
yield_until_equal( T const volatile & flag, const T value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ;
|
||||
while( value != flag ) {
|
||||
host_thread_yield(++i, WaitMode::PASSIVE);
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
|
||||
typename std::enable_if< std::is_integral<T>::value, void>::type
|
||||
spinwait_while_equal( T const volatile & flag, const T value ) {}
|
||||
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
|
||||
typename std::enable_if< std::is_integral<T>::value, void>::type
|
||||
yield_while_equal( T const volatile & flag, const T value ) {}
|
||||
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void yield_while_equal( volatile int32_t & , const int32_t ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void yield_until_equal( volatile int32_t & , const int32_t ) {}
|
||||
typename std::enable_if< std::is_integral<T>::value, void>::type
|
||||
spinwait_until_equal( T const volatile & flag, const T value ) {}
|
||||
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void yield_while_equal( volatile int64_t & , const int64_t ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void yield_until_equal( volatile int64_t & , const int64_t ) {}
|
||||
typename std::enable_if< std::is_integral<T>::value, void>::type
|
||||
yield_until_equal( T const volatile & flag, const T value ) {}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -111,7 +111,9 @@ struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> >
|
||||
|
||||
using type = CommonViewAllocProp<Specialize,T> ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ViewCtorProp( const type & arg ) : value( arg ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ViewCtorProp( type && arg ) : value( arg ) {}
|
||||
|
||||
type value ;
|
||||
@ -128,6 +130,7 @@ struct ViewCtorProp< void , std::integral_constant<unsigned,I> >
|
||||
ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
|
||||
|
||||
template< typename P >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ViewCtorProp( const P & ) {}
|
||||
};
|
||||
|
||||
|
||||
@ -369,9 +369,9 @@ private:
|
||||
|
||||
template< size_t ... DimArgs >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
bool set( unsigned domain_rank
|
||||
, unsigned range_rank
|
||||
, const ViewDimension< DimArgs ... > & dim )
|
||||
bool set( unsigned
|
||||
, unsigned
|
||||
, const ViewDimension< DimArgs ... > & )
|
||||
{ return true ; }
|
||||
|
||||
template< class T , size_t ... DimArgs , class ... Args >
|
||||
@ -1047,7 +1047,7 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
|
||||
template< class DimRHS >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr ViewOffset(
|
||||
const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ,
|
||||
const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & ,
|
||||
const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub )
|
||||
: m_dim( sub.range_extent(0), 0, 0, 0, 0, 0, 0, 0 )
|
||||
{
|
||||
@ -1252,7 +1252,7 @@ public:
|
||||
template< unsigned TrivialScalarSize >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr ViewOffset
|
||||
( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
|
||||
( std::integral_constant<unsigned,TrivialScalarSize> const &
|
||||
, Kokkos::LayoutLeft const & arg_layout
|
||||
)
|
||||
: m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
|
||||
@ -1741,7 +1741,7 @@ public:
|
||||
template< unsigned TrivialScalarSize >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr ViewOffset
|
||||
( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
|
||||
( std::integral_constant<unsigned,TrivialScalarSize> const &
|
||||
, Kokkos::LayoutRight const & arg_layout
|
||||
)
|
||||
: m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
|
||||
@ -2368,7 +2368,7 @@ struct ViewDataHandle< Traits ,
|
||||
)>::type >
|
||||
{
|
||||
typedef typename Traits::value_type value_type ;
|
||||
typedef typename Traits::value_type * KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ;
|
||||
typedef typename Traits::value_type * KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
|
||||
typedef typename Traits::value_type & return_type ;
|
||||
typedef Kokkos::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
@ -2376,7 +2376,7 @@ struct ViewDataHandle< Traits ,
|
||||
static handle_type assign( value_type * arg_data_ptr
|
||||
, track_type const & /*arg_tracker*/ )
|
||||
{
|
||||
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) {
|
||||
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
|
||||
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
|
||||
}
|
||||
return handle_type( arg_data_ptr );
|
||||
@ -2386,7 +2386,7 @@ struct ViewDataHandle< Traits ,
|
||||
static handle_type assign( handle_type const arg_data_ptr
|
||||
, size_t offset )
|
||||
{
|
||||
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) {
|
||||
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
|
||||
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
|
||||
}
|
||||
return handle_type( arg_data_ptr + offset );
|
||||
@ -2411,7 +2411,7 @@ struct ViewDataHandle< Traits ,
|
||||
)>::type >
|
||||
{
|
||||
typedef typename Traits::value_type value_type ;
|
||||
typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ;
|
||||
typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
|
||||
typedef typename Traits::value_type & return_type ;
|
||||
typedef Kokkos::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
@ -2419,7 +2419,7 @@ struct ViewDataHandle< Traits ,
|
||||
static handle_type assign( value_type * arg_data_ptr
|
||||
, track_type const & /*arg_tracker*/ )
|
||||
{
|
||||
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) {
|
||||
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
|
||||
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
|
||||
}
|
||||
return handle_type( arg_data_ptr );
|
||||
@ -2429,7 +2429,7 @@ struct ViewDataHandle< Traits ,
|
||||
static handle_type assign( handle_type const arg_data_ptr
|
||||
, size_t offset )
|
||||
{
|
||||
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) {
|
||||
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
|
||||
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
|
||||
}
|
||||
return handle_type( arg_data_ptr + offset );
|
||||
@ -2783,6 +2783,11 @@ public:
|
||||
, m_offset( std::integral_constant< unsigned , 0 >() , arg_layout )
|
||||
{}
|
||||
|
||||
/**\brief Assign data */
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void assign_data( pointer_type arg_ptr )
|
||||
{ m_handle = handle_type( arg_ptr ); }
|
||||
|
||||
//----------------------------------------
|
||||
/* Allocate and construct mapped array.
|
||||
* Allocate via shared allocation record and
|
||||
|
||||
@ -48,6 +48,7 @@
|
||||
#include <algorithm>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_hwloc.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
@ -312,14 +313,18 @@ Sentinel::Sentinel()
|
||||
hwloc_get_cpubind( s_hwloc_topology , s_process_binding , HWLOC_CPUBIND_PROCESS );
|
||||
|
||||
if ( hwloc_bitmap_iszero( s_process_binding ) ) {
|
||||
std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl;
|
||||
if (Kokkos::show_warnings() ) {
|
||||
std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl;
|
||||
}
|
||||
const int pu_depth = hwloc_get_type_depth( s_hwloc_topology, HWLOC_OBJ_PU );
|
||||
int num_pu = 1;
|
||||
if ( pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN ) {
|
||||
num_pu = hwloc_get_nbobjs_by_depth( s_hwloc_topology, pu_depth );
|
||||
}
|
||||
else {
|
||||
std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl;
|
||||
if (Kokkos::show_warnings() ) {
|
||||
std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl;
|
||||
}
|
||||
num_pu = 1;
|
||||
}
|
||||
hwloc_bitmap_set_range( s_process_binding, 0, num_pu-1);
|
||||
@ -349,7 +354,7 @@ Sentinel::Sentinel()
|
||||
|
||||
hwloc_bitmap_free( s_process_no_core_zero );
|
||||
|
||||
if ( ! ok ) {
|
||||
if ( Kokkos::show_warnings() && ! ok ) {
|
||||
std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
|
||||
}
|
||||
}
|
||||
@ -503,8 +508,8 @@ Sentinel::Sentinel()
|
||||
|
||||
hwloc_bitmap_free( proc_cpuset_location );
|
||||
|
||||
if ( ! symmetric ) {
|
||||
std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
|
||||
if ( Kokkos::show_warnings() && ! symmetric ) {
|
||||
std::cerr << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
|
||||
<< std::endl ;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user