Update Kokkos to v2.04.11

This commit is contained in:
Stan Moore
2017-11-06 13:47:33 -07:00
parent 39df9f5d94
commit 16b5315845
117 changed files with 33239 additions and 1093 deletions

View File

@ -63,7 +63,7 @@
#include <typeinfo>
#endif
namespace Kokkos { namespace Experimental { namespace Impl {
namespace Kokkos { namespace Impl {
// ------------------------------------------------------------------ //
@ -110,21 +110,12 @@ struct apply_impl<2,RP,Functor,void >
{
// LL
if (RP::inner_direction == RP::Left) {
/*
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
m_func(i, j);
} }
*/
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1);
}
@ -134,21 +125,12 @@ struct apply_impl<2,RP,Functor,void >
}
// LR
else {
/*
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
m_func(i, j);
} }
*/
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
m_func(offset_0 , offset_1);
}
@ -182,21 +164,12 @@ struct apply_impl<2,RP,Functor,Tag>
{
if (RP::inner_direction == RP::Left) {
// Loop over size maxnumblocks until full range covered
/*
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
m_func(Tag(), i, j);
} }
*/
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1);
}
@ -205,21 +178,12 @@ struct apply_impl<2,RP,Functor,Tag>
}
}
else {
/*
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
m_func(Tag(), i, j);
} }
*/
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
m_func(Tag(), offset_0 , offset_1);
}
@ -255,15 +219,15 @@ struct apply_impl<3,RP,Functor,void >
// LL
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2);
}
@ -276,15 +240,15 @@ struct apply_impl<3,RP,Functor,void >
// LR
else {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
m_func(offset_0 , offset_1 , offset_2);
}
@ -319,15 +283,15 @@ struct apply_impl<3,RP,Functor,Tag>
{
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
@ -339,15 +303,15 @@ struct apply_impl<3,RP,Functor,Tag>
}
else {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
@ -398,19 +362,19 @@ struct apply_impl<4,RP,Functor,void >
const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
@ -436,19 +400,19 @@ struct apply_impl<4,RP,Functor,void >
const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
@ -498,19 +462,19 @@ struct apply_impl<4,RP,Functor,Tag>
const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
}
@ -535,19 +499,19 @@ struct apply_impl<4,RP,Functor,Tag>
const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
}
@ -612,23 +576,23 @@ struct apply_impl<5,RP,Functor,void >
const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
@ -667,23 +631,23 @@ struct apply_impl<5,RP,Functor,void >
const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
@ -747,23 +711,23 @@ struct apply_impl<5,RP,Functor,Tag>
const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
@ -802,23 +766,23 @@ struct apply_impl<5,RP,Functor,Tag>
const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
@ -895,27 +859,27 @@ struct apply_impl<6,RP,Functor,void >
const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
@ -967,27 +931,27 @@ struct apply_impl<6,RP,Functor,void >
const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}
@ -1064,27 +1028,27 @@ struct apply_impl<6,RP,Functor,Tag>
const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
@ -1136,27 +1100,27 @@ struct apply_impl<6,RP,Functor,Tag>
const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}
@ -1292,7 +1256,7 @@ protected:
const Functor m_func;
};
} } } //end namespace Kokkos::Experimental::Impl
} } //end namespace Kokkos::Impl
#endif
#endif

View File

@ -63,7 +63,7 @@
#include <typeinfo>
#endif
namespace Kokkos { namespace Experimental { namespace Impl {
namespace Kokkos { namespace Impl {
namespace Refactor {
@ -2709,7 +2709,7 @@ private:
// ----------------------------------------------------------------------------------
} } } //end namespace Kokkos::Experimental::Impl
} } //end namespace Kokkos::Impl
#endif
#endif

View File

@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
//__launch_bounds__(maxTperB, minBperSM)
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
//__launch_bounds__(maxTperB, minBperSM)
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
@ -193,9 +193,14 @@ template < class DriverType
, bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
struct CudaParallelLaunch ;
template < class DriverType, class LaunchBounds >
struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
template < class DriverType
, unsigned int MaxThreadsPerBlock
, unsigned int MinBlocksPerSM >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds< MaxThreadsPerBlock
, MinBlocksPerSM >
, true >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
@ -216,21 +221,28 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
else if ( shmem ) {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
} else {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_constant_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
// Copy functor to constant memory on the device
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
cudaMemcpyToSymbol(
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
cuda_parallel_launch_constant_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
<<< grid , block , shmem , stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
@ -240,9 +252,11 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
}
};
template < class DriverType, class LaunchBounds >
struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
template < class DriverType >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds<>
, true >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
@ -252,20 +266,136 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
// Fence before changing settings and copying closure
Kokkos::Cuda::fence();
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
else if ( shmem ) {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
} else {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_constant_memory< DriverType >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
// Copy functor to constant memory on the device
cudaMemcpyToSymbol(
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory< DriverType >
<<< grid , block , shmem , stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda::fence();
#endif
}
}
};
template < class DriverType
, unsigned int MaxThreadsPerBlock
, unsigned int MinBlocksPerSM >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds< MaxThreadsPerBlock
, MinBlocksPerSM >
, false >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_local_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
// Invoke the driver function on the device
cuda_parallel_launch_local_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
<<< grid , block , shmem , stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda::fence();
#endif
}
}
};
template < class DriverType >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds<>
, false >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_local_memory< DriverType >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_local_memory< DriverType >
<<< grid , block , shmem , stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );

View File

@ -713,7 +713,7 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void *
// Iterate records to print orphaned memory ...
void
SharedAllocationRecord< Kokkos::CudaSpace , void >::
print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail )
print_records( std::ostream & s , const Kokkos::CudaSpace & , bool detail )
{
SharedAllocationRecord< void , void > * r = & s_root_record ;
@ -751,7 +751,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
, reinterpret_cast<uintptr_t>( r->m_dealloc )
, head.m_label
);
std::cout << buffer ;
s << buffer ;
r = r->m_next ;
} while ( r != & s_root_record );
}
@ -781,7 +781,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
else {
snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" );
}
std::cout << buffer ;
s << buffer ;
r = r->m_next ;
} while ( r != & s_root_record );
}
@ -789,14 +789,14 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
void
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail )
print_records( std::ostream & s , const Kokkos::CudaUVMSpace & , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail );
}
void
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail )
print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
}

View File

@ -421,7 +421,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
std::string msg = ss.str();
Kokkos::abort( msg.c_str() );
}
if ( compiled_major != cudaProp.major || compiled_minor != cudaProp.minor ) {
if ( Kokkos::show_warnings() && (compiled_major != cudaProp.major || compiled_minor != cudaProp.minor) ) {
std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
<< compiled_major << "." << compiled_minor
<< " on device with compute capability "
@ -467,7 +467,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
if ( ! m_scratchUnifiedSupported ) {
if ( Kokkos::show_warnings() && ! m_scratchUnifiedSupported ) {
std::cout << "Kokkos::Cuda device "
<< cudaProp.name << " capability "
<< cudaProp.major << "." << cudaProp.minor
@ -545,7 +545,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
}
#ifdef KOKKOS_ENABLE_CUDA_UVM
if(!cuda_launch_blocking()) {
if( Kokkos::show_warnings() && !cuda_launch_blocking() ) {
std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
std::cout << " without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
std::cout << " The code must call Cuda::fence() after each kernel" << std::endl;
@ -561,7 +561,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
bool visible_devices_one=true;
if (env_visible_devices == 0) visible_devices_one=false;
if(!visible_devices_one && !force_device_alloc) {
if( Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc) ) {
std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
std::cout << " without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
std::cout << " setting CUDA_VISIBLE_DEVICES." << std::endl;

View File

@ -381,12 +381,12 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, Kokkos::Cuda
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
typedef Kokkos::MDRangePolicy< Traits ... > Policy ;
using RP = Policy;
typedef typename Policy::array_index_type array_index_type;
typedef typename Policy::index_type index_type;
@ -402,7 +402,7 @@ public:
__device__
void operator()(void) const
{
Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
Kokkos::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
}
@ -858,14 +858,14 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Cuda
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
typedef Kokkos::MDRangePolicy< Traits ... > Policy ;
typedef typename Policy::array_index_type array_index_type;
typedef typename Policy::index_type index_type;
@ -898,7 +898,7 @@ public:
size_type * m_scratch_flags ;
size_type * m_unified_space ;
typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
@ -913,7 +913,7 @@ public:
void
exec_range( reference_type update ) const
{
Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
}
inline

View File

@ -127,11 +127,11 @@ struct CudaTextureFetch {
template< class CudaMemorySpace >
inline explicit
CudaTextureFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
, Kokkos::Impl::SharedAllocationRecord< CudaMemorySpace , void > * record
)
: m_obj( record.template attach_texture_object< AliasType >() )
: m_obj( record->template attach_texture_object< AliasType >() )
, m_ptr( arg_ptr )
, m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
, m_offset( record->attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
{}
// Texture object spans the entire allocation.
@ -199,8 +199,8 @@ struct CudaLDGFetch {
template< class CudaMemorySpace >
inline explicit
CudaLDGFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
)
, Kokkos::Impl::SharedAllocationRecord<CudaMemorySpace,void>*
)
: m_ptr( arg_ptr )
{}
@ -285,7 +285,21 @@ public:
// Assignment of texture = non-texture requires creation of a texture object
// which can only occur on the host. In addition, 'get_record' is only valid
// if called in a host execution space
return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
typedef typename Traits::memory_space memory_space ;
typedef typename Impl::SharedAllocationRecord<memory_space,void> record ;
record * const r = arg_tracker.template get_record< memory_space >();
#if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
if ( 0 == r ) {
Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
}
#endif
return handle_type( arg_data_ptr , r );
#else
Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
return handle_type();

View File

@ -48,50 +48,52 @@ namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::Cuda
class ParallelFor< FunctorType
, Kokkos::WorkGraphPolicy< Traits ... >
, Kokkos::Cuda
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::Cuda,
Traits ...
>
{
public:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::Cuda, Traits ... > Base ;
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda> Self ;
private:
template< class TagType >
__device__
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
Policy m_policy ;
FunctorType m_functor ;
template< class TagType >
__device__
__device__ inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one( const std::int32_t w ) const noexcept
{ m_functor( w ); }
template< class TagType >
__device__ inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
exec_one( const std::int32_t w ) const noexcept
{ const TagType t{} ; m_functor( t , w ); }
public:
__device__
inline
void operator()() const {
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
__device__ inline
void operator()() const noexcept
{
if ( 0 == ( threadIdx.y % 16 ) ) {
// Spin until COMPLETED_TOKEN.
// END_TOKEN indicates no work is currently available.
for ( std::int32_t w = Policy::END_TOKEN ;
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
if ( Policy::END_TOKEN != w ) {
exec_one< typename Policy::work_tag >( w );
m_policy.completed_work(w);
}
}
}
}
}
inline
void execute()
@ -108,9 +110,9 @@ public:
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
: m_policy( arg_policy )
, m_functor( arg_functor )
{}
};
} // namespace Impl

View File

@ -55,7 +55,7 @@
#include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
#endif
namespace Kokkos { namespace Experimental {
namespace Kokkos {
// ------------------------------------------------------------------ //
@ -331,11 +331,23 @@ struct MDRangePolicy
}
};
} // namespace Kokkos
// For backward compatibility
namespace Kokkos { namespace Experimental {
using Kokkos::MDRangePolicy;
using Kokkos::Rank;
using Kokkos::Iterate;
} } // end Kokkos::Experimental
// ------------------------------------------------------------------ //
// ------------------------------------------------------------------ //
//md_parallel_for - deprecated use parallel_for
// ------------------------------------------------------------------ //
namespace Kokkos { namespace Experimental {
template <typename MDRange, typename Functor, typename Enable = void>
void md_parallel_for( MDRange const& range
, Functor const& f
@ -347,7 +359,7 @@ void md_parallel_for( MDRange const& range
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
@ -365,7 +377,7 @@ void md_parallel_for( const std::string& str
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
@ -385,7 +397,7 @@ void md_parallel_for( const std::string& str
) >::type* = 0
)
{
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
closure.execute();
}
@ -400,7 +412,7 @@ void md_parallel_for( MDRange const& range
) >::type* = 0
)
{
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
closure.execute();
}
#endif
@ -421,7 +433,7 @@ void md_parallel_reduce( MDRange const& range
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
@ -439,7 +451,7 @@ void md_parallel_reduce( const std::string& str
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
@ -448,7 +460,7 @@ void md_parallel_reduce( const std::string& str
// Cuda - md_parallel_reduce not implemented - use parallel_reduce
}} // namespace Kokkos::Experimental
} } // namespace Kokkos::Experimental
#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP

View File

@ -81,10 +81,10 @@ struct IndexType
/**\brief Specify Launch Bounds for CUDA execution.
*
* The "best" defaults may be architecture specific.
* If no launch bounds specified then do not set launch bounds.
*/
template< unsigned int maxT = 1024 /* Max threads per block */
, unsigned int minB = 1 /* Min blocks per SM */
template< unsigned int maxT = 0 /* Max threads per block */
, unsigned int minB = 0 /* Min blocks per SM */
>
struct LaunchBounds
{
@ -280,6 +280,9 @@ struct MemorySpaceAccess {
enum { deepcopy = assignable };
};
}} // namespace Kokkos::Impl
namespace Kokkos {
/**\brief Can AccessSpace access MemorySpace ?
*
@ -358,6 +361,13 @@ public:
>::type space ;
};
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
using Kokkos::SpaceAccessibility ; // For backward compatibility
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------

View File

@ -99,13 +99,17 @@ struct InitArguments {
int num_threads;
int num_numa;
int device_id;
bool disable_warnings;
InitArguments( int nt = -1
, int nn = -1
, int dv = -1)
: num_threads( nt )
, num_numa( nn )
, device_id( dv )
, int dv = -1
, bool dw = false
)
: num_threads{ nt }
, num_numa{ nn }
, device_id{ dv }
, disable_warnings{ dw }
{}
};
@ -113,6 +117,10 @@ void initialize(int& narg, char* arg[]);
void initialize(const InitArguments& args = InitArguments());
bool is_initialized() noexcept;
bool show_warnings() noexcept;
/** \brief Finalize the spaces that were initialized via Kokkos::initialize */
void finalize();

View File

@ -45,7 +45,6 @@
#define KOKKOS_CRS_HPP
namespace Kokkos {
namespace Experimental {
/// \class Crs
/// \brief Compressed row storage array.
@ -164,7 +163,7 @@ void transpose_crs(
Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in);
}} // namespace Kokkos::Experimental
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
@ -172,7 +171,6 @@ void transpose_crs(
namespace Kokkos {
namespace Impl {
namespace Experimental {
template <class InCrs, class OutCounts>
class GetCrsTransposeCounts {
@ -277,14 +275,13 @@ class FillCrsTransposeEntries {
}
};
}}} // namespace Kokkos::Impl::Experimental
}} // namespace Kokkos::Impl
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Experimental {
template< class OutCounts,
class DataType,
@ -297,8 +294,7 @@ void get_crs_transpose_counts(
std::string const& name) {
using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>;
out = OutCounts(name, in.numRows());
Kokkos::Impl::Experimental::
GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
Kokkos::Impl::GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
}
template< class OutRowMap,
@ -308,8 +304,7 @@ typename OutRowMap::value_type get_crs_row_map_from_counts(
InCounts const& in,
std::string const& name) {
out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
Kokkos::Impl::Experimental::
CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
Kokkos::Impl::CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
return functor.execute();
}
@ -326,32 +321,37 @@ void transpose_crs(
typedef View<SizeType*, memory_space> counts_type ;
{
counts_type counts;
Kokkos::Experimental::get_crs_transpose_counts(counts, in);
Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts,
Kokkos::get_crs_transpose_counts(counts, in);
Kokkos::get_crs_row_map_from_counts(out.row_map, counts,
"tranpose_row_map");
}
out.entries = decltype(out.entries)("transpose_entries", in.entries.size());
Kokkos::Impl::Experimental::
Kokkos::Impl::
FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
}
template< class CrsType,
class Functor>
struct CountAndFill {
class Functor,
class ExecutionSpace = typename CrsType::execution_space>
struct CountAndFillBase;
template< class CrsType,
class Functor,
class ExecutionSpace>
struct CountAndFillBase {
using data_type = typename CrsType::size_type;
using size_type = typename CrsType::size_type;
using row_map_type = typename CrsType::row_map_type;
using entries_type = typename CrsType::entries_type;
using counts_type = row_map_type;
CrsType m_crs;
Functor m_functor;
counts_type m_counts;
struct Count {};
KOKKOS_INLINE_FUNCTION void operator()(Count, size_type i) const {
inline void operator()(Count, size_type i) const {
m_counts(i) = m_functor(i, nullptr);
}
struct Fill {};
KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
inline void operator()(Fill, size_type i) const {
auto j = m_crs.row_map(i);
/* we don't want to access entries(entries.size()), even if its just to get its
address and never use it.
@ -363,13 +363,63 @@ struct CountAndFill {
nullptr : (&(m_crs.entries(j)));
m_functor(i, fill);
}
using self_type = CountAndFill<CrsType, Functor>;
CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
CountAndFillBase(CrsType& crs, Functor const& f):
m_crs(crs),
m_functor(f)
{}
};
#if defined( KOKKOS_ENABLE_CUDA )
template< class CrsType,
class Functor>
struct CountAndFillBase<CrsType, Functor, Kokkos::Cuda> {
using data_type = typename CrsType::size_type;
using size_type = typename CrsType::size_type;
using row_map_type = typename CrsType::row_map_type;
using counts_type = row_map_type;
CrsType m_crs;
Functor m_functor;
counts_type m_counts;
struct Count {};
__device__ inline void operator()(Count, size_type i) const {
m_counts(i) = m_functor(i, nullptr);
}
struct Fill {};
__device__ inline void operator()(Fill, size_type i) const {
auto j = m_crs.row_map(i);
/* we don't want to access entries(entries.size()), even if its just to get its
address and never use it.
this can happen when row (i) is empty and all rows after it are also empty.
we could compare to row_map(i + 1), but that is a read from global memory,
whereas dimension_0() should be part of the View in registers (or constant memory) */
data_type* fill =
(j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
nullptr : (&(m_crs.entries(j)));
m_functor(i, fill);
}
CountAndFillBase(CrsType& crs, Functor const& f):
m_crs(crs),
m_functor(f)
{}
};
#endif
template< class CrsType,
class Functor>
struct CountAndFill : public CountAndFillBase<CrsType, Functor> {
using base_type = CountAndFillBase<CrsType, Functor>;
using typename base_type::data_type;
using typename base_type::size_type;
using typename base_type::counts_type;
using typename base_type::Count;
using typename base_type::Fill;
using entries_type = typename CrsType::entries_type;
using self_type = CountAndFill<CrsType, Functor>;
CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
base_type(crs, f)
{
using execution_space = typename CrsType::execution_space;
m_counts = counts_type("counts", nrows);
this->m_counts = counts_type("counts", nrows);
{
using count_policy_type = RangePolicy<size_type, execution_space, Count>;
using count_closure_type =
@ -377,10 +427,10 @@ struct CountAndFill {
const count_closure_type closure(*this, count_policy_type(0, nrows));
closure.execute();
}
auto nentries = Kokkos::Experimental::
get_crs_row_map_from_counts(m_crs.row_map, m_counts);
m_counts = counts_type();
m_crs.entries = entries_type("entries", nentries);
auto nentries = Kokkos::
get_crs_row_map_from_counts(this->m_crs.row_map, this->m_counts);
this->m_counts = counts_type();
this->m_crs.entries = entries_type("entries", nentries);
{
using fill_policy_type = RangePolicy<size_type, execution_space, Fill>;
using fill_closure_type =
@ -388,7 +438,7 @@ struct CountAndFill {
const fill_closure_type closure(*this, fill_policy_type(0, nrows));
closure.execute();
}
crs = m_crs;
crs = this->m_crs;
}
};
@ -398,9 +448,9 @@ void count_and_fill_crs(
CrsType& crs,
typename CrsType::size_type nrows,
Functor const& f) {
Kokkos::Experimental::CountAndFill<CrsType, Functor>(crs, nrows, f);
Kokkos::CountAndFill<CrsType, Functor>(crs, nrows, f);
}
}} // namespace Kokkos::Experimental
} // namespace Kokkos
#endif /* #define KOKKOS_CRS_HPP */

View File

@ -379,12 +379,13 @@ Impl::PerThreadValue PerThread(const int& arg);
* uses variadic templates. Each and any of the template arguments can
* be omitted.
*
* Possible Template arguments and there default values:
* Possible Template arguments and their default values:
* ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be enabled.
* WorkTag (none): Tag which is used as the first argument for the functor operator.
* Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
* IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
* LaunchBounds<int,int> (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation.
* LaunchBounds<unsigned,unsigned> Launch Bounds for CUDA compilation,
* default of LaunchBounds<0,0> indicates no launch bounds specified.
*/
template< class ... Properties>
class TeamPolicy: public

View File

@ -274,18 +274,14 @@
#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
#endif
#if ! defined( KOKKOS_MEMORY_ALIGNMENT )
#define KOKKOS_MEMORY_ALIGNMENT 64
#endif
#define KOKKOS_RESTRICT __restrict__
#ifndef KOKKOS_ALIGN
#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
#endif
#ifndef KOKKOS_ALIGN_PTR
#define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size)))
#endif
#ifndef KOKKOS_ALIGN_SIZE
#define KOKKOS_ALIGN_SIZE 64
#ifndef KOKKOS_IMPL_ALIGN_PTR
#define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size)))
#endif
#if ( 1400 > KOKKOS_COMPILER_INTEL )
@ -351,6 +347,11 @@
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
#endif
#if !defined( KOKKOS_IMPL_ALIGN_PTR )
#define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((aligned(size)))
#endif
#endif
//----------------------------------------------------------------------------
@ -426,16 +427,16 @@
//----------------------------------------------------------------------------
// Define Macro for alignment:
#if !defined KOKKOS_ALIGN_SIZE
#define KOKKOS_ALIGN_SIZE 16
#if ! defined( KOKKOS_MEMORY_ALIGNMENT )
#define KOKKOS_MEMORY_ALIGNMENT 16
#endif
#if !defined( KOKKOS_ALIGN )
#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
#if ! defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
#define KOKKOS_MEMORY_ALIGNMENT_THRESHOLD 4
#endif
#if !defined( KOKKOS_ALIGN_PTR )
#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
#if !defined( KOKKOS_IMPL_ALIGN_PTR )
#define KOKKOS_IMPL_ALIGN_PTR(size) /* */
#endif
//----------------------------------------------------------------------------

View File

@ -111,6 +111,10 @@ private:
public:
/**\brief The maximum size of a superblock and block */
enum : uint32_t { max_superblock_size = 1LU << 31 /* 2 gigabytes */ };
enum : uint32_t { max_block_per_superblock = max_bit_count };
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
@ -206,7 +210,7 @@ public:
const uint32_t * sb_state_ptr = sb_state_array ;
s << "pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) << ")"
<< " superblock_size(" << ( 1 << m_sb_size_lg2 ) << ")" << std::endl ;
<< " superblock_size(" << ( 1LU << m_sb_size_lg2 ) << ")" << std::endl ;
for ( int32_t i = 0 ; i < m_sb_count
; ++i , sb_state_ptr += m_sb_state_size ) {
@ -215,7 +219,7 @@ public:
const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
const uint32_t block_count = 1 << block_count_lg2 ;
const uint32_t block_count = 1u << block_count_lg2 ;
const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
s << "Superblock[ " << i << " / " << m_sb_count << " ] {"
@ -284,43 +288,87 @@ public:
{
const uint32_t int_align_lg2 = 3 ; /* align as int[8] */
const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
const uint32_t default_min_block_size = 1u << 6 ; /* 64 bytes */
const uint32_t default_max_block_size = 1u << 12 ;/* 4k bytes */
const uint32_t default_min_superblock_size = 1u << 20 ;/* 1M bytes */
// Constraints and defaults:
// min_block_alloc_size <= max_block_alloc_size
// max_block_alloc_size <= min_superblock_size
// min_superblock_size <= min_total_alloc_size
//--------------------------------------------------
// Default block and superblock sizes:
const uint32_t MIN_BLOCK_SIZE = 1u << 6 /* 64 bytes */ ;
const uint32_t MAX_BLOCK_SIZE = 1u << 12 /* 4k bytes */ ;
if ( 0 == min_block_alloc_size ) {
// Default all sizes:
if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ;
min_superblock_size =
std::min( size_t(default_min_superblock_size)
, min_total_alloc_size );
min_block_alloc_size =
std::min( size_t(default_min_block_size)
, min_superblock_size );
max_block_alloc_size =
std::min( size_t(default_max_block_size)
, min_superblock_size );
}
else if ( 0 == min_superblock_size ) {
// Choose superblock size as minimum of:
// max_block_per_superblock * min_block_size
// max_superblock_size
// min_total_alloc_size
const size_t max_superblock =
min_block_alloc_size * max_block_per_superblock ;
min_superblock_size =
std::min( max_superblock ,
std::min( size_t(max_superblock_size)
, min_total_alloc_size ) );
}
if ( 0 == max_block_alloc_size ) {
max_block_alloc_size = MAX_BLOCK_SIZE ;
// Upper bound of total allocation size
max_block_alloc_size = std::min( size_t(max_block_alloc_size)
, min_total_alloc_size );
// Lower bound of minimum block size
max_block_alloc_size = std::max( max_block_alloc_size
, min_block_alloc_size );
max_block_alloc_size = min_superblock_size ;
}
if ( 0 == min_superblock_size ) {
min_superblock_size = max_block_alloc_size ;
//--------------------------------------------------
// Upper bound of total allocation size
min_superblock_size = std::min( size_t(min_superblock_size)
, min_total_alloc_size );
{
/* Enforce size constraints:
* min_block_alloc_size <= max_block_alloc_size
* max_block_alloc_size <= min_superblock_size
* min_superblock_size <= max_superblock_size
* min_superblock_size <= min_total_alloc_size
* min_superblock_size <= min_block_alloc_size *
* max_block_per_superblock
*/
// Lower bound of maximum block size
min_superblock_size = std::max( min_superblock_size
, max_block_alloc_size );
const size_t max_superblock =
min_block_alloc_size * max_block_per_superblock ;
if ( ( size_t(max_superblock_size) < min_superblock_size ) ||
( min_total_alloc_size < min_superblock_size ) ||
( max_superblock < min_superblock_size ) ||
( min_superblock_size < max_block_alloc_size ) ||
( max_block_alloc_size < min_block_alloc_size ) ) {
#if 1
printf( " MemoryPool min_block_alloc_size(%ld) max_block_alloc_size(%ld) min_superblock_size(%ld) min_total_alloc_size(%ld) ; max_superblock_size(%ld) max_block_per_superblock(%ld)\n"
, min_block_alloc_size
, max_block_alloc_size
, min_superblock_size
, min_total_alloc_size
, size_t(max_superblock_size)
, size_t(max_block_per_superblock)
);
#endif
Kokkos::abort("Kokkos MemoryPool size constraint violation");
}
}
//--------------------------------------------------
// Block and superblock size is power of two:
// Maximum value is 'max_superblock_size'
m_min_block_size_lg2 =
Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
@ -331,45 +379,26 @@ public:
m_sb_size_lg2 =
Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
// Constraints:
// m_min_block_size_lg2 <= m_max_block_size_lg2 <= m_sb_size_lg2
// m_sb_size_lg2 <= m_min_block_size + max_bit_count_lg2
{
// number of superblocks is multiple of superblock size that
// can hold min_total_alloc_size.
if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_sb_size_lg2 ) {
m_min_block_size_lg2 = m_sb_size_lg2 - max_bit_count_lg2 ;
}
if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_max_block_size_lg2 ) {
m_min_block_size_lg2 = m_max_block_size_lg2 - max_bit_count_lg2 ;
}
if ( m_max_block_size_lg2 < m_min_block_size_lg2 ) {
m_max_block_size_lg2 = m_min_block_size_lg2 ;
}
if ( m_sb_size_lg2 < m_max_block_size_lg2 ) {
m_sb_size_lg2 = m_max_block_size_lg2 ;
const uint64_t sb_size_mask = ( 1LU << m_sb_size_lg2 ) - 1 ;
m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
}
// At least 32 minimum size blocks in a superblock
{
// Any superblock can be assigned to the smallest size block
// Size the block bitset to maximum number of blocks
if ( m_sb_size_lg2 < m_min_block_size_lg2 + 5 ) {
m_sb_size_lg2 = m_min_block_size_lg2 + 5 ;
const uint32_t max_block_count_lg2 =
m_sb_size_lg2 - m_min_block_size_lg2 ;
m_sb_state_size =
( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
}
// number of superblocks is multiple of superblock size that
// can hold min_total_alloc_size.
const uint32_t sb_size_mask = ( 1u << m_sb_size_lg2 ) - 1 ;
m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
// Any superblock can be assigned to the smallest size block
// Size the block bitset to maximum number of blocks
const uint32_t max_block_count_lg2 =
m_sb_size_lg2 - m_min_block_size_lg2 ;
m_sb_state_size =
( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
// Array of all superblock states
const size_t all_sb_state_size =
@ -454,7 +483,7 @@ private:
* Restrict lower bound to minimum block size.
*/
KOKKOS_FORCEINLINE_FUNCTION
unsigned get_block_size_lg2( unsigned n ) const noexcept
uint32_t get_block_size_lg2( uint32_t n ) const noexcept
{
const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n );
@ -463,11 +492,12 @@ private:
public:
/* Return 0 for invalid block size */
KOKKOS_INLINE_FUNCTION
uint32_t allocate_block_size( uint32_t alloc_size ) const noexcept
uint32_t allocate_block_size( uint64_t alloc_size ) const noexcept
{
return alloc_size <= (1UL << m_max_block_size_lg2)
? ( 1u << get_block_size_lg2( alloc_size ) )
? ( 1UL << get_block_size_lg2( uint32_t(alloc_size) ) )
: 0 ;
}
@ -485,246 +515,253 @@ public:
void * allocate( size_t alloc_size
, int32_t attempt_limit = 1 ) const noexcept
{
if ( size_t(1LU << m_max_block_size_lg2) < alloc_size ) {
Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
}
if ( 0 == alloc_size ) return (void*) 0 ;
void * p = 0 ;
const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
if ( block_size_lg2 <= m_max_block_size_lg2 ) {
// Allocation will fit within a superblock
// that has block sizes ( 1 << block_size_lg2 )
// Allocation will fit within a superblock
// that has block sizes ( 1 << block_size_lg2 )
const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
const uint32_t block_state = block_count_lg2 << state_shift ;
const uint32_t block_count = 1u << block_count_lg2 ;
const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
const uint32_t block_state = block_count_lg2 << state_shift ;
const uint32_t block_count = 1u << block_count_lg2 ;
// Superblock hints for this block size:
// hint_sb_id_ptr[0] is the dynamically changing hint
// hint_sb_id_ptr[1] is the static start point
// Superblock hints for this block size:
// hint_sb_id_ptr[0] is the dynamically changing hint
// hint_sb_id_ptr[1] is the static start point
volatile uint32_t * const hint_sb_id_ptr
= m_sb_state_array /* memory pool state array */
+ m_hint_offset /* offset to hint portion of array */
+ HINT_PER_BLOCK_SIZE /* number of hints per block size */
* ( block_size_lg2 - m_min_block_size_lg2 ); /* block size id */
volatile uint32_t * const hint_sb_id_ptr
= m_sb_state_array /* memory pool state array */
+ m_hint_offset /* offset to hint portion of array */
+ HINT_PER_BLOCK_SIZE /* number of hints per block size */
* ( block_size_lg2 - m_min_block_size_lg2 ); /* block size id */
const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );
const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );
// Fast query clock register 'tic' to pseudo-randomize
// the guess for which block within a superblock should
// be claimed. If not available then a search occurs.
// Fast query clock register 'tic' to pseudo-randomize
// the guess for which block within a superblock should
// be claimed. If not available then a search occurs.
const uint32_t block_id_hint =
(uint32_t)( Kokkos::Impl::clock_tic()
const uint32_t block_id_hint =
(uint32_t)( Kokkos::Impl::clock_tic()
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
// Spread out potentially concurrent access
// by threads within a warp or thread block.
+ ( threadIdx.x + blockDim.x * threadIdx.y )
// Spread out potentially concurrent access
// by threads within a warp or thread block.
+ ( threadIdx.x + blockDim.x * threadIdx.y )
#endif
);
);
// expected state of superblock for allocation
uint32_t sb_state = block_state ;
// expected state of superblock for allocation
uint32_t sb_state = block_state ;
int32_t sb_id = -1 ;
int32_t sb_id = -1 ;
volatile uint32_t * sb_state_array = 0 ;
volatile uint32_t * sb_state_array = 0 ;
while ( attempt_limit ) {
while ( attempt_limit ) {
int32_t hint_sb_id = -1 ;
int32_t hint_sb_id = -1 ;
if ( sb_id < 0 ) {
if ( sb_id < 0 ) {
// No superblock specified, try the hint for this block size
// No superblock specified, try the hint for this block size
sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
}
// Require:
// 0 <= sb_id
// sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
// This superblock state is as expected, for the moment.
// Attempt to claim a bit. The attempt updates the state
// so have already made sure the state header is as expected.
const uint32_t count_lg2 = sb_state >> state_shift ;
const uint32_t mask = ( 1u << count_lg2 ) - 1 ;
const Kokkos::pair<int,int> result =
CB::acquire_bounded_lg2( sb_state_array
, count_lg2
, block_id_hint & mask
, sb_state
);
// If result.first < 0 then failed to acquire
// due to either full or buffer was wrong state.
// Could be wrong state if a deallocation raced the
// superblock to empty before the acquire could succeed.
if ( 0 <= result.first ) { // acquired a bit
const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
// Set the allocated block pointer
p = ((char*)( m_sb_state_array + m_data_offset ))
+ ( uint64_t(sb_id) << m_sb_size_lg2 ) // superblock memory
+ ( uint64_t(result.first) << size_lg2 ); // block memory
#if 0
printf( " MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
, (uintptr_t)m_sb_state_array
, (uintptr_t)p
, alloc_size
, sb_id
, sb_state
, (1u << size_lg2)
, (1u << count_lg2)
, result.first
, result.second );
#endif
break ; // Success
}
}
//------------------------------------------------------------------
// Arrive here if failed to acquire a block.
// Must find a new superblock.
// Start searching at designated index for this block size.
// Look for superblock that, in preferential order,
// 1) part-full superblock of this block size
// 2) empty superblock to claim for this block size
// 3) part-full superblock of the next larger block size
sb_state = block_state ; // Expect to find the desired state
sb_id = -1 ;
bool update_hint = false ;
int32_t sb_id_empty = -1 ;
int32_t sb_id_large = -1 ;
uint32_t sb_state_large = 0 ;
sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
for ( int32_t i = 0 , id = sb_id_begin ; i < m_sb_count ; ++i ) {
// Query state of the candidate superblock.
// Note that the state may change at any moment
// as concurrent allocations and deallocations occur.
const uint32_t full_state = *sb_state_array ;
const uint32_t used = full_state & state_used_mask ;
const uint32_t state = full_state & state_header_mask ;
if ( state == block_state ) {
// Superblock is assigned to this block size
if ( used < block_count ) {
// There is room to allocate one block
sb_id = id ;
// Is there room to allocate more than one block?
update_hint = used + 1 < block_count ;
break ;
}
}
else if ( 0 == used ) {
// Superblock is empty
if ( -1 == sb_id_empty ) {
// Superblock is not assigned to this block size
// and is the first empty superblock encountered.
// Save this id to use if a partfull superblock is not found.
sb_id_empty = id ;
}
}
else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
( -1 == sb_id_large /* have not found a larger */ ) &&
( state < block_state /* a larger block */ ) &&
// is not full:
( used < ( 1u << ( state >> state_shift ) ) ) ) {
// First superblock encountered that is
// larger than this block size and
// has room for an allocation.
// Save this id to use of partfull or empty superblock not found
sb_id_large = id ;
sb_state_large = state ;
}
// Iterate around the superblock array:
if ( ++id < m_sb_count ) {
sb_state_array += m_sb_state_size ;
}
else {
id = 0 ;
sb_state_array = m_sb_state_array ;
}
}
// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
if ( sb_id < 0 ) {
// Did not find a partfull superblock for this block size.
if ( 0 <= sb_id_empty ) {
// Found first empty superblock following designated superblock
// Attempt to claim it for this block size.
// If the claim fails assume that another thread claimed it
// for this block size and try to use it anyway,
// but do not update hint.
sb_id = sb_id_empty ;
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
// If successfully changed assignment of empty superblock 'sb_id'
// to this block_size then update the hint.
const uint32_t state_empty = state_header_mask & *sb_state_array ;
// If this thread claims the empty block then update the hint
update_hint =
state_empty ==
Kokkos::atomic_compare_exchange
(sb_state_array,state_empty,block_state);
}
else if ( 0 <= sb_id_large ) {
// Found a larger superblock with space available
sb_id = sb_id_large ;
sb_state = sb_state_large ;
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
}
// Require:
// 0 <= sb_id
// sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
// This superblock state is as expected, for the moment.
// Attempt to claim a bit. The attempt updates the state
// so have already made sure the state header is as expected.
const uint32_t count_lg2 = sb_state >> state_shift ;
const uint32_t mask = ( 1u << count_lg2 ) - 1 ;
const Kokkos::pair<int,int> result =
CB::acquire_bounded_lg2( sb_state_array
, count_lg2
, block_id_hint & mask
, sb_state
);
// If result.first < 0 then failed to acquire
// due to either full or buffer was wrong state.
// Could be wrong state if a deallocation raced the
// superblock to empty before the acquire could succeed.
if ( 0 <= result.first ) { // acquired a bit
const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
// Set the allocated block pointer
p = ((char*)( m_sb_state_array + m_data_offset ))
+ ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory
+ ( result.first << size_lg2 ); // block memory
break ; // Success
}
// printf(" acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second );
else {
// Did not find a potentially usable superblock
--attempt_limit ;
}
//------------------------------------------------------------------
// Arrive here if failed to acquire a block.
// Must find a new superblock.
}
// Start searching at designated index for this block size.
// Look for superblock that, in preferential order,
// 1) part-full superblock of this block size
// 2) empty superblock to claim for this block size
// 3) part-full superblock of the next larger block size
sb_state = block_state ; // Expect to find the desired state
sb_id = -1 ;
bool update_hint = false ;
int32_t sb_id_empty = -1 ;
int32_t sb_id_large = -1 ;
uint32_t sb_state_large = 0 ;
sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
for ( int32_t i = 0 , id = sb_id_begin ; i < m_sb_count ; ++i ) {
// Query state of the candidate superblock.
// Note that the state may change at any moment
// as concurrent allocations and deallocations occur.
const uint32_t full_state = *sb_state_array ;
const uint32_t used = full_state & state_used_mask ;
const uint32_t state = full_state & state_header_mask ;
if ( state == block_state ) {
// Superblock is assigned to this block size
if ( used < block_count ) {
// There is room to allocate one block
sb_id = id ;
// Is there room to allocate more than one block?
update_hint = used + 1 < block_count ;
break ;
}
}
else if ( 0 == used ) {
// Superblock is empty
if ( -1 == sb_id_empty ) {
// Superblock is not assigned to this block size
// and is the first empty superblock encountered.
// Save this id to use if a partfull superblock is not found.
sb_id_empty = id ;
}
}
else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
( -1 == sb_id_large /* have not found a larger */ ) &&
( state < block_state /* a larger block */ ) &&
// is not full:
( used < ( 1u << ( state >> state_shift ) ) ) ) {
// First superblock encountered that is
// larger than this block size and
// has room for an allocation.
// Save this id to use of partfull or empty superblock not found
sb_id_large = id ;
sb_state_large = state ;
}
// Iterate around the superblock array:
if ( ++id < m_sb_count ) {
sb_state_array += m_sb_state_size ;
}
else {
id = 0 ;
sb_state_array = m_sb_state_array ;
}
}
// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
if ( sb_id < 0 ) {
// Did not find a partfull superblock for this block size.
if ( 0 <= sb_id_empty ) {
// Found first empty superblock following designated superblock
// Attempt to claim it for this block size.
// If the claim fails assume that another thread claimed it
// for this block size and try to use it anyway,
// but do not update hint.
sb_id = sb_id_empty ;
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
// If successfully changed assignment of empty superblock 'sb_id'
// to this block_size then update the hint.
const uint32_t state_empty = state_header_mask & *sb_state_array ;
// If this thread claims the empty block then update the hint
update_hint =
state_empty ==
Kokkos::atomic_compare_exchange
(sb_state_array,state_empty,block_state);
}
else if ( 0 <= sb_id_large ) {
// Found a larger superblock with space available
sb_id = sb_id_large ;
sb_state = sb_state_large ;
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
}
else {
// Did not find a potentially usable superblock
--attempt_limit ;
}
}
if ( update_hint ) {
Kokkos::atomic_compare_exchange
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
}
} // end allocation attempt loop
//--------------------------------------------------------------------
}
else {
Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
}
if ( update_hint ) {
Kokkos::atomic_compare_exchange
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
}
} // end allocation attempt loop
//--------------------------------------------------------------------
return p ;
}
@ -765,7 +802,7 @@ public:
const uint32_t block_size_lg2 =
m_sb_size_lg2 - ( block_state >> state_shift );
ok_block_aligned = 0 == ( d & ( ( 1 << block_size_lg2 ) - 1 ) );
ok_block_aligned = 0 == ( d & ( ( 1UL << block_size_lg2 ) - 1 ) );
if ( ok_block_aligned ) {
@ -773,31 +810,70 @@ public:
// mask into superblock and then shift down for block index
const uint32_t bit =
( d & ( ptrdiff_t( 1 << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
( d & ( ptrdiff_t( 1LU << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
const int result =
CB::release( sb_state_array , bit , block_state );
ok_dealloc_once = 0 <= result ;
// printf(" deallocate from sb_id(%d) result(%d) bit(%d) state(0x%x)\n"
// , sb_id
// , result
// , uint32_t(d >> block_size_lg2)
// , *sb_state_array );
#if 0
printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
, (uintptr_t)m_sb_state_array
, (uintptr_t)p
, sb_id
, (1u << block_size_lg2)
, (1u << (m_sb_size_lg2 - block_size_lg2))
, bit
, result );
#endif
}
}
if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) {
#if 0
printf("Kokkos MemoryPool deallocate(0x%lx) contains(%d) block_aligned(%d) dealloc_once(%d)\n",(uintptr_t)p,ok_contains,ok_block_aligned,ok_dealloc_once);
printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
, (uintptr_t)m_sb_state_array
, (uintptr_t)p
, int(ok_contains)
, int(ok_block_aligned)
, int(ok_dealloc_once) );
#endif
Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
}
}
// end deallocate
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
int number_of_superblocks() const noexcept { return m_sb_count ; }
KOKKOS_INLINE_FUNCTION
void superblock_state( int sb_id
, int & block_size
, int & block_count_capacity
, int & block_count_used ) const noexcept
{
block_size = 0 ;
block_count_capacity = 0 ;
block_count_used = 0 ;
if ( Kokkos::Impl::MemorySpaceAccess
< Kokkos::Impl::ActiveExecutionMemorySpace
, base_memory_space >::accessible ) {
// Can access the state array
const uint32_t state =
((uint32_t volatile *)m_sb_state_array)[sb_id*m_sb_state_size];
const uint32_t block_count_lg2 = state >> state_shift ;
const uint32_t block_used = state & state_used_mask ;
block_size = 1LU << ( m_sb_size_lg2 - block_count_lg2 );
block_count_capacity = 1LU << block_count_lg2 ;
block_count_used = block_used ;
}
}
};
} // namespace Kokkos

View File

@ -97,26 +97,22 @@ typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryR
namespace Kokkos {
namespace Impl {
static_assert(
( 0 < int(KOKKOS_MEMORY_ALIGNMENT) ) &&
( 0 == ( int(KOKKOS_MEMORY_ALIGNMENT) & (int(KOKKOS_MEMORY_ALIGNMENT)-1))) ,
"KOKKOS_MEMORY_ALIGNMENT must be a power of two" );
/** \brief Memory alignment settings
*
* Sets global value for memory alignment. Must be a power of two!
* Enable compatibility of views from different devices with static stride.
* Use compiler flag to enable overwrites.
*/
enum { MEMORY_ALIGNMENT =
#if defined( KOKKOS_MEMORY_ALIGNMENT )
( 1 << Kokkos::Impl::integral_power_of_two( KOKKOS_MEMORY_ALIGNMENT ) )
#else
( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
#endif
#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
enum : unsigned
{ MEMORY_ALIGNMENT = KOKKOS_MEMORY_ALIGNMENT
, MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
#else
, MEMORY_ALIGNMENT_THRESHOLD = 4
#endif
};
} //namespace Impl
} // namespace Kokkos

View File

@ -204,8 +204,8 @@ struct VerifyExecutionCanAccessMemorySpace
>
{
enum { value = false };
inline static void verify( void ) { Experimental::ROCmSpace::access_error(); }
inline static void verify( const void * p ) { Experimental::ROCmSpace::access_error(p); }
inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
};
} // namespace Experimental
} // namespace Kokkos

View File

@ -619,16 +619,16 @@ namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::MDRangePolicy< Traits ... > ,
Kokkos::MDRangePolicy< Traits ... > ,
Kokkos::Serial
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
const FunctorType m_functor ;
const MDRangePolicy m_mdr_policy ;
@ -661,14 +661,14 @@ public:
template< class FunctorType , class ReducerType , class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Serial
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
@ -686,7 +686,7 @@ private:
typedef typename Analysis::reference_type reference_type ;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
, FunctorType
, WorkTag
, ValueType

View File

@ -408,7 +408,7 @@ view_alloc( Args const & ... args )
}
template< class ... Args >
inline
KOKKOS_INLINE_FUNCTION
Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
view_wrap( Args const & ... args )
{
@ -1216,6 +1216,13 @@ public:
m_track.assign_allocated_record_to_uninitialized( record );
}
KOKKOS_INLINE_FUNCTION
void assign_data( pointer_type arg_data )
{
m_track.clear();
m_map.assign_data( arg_data );
}
// Wrap memory according to properties and array layout
template< class ... P >
explicit KOKKOS_INLINE_FUNCTION
@ -2235,6 +2242,29 @@ create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
}
// Create a mirror view and deep_copy in a new space (specialization for same space)
template<class Space, class T, class ... P>
typename Impl::MirrorViewType<Space,T,P ...>::view_type
create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
, std::string const& name = ""
, typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
(void)name;
return src;
}
// Create a mirror view and deep_copy in a new space (specialization for different space)
template<class Space, class T, class ... P>
typename Impl::MirrorViewType<Space,T,P ...>::view_type
create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
, std::string const& name = ""
, typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
using Mirror = typename Impl::MirrorViewType<Space,T,P ...>::view_type;
std::string label = name.empty() ? src.label() : name;
auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout());
deep_copy(mirror, src);
return mirror;
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
@ -2432,6 +2462,7 @@ struct CommonViewAllocProp< void, ValueType >
using scalar_array_type = ValueType;
template < class ... Views >
KOKKOS_INLINE_FUNCTION
CommonViewAllocProp( const Views & ... ) {}
};
@ -2499,6 +2530,7 @@ using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...
// User function
template < class ... Views >
KOKKOS_INLINE_FUNCTION
DeducedCommonPropsType<Views...>
common_view_alloc_prop( Views const & ... views )
{

View File

@ -46,205 +46,198 @@
namespace Kokkos {
namespace Impl {
namespace Experimental {
template< class functor_type , class execution_space, class ... policy_args >
class WorkGraphExec;
}}} // namespace Kokkos::Impl::Experimental
}} // namespace Kokkos::Impl
namespace Kokkos {
namespace Experimental {
template< class ... Properties >
class WorkGraphPolicy
{
public:
using self_type = WorkGraphPolicy<Properties ... >;
using traits = Kokkos::Impl::PolicyTraits<Properties ... >;
using index_type = typename traits::index_type;
using self_type = WorkGraphPolicy<Properties ... >;
using traits = Kokkos::Impl::PolicyTraits<Properties ... >;
using index_type = typename traits::index_type;
using member_type = index_type;
using work_tag = typename traits::work_tag;
using execution_space = typename traits::execution_space;
using work_tag = typename traits::work_tag;
using memory_space = typename execution_space::memory_space;
using graph_type = Kokkos::Experimental::Crs<index_type, execution_space, void, index_type>;
using member_type = index_type;
using memory_space = typename execution_space::memory_space;
using graph_type = Kokkos::Crs<index_type,execution_space,void,index_type>;
enum : std::int32_t {
END_TOKEN = -1 ,
BEGIN_TOKEN = -2 ,
COMPLETED_TOKEN = -3 };
private:
graph_type m_graph;
using ints_type = Kokkos::View<std::int32_t*, memory_space>;
using range_type = Kokkos::pair<std::int32_t, std::int32_t>;
using ranges_type = Kokkos::View<range_type*, memory_space>;
const std::int32_t m_total_work;
ints_type m_counts;
ints_type m_queue;
ranges_type m_ranges;
// Let N = m_graph.numRows(), the total work
// m_queue[ 0 .. N-1] = the ready queue
// m_queue[ N .. 2*N-1] = the waiting queue counts
// m_queue[2*N .. 2*N+2] = the ready queue hints
graph_type const m_graph;
ints_type m_queue ;
KOKKOS_INLINE_FUNCTION
void push_work( const std::int32_t w ) const noexcept
{
const std::int32_t N = m_graph.numRows();
std::int32_t volatile * const ready_queue = & m_queue[0] ;
std::int32_t volatile * const end_hint = & m_queue[2*N+1] ;
// Push work to end of queue
const std::int32_t j = atomic_fetch_add( end_hint , 1 );
if ( ( N <= j ) ||
( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) {
// ERROR: past the end of queue or did not replace END_TOKEN
Kokkos::abort("WorkGraphPolicy push_work error");
}
memory_fence();
}
public:
struct TagZeroRanges {};
/**\brief Attempt to pop the work item at the head of the queue.
*
* Find entry 'i' such that
* ( m_queue[i] != BEGIN_TOKEN ) AND
* ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN )
* if found then
* increment begin hint
* return atomic_exchange( m_queue[i] , BEGIN_TOKEN )
* else if i < total work
* return END_TOKEN
* else
* return COMPLETED_TOKEN
*
*/
KOKKOS_INLINE_FUNCTION
void operator()(TagZeroRanges, std::int32_t i) const {
m_ranges[i] = range_type(0, 0);
}
void zero_ranges() {
using policy_type = RangePolicy<std::int32_t, execution_space, TagZeroRanges>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this, policy_type(0, 1));
closure.execute();
execution_space::fence();
}
std::int32_t pop_work() const noexcept
{
const std::int32_t N = m_graph.numRows();
struct TagFillQueue {};
KOKKOS_INLINE_FUNCTION
void operator()(TagFillQueue, std::int32_t i) const {
if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i);
}
void fill_queue() {
using policy_type = RangePolicy<std::int32_t, execution_space, TagFillQueue>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this, policy_type(0, m_total_work));
closure.execute();
execution_space::fence();
}
std::int32_t volatile * const ready_queue = & m_queue[0] ;
std::int32_t volatile * const begin_hint = & m_queue[2*N] ;
private:
// begin hint is guaranteed to be less than or equal to
// actual begin location in the queue.
inline
void setup() {
if (m_graph.numRows() > std::numeric_limits<std::int32_t>::max()) {
Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t");
}
get_crs_transpose_counts(m_counts, m_graph);
m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work);
deep_copy(m_queue, std::int32_t(-1));
m_ranges = ranges_type("ranges", 1);
fill_queue();
}
for ( std::int32_t i = *begin_hint ; i < N ; ++i ) {
KOKKOS_INLINE_FUNCTION
std::int32_t pop_work() const {
range_type w(-1,-1);
while (true) {
const range_type w_new( w.first + 1 , w.second );
w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
if ( w.first < w.second ) { // there was work in the queue
if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
// we got a work item
std::int32_t i;
// the push_work function may have incremented the end counter
// but not yet written the work index into the queue.
// wait until the entry is valid.
while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) );
return i;
} // we got a work item
} else { // there was no work in the queue
#ifdef KOKKOS_DEBUG
if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
Kokkos::abort("bug in pop_work");
const std::int32_t w = ready_queue[i] ;
if ( w == END_TOKEN ) { return END_TOKEN ; }
if ( ( w != BEGIN_TOKEN ) &&
( w == atomic_compare_exchange(ready_queue+i,w,BEGIN_TOKEN) ) ) {
// Attempt to claim ready work index succeeded,
// update the hint and return work index
atomic_increment( begin_hint );
return w ;
}
#endif
if (w.first == m_total_work) { // all work is done
return -1;
} else { // need to wait for more work to be pushed
// take a guess that one work item will be pushed
// the key thing is we can't leave (w) alone, because
// otherwise the next compare_exchange may succeed in
// popping work from an empty queue
w.second++;
}
} // there was no work in the queue
} // while (true)
}
// arrive here when ready_queue[i] == BEGIN_TOKEN
}
return COMPLETED_TOKEN ;
}
KOKKOS_INLINE_FUNCTION
void push_work(std::int32_t i) const {
range_type w(-1,-1);
while (true) {
const range_type w_new( w.first , w.second + 1 );
// try to increment the end counter
w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
// stop trying if the increment was successful
if ( w.first == w_new.first && w.second + 1 == w_new.second ) break;
void completed_work( std::int32_t w ) const noexcept
{
Kokkos::memory_fence();
// Make sure the completed work function's memory accesses are flushed.
const std::int32_t N = m_graph.numRows();
std::int32_t volatile * const count_queue = & m_queue[N] ;
const std::int32_t B = m_graph.row_map(w);
const std::int32_t E = m_graph.row_map(w+1);
for ( std::int32_t i = B ; i < E ; ++i ) {
const std::int32_t j = m_graph.entries(i);
if ( 1 == atomic_fetch_add(count_queue+j,-1) ) {
push_work(j);
}
}
}
// write the work index into the claimed spot in the queue
*((volatile std::int32_t*)(&m_queue( w.second ))) = i;
// push this write out into the memory system
memory_fence();
}
template< class functor_type , class execution_space, class ... policy_args >
friend class Kokkos::Impl::Experimental::WorkGraphExec;
struct TagInit {};
struct TagCount {};
struct TagReady {};
public:
/**\brief Initialize queue
*
* m_queue[0..N-1] = END_TOKEN, the ready queue
* m_queue[N..2*N-1] = 0, the waiting count queue
* m_queue[2*N..2*N+1] = 0, begin/end hints for ready queue
*/
KOKKOS_INLINE_FUNCTION
void operator()( const TagInit , int i ) const noexcept
{ m_queue[i] = i < m_graph.numRows() ? END_TOKEN : 0 ; }
WorkGraphPolicy(graph_type arg_graph)
KOKKOS_INLINE_FUNCTION
void operator()( const TagCount , int i ) const noexcept
{
std::int32_t volatile * const count_queue =
& m_queue[ m_graph.numRows() ] ;
atomic_increment( count_queue + m_graph.entries[i] );
}
KOKKOS_INLINE_FUNCTION
void operator()( const TagReady , int w ) const noexcept
{
std::int32_t const * const count_queue =
& m_queue[ m_graph.numRows() ] ;
if ( 0 == count_queue[w] ) push_work(w);
}
WorkGraphPolicy( const graph_type & arg_graph )
: m_graph(arg_graph)
, m_total_work( arg_graph.numRows() )
, m_queue( view_alloc( "queue" , WithoutInitializing )
, arg_graph.numRows() * 2 + 2 )
{
setup();
}
{ // Initialize
using policy_type = RangePolicy<std::int32_t, execution_space, TagInit>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this, policy_type(0, m_queue.size()));
closure.execute();
execution_space::fence();
}
};
{ // execute-after counts
using policy_type = RangePolicy<std::int32_t, execution_space, TagCount>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this,policy_type(0,m_graph.entries.size()));
closure.execute();
execution_space::fence();
}
}} // namespace Kokkos::Experimental
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
namespace Experimental {
template< class functor_type , class execution_space, class ... policy_args >
class WorkGraphExec
{
public:
using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >;
using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >;
using member_type = typename policy_type::member_type;
using memory_space = typename execution_space::memory_space;
protected:
const functor_type m_functor;
const policy_type m_policy;
protected:
KOKKOS_INLINE_FUNCTION
std::int32_t before_work() const {
return m_policy.pop_work();
}
KOKKOS_INLINE_FUNCTION
void after_work(std::int32_t i) const {
/* fence any writes that were done by the work item itself
(usually writing its result to global memory) */
memory_fence();
const std::int32_t begin = m_policy.m_graph.row_map( i );
const std::int32_t end = m_policy.m_graph.row_map( i + 1 );
for (std::int32_t j = begin; j < end; ++j) {
const std::int32_t next = m_policy.m_graph.entries( j );
const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 );
if ( old_count == 1 ) m_policy.push_work( next );
{ // Scheduling ready tasks
using policy_type = RangePolicy<std::int32_t, execution_space, TagReady>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this,policy_type(0,m_graph.numRows()));
closure.execute();
execution_space::fence();
}
}
inline
WorkGraphExec( const functor_type & arg_functor
, const policy_type & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{
}
};
}}} // namespace Kokkos::Impl::Experimental
} // namespace Kokkos
#ifdef KOKKOS_ENABLE_SERIAL
#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"

View File

@ -294,7 +294,7 @@ void OpenMP::initialize( int thread_count )
}
{
if (nullptr == std::getenv("OMP_PROC_BIND") ) {
if ( Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND") ) {
printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
printf(" In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
printf(" For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
@ -327,7 +327,7 @@ void OpenMP::initialize( int thread_count )
omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
}
else {
if( thread_count > process_num_threads ) {
if( Kokkos::show_warnings() && thread_count > process_num_threads ) {
printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
printf( " process threads available : %3d, requested thread : %3d\n", process_num_threads, thread_count );
}
@ -364,12 +364,12 @@ void OpenMP::initialize( int thread_count )
// Check for over-subscription
//if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
// std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
// std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
// std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
// std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
//}
if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
}
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();

View File

@ -170,20 +170,20 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, Kokkos::OpenMP
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
OpenMPExec * m_instance ;
const FunctorType m_functor ;
@ -445,14 +445,14 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ...>
, Kokkos::MDRangePolicy< Traits ...>
, ReducerType
, Kokkos::OpenMP
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
@ -472,7 +472,7 @@ private:
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
, FunctorType
, WorkTag
, ValueType

View File

@ -49,33 +49,26 @@ namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::WorkGraphPolicy< Traits ... > ,
Kokkos::OpenMP
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::OpenMP,
Traits ...
>
{
private:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::OpenMP, Traits ... > Base ;
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
Policy m_policy ;
FunctorType m_functor ;
template< class TagType >
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
exec_one( const std::int32_t w ) const noexcept
{ m_functor( w ); }
template< class TagType >
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
exec_one( const std::int32_t w ) const noexcept
{ const TagType t{} ; m_functor( t , w ); }
public:
@ -86,9 +79,15 @@ public:
#pragma omp parallel num_threads(pool_size)
{
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
// Spin until COMPLETED_TOKEN.
// END_TOKEN indicates no work is currently available.
for ( std::int32_t w = Policy::END_TOKEN ;
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
if ( Policy::END_TOKEN != w ) {
exec_one< typename Policy::work_tag >( w );
m_policy.completed_work(w);
}
}
}
}
@ -96,12 +95,13 @@ public:
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
: m_policy( arg_policy )
, m_functor( arg_functor )
{}
};
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */

View File

@ -125,7 +125,7 @@ namespace Kokkos {
oldval.t = *dest ;
assume.i = oldval.i ;
newval.t = val ;
atomic_compare_exchange( reinterpret_cast<int*>(dest) , assume.i, newval.i );
atomic_compare_exchange( (int*)(dest) , assume.i, newval.i );
return oldval.t ;
}

View File

@ -608,6 +608,7 @@ ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size )
void ROCmInternal::finalize()
{
Kokkos::Impl::rocm_device_synchronize();
was_finalized = 1;
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {

View File

@ -277,7 +277,7 @@ public:
this->team_barrier();
value = local_value;
}
// Reduce accross a team of threads.
// Reduce across a team of threads.
//
// Each thread has vector_length elements.
// This reduction is for TeamThreadRange operations, where the range
@ -354,6 +354,80 @@ public:
return buffer[0];
}
// Reduce across a team of threads, with a reducer data type
//
// Each thread has vector_length elements.
// This reduction is for TeamThreadRange operations, where the range
// is spread across threads. Effectively, there are vector_length
// independent reduction operations.
// This is different from a reduction across the elements of a thread,
// which reduces every vector element.
template< class ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< is_reducer< ReducerType >::value >::type
team_reduce( const ReducerType & reducer) const
{
typedef typename ReducerType::value_type value_type ;
tile_static value_type buffer[512];
const auto local = lindex();
const auto team = team_rank();
auto vector_rank = local%m_vector_length;
auto thread_base = team*m_vector_length;
const std::size_t size = next_pow_2(m_team_size+1)/2;
#if defined(ROCM15)
buffer[local] = reducer.reference();
#else
// ROCM 1.5 handles address spaces better, previous version didn't
lds_for(buffer[local], [&](ValueType& x)
{
x = value;
});
#endif
m_idx.barrier.wait();
for(std::size_t s = 1; s < size; s *= 2)
{
const std::size_t index = 2 * s * team;
if (index < size)
{
#if defined(ROCM15)
reducer.join(buffer[vector_rank+index*m_vector_length],
buffer[vector_rank+(index+s)*m_vector_length]);
#else
lds_for(buffer[vector_rank+index*m_vector_length], [&](ValueType& x)
{
lds_for(buffer[vector_rank+(index+s)*m_vector_length],
[&](ValueType& y)
{
reducer.join(x, y);
});
});
#endif
}
m_idx.barrier.wait();
}
if (local == 0)
{
for(int i=size*m_vector_length; i<m_team_size*m_vector_length; i+=m_vector_length)
#if defined(ROCM15)
reducer.join(buffer[vector_rank], buffer[vector_rank+i]);
#else
lds_for(buffer[vector_rank], [&](ValueType& x)
{
lds_for(buffer[vector_rank+i],
[&](ValueType& y)
{
reducer.join(x, y);
});
});
#endif
}
m_idx.barrier.wait();
}
/** \brief Intra-team vector reduce
* with intra-team non-deterministic ordering accumulation.
@ -406,6 +480,33 @@ public:
return buffer[thread_base];
}
template< typename ReducerType >
KOKKOS_INLINE_FUNCTION static
typename std::enable_if< is_reducer< ReducerType >::value >::type
vector_reduce( ReducerType const & reducer )
{
#ifdef __HCC_ACCELERATOR__
if(blockDim_x == 1) return;
// Intra vector lane shuffle reduction:
typename ReducerType::value_type tmp ( reducer.reference() );
for ( int i = blockDim_x ; ( i >>= 1 ) ; ) {
shfl_down( reducer.reference() , i , blockDim_x );
if ( (int)threadIdx_x < i ) { reducer.join( tmp , reducer.reference() ); }
}
// Broadcast from root lane to all other lanes.
// Cannot use "butterfly" algorithm to avoid the broadcast
// because floating point summation is not associative
// and thus different threads could have different results.
shfl( reducer.reference() , 0 , blockDim_x );
#endif
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
@ -1075,6 +1176,22 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROC
// Impl::JoinAdd<ValueType>());
}
/** \brief Inter-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ReducerType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>& loop_boundaries,
const Lambda & lambda, ReducerType const & reducer) {
reducer.init( reducer.reference() );
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,reducer.reference());
}
loop_boundaries.thread.team_reduce(reducer);
}
/** \brief Intra-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
@ -1161,6 +1278,41 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::R
result = loop_boundaries.thread.thread_reduce(result,join);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ReducerType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
loop_boundaries, const Lambda & lambda, ReducerType const & reducer) {
reducer.init( reducer.reference() );
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,reducer.reference());
}
loop_boundaries.thread.vector_reduce(reducer);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ReducerType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ReducerType const & reducer) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,reducer.reference());
loop_boundaries.thread.team_barrier();
}
reducer.reference() = loop_boundaries.thread.thread_reduce(reducer.reference(),join);
}
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
* for each i=0..N-1.
*

View File

@ -266,7 +266,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
for ( int i = 0 ; i < n ; ++i ) {
Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
exec.m_pool_state = ThreadsExec::Inactive ;
@ -310,7 +310,7 @@ void ThreadsExec::fence()
{
if ( s_thread_pool_size[0] ) {
// Wait for the root thread to complete:
Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
}
s_current_function = 0 ;
@ -716,12 +716,12 @@ void ThreadsExec::initialize( unsigned thread_count ,
}
// Check for over-subscription
//if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
// std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
// std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
// std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
// std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
//}
if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
}
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();

View File

@ -50,6 +50,7 @@
#include <cstdio>
#include <utility>
#include <cstdalign>
#include <impl/Kokkos_Spinwait.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
@ -107,7 +108,7 @@ private:
// Which thread am I stealing from currently
int m_current_steal_target;
// This thread's owned work_range
Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN(16);
Kokkos::pair<long,long> m_work_range __attribute__((aligned(16))) ;
// Team Offset if one thread determines work_range for others
long m_team_work_index;
@ -191,13 +192,13 @@ public:
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the reduction and broadcast
@ -233,13 +234,13 @@ public:
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the reduction and broadcast
@ -268,7 +269,7 @@ public:
ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );
Join::join( f , reduce_memory() , fan.reduce_memory() );
}
@ -295,7 +296,7 @@ public:
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
}
}
@ -327,7 +328,7 @@ public:
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
// Wait: Active -> ReductionAvailable (or ScanAvailable)
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );
Join::join( f , work_value , fan.reduce_memory() );
}
@ -345,8 +346,8 @@ public:
// Wait: Active -> ReductionAvailable
// Wait: ReductionAvailable -> ScanAvailable
Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable );
Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::ReductionAvailable );
Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
}
@ -357,7 +358,7 @@ public:
// Wait for all threads to complete inclusive scan
// Wait: ScanAvailable -> Rendezvous
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable );
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanAvailable );
}
//--------------------------------
@ -365,7 +366,7 @@ public:
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
// Wait: ReductionAvailable -> ScanAvailable
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable );
Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::ReductionAvailable );
// Set: ScanAvailable -> Rendezvous
fan.m_pool_state = ThreadsExec::Rendezvous ;
}
@ -392,13 +393,13 @@ public:
// Wait for all threads to copy previous thread's inclusive scan value
// Wait for all threads: Rendezvous -> ScanCompleted
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
}
if ( rev_rank ) {
// Set: ScanAvailable -> ScanCompleted
m_pool_state = ThreadsExec::ScanCompleted ;
// Wait: ScanCompleted -> Active
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted );
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanCompleted );
}
// Set: ScanCompleted -> Active
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
@ -425,7 +426,7 @@ public:
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
@ -433,7 +434,7 @@ public:
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the thread-scan before releasing threads

View File

@ -107,13 +107,13 @@ public:
// Wait for fan-in threads
for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( m_team_base[j]->state() , ThreadsExec::Active );
}
// If not root then wait for release
if ( m_team_rank_rev ) {
m_exec->state() = ThreadsExec::Rendezvous ;
Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal<int>( m_exec->state() , ThreadsExec::Rendezvous );
}
return ! m_team_rank_rev ;

View File

@ -180,12 +180,12 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, Kokkos::Threads
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
@ -193,7 +193,7 @@ private:
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
const FunctorType m_functor ;
const MDRangePolicy m_mdr_policy ;
@ -548,14 +548,14 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Threads
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
@ -573,7 +573,7 @@ private:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
, FunctorType
, WorkTag
, ValueType

View File

@ -49,48 +49,50 @@ namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::WorkGraphPolicy< Traits ... > ,
Kokkos::Threads
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::Threads,
Traits ...
>
{
private:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::Threads, Traits ... > Base ;
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
typedef ParallelFor<FunctorType,
Kokkos::Experimental::WorkGraphPolicy<Traits ...>,
Kokkos::WorkGraphPolicy<Traits ...>,
Kokkos::Threads> Self ;
Policy m_policy ;
FunctorType m_functor ;
template< class TagType >
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
exec_one( const std::int32_t w ) const noexcept
{ m_functor( w ); }
template< class TagType >
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
exec_one( const std::int32_t w ) const noexcept
{ const TagType t{}; m_functor( t , w ); }
inline void exec_one_thread() const {
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
inline void exec_one_thread() const noexcept
{
// Spin until COMPLETED_TOKEN.
// END_TOKEN indicates no work is currently available.
for ( std::int32_t w = Policy::END_TOKEN ;
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
if ( Policy::END_TOKEN != w ) {
exec_one< typename Policy::work_tag >( w );
m_policy.completed_work(w);
}
}
}
}
static inline void thread_main( ThreadsExec&, const void* arg ) {
const Self& self = *(static_cast<const Self*>(arg));
self.exec_one_thread();
}
static inline void thread_main( ThreadsExec&, const void* arg ) noexcept
{
const Self& self = *(static_cast<const Self*>(arg));
self.exec_one_thread();
}
public:
@ -104,9 +106,9 @@ public:
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
: m_policy( arg_policy )
, m_functor( arg_functor )
{}
};
} // namespace Impl

View File

@ -59,7 +59,7 @@
#include <algorithm>
#include <cstdio>
namespace Kokkos { namespace Experimental { namespace Impl {
namespace Kokkos { namespace Impl {
// Temporary, for testing new loop macros
#define KOKKOS_ENABLE_NEW_LOOP_MACROS 1
@ -1274,7 +1274,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i
template <typename T>
using is_void = std::is_same< T , void >;
using is_void_type = std::is_same< T , void >;
template <typename T>
struct is_type_array : std::false_type
@ -1303,7 +1303,7 @@ template < typename RP
, typename Tag
, typename ValueType
>
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void<ValueType >::value >::type >
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void_type<ValueType >::value >::type >
{
using index_type = typename RP::index_type;
using point_type = typename RP::point_type;
@ -1781,7 +1781,7 @@ template < typename RP
, typename Tag
, typename ValueType
>
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type >
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && !is_type_array<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using point_type = typename RP::point_type;
@ -2268,7 +2268,7 @@ template < typename RP
, typename Tag
, typename ValueType
>
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type >
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && is_type_array<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using point_type = typename RP::point_type;
@ -2750,6 +2750,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
// Cuda uses DeviceIterateTile directly within md_parallel_for
// TODO Once md_parallel_{for,reduce} removed, this can be removed
namespace Experimental {
// ParallelReduce - scalar reductions
template < typename MDRange, typename Functor, typename ValueType = void >
struct MDFunctor
@ -2759,11 +2761,11 @@ struct MDFunctor
using value_type = ValueType;
using work_tag = typename range_policy::work_tag;
using index_type = typename range_policy::index_type;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
, Functor
, work_tag
, value_type
>;
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
, Functor
, work_tag
, value_type
>;
inline
@ -2804,11 +2806,11 @@ struct MDFunctor< MDRange, Functor, ValueType[] >
using value_type = ValueType[];
using work_tag = typename range_policy::work_tag;
using index_type = typename range_policy::index_type;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
, Functor
, work_tag
, value_type
>;
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
, Functor
, work_tag
, value_type
>;
inline
@ -2852,11 +2854,11 @@ struct MDFunctor< MDRange, Functor, void >
using functor_type = Functor;
using work_tag = typename range_policy::work_tag;
using index_type = typename range_policy::index_type;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
, Functor
, work_tag
, void
>;
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
, Functor
, work_tag
, void
>;
inline
@ -2887,8 +2889,9 @@ struct MDFunctor< MDRange, Functor, void >
Functor m_func;
};
} // end namespace Experimental
#undef KOKKOS_ENABLE_NEW_LOOP_MACROS
} } } //end namespace Kokkos::Experimental::Impl
} } //end namespace Kokkos::Impl
#endif

View File

@ -51,9 +51,12 @@
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
bool g_is_initialized = false;
bool g_show_warnings = true;
}
namespace Kokkos { namespace Impl { namespace {
bool is_unsigned_int(const char* str)
{
@ -75,6 +78,10 @@ void initialize_internal(const InitArguments& args)
setenv("MEMKIND_HBW_NODES", "1", 0);
#endif
if (args.disable_warnings) {
g_show_warnings = false;
}
// Protect declarations, to prevent "unused variable" warnings.
#if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMPTARGET )
const int num_threads = args.num_threads;
@ -177,6 +184,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
g_is_initialized = true;
}
void finalize_internal( const bool all_spaces = false )
@ -233,6 +241,9 @@ void finalize_internal( const bool all_spaces = false )
Kokkos::Serial::finalize();
}
#endif
g_is_initialized = false;
g_show_warnings = true;
}
void fence_internal()
@ -306,9 +317,7 @@ bool check_int_arg(char const* arg, char const* expected, int* value) {
return true;
}
} // namespace
} // namespace Impl
} // namespace Kokkos
}}} // namespace Kokkos::Impl::{unnamed}
//----------------------------------------------------------------------------
@ -319,6 +328,7 @@ void initialize(int& narg, char* arg[])
int num_threads = -1;
int numa = -1;
int device = -1;
bool disable_warnings = false;
int kokkos_threads_found = 0;
int kokkos_numa_found = 0;
@ -415,6 +425,12 @@ void initialize(int& narg, char* arg[])
} else {
iarg++;
}
} else if ( strcmp(arg[iarg],"--kokkos-disable-warnings") == 0) {
disable_warnings = true;
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
narg--;
} else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
std::cout << std::endl;
std::cout << "--------------------------------------------------------------------------------" << std::endl;
@ -427,6 +443,7 @@ void initialize(int& narg, char* arg[])
std::cout << "settings." << std::endl;
std::cout << std::endl;
std::cout << "--kokkos-help : print this message" << std::endl;
std::cout << "--kokkos-disable-warnings : disable kokkos warning messages" << std::endl;
std::cout << "--kokkos-threads=INT : specify total number of threads or" << std::endl;
std::cout << " number of threads per NUMA region if " << std::endl;
std::cout << " used in conjunction with '--numa' option. " << std::endl;
@ -457,7 +474,7 @@ void initialize(int& narg, char* arg[])
iarg++;
}
InitArguments arguments{num_threads, numa, device};
InitArguments arguments{num_threads, numa, device, disable_warnings};
Impl::initialize_internal(arguments);
}
@ -787,5 +804,9 @@ void print_configuration( std::ostream & out , const bool detail )
out << msg.str() << std::endl;
}
bool is_initialized() noexcept { return g_is_initialized; }
bool show_warnings() noexcept { return g_show_warnings; }
} // namespace Kokkos

View File

@ -476,7 +476,7 @@ template< class FunctorType , class ArgTag , class T , class Enable >
struct FunctorValueInit< FunctorType , ArgTag , T & , Enable >
{
KOKKOS_FORCEINLINE_FUNCTION static
T & init( const FunctorType & f , void * p )
T & init( const FunctorType & , void * p )
{ return *( new(p) T() ); };
};

View File

@ -254,7 +254,12 @@ void * HostSpace::allocate( const size_t arg_alloc_size ) const
}
void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
void HostSpace::deallocate( void * const arg_alloc_ptr
, const size_t
#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
arg_alloc_size
#endif
) const
{
if ( arg_alloc_ptr ) {
@ -409,7 +414,7 @@ SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr
// Iterate records to print orphaned memory ...
void SharedAllocationRecord< Kokkos::HostSpace , void >::
print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail )
print_records( std::ostream & s , const Kokkos::HostSpace & , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail );
}

View File

@ -44,6 +44,9 @@
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE_HPP )
#define KOKKOS_MEMORY_FENCE_HPP
#include <atomic>
namespace Kokkos {
//----------------------------------------------------------------------------
@ -53,23 +56,8 @@ void memory_fence()
{
#if defined( __CUDA_ARCH__ )
__threadfence();
#elif defined( KOKKOS_ENABLE_ROCM_ATOMICS )
amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"mfence" ::: "memory"
);
#elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
__sync_synchronize();
#elif defined( KOKKOS_ENABLE_INTEL_ATOMICS )
_mm_mfence();
#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
#pragma omp flush
#elif defined( KOKKOS_ENABLE_WINDOWS_ATOMICS )
MemoryBarrier();
#else
#error "Error: memory_fence() not defined"
std::atomic_thread_fence( std::memory_order_seq_cst );
#endif
}
@ -81,12 +69,10 @@ void memory_fence()
KOKKOS_FORCEINLINE_FUNCTION
void store_fence()
{
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"sfence" ::: "memory"
);
#if defined( __CUDA_ARCH__ )
__threadfence();
#else
memory_fence();
std::atomic_thread_fence( std::memory_order_seq_cst );
#endif
}
@ -98,12 +84,10 @@ void store_fence()
KOKKOS_FORCEINLINE_FUNCTION
void load_fence()
{
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"lfence" ::: "memory"
);
#if defined( __CUDA_ARCH__ )
__threadfence();
#else
memory_fence();
std::atomic_thread_fence( std::memory_order_seq_cst );
#endif
}

View File

@ -49,51 +49,50 @@ namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::WorkGraphPolicy< Traits ... > ,
Kokkos::Serial
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::Serial,
Traits ...
>
{
private:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::Serial, Traits ... > Base ;
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
Policy m_policy ;
FunctorType m_functor ;
template< class TagType >
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
exec_one( const std::int32_t w ) const noexcept
{ m_functor( w ); }
template< class TagType >
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
exec_one( const std::int32_t w ) const noexcept
{ const TagType t{}; m_functor( t , w ); }
public:
inline
void execute()
{
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
void execute() const noexcept
{
// Spin until COMPLETED_TOKEN.
// END_TOKEN indicates no work is currently available.
for ( std::int32_t w = Policy::END_TOKEN ;
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
if ( Policy::END_TOKEN != w ) {
exec_one< typename Policy::work_tag >( w );
m_policy.completed_work(w);
}
}
}
}
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
: m_policy( arg_policy )
, m_functor( arg_functor )
{}
};
} // namespace Impl

View File

@ -306,7 +306,7 @@ print_host_accessible_records( std::ostream & s
, reinterpret_cast<uintptr_t>( r->m_dealloc )
, r->m_alloc_ptr->m_label
);
std::cout << buffer ;
s << buffer ;
r = r->m_next ;
} while ( r != root );
}
@ -334,7 +334,7 @@ print_host_accessible_records( std::ostream & s
else {
snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name );
}
std::cout << buffer ;
s << buffer ;
r = r->m_next ;
} while ( r != root );
}

View File

@ -294,9 +294,13 @@ public:
template< class MemorySpace >
constexpr
SharedAllocationRecord< MemorySpace , void > &
get_record() const
{ return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
SharedAllocationRecord< MemorySpace , void > *
get_record() const noexcept
{
return ( m_record_bits & DO_NOT_DEREF_FLAG )
? (SharedAllocationRecord< MemorySpace,void>*) 0
: static_cast<SharedAllocationRecord<MemorySpace,void>*>(m_record);
}
template< class MemorySpace >
std::string get_label() const
@ -323,6 +327,16 @@ public:
return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0;
}
KOKKOS_FORCEINLINE_FUNCTION
void clear()
{
// If this is tracking then must decrement
KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT
// Reset to default constructed value.
m_record_bits = DO_NOT_DEREF_FLAG ;
}
// Copy:
KOKKOS_FORCEINLINE_FUNCTION
~SharedAllocationTracker()
{ KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT }

View File

@ -48,7 +48,7 @@
#include <impl/Kokkos_Spinwait.hpp>
#include <impl/Kokkos_BitOps.hpp>
#if defined( KOKKOS_ENABLE_STDTHREAD )
#if defined( KOKKOS_ENABLE_STDTHREAD) || defined( _WIN32 )
#include <thread>
#elif !defined( _WIN32 )
#include <sched.h>
@ -63,9 +63,8 @@
namespace Kokkos {
namespace Impl {
namespace {
void host_thread_yield( const uint32_t i , const int force_yield )
void host_thread_yield( const uint32_t i , const WaitMode mode )
{
static constexpr uint32_t sleep_limit = 1 << 13 ;
static constexpr uint32_t yield_limit = 1 << 12 ;
@ -76,28 +75,26 @@ void host_thread_yield( const uint32_t i , const int force_yield )
// Attempt to put the thread to sleep for 'c' milliseconds
#if defined( KOKKOS_ENABLE_STDTHREAD )
std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) )
#elif !defined( _WIN32 )
#if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
auto start = std::chrono::high_resolution_clock::now();
std::this_thread::yield();
std::this_thread::sleep_until( start + std::chrono::nanoseconds( c * 1000 ) );
#else
timespec req ;
req.tv_sec = 0 ;
req.tv_nsec = 1000 * c ;
nanosleep( &req, nullptr );
#else /* defined( _WIN32 ) IS Microsoft Windows */
Sleep(c);
#endif
}
else if ( force_yield || yield_limit < i ) {
else if ( mode == WaitMode::PASSIVE || yield_limit < i ) {
// Attempt to yield thread resources to runtime
#if defined( KOKKOS_ENABLE_STDTHREAD )
#if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
std::this_thread::yield();
#elif !defined( _WIN32 )
#else
sched_yield();
#else /* defined( _WIN32 ) IS Microsoft Windows */
YieldProcessor();
#endif
}
@ -110,9 +107,9 @@ void host_thread_yield( const uint32_t i , const int force_yield )
for ( int k = 0 ; k < c ; ++k ) {
#if defined( __amd64 ) || defined( __amd64__ ) || \
defined( __x86_64 ) || defined( __x86_64__ )
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
asm volatile( "nop\n" );
#else
#else
__asm__ __volatile__( "nop\n" );
#endif
#elif defined(__PPC64__)
@ -123,86 +120,22 @@ void host_thread_yield( const uint32_t i , const int force_yield )
{
// Insert memory pause
#if defined( __amd64 ) || defined( __amd64__ ) || \
defined( __x86_64 ) || defined( __x86_64__ )
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
#if defined( __amd64 ) || defined( __amd64__ ) || \
defined( __x86_64 ) || defined( __x86_64__ )
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
asm volatile( "pause\n":::"memory" );
#else
#else
__asm__ __volatile__( "pause\n":::"memory" );
#endif
#elif defined(__PPC64__)
asm volatile( "or 27, 27, 27" ::: "memory" );
asm volatile( "or 27, 27, 27" ::: "memory" );
#endif
}
#endif /* defined( KOKKOS_ENABLE_ASM ) */
}
}}} // namespace Kokkos::Impl::{anonymous}
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void yield_while_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
void yield_until_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
void yield_while_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
void yield_until_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
} /* namespace Impl */
} /* namespace Kokkos */
}} // namespace Kokkos::Impl
#else
void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}

View File

@ -46,47 +46,95 @@
#define KOKKOS_SPINWAIT_HPP
#include <Kokkos_Macros.hpp>
#include <Kokkos_Atomic.hpp>
#include <cstdint>
#include <type_traits>
namespace Kokkos {
namespace Impl {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
enum class WaitMode : int {
ACTIVE // Used for tight loops to keep threads active longest
, PASSIVE // Used to quickly yield the thread to quite down the system
};
void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
void yield_while_equal( volatile int32_t & flag , const int32_t value );
void yield_until_equal( volatile int32_t & flag , const int32_t value );
void host_thread_yield( const uint32_t i , const WaitMode mode );
void yield_while_equal( volatile int64_t & flag , const int64_t value );
void yield_until_equal( volatile int64_t & flag , const int64_t value );
template <typename T>
typename std::enable_if< std::is_integral<T>::value, void>::type
spinwait_while_equal( T const volatile & flag, const T value )
{
Kokkos::store_fence();
uint32_t i = 0 ;
while( value == flag ) {
host_thread_yield(++i, WaitMode::ACTIVE);
}
Kokkos::load_fence();
}
template <typename T>
typename std::enable_if< std::is_integral<T>::value, void>::type
yield_while_equal( T const volatile & flag, const T value )
{
Kokkos::store_fence();
uint32_t i = 0 ;
while( value == flag ) {
host_thread_yield(++i, WaitMode::PASSIVE);
}
Kokkos::load_fence();
}
template <typename T>
typename std::enable_if< std::is_integral<T>::value, void>::type
spinwait_until_equal( T const volatile & flag, const T value )
{
Kokkos::store_fence();
uint32_t i = 0 ;
while( value != flag ) {
host_thread_yield(++i, WaitMode::ACTIVE);
}
Kokkos::load_fence();
}
template <typename T>
typename std::enable_if< std::is_integral<T>::value, void>::type
yield_until_equal( T const volatile & flag, const T value )
{
Kokkos::store_fence();
uint32_t i = 0 ;
while( value != flag ) {
host_thread_yield(++i, WaitMode::PASSIVE);
}
Kokkos::load_fence();
}
#else
template <typename T>
KOKKOS_INLINE_FUNCTION
void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
typename std::enable_if< std::is_integral<T>::value, void>::type
spinwait_while_equal( T const volatile & flag, const T value ) {}
template <typename T>
KOKKOS_INLINE_FUNCTION
void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
typename std::enable_if< std::is_integral<T>::value, void>::type
yield_while_equal( T const volatile & flag, const T value ) {}
template <typename T>
KOKKOS_INLINE_FUNCTION
void yield_while_equal( volatile int32_t & , const int32_t ) {}
KOKKOS_INLINE_FUNCTION
void yield_until_equal( volatile int32_t & , const int32_t ) {}
typename std::enable_if< std::is_integral<T>::value, void>::type
spinwait_until_equal( T const volatile & flag, const T value ) {}
template <typename T>
KOKKOS_INLINE_FUNCTION
void yield_while_equal( volatile int64_t & , const int64_t ) {}
KOKKOS_INLINE_FUNCTION
void yield_until_equal( volatile int64_t & , const int64_t ) {}
typename std::enable_if< std::is_integral<T>::value, void>::type
yield_until_equal( T const volatile & flag, const T value ) {}
#endif

View File

@ -111,7 +111,9 @@ struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> >
using type = CommonViewAllocProp<Specialize,T> ;
KOKKOS_INLINE_FUNCTION
ViewCtorProp( const type & arg ) : value( arg ) {}
KOKKOS_INLINE_FUNCTION
ViewCtorProp( type && arg ) : value( arg ) {}
type value ;
@ -128,6 +130,7 @@ struct ViewCtorProp< void , std::integral_constant<unsigned,I> >
ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
template< typename P >
KOKKOS_INLINE_FUNCTION
ViewCtorProp( const P & ) {}
};

View File

@ -369,9 +369,9 @@ private:
template< size_t ... DimArgs >
KOKKOS_FORCEINLINE_FUNCTION
bool set( unsigned domain_rank
, unsigned range_rank
, const ViewDimension< DimArgs ... > & dim )
bool set( unsigned
, unsigned
, const ViewDimension< DimArgs ... > & )
{ return true ; }
template< class T , size_t ... DimArgs , class ... Args >
@ -1047,7 +1047,7 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
template< class DimRHS >
KOKKOS_INLINE_FUNCTION
constexpr ViewOffset(
const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ,
const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & ,
const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub )
: m_dim( sub.range_extent(0), 0, 0, 0, 0, 0, 0, 0 )
{
@ -1252,7 +1252,7 @@ public:
template< unsigned TrivialScalarSize >
KOKKOS_INLINE_FUNCTION
constexpr ViewOffset
( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
( std::integral_constant<unsigned,TrivialScalarSize> const &
, Kokkos::LayoutLeft const & arg_layout
)
: m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
@ -1741,7 +1741,7 @@ public:
template< unsigned TrivialScalarSize >
KOKKOS_INLINE_FUNCTION
constexpr ViewOffset
( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
( std::integral_constant<unsigned,TrivialScalarSize> const &
, Kokkos::LayoutRight const & arg_layout
)
: m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
@ -2368,7 +2368,7 @@ struct ViewDataHandle< Traits ,
)>::type >
{
typedef typename Traits::value_type value_type ;
typedef typename Traits::value_type * KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ;
typedef typename Traits::value_type * KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
typedef typename Traits::value_type & return_type ;
typedef Kokkos::Impl::SharedAllocationTracker track_type ;
@ -2376,7 +2376,7 @@ struct ViewDataHandle< Traits ,
static handle_type assign( value_type * arg_data_ptr
, track_type const & /*arg_tracker*/ )
{
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) {
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
}
return handle_type( arg_data_ptr );
@ -2386,7 +2386,7 @@ struct ViewDataHandle< Traits ,
static handle_type assign( handle_type const arg_data_ptr
, size_t offset )
{
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) {
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
}
return handle_type( arg_data_ptr + offset );
@ -2411,7 +2411,7 @@ struct ViewDataHandle< Traits ,
)>::type >
{
typedef typename Traits::value_type value_type ;
typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ;
typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
typedef typename Traits::value_type & return_type ;
typedef Kokkos::Impl::SharedAllocationTracker track_type ;
@ -2419,7 +2419,7 @@ struct ViewDataHandle< Traits ,
static handle_type assign( value_type * arg_data_ptr
, track_type const & /*arg_tracker*/ )
{
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) {
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
}
return handle_type( arg_data_ptr );
@ -2429,7 +2429,7 @@ struct ViewDataHandle< Traits ,
static handle_type assign( handle_type const arg_data_ptr
, size_t offset )
{
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) {
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
}
return handle_type( arg_data_ptr + offset );
@ -2783,6 +2783,11 @@ public:
, m_offset( std::integral_constant< unsigned , 0 >() , arg_layout )
{}
/**\brief Assign data */
KOKKOS_INLINE_FUNCTION
void assign_data( pointer_type arg_ptr )
{ m_handle = handle_type( arg_ptr ); }
//----------------------------------------
/* Allocate and construct mapped array.
* Allocate via shared allocation record and

View File

@ -48,6 +48,7 @@
#include <algorithm>
#include <Kokkos_Macros.hpp>
#include <Kokkos_Core.hpp>
#include <Kokkos_hwloc.hpp>
#include <impl/Kokkos_Error.hpp>
@ -312,14 +313,18 @@ Sentinel::Sentinel()
hwloc_get_cpubind( s_hwloc_topology , s_process_binding , HWLOC_CPUBIND_PROCESS );
if ( hwloc_bitmap_iszero( s_process_binding ) ) {
std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl;
if (Kokkos::show_warnings() ) {
std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl;
}
const int pu_depth = hwloc_get_type_depth( s_hwloc_topology, HWLOC_OBJ_PU );
int num_pu = 1;
if ( pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN ) {
num_pu = hwloc_get_nbobjs_by_depth( s_hwloc_topology, pu_depth );
}
else {
std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl;
if (Kokkos::show_warnings() ) {
std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl;
}
num_pu = 1;
}
hwloc_bitmap_set_range( s_process_binding, 0, num_pu-1);
@ -349,7 +354,7 @@ Sentinel::Sentinel()
hwloc_bitmap_free( s_process_no_core_zero );
if ( ! ok ) {
if ( Kokkos::show_warnings() && ! ok ) {
std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
}
}
@ -503,8 +508,8 @@ Sentinel::Sentinel()
hwloc_bitmap_free( proc_cpuset_location );
if ( ! symmetric ) {
std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
if ( Kokkos::show_warnings() && ! symmetric ) {
std::cerr << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
<< std::endl ;
}
}