Merge pull request #12 from weinbe2/neigh_transpose
Addition of a "smart" transpose routine with scratch memory staging
This commit is contained in:
@ -338,7 +338,11 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
|
||||
|
||||
list->k_ilist.template modify<DeviceType>();
|
||||
|
||||
Kokkos::deep_copy(list->d_neighbors,list->d_neighbors_build);
|
||||
//Kokkos::deep_copy(list->d_neighbors,list->d_neighbors_build);
|
||||
{
|
||||
// Perform an optimized transpose
|
||||
NPairKokkosTransposeHelper<DeviceType, typename AT::t_neighbors_2d, typename AT::t_neighbors_2d_lr>(list->d_neighbors, list->d_neighbors_build);
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
@ -473,6 +473,186 @@ struct NPairKokkosBuildFunctorSize<LMPHostType,HALF_NEIGH,GHOST_NEWTON,TRI> {
|
||||
void operator() (typename Kokkos::TeamPolicy<LMPHostType>::member_type /*dev*/) const {} // Should error out
|
||||
};
|
||||
|
||||
// This helper class implements optimized out-of-place transposes for Rank-2 views.
|
||||
// In the case where both views have the same layout, it uses Kokkos' default `deep_copy`.
|
||||
// In the case where the views have different layouts (LayoutLeft/LayoutRight), it implements
|
||||
// a transpose through scratch memory staging
|
||||
template <class DeviceType_, class t_view_dst_, class t_view_src_>
|
||||
struct NPairKokkosTransposeHelper {
|
||||
|
||||
using DeviceType = DeviceType_;
|
||||
|
||||
using t_view_dst = t_view_dst_;
|
||||
using t_view_src = t_view_src_;
|
||||
|
||||
static_assert(std::is_same<typename t_view_dst::value_type, typename t_view_src::value_type>::value, "Value types do not match");
|
||||
static_assert(t_view_dst::Rank == 2, "Destination view rank != 2");
|
||||
static_assert(t_view_src::Rank == 2, "Source view rank != 2");
|
||||
|
||||
using dst_layout = typename t_view_dst::traits::array_layout;
|
||||
using src_layout = typename t_view_src::traits::array_layout;
|
||||
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
|
||||
using t_view_value = typename t_view_dst::value_type;
|
||||
|
||||
// 32x32 tiles, will update so each thread does multiple loads
|
||||
static constexpr int vector_length = 32;
|
||||
static constexpr int bank_pad = 1;
|
||||
static constexpr int elem_size = sizeof(t_view_value);
|
||||
|
||||
static constexpr int threads_per_team = 4;
|
||||
|
||||
t_view_dst d_dst;
|
||||
t_view_src d_src;
|
||||
|
||||
bool src_is_layout_right;
|
||||
|
||||
// extents divided by vector length, rounded up
|
||||
int extent_tiles[2];
|
||||
|
||||
// 1 if extent is divisible by vector length, 0 otherwise
|
||||
int extent_is_multiple_vector_length[2];
|
||||
|
||||
// number of teams
|
||||
int n_teams;
|
||||
|
||||
// amount of shared memory per thread
|
||||
int shared_mem_per_thread;
|
||||
|
||||
NPairKokkosTransposeHelper(t_view_dst d_dst_, t_view_src d_src_)
|
||||
: d_dst(d_dst_), d_src(d_src_) {
|
||||
|
||||
assert(d_dst.extent(0) == d_src.extent(0) && d_dst.extent(1) == d_dst.extent(1));
|
||||
|
||||
if (std::is_same<dst_layout, src_layout>::value) {
|
||||
Kokkos::deep_copy(d_dst, d_src);
|
||||
} else {
|
||||
|
||||
src_is_layout_right = std::is_same<src_layout, Kokkos::LayoutRight>::value;
|
||||
|
||||
extent_tiles[0] = (d_dst.extent(0) + vector_length - 1) / vector_length;
|
||||
extent_tiles[1] = (d_dst.extent(1) + vector_length - 1) / vector_length;
|
||||
|
||||
extent_is_multiple_vector_length[0] = (extent_tiles[0] * vector_length == d_dst.extent(0)) ? 1 : 0;
|
||||
extent_is_multiple_vector_length[1] = (extent_tiles[1] * vector_length == d_dst.extent(1)) ? 1 : 0;
|
||||
|
||||
n_teams = (extent_tiles[0] * extent_tiles[1] + threads_per_team - 1) / threads_per_team;
|
||||
|
||||
shared_mem_per_thread = vector_length * (vector_length + bank_pad) * elem_size;
|
||||
|
||||
Kokkos::TeamPolicy<DeviceType> transpose_policy(n_teams, threads_per_team, vector_length);
|
||||
transpose_policy = transpose_policy.set_scratch_size(0, Kokkos::PerThread(shared_mem_per_thread));
|
||||
|
||||
Kokkos::parallel_for(transpose_policy, *this);
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member) const {
|
||||
|
||||
t_view_value* buffer = (t_view_value*)(team_member.team_shmem().get_shmem(shared_mem_per_thread * threads_per_team, 0)) + (shared_mem_per_thread / elem_size) * team_member.team_rank();
|
||||
|
||||
// extract flattened tile
|
||||
const int flattened_idx = team_member.team_rank() + team_member.league_rank() * threads_per_team;
|
||||
|
||||
// get range x, range y tile
|
||||
int extent_tile_id[2];
|
||||
if (src_is_layout_right) {
|
||||
// keep extent 1 tiles close together b/c loading from layout right
|
||||
extent_tile_id[0] = flattened_idx / extent_tiles[1];
|
||||
extent_tile_id[1] = flattened_idx - extent_tile_id[0] * extent_tiles[1];
|
||||
} else {
|
||||
// keep extent 0 tiles close together b/c loading from layout left
|
||||
extent_tile_id[1] = flattened_idx / extent_tiles[0];
|
||||
extent_tile_id[0] = flattened_idx - extent_tile_id[1] * extent_tiles[0];
|
||||
}
|
||||
|
||||
int elem[2];
|
||||
elem[0] = extent_tile_id[0] * vector_length;
|
||||
elem[1] = extent_tile_id[1] * vector_length;
|
||||
|
||||
if (elem[0] >= d_dst.extent(0) ||
|
||||
elem[1] >= d_dst.extent(1)) return;
|
||||
|
||||
// determine if a row/column is a full `vector_length` in size or not
|
||||
bool perfect_pad[2];
|
||||
perfect_pad[0] = (extent_is_multiple_vector_length[0] == 1 || extent_tile_id[0] + 1 < extent_tiles[0]);
|
||||
perfect_pad[1] = (extent_is_multiple_vector_length[1] == 1 || extent_tile_id[1] + 1 < extent_tiles[1]);
|
||||
|
||||
// load phase
|
||||
if (src_is_layout_right) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length),
|
||||
[&] (const int j) {
|
||||
|
||||
if (elem[1] + j < d_src.extent(1)) {
|
||||
if (perfect_pad[0]) {
|
||||
for (int i = 0; i < vector_length; i++)
|
||||
buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
|
||||
} else {
|
||||
for (int i = 0; i < (d_src.extent(0) - elem[0]); i++)
|
||||
buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
} else {
|
||||
// src is layout left
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length),
|
||||
[&] (const int i) {
|
||||
|
||||
if (elem[0] + i < d_src.extent(0)) {
|
||||
if (perfect_pad[0]) {
|
||||
for (int j = 0; j < vector_length; j++)
|
||||
buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
|
||||
} else {
|
||||
for (int j = 0; j < (d_src.extent(1) - elem[1]); j++)
|
||||
buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// No need for an extra sync b/c, as confirmed by asking on the Kokkos Slack, there
|
||||
// is an implicit sync at the end of a ThreadVectorRange as per the Kokkos
|
||||
// programming model.
|
||||
|
||||
// save phase
|
||||
if (src_is_layout_right) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length),
|
||||
[&] (const int i) {
|
||||
|
||||
if (elem[0] + i < d_dst.extent(0)) {
|
||||
if (perfect_pad[1]) {
|
||||
for (int j = 0; j < vector_length; j++)
|
||||
d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j];
|
||||
} else {
|
||||
for (int j = 0; j < (d_dst.extent(1) - elem[1]); j++)
|
||||
d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j];
|
||||
}
|
||||
}
|
||||
});
|
||||
} else {
|
||||
|
||||
// src is layout left
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length),
|
||||
[&] (const int j) {
|
||||
|
||||
if (elem[1] + j < d_dst.extent(1)) {
|
||||
if (perfect_pad[1]) {
|
||||
for (int i = 0; i < vector_length; i++)
|
||||
d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j];
|
||||
} else {
|
||||
for (int i = 0; i < (d_dst.extent(0) - elem[0]); i++)
|
||||
d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j];
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user