Update Kokkos library in LAMMPS to v3.0
This commit is contained in:
@ -3,8 +3,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
# This is a tutorial, not a test, so we don't ask CTest to run it.
|
||||
TRIBITS_ADD_EXECUTABLE(
|
||||
KOKKOS_ADD_EXECUTABLE(
|
||||
tutorial_hierarchicalparallelism_01_thread_teams
|
||||
SOURCES thread_teams.cpp
|
||||
COMM serial mpi
|
||||
)
|
||||
)
|
||||
|
||||
@ -1,13 +1,14 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
//
|
||||
// Kokkos v. 3.0
|
||||
// Copyright (2020) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -23,10 +24,10 @@
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
@ -36,7 +37,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -51,16 +52,17 @@
|
||||
// to identify a thread uniquely and some team related function calls such as a
|
||||
// barrier (which will be used in a subsequent example).
|
||||
// A ThreadTeam consists of 1 to n threads where the maxmimum value of n is
|
||||
// determined by the hardware. On a dual socket CPU machine with 8 cores per socket
|
||||
// the maximum size of a team is 8. The number of teams (i.e. the league_size) is
|
||||
// not limited by physical constraints. Its a pure logical number.
|
||||
// determined by the hardware. On a dual socket CPU machine with 8 cores per
|
||||
// socket the maximum size of a team is 8. The number of teams (i.e. the
|
||||
// league_size) is not limited by physical constraints. Its a pure logical
|
||||
// number.
|
||||
|
||||
typedef Kokkos::TeamPolicy<> team_policy ;
|
||||
typedef team_policy::member_type team_member ;
|
||||
typedef Kokkos::TeamPolicy<> team_policy;
|
||||
typedef team_policy::member_type team_member;
|
||||
|
||||
// Define a functor which can be launched using the TeamPolicy
|
||||
struct hello_world {
|
||||
typedef int value_type; //Specify value type for reduction target, sum
|
||||
typedef int value_type; // Specify value type for reduction target, sum
|
||||
|
||||
// This is a reduction operator which now takes as first argument the
|
||||
// TeamPolicy member_type. Every member of the team contributes to the
|
||||
@ -68,36 +70,39 @@ struct hello_world {
|
||||
// It is helpful to think of this operator as a parallel region for a team
|
||||
// (i.e. every team member is active and will execute the code).
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() ( const team_member & thread, int& sum) const {
|
||||
sum+=1;
|
||||
void operator()(const team_member& thread, int& sum) const {
|
||||
sum += 1;
|
||||
// The TeamPolicy<>::member_type provides functions to query the multi
|
||||
// dimensional index of a thread as well as the number of thread-teams and the size
|
||||
// of each team.
|
||||
printf("Hello World: %i %i // %i %i\n",thread.league_rank(),thread.team_rank(),thread.league_size(),thread.team_size());
|
||||
// dimensional index of a thread as well as the number of thread-teams and
|
||||
// the size of each team.
|
||||
printf("Hello World: %i %i // %i %i\n", thread.league_rank(),
|
||||
thread.team_rank(), thread.league_size(), thread.team_size());
|
||||
}
|
||||
};
|
||||
|
||||
int main(int narg, char* args[]) {
|
||||
Kokkos::initialize(narg,args);
|
||||
Kokkos::initialize(narg, args);
|
||||
|
||||
// Launch 12 teams of the maximum number of threads per team
|
||||
const int team_size_max = team_policy(1,1).team_size_max(hello_world(), Kokkos::ParallelReduceTag());
|
||||
const team_policy policy_a( 12 , team_size_max );
|
||||
const int team_size_max = team_policy(1, 1).team_size_max(
|
||||
hello_world(), Kokkos::ParallelReduceTag());
|
||||
const team_policy policy_a(12, team_size_max);
|
||||
|
||||
int sum = 0;
|
||||
Kokkos::parallel_reduce( policy_a , hello_world() , sum );
|
||||
Kokkos::parallel_reduce(policy_a, hello_world(), sum);
|
||||
|
||||
// The result will be 12*team_size_max
|
||||
printf("Result A: %i == %i\n",sum, team_size_max*12);
|
||||
printf("Result A: %i == %i\n", sum, team_size_max * 12);
|
||||
|
||||
// In practice it is often better to let Kokkos decide on the team_size
|
||||
const team_policy policy_b( 12 , Kokkos::AUTO );
|
||||
const team_policy policy_b(12, Kokkos::AUTO);
|
||||
|
||||
Kokkos::parallel_reduce( policy_b , hello_world() , sum );
|
||||
// The result will be 12*policy_b.team_size_recommended( hello_world(), Kokkos::ParallelReduceTag())
|
||||
const int team_size_recommended = policy_b.team_size_recommended( hello_world(), Kokkos::ParallelReduceTag());
|
||||
printf("Result B: %i %i\n",sum, team_size_recommended*12);
|
||||
Kokkos::parallel_reduce(policy_b, hello_world(), sum);
|
||||
// The result will be 12*policy_b.team_size_recommended( hello_world(),
|
||||
// Kokkos::ParallelReduceTag())
|
||||
const int team_size_recommended = policy_b.team_size_recommended(
|
||||
hello_world(), Kokkos::ParallelReduceTag());
|
||||
printf("Result B: %i %i\n", sum, team_size_recommended * 12);
|
||||
|
||||
Kokkos::finalize();
|
||||
}
|
||||
|
||||
|
||||
@ -2,12 +2,9 @@
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
IF (Kokkos_ENABLE_CXX11)
|
||||
# This is a tutorial, not a test, so we don't ask CTest to run it.
|
||||
TRIBITS_ADD_EXECUTABLE(
|
||||
tutorial_hierarchical_01_thread_teams_lambda
|
||||
SOURCES thread_teams_lambda.cpp
|
||||
COMM serial mpi
|
||||
)
|
||||
ENDIF ()
|
||||
# This is a tutorial, not a test, so we don't ask CTest to run it.
|
||||
KOKKOS_ADD_EXECUTABLE(
|
||||
tutorial_hierarchical_01_thread_teams_lambda
|
||||
SOURCES thread_teams_lambda.cpp
|
||||
)
|
||||
|
||||
|
||||
@ -2,10 +2,11 @@
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
// Kokkos v. 3.0
|
||||
// Copyright (2020) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
@ -23,10 +24,10 @@
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
@ -53,17 +54,17 @@
|
||||
// some reasonable bound, which eventually depends upon the hardware
|
||||
// and programming model implementation).
|
||||
|
||||
int main (int narg, char* args[]) {
|
||||
int main(int narg, char* args[]) {
|
||||
using Kokkos::parallel_reduce;
|
||||
typedef Kokkos::TeamPolicy<> team_policy;
|
||||
typedef typename team_policy::member_type team_member;
|
||||
typedef Kokkos::TeamPolicy<> team_policy;
|
||||
typedef typename team_policy::member_type team_member;
|
||||
|
||||
Kokkos::initialize (narg, args);
|
||||
Kokkos::initialize(narg, args);
|
||||
|
||||
// Set up a policy that launches 12 teams, with the maximum number
|
||||
// of threads per team.
|
||||
|
||||
const team_policy policy (12, Kokkos::AUTO);
|
||||
const team_policy policy(12, Kokkos::AUTO);
|
||||
|
||||
// This is a reduction with a team policy. The team policy changes
|
||||
// the first argument of the lambda. Rather than an integer index
|
||||
@ -77,21 +78,23 @@ int main (int narg, char* args[]) {
|
||||
// region." That is, every team member is active and will execute
|
||||
// the body of the lambda.
|
||||
int sum = 0;
|
||||
// We also need to protect the usage of a lambda against compiling
|
||||
// with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
|
||||
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
|
||||
parallel_reduce (policy, KOKKOS_LAMBDA (const team_member& thread, int& lsum) {
|
||||
lsum += 1;
|
||||
// TeamPolicy<>::member_type provides functions to query the
|
||||
// multidimensional index of a thread, as well as the number of
|
||||
// thread teams and the size of each team.
|
||||
printf ("Hello World: %i %i // %i %i\n", thread.league_rank (),
|
||||
thread.team_rank (), thread.league_size (), thread.team_size ());
|
||||
}, sum);
|
||||
#endif
|
||||
// We also need to protect the usage of a lambda against compiling
|
||||
// with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
|
||||
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
|
||||
parallel_reduce(
|
||||
policy,
|
||||
KOKKOS_LAMBDA(const team_member& thread, int& lsum) {
|
||||
lsum += 1;
|
||||
// TeamPolicy<>::member_type provides functions to query the
|
||||
// multidimensional index of a thread, as well as the number of
|
||||
// thread teams and the size of each team.
|
||||
printf("Hello World: %i %i // %i %i\n", thread.league_rank(),
|
||||
thread.team_rank(), thread.league_size(), thread.team_size());
|
||||
},
|
||||
sum);
|
||||
#endif
|
||||
// The result will be 12*team_policy::team_size_max([=]{})
|
||||
printf ("Result %i\n",sum);
|
||||
printf("Result %i\n", sum);
|
||||
|
||||
Kokkos::finalize ();
|
||||
Kokkos::finalize();
|
||||
}
|
||||
|
||||
|
||||
@ -3,8 +3,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
# This is a tutorial, not a test, so we don't ask CTest to run it.
|
||||
TRIBITS_ADD_EXECUTABLE(
|
||||
KOKKOS_ADD_EXECUTABLE(
|
||||
tutorial_hierarchicalparallelism_02_nested_parallel_for
|
||||
SOURCES nested_parallel_for.cpp
|
||||
COMM serial mpi
|
||||
)
|
||||
)
|
||||
|
||||
@ -1,13 +1,14 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
//
|
||||
// Kokkos v. 3.0
|
||||
// Copyright (2020) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -23,10 +24,10 @@
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
@ -36,7 +37,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -45,46 +46,50 @@
|
||||
#include <cstdio>
|
||||
|
||||
// See 01_thread_teams for an explanation of a basic TeamPolicy
|
||||
typedef Kokkos::TeamPolicy<> team_policy ;
|
||||
typedef typename team_policy::member_type team_member ;
|
||||
typedef Kokkos::TeamPolicy<> team_policy;
|
||||
typedef typename team_policy::member_type team_member;
|
||||
|
||||
struct hello_world {
|
||||
typedef int value_type; //Specify value type for reduction target, sum
|
||||
typedef int value_type; // Specify value type for reduction target, sum
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() ( const team_member & thread, int& sum) const {
|
||||
sum+=1;
|
||||
void operator()(const team_member& thread, int& sum) const {
|
||||
sum += 1;
|
||||
// When using the TeamPolicy Kokkos allows for nested parallel loops.
|
||||
// All three Kokkos parallel patterns are allowed (for, reduce, scan) and they
|
||||
// largely follow the same syntax as on the global level.
|
||||
// The execution policy for the Thread level nesting (the Vector level is in the next
|
||||
// tutorial example) is Kokkos::TeamThreadRange. This means the loop will be executed
|
||||
// by all members of the team and the loop count will be split between threads of the
|
||||
// team. Its arguments are the team_member, and a loop count.
|
||||
// Not every thread will do the same amount of iterations. On a GPU for example with
|
||||
// a team_size() larger than 31 only the first 31 threads would actually do anything.
|
||||
// On a CPU with 8 threads 7 would execute 4 loop iterations, and 1 thread would do
|
||||
// 3. Note also that the mode of splitting the count is architecture dependent similar
|
||||
// to what the RangePolicy on a global level does.
|
||||
// The call itself is not guaranteed to be synchronous. Also keep in mind that the
|
||||
// operator using a team_policy acts like a parallel region for the team. That means
|
||||
// that everything outside of the nested parallel_for is also executed by all threads
|
||||
// of the team.
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,31), [&] (const int& i) {
|
||||
printf("Hello World: (%i , %i) executed loop %i \n",thread.league_rank(),thread.team_rank(),i);
|
||||
});
|
||||
// All three Kokkos parallel patterns are allowed (for, reduce, scan) and
|
||||
// they largely follow the same syntax as on the global level. The execution
|
||||
// policy for the Thread level nesting (the Vector level is in the next
|
||||
// tutorial example) is Kokkos::TeamThreadRange. This means the loop will be
|
||||
// executed by all members of the team and the loop count will be split
|
||||
// between threads of the team. Its arguments are the team_member, and a
|
||||
// loop count. Not every thread will do the same amount of iterations. On a
|
||||
// GPU for example with a team_size() larger than 31 only the first 31
|
||||
// threads would actually do anything. On a CPU with 8 threads 7 would
|
||||
// execute 4 loop iterations, and 1 thread would do
|
||||
// 3. Note also that the mode of splitting the count is architecture
|
||||
// dependent similar to what the RangePolicy on a global level does. The
|
||||
// call itself is not guaranteed to be synchronous. Also keep in mind that
|
||||
// the operator using a team_policy acts like a parallel region for the
|
||||
// team. That means that everything outside of the nested parallel_for is
|
||||
// also executed by all threads of the team.
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31),
|
||||
[&](const int& i) {
|
||||
printf("Hello World: (%i , %i) executed loop %i \n",
|
||||
thread.league_rank(), thread.team_rank(), i);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
int main(int narg, char* args[]) {
|
||||
Kokkos::initialize(narg,args);
|
||||
Kokkos::initialize(narg, args);
|
||||
|
||||
// Launch 3 teams of the maximum number of threads per team
|
||||
const int team_size_max = team_policy(3,1).team_size_max( hello_world(), Kokkos::ParallelReduceTag());
|
||||
const team_policy policy( 3 , team_size_max );
|
||||
|
||||
const int team_size_max = team_policy(3, 1).team_size_max(
|
||||
hello_world(), Kokkos::ParallelReduceTag());
|
||||
const team_policy policy(3, team_size_max);
|
||||
|
||||
int sum = 0;
|
||||
Kokkos::parallel_reduce( policy , hello_world() , sum );
|
||||
printf("Result %i\n",sum);
|
||||
Kokkos::parallel_reduce(policy, hello_world(), sum);
|
||||
printf("Result %i\n", sum);
|
||||
|
||||
Kokkos::finalize();
|
||||
}
|
||||
|
||||
@ -3,14 +3,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
# This is a tutorial, not a test, so we don't ask CTest to run it.
|
||||
|
||||
IF(Kokkos_ENABLE_CXX11)
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE(
|
||||
KOKKOS_ADD_EXECUTABLE(
|
||||
tutorial_hierarchicalparallelism_03_vectorization
|
||||
SOURCES vectorization.cpp
|
||||
COMM serial mpi
|
||||
)
|
||||
|
||||
ENDIF()
|
||||
)
|
||||
|
||||
|
||||
@ -2,10 +2,11 @@
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
// Kokkos v. 3.0
|
||||
// Copyright (2020) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
@ -23,10 +24,10 @@
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
@ -45,118 +46,131 @@
|
||||
#include <Kokkos_Random.hpp>
|
||||
#include <cstdio>
|
||||
|
||||
|
||||
// The TeamPolicy actually supports 3D parallelism: Teams, Threads, Vector
|
||||
// Kokkos::parallel_{for/reduce/scan} calls can be completely free nested.
|
||||
// The execution policies for the nested layers are TeamThreadRange and
|
||||
// ThreadVectorRange.
|
||||
// The only restriction on nesting is that a given level can only be nested in a
|
||||
// higher one. e.g. a ThreadVectorRange can be nested inside a TeamPolicy operator
|
||||
// and inside a TeamThreadRange, but you can not nest a ThreadVectorRange or a
|
||||
// TeamThreadRange inside another ThreadVectorRange.
|
||||
// As with the 2D execution of TeamPolicy the operator has to be considered as
|
||||
// a parallel region even with respect to VectorLanes. That means even outside
|
||||
// a TeamThread or VectorThread loop all threads of a team and all vector lanes
|
||||
// of a thread execute every line of the operator as long as there are no restricitons
|
||||
// on them.
|
||||
// Code lines can be restricted using Kokkos::single to either execute once PerThread
|
||||
// or execute once PerTeam.
|
||||
typedef typename Kokkos::TeamPolicy<>::member_type team_member ;
|
||||
// higher one. e.g. a ThreadVectorRange can be nested inside a TeamPolicy
|
||||
// operator and inside a TeamThreadRange, but you can not nest a
|
||||
// ThreadVectorRange or a TeamThreadRange inside another ThreadVectorRange. As
|
||||
// with the 2D execution of TeamPolicy the operator has to be considered as a
|
||||
// parallel region even with respect to VectorLanes. That means even outside a
|
||||
// TeamThread or VectorThread loop all threads of a team and all vector lanes of
|
||||
// a thread execute every line of the operator as long as there are no
|
||||
// restricitons on them. Code lines can be restricted using Kokkos::single to
|
||||
// either execute once PerThread or execute once PerTeam.
|
||||
typedef typename Kokkos::TeamPolicy<>::member_type team_member;
|
||||
|
||||
struct SomeCorrelation {
|
||||
typedef int value_type; //Specify value type for reduction target, sum
|
||||
typedef int value_type; // Specify value type for reduction target, sum
|
||||
typedef Kokkos::DefaultExecutionSpace::scratch_memory_space shared_space;
|
||||
typedef Kokkos::View<int*,shared_space,Kokkos::MemoryUnmanaged> shared_1d_int;
|
||||
typedef Kokkos::View<int*, shared_space, Kokkos::MemoryUnmanaged>
|
||||
shared_1d_int;
|
||||
|
||||
Kokkos::View<const int***,Kokkos::LayoutRight> data;
|
||||
Kokkos::View<const int***, Kokkos::LayoutRight> data;
|
||||
Kokkos::View<int> gsum;
|
||||
|
||||
SomeCorrelation(Kokkos::View<int***,Kokkos::LayoutRight> data_in,
|
||||
Kokkos::View<int> sum):data(data_in),gsum(sum){}
|
||||
SomeCorrelation(Kokkos::View<int***, Kokkos::LayoutRight> data_in,
|
||||
Kokkos::View<int> sum)
|
||||
: data(data_in), gsum(sum) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() ( const team_member & thread) const {
|
||||
void operator()(const team_member& thread) const {
|
||||
int i = thread.league_rank();
|
||||
|
||||
// Allocate a shared array for the team.
|
||||
shared_1d_int count(thread.team_shmem(),data.extent(1));
|
||||
shared_1d_int count(thread.team_shmem(), data.extent(1));
|
||||
|
||||
// With each team run a parallel_for with its threads
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.extent(1)), [=] (const int& j) {
|
||||
int tsum;
|
||||
// Run a vector loop reduction over the inner dimension of data
|
||||
// Count how many values are multiples of 4
|
||||
// Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.extent(2)), [=] (const int& k, int & vsum) {
|
||||
vsum+= (data(i,j,k) % 4 == 0)?1:0;
|
||||
},tsum);
|
||||
Kokkos::parallel_for(
|
||||
Kokkos::TeamThreadRange(thread, data.extent(1)), [=](const int& j) {
|
||||
int tsum;
|
||||
// Run a vector loop reduction over the inner dimension of data
|
||||
// Count how many values are multiples of 4
|
||||
// Every vector lane gets the same reduction value (tsum) back, it is
|
||||
// broadcast to all vector lanes
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::ThreadVectorRange(thread, data.extent(2)),
|
||||
[=](const int& k, int& vsum) {
|
||||
vsum += (data(i, j, k) % 4 == 0) ? 1 : 0;
|
||||
},
|
||||
tsum);
|
||||
|
||||
// Make sure only one vector lane adds the reduction value to the shared array, i.e. execute
|
||||
// the next line only once PerThread
|
||||
Kokkos::single(Kokkos::PerThread(thread),[=] () {
|
||||
count(j) = tsum;
|
||||
});
|
||||
});
|
||||
// Make sure only one vector lane adds the reduction value to the
|
||||
// shared array, i.e. execute the next line only once PerThread
|
||||
Kokkos::single(Kokkos::PerThread(thread), [=]() { count(j) = tsum; });
|
||||
});
|
||||
|
||||
// Wait for all threads to finish the parallel_for so that all shared memory writes are done
|
||||
// Wait for all threads to finish the parallel_for so that all shared memory
|
||||
// writes are done
|
||||
thread.team_barrier();
|
||||
|
||||
// Check with one vector lane from each thread how many consecutive
|
||||
// data segments have the same number of values divisible by 4
|
||||
// The team reduction value is again broadcast to every team member (and every vector lane)
|
||||
// The team reduction value is again broadcast to every team member (and
|
||||
// every vector lane)
|
||||
int team_sum = 0;
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.extent(1)-1), [=] (const int& j, int& thread_sum) {
|
||||
// It is not valid to directly add to thread_sum
|
||||
// Use a single function with broadcast instead
|
||||
// team_sum will be used as input to the operator (i.e. it is used to initialize sum)
|
||||
// the end value of sum will be broadcast to all vector lanes in the thread.
|
||||
Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) {
|
||||
if(count(j)==count(j+1)) sum++;
|
||||
},thread_sum);
|
||||
},team_sum);
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::TeamThreadRange(thread, data.extent(1) - 1),
|
||||
[=](const int& j, int& thread_sum) {
|
||||
// It is not valid to directly add to thread_sum
|
||||
// Use a single function with broadcast instead
|
||||
// team_sum will be used as input to the operator (i.e. it is used to
|
||||
// initialize sum) the end value of sum will be broadcast to all
|
||||
// vector lanes in the thread.
|
||||
Kokkos::single(
|
||||
Kokkos::PerThread(thread),
|
||||
[=](int& sum) {
|
||||
if (count(j) == count(j + 1)) sum++;
|
||||
},
|
||||
thread_sum);
|
||||
},
|
||||
team_sum);
|
||||
|
||||
// Add with one thread and vectorlane of the team the team_sum to the global value
|
||||
Kokkos::single(Kokkos::PerTeam(thread),[=] () {
|
||||
Kokkos::atomic_add(&gsum(),team_sum);
|
||||
});
|
||||
// Add with one thread and vectorlane of the team the team_sum to the global
|
||||
// value
|
||||
Kokkos::single(Kokkos::PerTeam(thread),
|
||||
[=]() { Kokkos::atomic_add(&gsum(), team_sum); });
|
||||
}
|
||||
|
||||
// The functor needs to define how much shared memory it requests given a team_size.
|
||||
size_t team_shmem_size( int team_size ) const {
|
||||
// The functor needs to define how much shared memory it requests given a
|
||||
// team_size.
|
||||
size_t team_shmem_size(int team_size) const {
|
||||
return shared_1d_int::shmem_size(data.extent(1));
|
||||
}
|
||||
};
|
||||
|
||||
int main(int narg, char* args[]) {
|
||||
Kokkos::initialize(narg,args);
|
||||
Kokkos::initialize(narg, args);
|
||||
|
||||
{
|
||||
// Produce some 3D random data (see Algorithms/01_random_numbers for more info)
|
||||
Kokkos::View<int***,Kokkos::LayoutRight> data("Data",512,512,32);
|
||||
// Produce some 3D random data (see Algorithms/01_random_numbers for more
|
||||
// info)
|
||||
Kokkos::View<int***, Kokkos::LayoutRight> data("Data", 512, 512, 32);
|
||||
Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
|
||||
Kokkos::fill_random(data,rand_pool64,100);
|
||||
Kokkos::fill_random(data, rand_pool64, 100);
|
||||
|
||||
// A global value to put the result in
|
||||
Kokkos::View<int> gsum("Sum");
|
||||
|
||||
// Each team handles a slice of the data
|
||||
// Set up TeamPolicy with 512 teams with maximum number of threads per team and 16 vector lanes.
|
||||
// Kokkos::AUTO will determine the number of threads
|
||||
// The maximum vector length is hardware dependent but can always be smaller than the hardware allows.
|
||||
// The vector length must be a power of 2.
|
||||
// Set up TeamPolicy with 512 teams with maximum number of threads per team
|
||||
// and 16 vector lanes. Kokkos::AUTO will determine the number of threads
|
||||
// The maximum vector length is hardware dependent but can always be smaller
|
||||
// than the hardware allows. The vector length must be a power of 2.
|
||||
|
||||
const Kokkos::TeamPolicy<> policy( 512 , Kokkos::AUTO , 16);
|
||||
const Kokkos::TeamPolicy<> policy(512, Kokkos::AUTO, 16);
|
||||
|
||||
Kokkos::parallel_for( policy , SomeCorrelation(data,gsum) );
|
||||
Kokkos::parallel_for(policy, SomeCorrelation(data, gsum));
|
||||
|
||||
Kokkos::fence();
|
||||
|
||||
// Copy result value back
|
||||
int sum = 0;
|
||||
Kokkos::deep_copy(sum,gsum);
|
||||
printf("Result %i\n",sum);
|
||||
Kokkos::deep_copy(sum, gsum);
|
||||
printf("Result %i\n", sum);
|
||||
}
|
||||
|
||||
Kokkos::finalize();
|
||||
}
|
||||
|
||||
|
||||
@ -3,8 +3,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
# This is a tutorial, not a test, so we don't ask CTest to run it.
|
||||
TRIBITS_ADD_EXECUTABLE(
|
||||
KOKKOS_ADD_EXECUTABLE(
|
||||
tutorial_hierarchicalparallelism_04_team_scan
|
||||
SOURCES team_scan.cpp
|
||||
COMM serial mpi
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@ -1,13 +1,14 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
//
|
||||
// Kokkos v. 3.0
|
||||
// Copyright (2020) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -23,10 +24,10 @@
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
@ -36,7 +37,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -47,13 +48,13 @@
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
typedef Kokkos::DefaultExecutionSpace Device ;
|
||||
typedef Kokkos::HostSpace::execution_space Host ;
|
||||
typedef Kokkos::DefaultExecutionSpace Device;
|
||||
typedef Kokkos::HostSpace::execution_space Host;
|
||||
|
||||
typedef Kokkos::TeamPolicy< Device > team_policy ;
|
||||
typedef team_policy::member_type team_member ;
|
||||
typedef Kokkos::TeamPolicy<Device> team_policy;
|
||||
typedef team_policy::member_type team_member;
|
||||
|
||||
static const int TEAM_SIZE = 16 ;
|
||||
static const int TEAM_SIZE = 16;
|
||||
|
||||
struct find_2_tuples {
|
||||
int chunk_size;
|
||||
@ -61,89 +62,91 @@ struct find_2_tuples {
|
||||
Kokkos::View<int**> histogram;
|
||||
|
||||
find_2_tuples(int chunk_size_, Kokkos::DualView<int*> data_,
|
||||
Kokkos::DualView<int**> histogram_):chunk_size(chunk_size_),
|
||||
data(data_.d_view),histogram(histogram_.d_view) {
|
||||
data_.sync<Device>();
|
||||
histogram_.sync<Device>();
|
||||
histogram_.modify<Device>();
|
||||
Kokkos::DualView<int**> histogram_)
|
||||
: chunk_size(chunk_size_),
|
||||
data(data_.d_view),
|
||||
histogram(histogram_.d_view) {
|
||||
data_.sync<Device>();
|
||||
histogram_.sync<Device>();
|
||||
histogram_.modify<Device>();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() ( const team_member & dev) const {
|
||||
Kokkos::View<int**,Kokkos::MemoryUnmanaged> l_histogram(dev.team_shmem(),TEAM_SIZE,TEAM_SIZE);
|
||||
Kokkos::View<int*,Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),chunk_size+1);
|
||||
void operator()(const team_member& dev) const {
|
||||
Kokkos::View<int**, Kokkos::MemoryUnmanaged> l_histogram(
|
||||
dev.team_shmem(), TEAM_SIZE, TEAM_SIZE);
|
||||
Kokkos::View<int*, Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),
|
||||
chunk_size + 1);
|
||||
|
||||
const int i = dev.league_rank() * chunk_size;
|
||||
for(int j = dev.team_rank(); j<chunk_size+1; j+=dev.team_size())
|
||||
l_data(j) = data(i+j);
|
||||
for (int j = dev.team_rank(); j < chunk_size + 1; j += dev.team_size())
|
||||
l_data(j) = data(i + j);
|
||||
|
||||
for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
|
||||
for(int l = 0; l < TEAM_SIZE; l++)
|
||||
l_histogram(k,l) = 0;
|
||||
for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size())
|
||||
for (int l = 0; l < TEAM_SIZE; l++) l_histogram(k, l) = 0;
|
||||
dev.team_barrier();
|
||||
|
||||
for(int j = 0; j<chunk_size; j++) {
|
||||
for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
|
||||
for(int l = 0; l < TEAM_SIZE; l++) {
|
||||
if((l_data(j) == k) && (l_data(j+1)==l))
|
||||
l_histogram(k,l)++;
|
||||
for (int j = 0; j < chunk_size; j++) {
|
||||
for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size())
|
||||
for (int l = 0; l < TEAM_SIZE; l++) {
|
||||
if ((l_data(j) == k) && (l_data(j + 1) == l)) l_histogram(k, l)++;
|
||||
}
|
||||
}
|
||||
|
||||
for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
|
||||
for(int l = 0; l < TEAM_SIZE; l++) {
|
||||
Kokkos::atomic_fetch_add(&histogram(k,l),l_histogram(k,l));
|
||||
for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size())
|
||||
for (int l = 0; l < TEAM_SIZE; l++) {
|
||||
Kokkos::atomic_fetch_add(&histogram(k, l), l_histogram(k, l));
|
||||
}
|
||||
dev.team_barrier();
|
||||
}
|
||||
size_t team_shmem_size( int team_size ) const {
|
||||
return Kokkos::View<int**,Kokkos::MemoryUnmanaged>::shmem_size(TEAM_SIZE,TEAM_SIZE) +
|
||||
Kokkos::View<int*,Kokkos::MemoryUnmanaged>::shmem_size(chunk_size+1);
|
||||
size_t team_shmem_size(int team_size) const {
|
||||
return Kokkos::View<int**, Kokkos::MemoryUnmanaged>::shmem_size(TEAM_SIZE,
|
||||
TEAM_SIZE) +
|
||||
Kokkos::View<int*, Kokkos::MemoryUnmanaged>::shmem_size(chunk_size +
|
||||
1);
|
||||
}
|
||||
};
|
||||
|
||||
int main(int narg, char* args[]) {
|
||||
Kokkos::initialize(narg,args);
|
||||
Kokkos::initialize(narg, args);
|
||||
|
||||
{
|
||||
int chunk_size = 1024;
|
||||
int nchunks = 100000; //1024*1024;
|
||||
Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);
|
||||
int chunk_size = 1024;
|
||||
int nchunks = 100000; // 1024*1024;
|
||||
Kokkos::DualView<int*> data("data", nchunks * chunk_size + 1);
|
||||
|
||||
srand(1231093);
|
||||
srand(1231093);
|
||||
|
||||
for(int i = 0; i < (int) data.extent(0); i++) {
|
||||
data.h_view(i) = rand()%TEAM_SIZE;
|
||||
}
|
||||
data.modify<Host>();
|
||||
data.sync<Device>();
|
||||
|
||||
Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE);
|
||||
|
||||
|
||||
Kokkos::Timer timer;
|
||||
// threads/team is automatically limited to maximum supported by the device.
|
||||
int team_size = TEAM_SIZE;
|
||||
if( team_size > Device::execution_space::concurrency() )
|
||||
team_size = Device::execution_space::concurrency();
|
||||
Kokkos::parallel_for( team_policy( nchunks , team_size )
|
||||
, find_2_tuples(chunk_size,data,histogram) );
|
||||
Kokkos::fence();
|
||||
double time = timer.seconds();
|
||||
|
||||
histogram.sync<Host>();
|
||||
|
||||
printf("Time: %f \n\n",time);
|
||||
int sum = 0;
|
||||
for(int k=0; k<TEAM_SIZE; k++) {
|
||||
for(int l=0; l<TEAM_SIZE; l++) {
|
||||
printf("%i ",histogram.h_view(k,l));
|
||||
sum += histogram.h_view(k,l);
|
||||
for (int i = 0; i < (int)data.extent(0); i++) {
|
||||
data.h_view(i) = rand() % TEAM_SIZE;
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("Result: %i %i\n",sum,chunk_size*nchunks);
|
||||
data.modify<Host>();
|
||||
data.sync<Device>();
|
||||
|
||||
Kokkos::DualView<int**> histogram("histogram", TEAM_SIZE, TEAM_SIZE);
|
||||
|
||||
Kokkos::Timer timer;
|
||||
// threads/team is automatically limited to maximum supported by the device.
|
||||
int team_size = TEAM_SIZE;
|
||||
if (team_size > Device::execution_space::concurrency())
|
||||
team_size = Device::execution_space::concurrency();
|
||||
Kokkos::parallel_for(team_policy(nchunks, team_size),
|
||||
find_2_tuples(chunk_size, data, histogram));
|
||||
Kokkos::fence();
|
||||
double time = timer.seconds();
|
||||
|
||||
histogram.sync<Host>();
|
||||
|
||||
printf("Time: %f \n\n", time);
|
||||
int sum = 0;
|
||||
for (int k = 0; k < TEAM_SIZE; k++) {
|
||||
for (int l = 0; l < TEAM_SIZE; l++) {
|
||||
printf("%i ", histogram.h_view(k, l));
|
||||
sum += histogram.h_view(k, l);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("Result: %i %i\n", sum, chunk_size * nchunks);
|
||||
}
|
||||
Kokkos::finalize();
|
||||
}
|
||||
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
|
||||
TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams)
|
||||
KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams)
|
||||
KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams_lambda)
|
||||
KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_nested_parallel_for)
|
||||
KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_vectorization)
|
||||
|
||||
IF (Kokkos_ENABLE_CXX11)
|
||||
TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams_lambda)
|
||||
TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_nested_parallel_for)
|
||||
TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_vectorization)
|
||||
ENDIF ()
|
||||
|
||||
Reference in New Issue
Block a user