Update Kokkos library in LAMMPS to v3.0

This commit is contained in:
Stan Moore
2020-03-25 14:08:39 -06:00
parent 0252d8c210
commit 60864e38d1
2169 changed files with 121406 additions and 126492 deletions

View File

@ -1,11 +1,10 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_01_hello_world
SOURCES hello_world.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -84,12 +85,10 @@ struct hello_world {
// (as well as on the host). If not building with CUDA, the macro
// is unnecessary but harmless.
KOKKOS_INLINE_FUNCTION
void operator() (const int i) const {
printf ("Hello from i = %i\n", i);
}
void operator()(const int i) const { printf("Hello from i = %i\n", i); }
};
int main (int argc, char* argv[]) {
int main(int argc, char* argv[]) {
// You must call initialize() before you may call Kokkos.
//
// With no arguments, this initializes the default execution space
@ -97,13 +96,13 @@ int main (int argc, char* argv[]) {
// parameters. You may also pass in argc and argv, analogously to
// MPI_Init(). It reads and removes command-line arguments that
// start with "--kokkos-".
Kokkos::initialize (argc, argv);
Kokkos::initialize(argc, argv);
// Print the name of Kokkos' default execution space. We're using
// typeid here, so the name might get a bit mangled by the linker,
// but you should still be able to figure out what it is.
printf ("Hello World on Kokkos execution space %s\n",
typeid (Kokkos::DefaultExecutionSpace).name ());
printf("Hello World on Kokkos execution space %s\n",
typeid(Kokkos::DefaultExecutionSpace).name());
// Run the above functor on the default Kokkos execution space in
// parallel, with a parallel for loop count of 15.
@ -122,9 +121,8 @@ int main (int argc, char* argv[]) {
//
// You may notice that the printed numbers do not print out in
// order. Parallel for loops may execute in any order.
Kokkos::parallel_for ("HelloWorld",15, hello_world ());
Kokkos::parallel_for("HelloWorld", 15, hello_world());
// You must call finalize() after you are done using Kokkos.
Kokkos::finalize ();
Kokkos::finalize();
}

View File

@ -1,13 +1,10 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
IF (Kokkos_ENABLE_CXX11)
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
tutorial_01_hello_world_lambda
SOURCES hello_world_lambda.cpp
COMM serial mpi
)
ENDIF ()
# This is a tutorial, not a test, so we don't ask CTest to run it.
KOKKOS_ADD_EXECUTABLE(
tutorial_01_hello_world_lambda
SOURCES hello_world_lambda.cpp
)

View File

@ -2,10 +2,11 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -58,7 +59,7 @@
// lambdas have their places.
//
int main (int argc, char* argv[]) {
int main(int argc, char* argv[]) {
// You must call initialize() before you may call Kokkos.
//
// With no arguments, this initializes the default execution space
@ -66,13 +67,13 @@ int main (int argc, char* argv[]) {
// parameters. You may also pass in argc and argv, analogously to
// MPI_Init(). It reads and removes command-line arguments that
// start with "--kokkos-".
Kokkos::initialize (argc, argv);
Kokkos::initialize(argc, argv);
// Print the name of Kokkos' default execution space. We're using
// typeid here, so the name might get a bit mangled by the linker,
// but you should still be able to figure out what it is.
printf ("Hello World on Kokkos execution space %s\n",
typeid (Kokkos::DefaultExecutionSpace).name ());
printf("Hello World on Kokkos execution space %s\n",
typeid(Kokkos::DefaultExecutionSpace).name());
// Run lambda on the default Kokkos execution space in parallel,
// with a parallel for loop count of 15. The lambda's argument is
@ -101,12 +102,12 @@ int main (int argc, char* argv[]) {
// We also need to protect the usage of a lambda against compiling
// with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
Kokkos::parallel_for (15, KOKKOS_LAMBDA (const int i) {
// printf works in a CUDA parallel kernel; std::ostream does not.
printf ("Hello from i = %i\n", i);
});
Kokkos::parallel_for(
15, KOKKOS_LAMBDA(const int i) {
// printf works in a CUDA parallel kernel; std::ostream does not.
printf("Hello from i = %i\n", i);
});
#endif
// You must call finalize() after you are done using Kokkos.
Kokkos::finalize ();
Kokkos::finalize();
}

View File

@ -1,10 +1,9 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_02_simple_reduce
SOURCES simple_reduce.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -72,30 +73,33 @@ struct squaresum {
// (If the reduction type is an array like int[], indicating an
// array reduction result, then the second argument is just int[].)
KOKKOS_INLINE_FUNCTION
void operator () (const int i, int& lsum) const {
lsum += i*i; // compute the sum of squares
void operator()(const int i, int& lsum) const {
lsum += i * i; // compute the sum of squares
}
};
int main (int argc, char* argv[]) {
Kokkos::initialize (argc, argv);
int main(int argc, char* argv[]) {
Kokkos::initialize(argc, argv);
const int n = 10;
// Compute the sum of squares of integers from 0 to n-1, in
// parallel, using Kokkos.
int sum = 0;
Kokkos::parallel_reduce (n, squaresum (), sum);
printf ("Sum of squares of integers from 0 to %i, "
"computed in parallel, is %i\n", n - 1, sum);
Kokkos::parallel_reduce(n, squaresum(), sum);
printf(
"Sum of squares of integers from 0 to %i, "
"computed in parallel, is %i\n",
n - 1, sum);
// Compare to a sequential loop.
int seqSum = 0;
for (int i = 0; i < n; ++i) {
seqSum += i*i;
seqSum += i * i;
}
printf ("Sum of squares of integers from 0 to %i, "
"computed sequentially, is %i\n", n - 1, seqSum);
Kokkos::finalize ();
printf(
"Sum of squares of integers from 0 to %i, "
"computed sequentially, is %i\n",
n - 1, seqSum);
Kokkos::finalize();
return (sum == seqSum) ? 0 : -1;
}

View File

@ -1,12 +1,9 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
KOKKOS_ADD_EXECUTABLE(
tutorial_02_simple_reduce_lambda
SOURCES simple_reduce_lambda.cpp
)
IF (Kokkos_ENABLE_CXX11)
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
tutorial_02_simple_reduce_lambda
SOURCES simple_reduce_lambda.cpp
COMM serial mpi
)
ENDIF ()

View File

@ -2,10 +2,11 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -56,8 +57,8 @@
// of the parallel_reduce.
//
int main (int argc, char* argv[]) {
Kokkos::initialize (argc, argv);
int main(int argc, char* argv[]) {
Kokkos::initialize(argc, argv);
const int n = 10;
// Compute the sum of squares of integers from 0 to n-1, in
@ -65,30 +66,32 @@ int main (int argc, char* argv[]) {
// functor. The lambda takes the same arguments as the functor's
// operator().
int sum = 0;
// The KOKKOS_LAMBDA macro replaces the capture-by-value clause [=].
// It also handles any other syntax needed for CUDA.
// We also need to protect the usage of a lambda against compiling
// with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
Kokkos::parallel_reduce (n, KOKKOS_LAMBDA (const int i, int& lsum) {
lsum += i*i;
}, sum);
#endif
printf ("Sum of squares of integers from 0 to %i, "
"computed in parallel, is %i\n", n - 1, sum);
// The KOKKOS_LAMBDA macro replaces the capture-by-value clause [=].
// It also handles any other syntax needed for CUDA.
// We also need to protect the usage of a lambda against compiling
// with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
Kokkos::parallel_reduce(
n, KOKKOS_LAMBDA(const int i, int& lsum) { lsum += i * i; }, sum);
#endif
printf(
"Sum of squares of integers from 0 to %i, "
"computed in parallel, is %i\n",
n - 1, sum);
// Compare to a sequential loop.
int seqSum = 0;
for (int i = 0; i < n; ++i) {
seqSum += i*i;
seqSum += i * i;
}
printf ("Sum of squares of integers from 0 to %i, "
"computed sequentially, is %i\n", n - 1, seqSum);
Kokkos::finalize ();
printf(
"Sum of squares of integers from 0 to %i, "
"computed sequentially, is %i\n",
n - 1, seqSum);
Kokkos::finalize();
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
return (sum == seqSum) ? 0 : -1;
#else
return 0;
#endif
}

View File

@ -1,10 +1,9 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_03_simple_view
SOURCES simple_view.cpp
COMM serial mpi
)
)

View File

@ -43,7 +43,7 @@ include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
#for unit testing only, for best performance with OpenMP 4.0 or better
#for unit testing only, for best preformance with OpenMP 4.0 or better
test: $(EXE)
./$(EXE)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -66,7 +67,7 @@
//
// The first dimension of the View is the dimension over which it is
// efficient for Kokkos to parallelize.
typedef Kokkos::View<double*[3]> view_type;
typedef Kokkos::View<double * [3]> view_type;
// parallel_for functor that fills the View given to its constructor.
// The View must already have been allocated.
@ -78,20 +79,18 @@ struct InitView {
// operator= only do shallow copies. Thus, you can pass View
// objects around by "value"; they won't do a deep copy unless you
// explicitly ask for a deep copy.
InitView (view_type a_) :
a (a_)
{}
InitView(view_type a_) : a(a_) {}
// Fill the View with some data. The parallel_for loop will iterate
// over the View's first dimension N.
KOKKOS_INLINE_FUNCTION
void operator () (const int i) const {
void operator()(const int i) const {
// Acesss the View just like a Fortran array. The layout depends
// on the View's memory space, so don't rely on the View's
// physical memory layout unless you know what you're doing.
a(i,0) = 1.0*i;
a(i,1) = 1.0*i*i;
a(i,2) = 1.0*i*i*i;
a(i, 0) = 1.0 * i;
a(i, 1) = 1.0 * i * i;
a(i, 2) = 1.0 * i * i * i;
}
};
@ -100,20 +99,20 @@ struct ReduceFunctor {
view_type a;
// Constructor takes View by "value"; this does a shallow copy.
ReduceFunctor (view_type a_) : a (a_) {}
ReduceFunctor(view_type a_) : a(a_) {}
// If you write a functor to do a reduction, you must specify the
// type of the reduction result via a public 'value_type' typedef.
typedef double value_type;
KOKKOS_INLINE_FUNCTION
void operator() (int i, double &lsum) const {
lsum += a(i,0)*a(i,1)/(a(i,2)+0.1);
void operator()(int i, double& lsum) const {
lsum += a(i, 0) * a(i, 1) / (a(i, 2) + 0.1);
}
};
int main (int argc, char* argv[]) {
Kokkos::initialize (argc, argv);
int main(int argc, char* argv[]) {
Kokkos::initialize(argc, argv);
{
const int N = 10;
@ -132,13 +131,12 @@ int main (int argc, char* argv[]) {
//
// The string "A" is just the label; it only matters for debugging.
// Different Views may have the same label.
view_type a ("A", N);
view_type a("A", N);
Kokkos::parallel_for (N, InitView (a));
Kokkos::parallel_for(N, InitView(a));
double sum = 0;
Kokkos::parallel_reduce (N, ReduceFunctor (a), sum);
printf ("Result: %f\n", sum);
} // use this scope to ensure the lifetime of "A" ends before finalize
Kokkos::finalize ();
Kokkos::parallel_reduce(N, ReduceFunctor(a), sum);
printf("Result: %f\n", sum);
} // use this scope to ensure the lifetime of "A" ends before finalize
Kokkos::finalize();
}

View File

@ -1,10 +1,10 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
IF (Kokkos_ENABLE_CXX11)
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_03_simple_view_lambda
SOURCES simple_view_lambda.cpp
COMM serial mpi

View File

@ -2,10 +2,11 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -65,10 +66,10 @@
//
// The first dimension of the View is the dimension over which it is
// efficient for Kokkos to parallelize.
typedef Kokkos::View<double*[3]> view_type;
typedef Kokkos::View<double * [3]> view_type;
int main (int argc, char* argv[]) {
Kokkos::initialize (argc, argv);
int main(int argc, char* argv[]) {
Kokkos::initialize(argc, argv);
{
// Allocate the View. The first dimension is a run-time parameter
@ -86,37 +87,40 @@ int main (int argc, char* argv[]) {
//
// The string "A" is just the label; it only matters for debugging.
// Different Views may have the same label.
view_type a ("A", 10);
view_type a("A", 10);
// Fill the View with some data. The parallel_for loop will iterate
// over the View's first dimension N.
//
// Note that the View is passed by value into the lambda. The macro
// KOKKOS_LAMBDA includes the "capture by value" clause [=]. This
// tells the lambda to "capture all variables in the enclosing scope
// by value." Views have "view semantics"; they behave like
// pointers, not like std::vector. Passing them by value does a
// shallow copy. A deep copy never happens unless you explicitly
// ask for one.
// We also need to protect the usage of a lambda against compiling
// with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
Kokkos::parallel_for (10, KOKKOS_LAMBDA (const int i) {
// Acesss the View just like a Fortran array. The layout depends
// on the View's memory space, so don't rely on the View's
// physical memory layout unless you know what you're doing.
a(i,0) = 1.0*i;
a(i,1) = 1.0*i*i;
a(i,2) = 1.0*i*i*i;
});
// Fill the View with some data. The parallel_for loop will iterate
// over the View's first dimension N.
//
// Note that the View is passed by value into the lambda. The macro
// KOKKOS_LAMBDA includes the "capture by value" clause [=]. This
// tells the lambda to "capture all variables in the enclosing scope
// by value." Views have "view semantics"; they behave like
// pointers, not like std::vector. Passing them by value does a
// shallow copy. A deep copy never happens unless you explicitly
// ask for one.
// We also need to protect the usage of a lambda against compiling
// with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
Kokkos::parallel_for(
10, KOKKOS_LAMBDA(const int i) {
// Acesss the View just like a Fortran array. The layout depends
// on the View's memory space, so don't rely on the View's
// physical memory layout unless you know what you're doing.
a(i, 0) = 1.0 * i;
a(i, 1) = 1.0 * i * i;
a(i, 2) = 1.0 * i * i * i;
});
// Reduction functor that reads the View given to its constructor.
double sum = 0;
Kokkos::parallel_reduce (10, KOKKOS_LAMBDA (const int i, double& lsum) {
lsum += a(i,0)*a(i,1)/(a(i,2)+0.1);
}, sum);
printf ("Result: %f\n", sum);
#endif
Kokkos::parallel_reduce(
10,
KOKKOS_LAMBDA(const int i, double& lsum) {
lsum += a(i, 0) * a(i, 1) / (a(i, 2) + 0.1);
},
sum);
printf("Result: %f\n", sum);
#endif
}
Kokkos::finalize ();
Kokkos::finalize();
}

View File

@ -3,8 +3,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_04_simple_memoryspaces
SOURCES simple_memoryspaces.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -46,7 +47,7 @@
// The type of a two-dimensional N x 3 array of double.
// It lives in Kokkos' default memory space.
typedef Kokkos::View<double*[3]> view_type;
typedef Kokkos::View<double * [3]> view_type;
// The "HostMirror" type corresponding to view_type above is also a
// two-dimensional N x 3 array of double. However, it lives in the
@ -64,12 +65,12 @@ typedef view_type::HostMirror host_view_type;
struct ReduceFunctor {
view_type a;
ReduceFunctor (view_type a_) : a (a_) {}
typedef int value_type; //Specify type for reduction value, lsum
ReduceFunctor(view_type a_) : a(a_) {}
typedef int value_type; // Specify type for reduction value, lsum
KOKKOS_INLINE_FUNCTION
void operator() (int i, int &lsum) const {
lsum += a(i,0)-a(i,1)+a(i,2);
void operator()(int i, int &lsum) const {
lsum += a(i, 0) - a(i, 1) + a(i, 2);
}
};
@ -77,27 +78,26 @@ int main() {
Kokkos::initialize();
{
view_type a ("A", 10);
view_type a("A", 10);
// If view_type and host_mirror_type live in the same memory space,
// a "mirror view" is just an alias, and deep_copy does nothing.
// Otherwise, a mirror view of a device View lives in host memory,
// and deep_copy does a deep copy.
host_view_type h_a = Kokkos::create_mirror_view (a);
host_view_type h_a = Kokkos::create_mirror_view(a);
// The View h_a lives in host (CPU) memory, so it's legal to fill
// the view sequentially using ordinary code, like this.
for (int i = 0; i < 10; i++) {
for (int j = 0; j < 3; j++) {
h_a(i,j) = i*10 + j;
h_a(i, j) = i * 10 + j;
}
}
Kokkos::deep_copy (a, h_a); // Copy from host to device.
Kokkos::deep_copy(a, h_a); // Copy from host to device.
int sum = 0;
Kokkos::parallel_reduce (10, ReduceFunctor (a), sum);
printf ("Result is %i\n",sum);
Kokkos::parallel_reduce(10, ReduceFunctor(a), sum);
printf("Result is %i\n", sum);
}
Kokkos::finalize ();
Kokkos::finalize();
}

View File

@ -3,8 +3,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_05_simple_atomics
SOURCES simple_atomics.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -59,7 +60,6 @@ typedef view_type::HostMirror host_view_type;
typedef Kokkos::View<int> count_type;
typedef count_type::HostMirror host_count_type;
// Functor for finding a list of primes in a given set of numbers. If
// run in parallel, the order of results is nondeterministic, because
// hardware atomic updates do not guarantee an order of execution.
@ -68,26 +68,25 @@ struct findprimes {
view_type result;
count_type count;
findprimes (view_type data_, view_type result_, count_type count_) :
data (data_), result (result_), count (count_)
{}
findprimes(view_type data_, view_type result_, count_type count_)
: data(data_), result(result_), count(count_) {}
// Test if data(i) is prime. If it is, increment the count of
// primes (stored in the zero-dimensional View 'count') and add the
// value to the current list of primes 'result'.
KOKKOS_INLINE_FUNCTION
void operator() (const int i) const {
const int number = data(i); // the current number
void operator()(const int i) const {
const int number = data(i); // the current number
// Test all numbers from 3 to ceiling(sqrt(data(i))), to see if
// they are factors of data(i). It's not the most efficient prime
// test, but it works.
const int upper_bound = std::sqrt(1.0*number)+1;
bool is_prime = !(number%2 == 0);
int k = 3;
const int upper_bound = std::sqrt(1.0 * number) + 1;
bool is_prime = !(number % 2 == 0);
int k = 3;
while (k < upper_bound && is_prime) {
is_prime = !(number%k == 0);
k += 2; // don't have to test even numbers
is_prime = !(number % k == 0);
k += 2; // don't have to test even numbers
}
if (is_prime) {
@ -98,42 +97,41 @@ struct findprimes {
// atomic_fetch_add results the _current_ count, but increments
// it (by 1 in this case). The current count of primes indexes
// into the first unoccupied position of the 'result' array.
const int idx = Kokkos::atomic_fetch_add (&count(), 1);
result(idx) = number;
const int idx = Kokkos::atomic_fetch_add(&count(), 1);
result(idx) = number;
}
}
};
int main () {
Kokkos::initialize ();
int main() {
Kokkos::initialize();
{
srand (61391); // Set the random seed
srand(61391); // Set the random seed
int nnumbers = 100000;
view_type data ("RND", nnumbers);
view_type result ("Prime", nnumbers);
count_type count ("Count");
view_type data("RND", nnumbers);
view_type result("Prime", nnumbers);
count_type count("Count");
host_view_type h_data = Kokkos::create_mirror_view (data);
host_view_type h_result = Kokkos::create_mirror_view (result);
host_count_type h_count = Kokkos::create_mirror_view (count);
host_view_type h_data = Kokkos::create_mirror_view(data);
host_view_type h_result = Kokkos::create_mirror_view(result);
host_count_type h_count = Kokkos::create_mirror_view(count);
typedef view_type::size_type size_type;
// Fill the 'data' array on the host with random numbers. We assume
// that they come from some process which is only implemented on the
// host, via some library. (That's true in this case.)
for (size_type i = 0; i < data.extent(0); ++i) {
h_data(i) = rand () % nnumbers;
h_data(i) = rand() % nnumbers;
}
Kokkos::deep_copy (data, h_data); // copy from host to device
Kokkos::deep_copy(data, h_data); // copy from host to device
Kokkos::parallel_for (data.extent(0), findprimes (data, result, count));
Kokkos::deep_copy (h_count, count); // copy from device to host
Kokkos::parallel_for(data.extent(0), findprimes(data, result, count));
Kokkos::deep_copy(h_count, count); // copy from device to host
printf ("Found %i prime numbers in %i random numbers\n", h_count(), nnumbers);
printf("Found %i prime numbers in %i random numbers\n", h_count(),
nnumbers);
}
Kokkos::finalize ();
Kokkos::finalize();
}

View File

@ -3,8 +3,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_06_simple_mdrangepolicy
SOURCES simple_mdrangepolicy.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -52,150 +53,163 @@
// 3. Shut down Kokkos
//
// Two examples are provided:
// Example 1: Rank 2 case with minimal default parameters and arguments used
// Example 1: Rank 2 case with minimal default parameters and arguments used
// in the MDRangePolicy
//
// Example 2: Rank 3 case with additional outer/inner iterate pattern parameters
// and tile dims passed to the ctor
// Simple functor for computing/storing the product of indices in a View v
template < class ViewType >
template <class ViewType>
struct MDFunctor {
typedef long value_type;
ViewType v;
size_t size;
size_t size;
MDFunctor( const ViewType & v_, const size_t size_ )
: v(v_), size(size_) {}
MDFunctor(const ViewType& v_, const size_t size_) : v(v_), size(size_) {}
// 2D case - used by parallel_for
KOKKOS_INLINE_FUNCTION
void operator () (const int i, const int j) const {
v(i,j) = i*j; // compute the product of indices
void operator()(const int i, const int j) const {
v(i, j) = i * j; // compute the product of indices
}
// 3D case - used by parallel_for
KOKKOS_INLINE_FUNCTION
void operator () (const int i, const int j, const int k) const {
v(i,j,k) = i*j*k; // compute the product of indices
void operator()(const int i, const int j, const int k) const {
v(i, j, k) = i * j * k; // compute the product of indices
}
// 2D case - reduction
KOKKOS_INLINE_FUNCTION
void operator () (const int i, const int j, value_type & incorrect_count) const {
if ( v(i,j) != i*j ) {
void operator()(const int i, const int j, value_type& incorrect_count) const {
if (v(i, j) != i * j) {
incorrect_count += 1;
}
}
// 3D case - reduction
KOKKOS_INLINE_FUNCTION
void operator () (const int i, const int j, const int k, value_type & incorrect_count) const {
if ( v(i,j,k) != i*j*k ) {
void operator()(const int i, const int j, const int k,
value_type& incorrect_count) const {
if (v(i, j, k) != i * j * k) {
incorrect_count += 1;
}
}
};
int main (int argc, char* argv[]) {
Kokkos::initialize (argc, argv);
int main(int argc, char* argv[]) {
Kokkos::initialize(argc, argv);
// Bound(s) for MDRangePolicy
// Bound(s) for MDRangePolicy
const int n = 100;
// ViewType typedefs for Rank<2>, Rank<3> for example usage
typedef double ScalarType;
typedef typename Kokkos::View<ScalarType**> ViewType_2D;
typedef typename Kokkos::View<ScalarType***> ViewType_3D;
typedef typename Kokkos::View<ScalarType**> ViewType_2D;
typedef typename Kokkos::View<ScalarType***> ViewType_3D;
/////////////////////////////////////////////////////////////////////////////
// Explanation of MDRangePolicy usage, template parameters, constructor arguments
// Explanation of MDRangePolicy usage, template parameters, constructor
// arguments
//
// MDRangePolicy typedefs for Rank<2>, Rank<3> cases
// Required template parameters:
// Required template parameters:
// Kokkos::Rank<N>: where N=rank
//
// Optional template parameters to Rank<...>:
// Kokkos::Iterate::{Default,Left,Right}: Outer iteration pattern across tiles;
// Kokkos::Iterate::{Default,Left,Right}: Outer iteration pattern across
// tiles;
// defaults based on the execution space similar to Kokkos::Layout
// Kokkos::Iterate::{Default,Left,Right}: Inner iteration pattern within tiles;
// Kokkos::Iterate::{Default,Left,Right}: Inner iteration pattern within
// tiles;
// defaults based on the execution space similar to Kokkos::Layout
//
// e.g. typedef Rank<2, Iterate::Left, Iterate::Left> rank2ll;
//
//
// Optional template parameters to MDRangePolicy:
// ExecutionSpace: Kokkos::Serial, Kokkos::OpenMP, Kokkos::Cuda, etc.
// ExecutionSpace: Kokkos::Serial, Kokkos::OpenMP, Kokkos::Cuda, etc.
//
// Kokkos::IndexType< T >: where T = int, long, unsigned int, etc.
//
// struct Tag{}: A user-provided tag for tagging functor operators
//
// e.g. 1: MDRangePolicy< Kokkos::Serial, Rank<2, Iterate::Left, Iterate::Left>, IndexType<int>, Tag > mdpolicy;
// e.g. 2: MDRangePolicy< Kokkos::Serial, rank2ll, IndexType<int>, Tag > mdpolicy;
// e.g. 1: MDRangePolicy< Kokkos::Serial, Rank<2, Iterate::Left,
// Iterate::Left>, IndexType<int>, Tag > mdpolicy; e.g. 2: MDRangePolicy<
// Kokkos::Serial, rank2ll, IndexType<int>, Tag > mdpolicy;
//
//
// Required arguments to ctor:
// {{ l0, l1, ... }}: Lower bounds, provided as Kokkos::Array or std::initializer_list
// {{ u0, u1, ... }}: Upper bounds, provided as Kokkos::Array or std::initializer_list
// {{ l0, l1, ... }}: Lower bounds, provided as Kokkos::Array or
// std::initializer_list
// {{ u0, u1, ... }}: Upper bounds, provided as Kokkos::Array or
// std::initializer_list
//
// Optional arguments to ctor:
// {{ t0, t1, ... }}: Tile dimensions, provided as Kokkos::Array or std::initializer_list
// {{ t0, t1, ... }}: Tile dimensions, provided as Kokkos::Array or
// std::initializer_list
// defaults based on the execution space
//
// e.g. mdpolicy( {{0,0}}, {{u0,u1}}, {{t0,t1}};
//
//
/////////////////////////////////////////////////////////////////////////////
// Example 1:
// Example 1:
long incorrect_count_2d = 0;
{
// Rank<2> Case: Rank is provided, all other parameters are default
typedef typename Kokkos::Experimental::MDRangePolicy< Kokkos::Experimental::Rank<2> > MDPolicyType_2D;
typedef typename Kokkos::Experimental::MDRangePolicy<
Kokkos::Experimental::Rank<2> >
MDPolicyType_2D;
// Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims defaulted
MDPolicyType_2D mdpolicy_2d( {{0,0}}, {{n,n}} );
// Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims
// defaulted
MDPolicyType_2D mdpolicy_2d({{0, 0}}, {{n, n}});
// Construct a 2D view to store result of product of indices
ViewType_2D v2("v2", n, n);
// Execute parallel_for with rank 2 MDRangePolicy
Kokkos::parallel_for( "md2d", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n) );
Kokkos::parallel_for("md2d", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n));
// Check results with a parallel_reduce using the MDRangePolicy
Kokkos::parallel_reduce( "md2dredux", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n), incorrect_count_2d );
Kokkos::parallel_reduce("md2dredux", mdpolicy_2d,
MDFunctor<ViewType_2D>(v2, n), incorrect_count_2d);
printf("Rank 2 MDRangePolicy incorrect count: %ld\n", incorrect_count_2d); // should be 0
printf("Rank 2 MDRangePolicy incorrect count: %ld\n",
incorrect_count_2d); // should be 0
}
// Example 2:
// Example 2:
long incorrect_count_3d = 0;
{
// Rank<3> Case: Rank, inner iterate pattern, outer iterate pattern provided
typedef typename Kokkos::Experimental::MDRangePolicy< Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left, Kokkos::Experimental::Iterate::Left> > MDPolicyType_3D;
typedef typename Kokkos::Experimental::MDRangePolicy<
Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left,
Kokkos::Experimental::Iterate::Left> >
MDPolicyType_3D;
// Construct 3D MDRangePolicy: lower, upper bounds, tile dims provided
MDPolicyType_3D mdpolicy_3d( {{0,0,0}}, {{n,n,n}}, {{4,4,4}} );
MDPolicyType_3D mdpolicy_3d({{0, 0, 0}}, {{n, n, n}}, {{4, 4, 4}});
// Construct a 3D view to store result of product of indices
ViewType_3D v3("v3", n, n, n);
// Execute parallel_for with rank 3 MDRangePolicy
Kokkos::parallel_for( "md3d", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n) );
Kokkos::parallel_for("md3d", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n));
// Check results with a parallel_reduce using the MDRangePolicy
Kokkos::parallel_reduce( "md3dredux", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n), incorrect_count_3d );
Kokkos::parallel_reduce("md3dredux", mdpolicy_3d,
MDFunctor<ViewType_3D>(v3, n), incorrect_count_3d);
printf("Rank 3 MDRangePolicy incorrect count: %ld\n", incorrect_count_3d); // should be 0
printf("Rank 3 MDRangePolicy incorrect count: %ld\n",
incorrect_count_3d); // should be 0
}
Kokkos::finalize ();
Kokkos::finalize();
return (incorrect_count_2d == long(0) && incorrect_count_3d == long(0)) ? 0 : -1;
return (incorrect_count_2d == long(0) && incorrect_count_3d == long(0)) ? 0
: -1;
}

View File

@ -3,8 +3,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_advancedviews_01_data_layouts
SOURCES data_layouts.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -62,19 +63,19 @@ typedef Kokkos::View<double*> view_type;
// parallel_for functor that fills the given View with some data. It
// expects to access the View by rows in parallel: each call i of
// operator() accesses a row.
template<class ViewType>
template <class ViewType>
struct init_view {
ViewType a;
init_view (ViewType a_) : a (a_) {}
init_view(ViewType a_) : a(a_) {}
KOKKOS_INLINE_FUNCTION
void operator() (const typename ViewType::size_type i) const {
void operator()(const typename ViewType::size_type i) const {
// On CPUs this loop could be vectorized so j should do stride 1
// access on a for optimal performance. I.e. a should be LayoutRight.
// On GPUs threads should do coalesced loads and stores. That means
// that i should be the stride one access for optimal performance.
for (typename ViewType::size_type j = 0; j < a.extent(1); ++j) {
a(i,j) = 1.0*a.extent(0)*i + 1.0*j;
a(i, j) = 1.0 * a.extent(0) * i + 1.0 * j;
}
}
};
@ -86,14 +87,13 @@ struct init_view {
// Since the functor is templated on the ViewTypes itself it doesn't matter what
// there layouts are. That means you can use different layouts on different
// architectures.
template<class ViewType1, class ViewType2>
template <class ViewType1, class ViewType2>
struct contraction {
view_type a;
typename ViewType1::const_type v1;
typename ViewType2::const_type v2;
contraction (view_type a_, ViewType1 v1_, ViewType2 v2_) :
a (a_), v1 (v1_), v2 (v2_)
{}
contraction(view_type a_, ViewType1 v1_, ViewType2 v2_)
: a(a_), v1(v1_), v2(v2_) {}
// As with the initialization functor the performance of this operator
// depends on the architecture and the chosen data layouts.
@ -103,9 +103,9 @@ struct contraction {
// the thread Index, i must be the stride 1 dimension. That means v1 should be
// LayoutLeft and v2 LayoutRight.
KOKKOS_INLINE_FUNCTION
void operator() (const view_type::size_type i) const {
void operator()(const view_type::size_type i) const {
for (view_type::size_type j = 0; j < v1.extent(1); ++j) {
a(i) = v1(i,j)*v2(j,i);
a(i) = v1(i, j) * v2(j, i);
}
}
};
@ -113,61 +113,61 @@ struct contraction {
// Compute a dot product. This is used for result verification.
struct dot {
view_type a;
dot (view_type a_) : a (a_) {}
typedef double value_type; //Specify type for reduction target, lsum
dot(view_type a_) : a(a_) {}
typedef double value_type; // Specify type for reduction target, lsum
KOKKOS_INLINE_FUNCTION
void operator() (const view_type::size_type i, double &lsum) const {
lsum += a(i)*a(i);
void operator()(const view_type::size_type i, double& lsum) const {
lsum += a(i) * a(i);
}
};
int main (int narg, char* arg[]) {
int main(int narg, char* arg[]) {
// When initializing Kokkos, you may pass in command-line arguments,
// just like with MPI_Init(). Kokkos reserves the right to remove
// arguments from the list that start with '--kokkos-'.
Kokkos::initialize (narg, arg);
Kokkos::initialize(narg, arg);
{
int size = 10000;
view_type a("A",size);
view_type a("A", size);
// Define two views with LayoutLeft and LayoutRight.
left_type l("L",size,10000);
right_type r("R",size,10000);
left_type l("L", size, 10000);
right_type r("R", size, 10000);
// Initialize the data in the views.
Kokkos::parallel_for(size,init_view<left_type>(l));
Kokkos::parallel_for(size,init_view<right_type>(r));
Kokkos::parallel_for(size, init_view<left_type>(l));
Kokkos::parallel_for(size, init_view<right_type>(r));
Kokkos::fence();
// Measure time to execute the contraction kernel when giving it a
// LayoutLeft view for v1 and a LayoutRight view for v2. This should be
// fast on GPUs and slow on CPUs
Kokkos::Timer time1;
Kokkos::parallel_for(size,contraction<left_type,right_type>(a,l,r));
Kokkos::parallel_for(size, contraction<left_type, right_type>(a, l, r));
Kokkos::fence();
double sec1 = time1.seconds();
double sum1 = 0;
Kokkos::parallel_reduce(size,dot(a),sum1);
Kokkos::parallel_reduce(size, dot(a), sum1);
Kokkos::fence();
// Measure time to execute the contraction kernel when giving it a
// LayoutRight view for v1 and a LayoutLeft view for v2. This should be
// fast on CPUs and slow on GPUs
Kokkos::Timer time2;
Kokkos::parallel_for(size,contraction<right_type,left_type>(a,r,l));
Kokkos::parallel_for(size, contraction<right_type, left_type>(a, r, l));
Kokkos::fence();
double sec2 = time2.seconds();
double sum2 = 0;
Kokkos::parallel_reduce(size,dot(a),sum2);
Kokkos::parallel_reduce(size, dot(a), sum2);
// Kokkos' reductions are deterministic.
// The results should always be equal.
printf("Result Left/Right %f Right/Left %f (equal result: %i)\n",sec1,sec2,sum2==sum1);
printf("Result Left/Right %f Right/Left %f (equal result: %i)\n", sec1,
sec2, sum2 == sum1);
}
Kokkos::finalize();
}

View File

@ -3,8 +3,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_advancedviews_02_memory_traits
SOURCES memory_traits.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -62,82 +63,85 @@ typedef Kokkos::View<double*> view_type;
// cache. This only works if the View is read-only, which we enforce
// through the first template parameter.
//
// Note that we are still talking about views of the data, its not a new allocation.
// For example you can have an atomic view of a default view. While you even
// could use both in the same kernel, this could lead to undefined behaviour because
// one of your access paths is not atomic. Think of it in the same way as you think of
// pointers to const data and pointers to non-const data (i.e. const double* and double*).
// While these pointers can point to the same data you should not use them together if that
// brakes the const guarantee of the first pointer.
typedef Kokkos::View<const double*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > view_type_rnd;
// Note that we are still talking about views of the data, its not a new
// allocation. For example you can have an atomic view of a default view. While
// you even could use both in the same kernel, this could lead to undefined
// behaviour because one of your access paths is not atomic. Think of it in the
// same way as you think of pointers to const data and pointers to non-const
// data (i.e. const double* and double*). While these pointers can point to the
// same data you should not use them together if that brakes the const guarantee
// of the first pointer.
typedef Kokkos::View<const double*, Kokkos::MemoryTraits<Kokkos::RandomAccess> >
view_type_rnd;
typedef Kokkos::View<int**> idx_type;
typedef idx_type::HostMirror idx_type_host;
// We template this functor on the ViewTypes to show the effect of the RandomAccess trait.
template<class DestType, class SrcType>
// We template this functor on the ViewTypes to show the effect of the
// RandomAccess trait.
template <class DestType, class SrcType>
struct localsum {
idx_type::const_type idx;
DestType dest;
SrcType src;
localsum (idx_type idx_, DestType dest_, SrcType src_) :
idx (idx_), dest (dest_), src (src_)
{}
localsum(idx_type idx_, DestType dest_, SrcType src_)
: idx(idx_), dest(dest_), src(src_) {}
// Calculate a local sum of values
KOKKOS_INLINE_FUNCTION
void operator() (const int i) const {
void operator()(const int i) const {
double tmp = 0.0;
for (int j = 0; j < (int) idx.extent(1); ++j) {
for (int j = 0; j < (int)idx.extent(1); ++j) {
// This is an indirect access on src
const double val = src(idx(i,j));
tmp += val*val + 0.5*(idx.extent(0)*val -idx.extent(1)*val);
const double val = src(idx(i, j));
tmp += val * val + 0.5 * (idx.extent(0) * val - idx.extent(1) * val);
}
dest(i) = tmp;
}
};
int main(int narg, char* arg[]) {
Kokkos::initialize (narg, arg);
Kokkos::initialize(narg, arg);
{
int size = 1000000;
idx_type idx("Idx",size,64);
idx_type_host h_idx = Kokkos::create_mirror_view (idx);
idx_type idx("Idx", size, 64);
idx_type_host h_idx = Kokkos::create_mirror_view(idx);
view_type dest ("Dest", size);
view_type src ("Src", size);
view_type dest("Dest", size);
view_type src("Src", size);
srand(134231);
for (int i = 0; i < size; i++) {
for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
h_idx(i, j) = (size + i + (rand() % 500 - 250)) % size;
}
}
// Deep copy the initial data to the device
Kokkos::deep_copy(idx,h_idx);
Kokkos::deep_copy(idx, h_idx);
// Run the first kernel to warmup caches
Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
Kokkos::parallel_for(size,
localsum<view_type, view_type_rnd>(idx, dest, src));
Kokkos::fence();
// Run the localsum functor using the RandomAccess trait. On CPUs there should
// not be any different in performance to not using the RandomAccess trait.
// On GPUs where can be a dramatic difference
// Run the localsum functor using the RandomAccess trait. On CPUs there
// should not be any different in performance to not using the RandomAccess
// trait. On GPUs where can be a dramatic difference
Kokkos::Timer time1;
Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
Kokkos::parallel_for(size,
localsum<view_type, view_type_rnd>(idx, dest, src));
Kokkos::fence();
double sec1 = time1.seconds();
Kokkos::Timer time2;
Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src));
Kokkos::parallel_for(size, localsum<view_type, view_type>(idx, dest, src));
Kokkos::fence();
double sec2 = time2.seconds();
printf("Time with Trait RandomAccess: %f with Plain: %f \n",sec1,sec2);
printf("Time with Trait RandomAccess: %f with Plain: %f \n", sec1, sec2);
}
Kokkos::finalize();
}

View File

@ -3,8 +3,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_advancedviews_03_subviews
SOURCES subviews.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -70,19 +71,17 @@ typedef Kokkos::View<double***, Kokkos::LayoutStride> inner_mesh_type;
// Functor to set all entries of a boundary of the mesh to a constant
// value. The functor is templated on ViewType because different
// boundaries may have different layouts.
template<class ViewType>
template <class ViewType>
struct set_boundary {
ViewType a;
double value;
set_boundary (ViewType a_, double value_) :
a (a_), value (value_)
{}
set_boundary(ViewType a_, double value_) : a(a_), value(value_) {}
KOKKOS_INLINE_FUNCTION
void operator() (const typename ViewType::size_type i) const {
void operator()(const typename ViewType::size_type i) const {
for (typename ViewType::size_type j = 0; j < a.extent(1); ++j) {
a(i,j) = value;
a(i, j) = value;
}
}
};
@ -90,21 +89,19 @@ struct set_boundary {
// Functor to set all entries of a boundary of the mesh to a constant
// value. The functor is templated on ViewType because different
// boundaries may have different layouts.
template<class ViewType>
template <class ViewType>
struct set_inner {
ViewType a;
double value;
set_inner (ViewType a_, double value_) :
a (a_), value (value_)
{}
set_inner(ViewType a_, double value_) : a(a_), value(value_) {}
KOKKOS_INLINE_FUNCTION
void operator () (const typename ViewType::size_type i) const {
void operator()(const typename ViewType::size_type i) const {
typedef typename ViewType::size_type size_type;
for (size_type j = 0; j < a.extent(1); ++j) {
for (size_type k = 0; k < a.extent(2); ++k) {
a(i,j,k) = value;
a(i, j, k) = value;
}
}
}
@ -112,38 +109,34 @@ struct set_inner {
// Update the interior of the mesh. This simulates one timestep of a
// finite-difference method.
template<class ViewType>
template <class ViewType>
struct update {
ViewType a;
const double dt;
update (ViewType a_, const double dt_) :
a (a_), dt (dt_)
{}
update(ViewType a_, const double dt_) : a(a_), dt(dt_) {}
KOKKOS_INLINE_FUNCTION
void operator() (typename ViewType::size_type i) const {
void operator()(typename ViewType::size_type i) const {
typedef typename ViewType::size_type size_type;
i++;
for (size_type j = 1; j < a.extent(1)-1; j++) {
for (size_type k = 1; k < a.extent(2)-1; k++) {
a(i,j,k) += dt* (a(i,j,k+1) - a(i,j,k-1) +
a(i,j+1,k) - a(i,j-1,k) +
a(i+1,j,k) - a(i-1,j,k));
for (size_type j = 1; j < a.extent(1) - 1; j++) {
for (size_type k = 1; k < a.extent(2) - 1; k++) {
a(i, j, k) += dt * (a(i, j, k + 1) - a(i, j, k - 1) + a(i, j + 1, k) -
a(i, j - 1, k) + a(i + 1, j, k) - a(i - 1, j, k));
}
}
}
};
int main (int narg, char* arg[]) {
int main(int narg, char* arg[]) {
using Kokkos::ALL;
using Kokkos::pair;
using Kokkos::parallel_for;
using Kokkos::subview;
typedef mesh_type::size_type size_type;
Kokkos::initialize (narg, arg);
Kokkos::initialize(narg, arg);
{
// The number of mesh points along each dimension of the mesh, not
@ -151,43 +144,48 @@ int main (int narg, char* arg[]) {
const size_type size = 100;
// A is the full cubic 3-D mesh, including the boundaries.
mesh_type A ("A", size+2, size+2, size+2);
mesh_type A("A", size + 2, size + 2, size + 2);
// Ai is the "inner" part of A, _not_ including the boundaries.
//
// A pair of indices in a particular dimension means the contiguous
// zero-based index range in that dimension, including the first
// entry of the pair but _not_ including the second entry.
inner_mesh_type Ai = subview(A, pair<size_type, size_type> (1, size+1),
pair<size_type, size_type> (1, size+1),
pair<size_type, size_type> (1, size+1));
inner_mesh_type Ai = subview(A, pair<size_type, size_type>(1, size + 1),
pair<size_type, size_type>(1, size + 1),
pair<size_type, size_type>(1, size + 1));
// A has six boundaries, one for each face of the cube.
// Create a View of each of these boundaries.
// ALL() means "select all indices in that dimension."
xy_plane_type Zneg_halo = subview(A, ALL (), ALL (), 0);
xy_plane_type Zpos_halo = subview(A, ALL (), ALL (), 101);
xz_plane_type Yneg_halo = subview(A, ALL (), 0, ALL ());
xz_plane_type Ypos_halo = subview(A, ALL (), 101, ALL ());
yz_plane_type Xneg_halo = subview(A, 0, ALL (), ALL ());
yz_plane_type Xpos_halo = subview(A, 101, ALL (), ALL ());
xy_plane_type Zneg_halo = subview(A, ALL(), ALL(), 0);
xy_plane_type Zpos_halo = subview(A, ALL(), ALL(), 101);
xz_plane_type Yneg_halo = subview(A, ALL(), 0, ALL());
xz_plane_type Ypos_halo = subview(A, ALL(), 101, ALL());
yz_plane_type Xneg_halo = subview(A, 0, ALL(), ALL());
yz_plane_type Xpos_halo = subview(A, 101, ALL(), ALL());
// Set the boundaries to their initial conditions.
parallel_for (Zneg_halo.extent(0), set_boundary<xy_plane_type> (Zneg_halo, 1));
parallel_for (Zpos_halo.extent(0), set_boundary<xy_plane_type> (Zpos_halo, -1));
parallel_for (Yneg_halo.extent(0), set_boundary<xz_plane_type> (Yneg_halo, 2));
parallel_for (Ypos_halo.extent(0), set_boundary<xz_plane_type> (Ypos_halo, -2));
parallel_for (Xneg_halo.extent(0), set_boundary<yz_plane_type> (Xneg_halo, 3));
parallel_for (Xpos_halo.extent(0), set_boundary<yz_plane_type> (Xpos_halo, -3));
parallel_for(Zneg_halo.extent(0),
set_boundary<xy_plane_type>(Zneg_halo, 1));
parallel_for(Zpos_halo.extent(0),
set_boundary<xy_plane_type>(Zpos_halo, -1));
parallel_for(Yneg_halo.extent(0),
set_boundary<xz_plane_type>(Yneg_halo, 2));
parallel_for(Ypos_halo.extent(0),
set_boundary<xz_plane_type>(Ypos_halo, -2));
parallel_for(Xneg_halo.extent(0),
set_boundary<yz_plane_type>(Xneg_halo, 3));
parallel_for(Xpos_halo.extent(0),
set_boundary<yz_plane_type>(Xpos_halo, -3));
// Set the interior of the mesh to its initial condition.
parallel_for (Ai.extent(0), set_inner<inner_mesh_type> (Ai, 0));
parallel_for(Ai.extent(0), set_inner<inner_mesh_type>(Ai, 0));
// Update the interior of the mesh.
// This simulates one timestep with dt = 0.1.
parallel_for (Ai.extent(0), update<mesh_type> (A, 0.1));
parallel_for(Ai.extent(0), update<mesh_type>(A, 0.1));
Kokkos::fence();
printf ("Done\n");
printf("Done\n");
}
Kokkos::finalize ();
Kokkos::finalize();
}

View File

@ -3,8 +3,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_advancedviews_04_dualviews
SOURCES dual_view.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -68,16 +69,16 @@
typedef Kokkos::DualView<double*> view_type;
typedef Kokkos::DualView<int**> idx_type;
template<class ExecutionSpace>
template <class ExecutionSpace>
struct localsum {
// If the functor has a public 'execution_space' typedef, that defines
// the functor's execution space (where it runs in parallel). This
// overrides Kokkos' default execution space.
typedef ExecutionSpace execution_space;
typedef typename Kokkos::Impl::if_c<std::is_same<ExecutionSpace,Kokkos::DefaultExecutionSpace>::value ,
idx_type::memory_space, idx_type::host_mirror_space>::type memory_space;
typedef typename Kokkos::Impl::if_c<
std::is_same<ExecutionSpace, Kokkos::DefaultExecutionSpace>::value,
idx_type::memory_space, idx_type::host_mirror_space>::type memory_space;
// Get the view types on the particular device for which the functor
// is instantiated.
@ -86,25 +87,28 @@ struct localsum {
// the const version of the first template parameter of the View.
// For example, the const_data_type version of double** is const
// double**.
Kokkos::View<idx_type::const_data_type, idx_type::array_layout, memory_space> idx;
Kokkos::View<idx_type::const_data_type, idx_type::array_layout, memory_space>
idx;
// "scalar_array_type" is a typedef in ViewTraits (and DualView) which is the
// array version of the value(s) stored in the View.
Kokkos::View<view_type::scalar_array_type, view_type::array_layout, memory_space> dest;
Kokkos::View<view_type::scalar_array_type, view_type::array_layout,
memory_space>
dest;
Kokkos::View<view_type::const_data_type, view_type::array_layout,
memory_space, Kokkos::MemoryRandomAccess> src;
memory_space, Kokkos::MemoryRandomAccess>
src;
// Constructor takes DualViews, synchronizes them to the device,
// then marks them as modified on the device.
localsum (idx_type dv_idx, view_type dv_dest, view_type dv_src)
{
localsum(idx_type dv_idx, view_type dv_dest, view_type dv_src) {
// Extract the view on the correct Device (i.e., the correct
// memory space) from the DualView. DualView has a template
// method, view(), which is templated on the memory space. If the
// DualView has a View from that memory space, view() returns the
// View in that space.
idx = dv_idx.view<memory_space> ();
dest = dv_dest.template view<memory_space> ();
src = dv_src.template view<memory_space> ();
idx = dv_idx.view<memory_space>();
dest = dv_dest.template view<memory_space>();
src = dv_src.template view<memory_space>();
// Synchronize the DualView to the correct Device.
//
@ -116,103 +120,106 @@ struct localsum {
// determines this by the user manually marking one side or the
// other as modified; see the modify() call below.
dv_idx.sync<memory_space> ();
dv_dest.template sync<memory_space> ();
dv_src.template sync<memory_space> ();
dv_idx.sync<memory_space>();
dv_dest.template sync<memory_space>();
dv_src.template sync<memory_space>();
// Mark dest as modified on Device.
dv_dest.template modify<memory_space> ();
dv_dest.template modify<memory_space>();
}
KOKKOS_INLINE_FUNCTION
void operator() (const int i) const {
void operator()(const int i) const {
double tmp = 0.0;
for (int j = 0; j < (int) idx.extent(1); ++j) {
const double val = src(idx(i,j));
tmp += val*val + 0.5*(idx.extent(0)*val -idx.extent(1)*val);
for (int j = 0; j < (int)idx.extent(1); ++j) {
const double val = src(idx(i, j));
tmp += val * val + 0.5 * (idx.extent(0) * val - idx.extent(1) * val);
}
dest(i) += tmp;
}
};
class ParticleType {
public:
double q;
double m;
double q_over_m;
KOKKOS_INLINE_FUNCTION
ParticleType(double q_ = -1, double m_ = 1):
q(q_), m(m_), q_over_m(q/m) {}
protected:
public:
double q;
double m;
double q_over_m;
KOKKOS_INLINE_FUNCTION
ParticleType(double q_ = -1, double m_ = 1) : q(q_), m(m_), q_over_m(q / m) {}
protected:
};
typedef Kokkos::DualView<ParticleType[10]> ParticleTypes;
int main (int narg, char* arg[]) {
Kokkos::initialize (narg, arg);
typedef Kokkos::DualView<ParticleType[10]> ParticleTypes;
int main(int narg, char* arg[]) {
Kokkos::initialize(narg, arg);
// If View is non-trivial constructible type then add braces so it is out of scope
// before Kokkos::finalize() call
{
ParticleTypes test("Test");
Kokkos::fence();
test.h_view(0) = ParticleType(-1e4,1);
Kokkos::fence();
// If View is non-trivial constructible type then add braces so it is out of
// scope before Kokkos::finalize() call
{
ParticleTypes test("Test");
Kokkos::fence();
test.h_view(0) = ParticleType(-1e4, 1);
Kokkos::fence();
int size = 1000000;
int size = 1000000;
// Create DualViews. This will allocate on both the device and its
// host_mirror_device.
idx_type idx ("Idx",size,64);
view_type dest ("Dest",size);
view_type src ("Src",size);
// Create DualViews. This will allocate on both the device and its
// host_mirror_device.
idx_type idx("Idx", size, 64);
view_type dest("Dest", size);
view_type src("Src", size);
srand(134231);
srand (134231);
// Get a reference to the host view of idx directly (equivalent to
// idx.view<idx_type::host_mirror_space>() )
idx_type::t_host h_idx = idx.h_view;
for (int i = 0; i < size; ++i) {
for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
// Get a reference to the host view of idx directly (equivalent to
// idx.view<idx_type::host_mirror_space>() )
idx_type::t_host h_idx = idx.h_view;
for (int i = 0; i < size; ++i) {
for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
h_idx(i, j) = (size + i + (rand() % 500 - 250)) % size;
}
}
// Mark idx as modified on the host_mirror_space so that a
// sync to the device will actually move data. The sync happens in
// the functor's constructor.
idx.modify<idx_type::host_mirror_space>();
// Run on the device. This will cause a sync of idx to the device,
// since it was marked as modified on the host.
Kokkos::Timer timer;
Kokkos::parallel_for(size,
localsum<view_type::execution_space>(idx, dest, src));
Kokkos::fence();
double sec1_dev = timer.seconds();
timer.reset();
Kokkos::parallel_for(size,
localsum<view_type::execution_space>(idx, dest, src));
Kokkos::fence();
double sec2_dev = timer.seconds();
// Run on the host's default execution space (could be the same as device).
// This will cause a sync back to the host of dest. Note that if the Device
// is CUDA, the data layout will not be optimal on host, so performance is
// lower than what it would be for a pure host compilation.
timer.reset();
Kokkos::parallel_for(
size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src));
Kokkos::fence();
double sec1_host = timer.seconds();
timer.reset();
Kokkos::parallel_for(
size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src));
Kokkos::fence();
double sec2_host = timer.seconds();
printf("Device Time with Sync: %f without Sync: %f \n", sec1_dev, sec2_dev);
printf("Host Time with Sync: %f without Sync: %f \n", sec1_host,
sec2_host);
}
// Mark idx as modified on the host_mirror_space so that a
// sync to the device will actually move data. The sync happens in
// the functor's constructor.
idx.modify<idx_type::host_mirror_space> ();
// Run on the device. This will cause a sync of idx to the device,
// since it was marked as modified on the host.
Kokkos::Timer timer;
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
Kokkos::fence();
double sec1_dev = timer.seconds();
timer.reset();
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
Kokkos::fence();
double sec2_dev = timer.seconds();
// Run on the host's default execution space (could be the same as device).
// This will cause a sync back to the host of dest. Note that if the Device is CUDA,
// the data layout will not be optimal on host, so performance is
// lower than what it would be for a pure host compilation.
timer.reset();
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
Kokkos::fence();
double sec1_host = timer.seconds();
timer.reset();
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
Kokkos::fence();
double sec2_host = timer.seconds();
printf("Device Time with Sync: %f without Sync: %f \n",sec1_dev,sec2_dev);
printf("Host Time with Sync: %f without Sync: %f \n",sec1_host,sec2_host);
}
Kokkos::finalize();
}

View File

@ -2,12 +2,10 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
IF (Kokkos_ENABLE_Cuda_UVM)
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
tutorial_advancedviews_05_nvidia_uvm
SOURCES uvm_example.cpp
COMM serial mpi
DEPLIBS kokkoscontainers kokkoscore
)
IF (Kokkos_ENABLE_CUDA_UVM)
# This is a tutorial, not a test, so we don't ask CTest to run it.
KOKKOS_ADD_EXECUTABLE(
tutorial_advancedviews_05_nvidia_uvm
SOURCES uvm_example.cpp
)
ENDIF ()

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -51,92 +52,98 @@
typedef Kokkos::View<double*, Kokkos::CudaUVMSpace> view_type;
typedef Kokkos::View<int**, Kokkos::CudaUVMSpace> idx_type;
#else
typedef Kokkos::View<double*,Kokkos::HostSpace> view_type;
typedef Kokkos::View<int**,Kokkos::HostSpace> idx_type;
typedef Kokkos::View<double*, Kokkos::HostSpace> view_type;
typedef Kokkos::View<int**, Kokkos::HostSpace> idx_type;
#endif
template<class Device>
template <class Device>
struct localsum {
// Define the execution space for the functor (overrides the DefaultExecutionSpace)
// Define the execution space for the functor (overrides the
// DefaultExecutionSpace)
typedef Device execution_space;
// Get the view types on the particular device the functor is instantiated for
idx_type::const_type idx;
view_type dest;
Kokkos::View<view_type::const_data_type, view_type::array_layout, view_type::device_type, Kokkos::MemoryRandomAccess > src;
Kokkos::View<view_type::const_data_type, view_type::array_layout,
view_type::device_type, Kokkos::MemoryRandomAccess>
src;
localsum(idx_type idx_, view_type dest_,
view_type src_):idx(idx_),dest(dest_),src(src_) {
}
localsum(idx_type idx_, view_type dest_, view_type src_)
: idx(idx_), dest(dest_), src(src_) {}
KOKKOS_INLINE_FUNCTION
void operator() (int i) const {
void operator()(int i) const {
double tmp = 0.0;
for(int j = 0; j < int(idx.extent(1)); j++) {
const double val = src(idx(i,j));
tmp += val*val + 0.5*(idx.extent(0)*val -idx.extent(1)*val);
for (int j = 0; j < int(idx.extent(1)); j++) {
const double val = src(idx(i, j));
tmp += val * val + 0.5 * (idx.extent(0) * val - idx.extent(1) * val);
}
dest(i) += tmp;
}
};
int main(int narg, char* arg[]) {
Kokkos::initialize(narg,arg);
Kokkos::initialize(narg, arg);
{
int size = 1000000;
// Create Views
idx_type idx("Idx",size,64);
view_type dest("Dest",size);
view_type src("Src",size);
idx_type idx("Idx", size, 64);
view_type dest("Dest", size);
view_type src("Src", size);
srand(134231);
Kokkos::fence();
// When using UVM Cuda views can be accessed on the Host directly
for(int i=0; i<size; i++) {
for(int j=0; j<int(idx.extent(1)); j++)
idx(i,j) = (size + i + (rand()%500 - 250))%size;
for (int i = 0; i < size; i++) {
for (int j = 0; j < int(idx.extent(1)); j++)
idx(i, j) = (size + i + (rand() % 500 - 250)) % size;
}
Kokkos::fence();
// Run on the device
// This will cause a sync of idx to the device since it was modified on the host
// This will cause a sync of idx to the device since it was modified on the
// host
Kokkos::Timer timer;
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
Kokkos::parallel_for(size,
localsum<view_type::execution_space>(idx, dest, src));
Kokkos::fence();
double sec1_dev = timer.seconds();
// No data transfer will happen now, since nothing is accessed on the host
timer.reset();
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
Kokkos::parallel_for(size,
localsum<view_type::execution_space>(idx, dest, src));
Kokkos::fence();
double sec2_dev = timer.seconds();
// Run on the host
// This will cause a sync back to the host of dest which was changed on the device
// Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
// when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
// this gives lower effective bandwidth when doing a manual copy via dual views
// This will cause a sync back to the host of dest which was changed on the
// device Compare runtime here with the dual_view example: dest will be
// copied back in 4k blocks when they are accessed the first time during the
// parallel_for. Due to the latency of a memcpy this gives lower effective
// bandwidth when doing a manual copy via dual views
timer.reset();
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
Kokkos::parallel_for(
size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src));
Kokkos::fence();
double sec1_host = timer.seconds();
// No data transfers will happen now
timer.reset();
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
Kokkos::parallel_for(
size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src));
Kokkos::fence();
double sec2_host = timer.seconds();
printf("Device Time with Sync: %e without Sync: %e \n",sec1_dev,sec2_dev);
printf("Host Time with Sync: %e without Sync: %e \n",sec1_host,sec2_host);
printf("Device Time with Sync: %e without Sync: %e \n", sec1_dev, sec2_dev);
printf("Host Time with Sync: %e without Sync: %e \n", sec1_host,
sec2_host);
}
Kokkos::finalize();
}

View File

@ -2,10 +2,11 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -49,100 +50,100 @@
struct FillDevice {
double value;
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> a;
FillDevice(const double& val, const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_a):
value(val),a(d_a){}
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> a;
FillDevice(
const double& val,
const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_a)
: value(val), a(d_a) {}
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
a(i) = value;
}
void operator()(const int& i) const { a(i) = value; }
};
struct ComputeADevice {
int iter;
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> a;
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> b;
ComputeADevice(const int& iter_,
const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_a,
const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_b):
iter(iter_),a(d_a),b(d_b){}
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> a;
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> b;
ComputeADevice(
const int& iter_,
const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_a,
const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_b)
: iter(iter_), a(d_a), b(d_b) {}
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
for(int j=1;j<iter;j++) {
a(i) += std::pow(b(i),1.0+1.0/iter);
void operator()(const int& i) const {
for (int j = 1; j < iter; j++) {
a(i) += std::pow(b(i), 1.0 + 1.0 / iter);
}
}
};
struct ComputeAHost {
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> a;
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> b;
ComputeAHost( const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace>& d_a,
const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace>& d_b):
a(d_a),b(d_b){}
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> a;
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> b;
ComputeAHost(const Kokkos::View<double*, Kokkos::LayoutLeft,
Kokkos::CudaHostPinnedSpace>& d_a,
const Kokkos::View<double*, Kokkos::LayoutLeft,
Kokkos::CudaHostPinnedSpace>& d_b)
: a(d_a), b(d_b) {}
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
a(i) += b(i);
}
void operator()(const int& i) const { a(i) += b(i); }
};
struct MergeDevice {
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> a;
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> b;
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> a;
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> b;
MergeDevice(
const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_a,
const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_b):
a(d_a),b(d_b){}
const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_a,
const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_b)
: a(d_a), b(d_b) {}
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
a(i) += b(i);
}
void operator()(const int& i) const { a(i) += b(i); }
};
int main(int argc, char * argv[]) {
int main(int argc, char* argv[]) {
int size = 100000000;
Kokkos::initialize();
int synch = atoi(argv[1]);
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> d_a("Device A",size);
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> d_b("Device B",size);
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> d_tmp("Device tmp",size);
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> h_a("Host A",size);
Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> h_b("Host B",size);
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> d_a("Device A",
size);
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> d_b("Device B",
size);
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> d_tmp(
"Device tmp", size);
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> h_a(
"Host A", size);
Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> h_b(
"Host B", size);
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),FillDevice(0.0,d_a));
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),FillDevice(1.3513,d_b));
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size),
FillDevice(0.0, d_a));
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size),
FillDevice(1.3513, d_b));
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),ComputeADevice(20,d_a,d_b));
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size),
ComputeADevice(20, d_a, d_b));
if(synch==1)
Kokkos::deep_copy(Kokkos::OpenMP(),h_b,d_b);
if(synch==2)
Kokkos::deep_copy(h_b,d_b);
if (synch == 1) Kokkos::deep_copy(Kokkos::OpenMP(), h_b, d_b);
if (synch == 2) Kokkos::deep_copy(h_b, d_b);
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0,size),[=] (const int& i) {
h_a(i) = 0.0;
});
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0,size),ComputeAHost(h_a,h_b));
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0, size),
[=](const int& i) { h_a(i) = 0.0; });
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0, size),
ComputeAHost(h_a, h_b));
Kokkos::OpenMP().fence();
if(synch==1)
Kokkos::deep_copy(Kokkos::OpenMP(), d_tmp,h_a);
if(synch==2)
Kokkos::deep_copy(d_tmp,h_a);
if (synch == 1) Kokkos::deep_copy(Kokkos::OpenMP(), d_tmp, h_a);
if (synch == 2) Kokkos::deep_copy(d_tmp, h_a);
Kokkos::fence();
std::cout << "Time " << timer.seconds() << std::endl;
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),MergeDevice(d_a,d_tmp));
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size),
MergeDevice(d_a, d_tmp));
Kokkos::deep_copy(h_a,d_a);
Kokkos::deep_copy(h_a, d_a);
std::cout << "h_a(0): " << h_a(0) << " ( Correct: 27.4154 )" << std::endl;
Kokkos::finalize();
}

View File

@ -1,9 +1,9 @@
TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_data_layouts)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_memory_traits)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_subviews)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(04_dualviews)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_data_layouts)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_memory_traits)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_subviews)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(04_dualviews)
IF (Kokkos_ENABLE_Cuda_UVM)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(05_NVIDIA_UVM)
IF (Kokkos_ENABLE_CUDA_UVM)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(05_NVIDIA_UVM)
ENDIF ()

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -49,106 +50,109 @@
typedef Kokkos::HostSpace::execution_space DefaultHostType;
// Kokkos provides two different random number generators with a 64 bit and a 1024 bit state.
// These generators are based on Vigna, Sebastiano (2014). "An experimental exploration of Marsaglia's xorshift generators, scrambled"
// See: http://arxiv.org/abs/1402.6246
// The generators can be used fully independently on each thread and have been tested to
// produce good statistics for both inter and intra thread numbers.
// Note that within a kernel NO random number operations are (team) collective operations.
// Everything can be called within branches. This is a difference to the curand library where
// certain operations are required to be called by all threads in a block.
// Kokkos provides two different random number generators with a 64 bit and a
// 1024 bit state. These generators are based on Vigna, Sebastiano (2014). "An
// experimental exploration of Marsaglia's xorshift generators, scrambled" See:
// http://arxiv.org/abs/1402.6246 The generators can be used fully independently
// on each thread and have been tested to produce good statistics for both inter
// and intra thread numbers. Note that within a kernel NO random number
// operations are (team) collective operations. Everything can be called within
// branches. This is a difference to the curand library where certain operations
// are required to be called by all threads in a block.
//
// In Kokkos you are required to create a pool of generator states, so that threads can
// grep their own. On CPU architectures the pool size is equal to the thread number,
// on CUDA about 128k states are generated (enough to give every potentially simultaneously
// running thread its own state). With a kernel a thread is required to acquire a state from the
// pool and later return it.
// On CPUs the Random number generator is deterministic if using the same number of threads.
// On GPUs (i.e. using the CUDA backend it is not deterministic because threads acquire states via
// atomics.
// In Kokkos you are required to create a pool of generator states, so that
// threads can grep their own. On CPU architectures the pool size is equal to
// the thread number, on CUDA about 128k states are generated (enough to give
// every potentially simultaneously running thread its own state). With a kernel
// a thread is required to aquire a state from the pool and later return it. On
// CPUs the Random number generator is deterministic if using the same number of
// threads. On GPUs (i.e. using the CUDA backend it is not deterministic because
// threads aquire states via atomics.
// A Functor for generating uint64_t random numbers templated on the GeneratorPool type
template<class GeneratorPool>
// A Functor for generating uint64_t random numbers templated on the
// GeneratorPool type
template <class GeneratorPool>
struct generate_random {
// Output View for the random numbers
Kokkos::View<uint64_t*> vals;
// The GeneratorPool
GeneratorPool rand_pool;
int samples;
// Initialize all members
generate_random(Kokkos::View<uint64_t*> vals_,
GeneratorPool rand_pool_,
int samples_):
vals(vals_),rand_pool(rand_pool_),samples(samples_) {}
generate_random(Kokkos::View<uint64_t*> vals_, GeneratorPool rand_pool_,
int samples_)
: vals(vals_), rand_pool(rand_pool_), samples(samples_) {}
KOKKOS_INLINE_FUNCTION
void operator() (int i) const {
void operator()(int i) const {
// Get a random number state from the pool for the active thread
typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
// Draw samples numbers from the pool as urand64 between 0 and rand_pool.MAX_URAND64
// Note there are function calls to get other type of scalars, and also to specify
// Ranges or get a normal distributed float.
for(int k = 0;k<samples;k++)
vals(i*samples+k) = rand_gen.urand64();
// Draw samples numbers from the pool as urand64 between 0 and
// rand_pool.MAX_URAND64 Note there are function calls to get other type of
// scalars, and also to specify Ranges or get a normal distributed float.
for (int k = 0; k < samples; k++)
vals(i * samples + k) = rand_gen.urand64();
// Give the state back, which will allow another thread to acquire it
// Give the state back, which will allow another thread to aquire it
rand_pool.free_state(rand_gen);
}
};
int main(int argc, char* args[]) {
if (argc != 3){
printf("Please pass two integers on the command line\n");
}
else {
if (argc != 3) {
printf("Please pass two integers on the command line\n");
} else {
// Initialize Kokkos
Kokkos::initialize(argc, args);
int size = atoi(args[1]);
int samples = atoi(args[2]);
// Initialize Kokkos
Kokkos::initialize(argc,args);
int size = atoi(args[1]);
int samples = atoi(args[2]);
// Create two random number generator pools one for 64bit states and one for
// 1024 bit states Both take an 64 bit unsigned integer seed to initialize a
// Random_XorShift64 generator which is used to fill the generators of the
// pool.
Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857);
Kokkos::DualView<uint64_t*> vals("Vals", size * samples);
// Create two random number generator pools one for 64bit states and one for 1024 bit states
// Both take an 64 bit unsigned integer seed to initialize a Random_XorShift64 generator which
// is used to fill the generators of the pool.
Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857);
Kokkos::DualView<uint64_t*> vals("Vals",size*samples);
// Run some performance comparisons
Kokkos::Timer timer;
Kokkos::parallel_for(size,
generate_random<Kokkos::Random_XorShift64_Pool<> >(
vals.d_view, rand_pool64, samples));
Kokkos::fence();
// Run some performance comparisons
Kokkos::Timer timer;
Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
Kokkos::fence();
timer.reset();
Kokkos::parallel_for(size,
generate_random<Kokkos::Random_XorShift64_Pool<> >(
vals.d_view, rand_pool64, samples));
Kokkos::fence();
double time_64 = timer.seconds();
timer.reset();
Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
Kokkos::fence();
double time_64 = timer.seconds();
Kokkos::parallel_for(size,
generate_random<Kokkos::Random_XorShift1024_Pool<> >(
vals.d_view, rand_pool1024, samples));
Kokkos::fence();
Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples));
Kokkos::fence();
timer.reset();
Kokkos::parallel_for(size,
generate_random<Kokkos::Random_XorShift1024_Pool<> >(
vals.d_view, rand_pool1024, samples));
Kokkos::fence();
double time_1024 = timer.seconds();
timer.reset();
Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples));
Kokkos::fence();
double time_1024 = timer.seconds();
printf("#Time XorShift64*: %e %e\n", time_64,
1.0e-9 * samples * size / time_64);
printf("#Time XorShift1024*: %e %e\n", time_1024,
1.0e-9 * samples * size / time_1024);
printf("#Time XorShift64*: %e %e\n",time_64,1.0e-9*samples*size/time_64 );
printf("#Time XorShift1024*: %e %e\n",time_1024,1.0e-9*samples*size/time_1024 );
Kokkos::deep_copy(vals.h_view, vals.d_view);
Kokkos::deep_copy(vals.h_view,vals.d_view);
Kokkos::finalize();
Kokkos::finalize();
}
return 0;
}

View File

@ -1,17 +1,17 @@
TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_hello_world)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_simple_view)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(04_simple_memoryspaces)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(05_simple_atomics)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(06_simple_mdrangepolicy)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(Advanced_Views)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(Hierarchical_Parallelism)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_hello_world)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_simple_view)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(04_simple_memoryspaces)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(05_simple_atomics)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(06_simple_mdrangepolicy)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(Advanced_Views)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(Hierarchical_Parallelism)
IF (Kokkos_ENABLE_CXX11)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_hello_world_lambda)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce_lambda)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_simple_view_lambda)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_hello_world_lambda)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce_lambda)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_simple_view_lambda)
ENDIF ()

View File

@ -3,8 +3,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_hierarchicalparallelism_01_thread_teams
SOURCES thread_teams.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -51,16 +52,17 @@
// to identify a thread uniquely and some team related function calls such as a
// barrier (which will be used in a subsequent example).
// A ThreadTeam consists of 1 to n threads where the maxmimum value of n is
// determined by the hardware. On a dual socket CPU machine with 8 cores per socket
// the maximum size of a team is 8. The number of teams (i.e. the league_size) is
// not limited by physical constraints. Its a pure logical number.
// determined by the hardware. On a dual socket CPU machine with 8 cores per
// socket the maximum size of a team is 8. The number of teams (i.e. the
// league_size) is not limited by physical constraints. Its a pure logical
// number.
typedef Kokkos::TeamPolicy<> team_policy ;
typedef team_policy::member_type team_member ;
typedef Kokkos::TeamPolicy<> team_policy;
typedef team_policy::member_type team_member;
// Define a functor which can be launched using the TeamPolicy
struct hello_world {
typedef int value_type; //Specify value type for reduction target, sum
typedef int value_type; // Specify value type for reduction target, sum
// This is a reduction operator which now takes as first argument the
// TeamPolicy member_type. Every member of the team contributes to the
@ -68,36 +70,39 @@ struct hello_world {
// It is helpful to think of this operator as a parallel region for a team
// (i.e. every team member is active and will execute the code).
KOKKOS_INLINE_FUNCTION
void operator() ( const team_member & thread, int& sum) const {
sum+=1;
void operator()(const team_member& thread, int& sum) const {
sum += 1;
// The TeamPolicy<>::member_type provides functions to query the multi
// dimensional index of a thread as well as the number of thread-teams and the size
// of each team.
printf("Hello World: %i %i // %i %i\n",thread.league_rank(),thread.team_rank(),thread.league_size(),thread.team_size());
// dimensional index of a thread as well as the number of thread-teams and
// the size of each team.
printf("Hello World: %i %i // %i %i\n", thread.league_rank(),
thread.team_rank(), thread.league_size(), thread.team_size());
}
};
int main(int narg, char* args[]) {
Kokkos::initialize(narg,args);
Kokkos::initialize(narg, args);
// Launch 12 teams of the maximum number of threads per team
const int team_size_max = team_policy(1,1).team_size_max(hello_world(), Kokkos::ParallelReduceTag());
const team_policy policy_a( 12 , team_size_max );
const int team_size_max = team_policy(1, 1).team_size_max(
hello_world(), Kokkos::ParallelReduceTag());
const team_policy policy_a(12, team_size_max);
int sum = 0;
Kokkos::parallel_reduce( policy_a , hello_world() , sum );
Kokkos::parallel_reduce(policy_a, hello_world(), sum);
// The result will be 12*team_size_max
printf("Result A: %i == %i\n",sum, team_size_max*12);
printf("Result A: %i == %i\n", sum, team_size_max * 12);
// In practice it is often better to let Kokkos decide on the team_size
const team_policy policy_b( 12 , Kokkos::AUTO );
const team_policy policy_b(12, Kokkos::AUTO);
Kokkos::parallel_reduce( policy_b , hello_world() , sum );
// The result will be 12*policy_b.team_size_recommended( hello_world(), Kokkos::ParallelReduceTag())
const int team_size_recommended = policy_b.team_size_recommended( hello_world(), Kokkos::ParallelReduceTag());
printf("Result B: %i %i\n",sum, team_size_recommended*12);
Kokkos::parallel_reduce(policy_b, hello_world(), sum);
// The result will be 12*policy_b.team_size_recommended( hello_world(),
// Kokkos::ParallelReduceTag())
const int team_size_recommended = policy_b.team_size_recommended(
hello_world(), Kokkos::ParallelReduceTag());
printf("Result B: %i %i\n", sum, team_size_recommended * 12);
Kokkos::finalize();
}

View File

@ -2,12 +2,9 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
IF (Kokkos_ENABLE_CXX11)
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
tutorial_hierarchical_01_thread_teams_lambda
SOURCES thread_teams_lambda.cpp
COMM serial mpi
)
ENDIF ()
# This is a tutorial, not a test, so we don't ask CTest to run it.
KOKKOS_ADD_EXECUTABLE(
tutorial_hierarchical_01_thread_teams_lambda
SOURCES thread_teams_lambda.cpp
)

View File

@ -2,10 +2,11 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -53,17 +54,17 @@
// some reasonable bound, which eventually depends upon the hardware
// and programming model implementation).
int main (int narg, char* args[]) {
int main(int narg, char* args[]) {
using Kokkos::parallel_reduce;
typedef Kokkos::TeamPolicy<> team_policy;
typedef typename team_policy::member_type team_member;
typedef Kokkos::TeamPolicy<> team_policy;
typedef typename team_policy::member_type team_member;
Kokkos::initialize (narg, args);
Kokkos::initialize(narg, args);
// Set up a policy that launches 12 teams, with the maximum number
// of threads per team.
const team_policy policy (12, Kokkos::AUTO);
const team_policy policy(12, Kokkos::AUTO);
// This is a reduction with a team policy. The team policy changes
// the first argument of the lambda. Rather than an integer index
@ -77,21 +78,23 @@ int main (int narg, char* args[]) {
// region." That is, every team member is active and will execute
// the body of the lambda.
int sum = 0;
// We also need to protect the usage of a lambda against compiling
// with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
parallel_reduce (policy, KOKKOS_LAMBDA (const team_member& thread, int& lsum) {
lsum += 1;
// TeamPolicy<>::member_type provides functions to query the
// multidimensional index of a thread, as well as the number of
// thread teams and the size of each team.
printf ("Hello World: %i %i // %i %i\n", thread.league_rank (),
thread.team_rank (), thread.league_size (), thread.team_size ());
}, sum);
#endif
// We also need to protect the usage of a lambda against compiling
// with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
parallel_reduce(
policy,
KOKKOS_LAMBDA(const team_member& thread, int& lsum) {
lsum += 1;
// TeamPolicy<>::member_type provides functions to query the
// multidimensional index of a thread, as well as the number of
// thread teams and the size of each team.
printf("Hello World: %i %i // %i %i\n", thread.league_rank(),
thread.team_rank(), thread.league_size(), thread.team_size());
},
sum);
#endif
// The result will be 12*team_policy::team_size_max([=]{})
printf ("Result %i\n",sum);
printf("Result %i\n", sum);
Kokkos::finalize ();
Kokkos::finalize();
}

View File

@ -3,8 +3,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_hierarchicalparallelism_02_nested_parallel_for
SOURCES nested_parallel_for.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -45,46 +46,50 @@
#include <cstdio>
// See 01_thread_teams for an explanation of a basic TeamPolicy
typedef Kokkos::TeamPolicy<> team_policy ;
typedef typename team_policy::member_type team_member ;
typedef Kokkos::TeamPolicy<> team_policy;
typedef typename team_policy::member_type team_member;
struct hello_world {
typedef int value_type; //Specify value type for reduction target, sum
typedef int value_type; // Specify value type for reduction target, sum
KOKKOS_INLINE_FUNCTION
void operator() ( const team_member & thread, int& sum) const {
sum+=1;
void operator()(const team_member& thread, int& sum) const {
sum += 1;
// When using the TeamPolicy Kokkos allows for nested parallel loops.
// All three Kokkos parallel patterns are allowed (for, reduce, scan) and they
// largely follow the same syntax as on the global level.
// The execution policy for the Thread level nesting (the Vector level is in the next
// tutorial example) is Kokkos::TeamThreadRange. This means the loop will be executed
// by all members of the team and the loop count will be split between threads of the
// team. Its arguments are the team_member, and a loop count.
// Not every thread will do the same amount of iterations. On a GPU for example with
// a team_size() larger than 31 only the first 31 threads would actually do anything.
// On a CPU with 8 threads 7 would execute 4 loop iterations, and 1 thread would do
// 3. Note also that the mode of splitting the count is architecture dependent similar
// to what the RangePolicy on a global level does.
// The call itself is not guaranteed to be synchronous. Also keep in mind that the
// operator using a team_policy acts like a parallel region for the team. That means
// that everything outside of the nested parallel_for is also executed by all threads
// of the team.
Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,31), [&] (const int& i) {
printf("Hello World: (%i , %i) executed loop %i \n",thread.league_rank(),thread.team_rank(),i);
});
// All three Kokkos parallel patterns are allowed (for, reduce, scan) and
// they largely follow the same syntax as on the global level. The execution
// policy for the Thread level nesting (the Vector level is in the next
// tutorial example) is Kokkos::TeamThreadRange. This means the loop will be
// executed by all members of the team and the loop count will be split
// between threads of the team. Its arguments are the team_member, and a
// loop count. Not every thread will do the same amount of iterations. On a
// GPU for example with a team_size() larger than 31 only the first 31
// threads would actually do anything. On a CPU with 8 threads 7 would
// execute 4 loop iterations, and 1 thread would do
// 3. Note also that the mode of splitting the count is architecture
// dependent similar to what the RangePolicy on a global level does. The
// call itself is not guaranteed to be synchronous. Also keep in mind that
// the operator using a team_policy acts like a parallel region for the
// team. That means that everything outside of the nested parallel_for is
// also executed by all threads of the team.
Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31),
[&](const int& i) {
printf("Hello World: (%i , %i) executed loop %i \n",
thread.league_rank(), thread.team_rank(), i);
});
}
};
int main(int narg, char* args[]) {
Kokkos::initialize(narg,args);
Kokkos::initialize(narg, args);
// Launch 3 teams of the maximum number of threads per team
const int team_size_max = team_policy(3,1).team_size_max( hello_world(), Kokkos::ParallelReduceTag());
const team_policy policy( 3 , team_size_max );
const int team_size_max = team_policy(3, 1).team_size_max(
hello_world(), Kokkos::ParallelReduceTag());
const team_policy policy(3, team_size_max);
int sum = 0;
Kokkos::parallel_reduce( policy , hello_world() , sum );
printf("Result %i\n",sum);
Kokkos::parallel_reduce(policy, hello_world(), sum);
printf("Result %i\n", sum);
Kokkos::finalize();
}

View File

@ -3,14 +3,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
IF(Kokkos_ENABLE_CXX11)
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_hierarchicalparallelism_03_vectorization
SOURCES vectorization.cpp
COMM serial mpi
)
ENDIF()
)

View File

@ -2,10 +2,11 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -45,118 +46,131 @@
#include <Kokkos_Random.hpp>
#include <cstdio>
// The TeamPolicy actually supports 3D parallelism: Teams, Threads, Vector
// Kokkos::parallel_{for/reduce/scan} calls can be completely free nested.
// The execution policies for the nested layers are TeamThreadRange and
// ThreadVectorRange.
// The only restriction on nesting is that a given level can only be nested in a
// higher one. e.g. a ThreadVectorRange can be nested inside a TeamPolicy operator
// and inside a TeamThreadRange, but you can not nest a ThreadVectorRange or a
// TeamThreadRange inside another ThreadVectorRange.
// As with the 2D execution of TeamPolicy the operator has to be considered as
// a parallel region even with respect to VectorLanes. That means even outside
// a TeamThread or VectorThread loop all threads of a team and all vector lanes
// of a thread execute every line of the operator as long as there are no restricitons
// on them.
// Code lines can be restricted using Kokkos::single to either execute once PerThread
// or execute once PerTeam.
typedef typename Kokkos::TeamPolicy<>::member_type team_member ;
// higher one. e.g. a ThreadVectorRange can be nested inside a TeamPolicy
// operator and inside a TeamThreadRange, but you can not nest a
// ThreadVectorRange or a TeamThreadRange inside another ThreadVectorRange. As
// with the 2D execution of TeamPolicy the operator has to be considered as a
// parallel region even with respect to VectorLanes. That means even outside a
// TeamThread or VectorThread loop all threads of a team and all vector lanes of
// a thread execute every line of the operator as long as there are no
// restricitons on them. Code lines can be restricted using Kokkos::single to
// either execute once PerThread or execute once PerTeam.
typedef typename Kokkos::TeamPolicy<>::member_type team_member;
struct SomeCorrelation {
typedef int value_type; //Specify value type for reduction target, sum
typedef int value_type; // Specify value type for reduction target, sum
typedef Kokkos::DefaultExecutionSpace::scratch_memory_space shared_space;
typedef Kokkos::View<int*,shared_space,Kokkos::MemoryUnmanaged> shared_1d_int;
typedef Kokkos::View<int*, shared_space, Kokkos::MemoryUnmanaged>
shared_1d_int;
Kokkos::View<const int***,Kokkos::LayoutRight> data;
Kokkos::View<const int***, Kokkos::LayoutRight> data;
Kokkos::View<int> gsum;
SomeCorrelation(Kokkos::View<int***,Kokkos::LayoutRight> data_in,
Kokkos::View<int> sum):data(data_in),gsum(sum){}
SomeCorrelation(Kokkos::View<int***, Kokkos::LayoutRight> data_in,
Kokkos::View<int> sum)
: data(data_in), gsum(sum) {}
KOKKOS_INLINE_FUNCTION
void operator() ( const team_member & thread) const {
void operator()(const team_member& thread) const {
int i = thread.league_rank();
// Allocate a shared array for the team.
shared_1d_int count(thread.team_shmem(),data.extent(1));
shared_1d_int count(thread.team_shmem(), data.extent(1));
// With each team run a parallel_for with its threads
Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.extent(1)), [=] (const int& j) {
int tsum;
// Run a vector loop reduction over the inner dimension of data
// Count how many values are multiples of 4
// Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.extent(2)), [=] (const int& k, int & vsum) {
vsum+= (data(i,j,k) % 4 == 0)?1:0;
},tsum);
Kokkos::parallel_for(
Kokkos::TeamThreadRange(thread, data.extent(1)), [=](const int& j) {
int tsum;
// Run a vector loop reduction over the inner dimension of data
// Count how many values are multiples of 4
// Every vector lane gets the same reduction value (tsum) back, it is
// broadcast to all vector lanes
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(thread, data.extent(2)),
[=](const int& k, int& vsum) {
vsum += (data(i, j, k) % 4 == 0) ? 1 : 0;
},
tsum);
// Make sure only one vector lane adds the reduction value to the shared array, i.e. execute
// the next line only once PerThread
Kokkos::single(Kokkos::PerThread(thread),[=] () {
count(j) = tsum;
});
});
// Make sure only one vector lane adds the reduction value to the
// shared array, i.e. execute the next line only once PerThread
Kokkos::single(Kokkos::PerThread(thread), [=]() { count(j) = tsum; });
});
// Wait for all threads to finish the parallel_for so that all shared memory writes are done
// Wait for all threads to finish the parallel_for so that all shared memory
// writes are done
thread.team_barrier();
// Check with one vector lane from each thread how many consecutive
// data segments have the same number of values divisible by 4
// The team reduction value is again broadcast to every team member (and every vector lane)
// The team reduction value is again broadcast to every team member (and
// every vector lane)
int team_sum = 0;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.extent(1)-1), [=] (const int& j, int& thread_sum) {
// It is not valid to directly add to thread_sum
// Use a single function with broadcast instead
// team_sum will be used as input to the operator (i.e. it is used to initialize sum)
// the end value of sum will be broadcast to all vector lanes in the thread.
Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) {
if(count(j)==count(j+1)) sum++;
},thread_sum);
},team_sum);
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange(thread, data.extent(1) - 1),
[=](const int& j, int& thread_sum) {
// It is not valid to directly add to thread_sum
// Use a single function with broadcast instead
// team_sum will be used as input to the operator (i.e. it is used to
// initialize sum) the end value of sum will be broadcast to all
// vector lanes in the thread.
Kokkos::single(
Kokkos::PerThread(thread),
[=](int& sum) {
if (count(j) == count(j + 1)) sum++;
},
thread_sum);
},
team_sum);
// Add with one thread and vectorlane of the team the team_sum to the global value
Kokkos::single(Kokkos::PerTeam(thread),[=] () {
Kokkos::atomic_add(&gsum(),team_sum);
});
// Add with one thread and vectorlane of the team the team_sum to the global
// value
Kokkos::single(Kokkos::PerTeam(thread),
[=]() { Kokkos::atomic_add(&gsum(), team_sum); });
}
// The functor needs to define how much shared memory it requests given a team_size.
size_t team_shmem_size( int team_size ) const {
// The functor needs to define how much shared memory it requests given a
// team_size.
size_t team_shmem_size(int team_size) const {
return shared_1d_int::shmem_size(data.extent(1));
}
};
int main(int narg, char* args[]) {
Kokkos::initialize(narg,args);
Kokkos::initialize(narg, args);
{
// Produce some 3D random data (see Algorithms/01_random_numbers for more info)
Kokkos::View<int***,Kokkos::LayoutRight> data("Data",512,512,32);
// Produce some 3D random data (see Algorithms/01_random_numbers for more
// info)
Kokkos::View<int***, Kokkos::LayoutRight> data("Data", 512, 512, 32);
Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
Kokkos::fill_random(data,rand_pool64,100);
Kokkos::fill_random(data, rand_pool64, 100);
// A global value to put the result in
Kokkos::View<int> gsum("Sum");
// Each team handles a slice of the data
// Set up TeamPolicy with 512 teams with maximum number of threads per team and 16 vector lanes.
// Kokkos::AUTO will determine the number of threads
// The maximum vector length is hardware dependent but can always be smaller than the hardware allows.
// The vector length must be a power of 2.
// Set up TeamPolicy with 512 teams with maximum number of threads per team
// and 16 vector lanes. Kokkos::AUTO will determine the number of threads
// The maximum vector length is hardware dependent but can always be smaller
// than the hardware allows. The vector length must be a power of 2.
const Kokkos::TeamPolicy<> policy( 512 , Kokkos::AUTO , 16);
const Kokkos::TeamPolicy<> policy(512, Kokkos::AUTO, 16);
Kokkos::parallel_for( policy , SomeCorrelation(data,gsum) );
Kokkos::parallel_for(policy, SomeCorrelation(data, gsum));
Kokkos::fence();
// Copy result value back
int sum = 0;
Kokkos::deep_copy(sum,gsum);
printf("Result %i\n",sum);
Kokkos::deep_copy(sum, gsum);
printf("Result %i\n", sum);
}
Kokkos::finalize();
}

View File

@ -3,8 +3,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_hierarchicalparallelism_04_team_scan
SOURCES team_scan.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -47,13 +48,13 @@
#include <cstdio>
#include <cstdlib>
typedef Kokkos::DefaultExecutionSpace Device ;
typedef Kokkos::HostSpace::execution_space Host ;
typedef Kokkos::DefaultExecutionSpace Device;
typedef Kokkos::HostSpace::execution_space Host;
typedef Kokkos::TeamPolicy< Device > team_policy ;
typedef team_policy::member_type team_member ;
typedef Kokkos::TeamPolicy<Device> team_policy;
typedef team_policy::member_type team_member;
static const int TEAM_SIZE = 16 ;
static const int TEAM_SIZE = 16;
struct find_2_tuples {
int chunk_size;
@ -61,89 +62,91 @@ struct find_2_tuples {
Kokkos::View<int**> histogram;
find_2_tuples(int chunk_size_, Kokkos::DualView<int*> data_,
Kokkos::DualView<int**> histogram_):chunk_size(chunk_size_),
data(data_.d_view),histogram(histogram_.d_view) {
data_.sync<Device>();
histogram_.sync<Device>();
histogram_.modify<Device>();
Kokkos::DualView<int**> histogram_)
: chunk_size(chunk_size_),
data(data_.d_view),
histogram(histogram_.d_view) {
data_.sync<Device>();
histogram_.sync<Device>();
histogram_.modify<Device>();
}
KOKKOS_INLINE_FUNCTION
void operator() ( const team_member & dev) const {
Kokkos::View<int**,Kokkos::MemoryUnmanaged> l_histogram(dev.team_shmem(),TEAM_SIZE,TEAM_SIZE);
Kokkos::View<int*,Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),chunk_size+1);
void operator()(const team_member& dev) const {
Kokkos::View<int**, Kokkos::MemoryUnmanaged> l_histogram(
dev.team_shmem(), TEAM_SIZE, TEAM_SIZE);
Kokkos::View<int*, Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),
chunk_size + 1);
const int i = dev.league_rank() * chunk_size;
for(int j = dev.team_rank(); j<chunk_size+1; j+=dev.team_size())
l_data(j) = data(i+j);
for (int j = dev.team_rank(); j < chunk_size + 1; j += dev.team_size())
l_data(j) = data(i + j);
for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
for(int l = 0; l < TEAM_SIZE; l++)
l_histogram(k,l) = 0;
for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size())
for (int l = 0; l < TEAM_SIZE; l++) l_histogram(k, l) = 0;
dev.team_barrier();
for(int j = 0; j<chunk_size; j++) {
for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
for(int l = 0; l < TEAM_SIZE; l++) {
if((l_data(j) == k) && (l_data(j+1)==l))
l_histogram(k,l)++;
for (int j = 0; j < chunk_size; j++) {
for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size())
for (int l = 0; l < TEAM_SIZE; l++) {
if ((l_data(j) == k) && (l_data(j + 1) == l)) l_histogram(k, l)++;
}
}
for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
for(int l = 0; l < TEAM_SIZE; l++) {
Kokkos::atomic_fetch_add(&histogram(k,l),l_histogram(k,l));
for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size())
for (int l = 0; l < TEAM_SIZE; l++) {
Kokkos::atomic_fetch_add(&histogram(k, l), l_histogram(k, l));
}
dev.team_barrier();
}
size_t team_shmem_size( int team_size ) const {
return Kokkos::View<int**,Kokkos::MemoryUnmanaged>::shmem_size(TEAM_SIZE,TEAM_SIZE) +
Kokkos::View<int*,Kokkos::MemoryUnmanaged>::shmem_size(chunk_size+1);
size_t team_shmem_size(int team_size) const {
return Kokkos::View<int**, Kokkos::MemoryUnmanaged>::shmem_size(TEAM_SIZE,
TEAM_SIZE) +
Kokkos::View<int*, Kokkos::MemoryUnmanaged>::shmem_size(chunk_size +
1);
}
};
int main(int narg, char* args[]) {
Kokkos::initialize(narg,args);
Kokkos::initialize(narg, args);
{
int chunk_size = 1024;
int nchunks = 100000; //1024*1024;
Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);
int chunk_size = 1024;
int nchunks = 100000; // 1024*1024;
Kokkos::DualView<int*> data("data", nchunks * chunk_size + 1);
srand(1231093);
srand(1231093);
for(int i = 0; i < (int) data.extent(0); i++) {
data.h_view(i) = rand()%TEAM_SIZE;
}
data.modify<Host>();
data.sync<Device>();
Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE);
Kokkos::Timer timer;
// threads/team is automatically limited to maximum supported by the device.
int team_size = TEAM_SIZE;
if( team_size > Device::execution_space::concurrency() )
team_size = Device::execution_space::concurrency();
Kokkos::parallel_for( team_policy( nchunks , team_size )
, find_2_tuples(chunk_size,data,histogram) );
Kokkos::fence();
double time = timer.seconds();
histogram.sync<Host>();
printf("Time: %f \n\n",time);
int sum = 0;
for(int k=0; k<TEAM_SIZE; k++) {
for(int l=0; l<TEAM_SIZE; l++) {
printf("%i ",histogram.h_view(k,l));
sum += histogram.h_view(k,l);
for (int i = 0; i < (int)data.extent(0); i++) {
data.h_view(i) = rand() % TEAM_SIZE;
}
printf("\n");
}
printf("Result: %i %i\n",sum,chunk_size*nchunks);
data.modify<Host>();
data.sync<Device>();
Kokkos::DualView<int**> histogram("histogram", TEAM_SIZE, TEAM_SIZE);
Kokkos::Timer timer;
// threads/team is automatically limited to maximum supported by the device.
int team_size = TEAM_SIZE;
if (team_size > Device::execution_space::concurrency())
team_size = Device::execution_space::concurrency();
Kokkos::parallel_for(team_policy(nchunks, team_size),
find_2_tuples(chunk_size, data, histogram));
Kokkos::fence();
double time = timer.seconds();
histogram.sync<Host>();
printf("Time: %f \n\n", time);
int sum = 0;
for (int k = 0; k < TEAM_SIZE; k++) {
for (int l = 0; l < TEAM_SIZE; l++) {
printf("%i ", histogram.h_view(k, l));
sum += histogram.h_view(k, l);
}
printf("\n");
}
printf("Result: %i %i\n", sum, chunk_size * nchunks);
}
Kokkos::finalize();
}

View File

@ -1,8 +1,6 @@
TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams_lambda)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_nested_parallel_for)
KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_vectorization)
IF (Kokkos_ENABLE_CXX11)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams_lambda)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_nested_parallel_for)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_vectorization)
ENDIF ()

View File

@ -3,8 +3,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
KOKKOS_ADD_EXECUTABLE(
tutorial_02_simple_reduce
SOURCES simple_reduce.cpp
COMM serial mpi
)
)

View File

@ -1,13 +1,14 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -23,10 +24,10 @@
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ -36,7 +37,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -52,96 +53,92 @@
// 3. Shut down Kokkos
//
struct collision {
// Reduction functor
// For each i, we generate 10 hashes, look for and count collisions
// We use parallel_reduce to count the total collisions
// Note that we're just counting collisions within the 10 generated
// one i.
// This function was chosen as one that very simply can increase the
// register count.
// Reduction functor
// For each i, we generate 10 hashes, look for and count collisions
// We use parallel_reduce to count the total collisions
// Note that we're just counting collisions within the 10 generated
// one i.
// This function was chosen as one that very simply can increase the
// register count.
typedef int value_type;
KOKKOS_INLINE_FUNCTION
int hash(int q) const {
// A simple hash by Justin Sobel
// Thanks to Arash Partow (partow.net)
char* fourchars = (char*)&q;
int hash = 1315423911;
for (int i=0; i<4; fourchars++, i++) {
hash ^= ((hash<<5) + *fourchars + (hash >> 2));
}
return hash;
// A simple hash by Justin Sobel
// Thanks to Arash Partow (partow.net)
char* fourchars = (char*)&q;
int hash = 1315423911;
for (int i = 0; i < 4; fourchars++, i++) {
hash ^= ((hash << 5) + *fourchars + (hash >> 2));
}
return hash;
}
KOKKOS_INLINE_FUNCTION
void operator () (const int i, int& lsum) const {
//This is a silly function which generates 10 hashes
// then checks for collisions
int a = hash(i)%64;
int b = hash(i*3)%64;
int c = hash(i*5)%64;
int d = hash(i*7)%64;
int e = hash(i*11)%64;
int f = hash(i*17)%64;
int g = hash(i*23)%64;
int h = hash(i*29)%64;
int j = hash(i*31)%64;
int k = hash(i*37)%64;
void operator()(const int i, int& lsum) const {
// This is a silly function which generates 10 hashes
// then checks for collisions
int a = hash(i) % 64;
int b = hash(i * 3) % 64;
int c = hash(i * 5) % 64;
int d = hash(i * 7) % 64;
int e = hash(i * 11) % 64;
int f = hash(i * 17) % 64;
int g = hash(i * 23) % 64;
int h = hash(i * 29) % 64;
int j = hash(i * 31) % 64;
int k = hash(i * 37) % 64;
if (a==b) lsum++;
if (a==c) lsum++;
if (a==d) lsum++;
if (a==e) lsum++;
if (a==f) lsum++;
if (a==g) lsum++;
if (a==h) lsum++;
if (a==j) lsum++;
if (a==k) lsum++;
if (b==c) lsum++;
if (b==d) lsum++;
if (b==e) lsum++;
if (b==f) lsum++;
if (b==g) lsum++;
if (b==h) lsum++;
if (b==j) lsum++;
if (b==k) lsum++;
if (c==d) lsum++;
if (c==e) lsum++;
if (c==f) lsum++;
if (c==g) lsum++;
if (c==h) lsum++;
if (c==j) lsum++;
if (c==k) lsum++;
if (d==e) lsum++;
if (d==f) lsum++;
if (d==g) lsum++;
if (d==h) lsum++;
if (d==j) lsum++;
if (d==k) lsum++;
if (e==f) lsum++;
if (e==g) lsum++;
if (e==h) lsum++;
if (e==j) lsum++;
if (e==k) lsum++;
if (f==g) lsum++;
if (f==h) lsum++;
if (f==j) lsum++;
if (f==k) lsum++;
if (g==h) lsum++;
if (g==j) lsum++;
if (g==k) lsum++;
if (h==j) lsum++;
if (h==k) lsum++;
if (j==k) lsum++;
if (a == b) lsum++;
if (a == c) lsum++;
if (a == d) lsum++;
if (a == e) lsum++;
if (a == f) lsum++;
if (a == g) lsum++;
if (a == h) lsum++;
if (a == j) lsum++;
if (a == k) lsum++;
if (b == c) lsum++;
if (b == d) lsum++;
if (b == e) lsum++;
if (b == f) lsum++;
if (b == g) lsum++;
if (b == h) lsum++;
if (b == j) lsum++;
if (b == k) lsum++;
if (c == d) lsum++;
if (c == e) lsum++;
if (c == f) lsum++;
if (c == g) lsum++;
if (c == h) lsum++;
if (c == j) lsum++;
if (c == k) lsum++;
if (d == e) lsum++;
if (d == f) lsum++;
if (d == g) lsum++;
if (d == h) lsum++;
if (d == j) lsum++;
if (d == k) lsum++;
if (e == f) lsum++;
if (e == g) lsum++;
if (e == h) lsum++;
if (e == j) lsum++;
if (e == k) lsum++;
if (f == g) lsum++;
if (f == h) lsum++;
if (f == j) lsum++;
if (f == k) lsum++;
if (g == h) lsum++;
if (g == j) lsum++;
if (g == k) lsum++;
if (h == j) lsum++;
if (h == k) lsum++;
if (j == k) lsum++;
}
};
int main (int argc, char* argv[]) {
Kokkos::initialize (argc, argv);
int main(int argc, char* argv[]) {
Kokkos::initialize(argc, argv);
const int n = 10000;
// Compute and count hash collisions in
@ -150,24 +147,26 @@ int main (int argc, char* argv[]) {
// LaunchBounds functionality
int sum1 = 0;
int sum2 = 0;
//Without LaunchBounds, the kernel uses 56 registers
Kokkos::parallel_reduce (n, collision (), sum1);
//With LaunchBounds, we can reduce the register usage to 32
Kokkos::parallel_reduce (Kokkos::RangePolicy<Kokkos::LaunchBounds<512,4>>(0,n), collision (), sum2);
// Without LaunchBounds, the kernel uses 56 registers
Kokkos::parallel_reduce(n, collision(), sum1);
printf ("Number of collisions, "
"computed in parallel, is %i\n", sum1);
// With LaunchBounds, we can reduce the register usage to 32
Kokkos::parallel_reduce(
Kokkos::RangePolicy<Kokkos::LaunchBounds<512, 4>>(0, n), collision(),
sum2);
printf(
"Number of collisions, "
"computed in parallel, is %i\n",
sum1);
if (sum1 != sum2) {
printf( "Uh-oh! Results do not match\n");
return -1;
printf("Uh-oh! Results do not match\n");
return -1;
}
Kokkos::finalize();
return 0;
}