Updating Kokkos lib

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15556 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
stamoor
2016-09-06 23:06:32 +00:00
parent 1ad033ec0c
commit 39be4185c4
502 changed files with 157510 additions and 0 deletions

View File

@ -0,0 +1,10 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
tutorial_hierarchicalparallelism_01_thread_teams
SOURCES thread_teams.cpp
COMM serial mpi
)

View File

@ -0,0 +1,43 @@
KOKKOS_PATH = ../../../..
SRC = $(wildcard *.cpp)
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ../../../../config/nvcc_wrapper
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = $(SRC:.cpp=.cuda)
KOKKOS_DEVICES = "Cuda,OpenMP"
KOKKOS_ARCH = "SNB,Kepler35"
else
CXX = g++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = $(SRC:.cpp=.host)
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
DEPFLAGS = -M
OBJ = $(SRC:.cpp=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,94 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <cstdio>
// Using default execution space define a TeamPolicy and its member_type
// The member_type is what the operator of a functor or Lambda gets, for
// a simple RangePolicy the member_type is simply an integer
// For a TeamPolicy its a much richer object, since it provides all information
// to identify a thread uniquely and some team related function calls such as a
// barrier (which will be used in a subsequent example).
// A ThreadTeam consists of 1 to n threads where the maxmimum value of n is
// determined by the hardware. On a dual socket CPU machine with 8 cores per socket
// the maximum size of a team is 8. The number of teams (i.e. the league_size) is
// not limited by physical constraints. Its a pure logical number.
typedef Kokkos::TeamPolicy<> team_policy ;
typedef team_policy::member_type team_member ;
// Define a functor which can be launched using the TeamPolicy
struct hello_world {
typedef int value_type; //Specify value type for reduction target, sum
// This is a reduction operator which now takes as first argument the
// TeamPolicy member_type. Every member of the team contributes to the
// total sum.
// It is helpful to think of this operator as a parallel region for a team
// (i.e. every team member is active and will execute the code).
KOKKOS_INLINE_FUNCTION
void operator() ( const team_member & thread, int& sum) const {
sum+=1;
// The TeamPolicy<>::member_type provides functions to query the multi
// dimensional index of a thread as well as the number of thread-teams and the size
// of each team.
printf("Hello World: %i %i // %i %i\n",thread.league_rank(),thread.team_rank(),thread.league_size(),thread.team_size());
}
};
int main(int narg, char* args[]) {
Kokkos::initialize(narg,args);
// Launch 12 teams of the maximum number of threads per team
const team_policy policy( 12 , team_policy::team_size_max( hello_world() ) );
int sum = 0;
Kokkos::parallel_reduce( policy , hello_world() , sum );
// The result will be 12*team_policy::team_size_max( hello_world())
printf("Result %i\n",sum);
Kokkos::finalize();
}

View File

@ -0,0 +1,13 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
IF (Kokkos_ENABLE_CXX11)
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
tutorial_hierarchical_01_thread_teams_lambda
SOURCES thread_teams_lambda.cpp
COMM serial mpi
)
ENDIF ()

View File

@ -0,0 +1,44 @@
KOKKOS_PATH = ../../../..
SRC = $(wildcard *.cpp)
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ../../../../config/nvcc_wrapper
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = $(SRC:.cpp=.cuda)
KOKKOS_DEVICES = "Cuda,OpenMP"
KOKKOS_ARCH = "SNB,Kepler35"
KOKKOS_CUDA_OPTIONS = "enable_lambda"
else
CXX = g++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = $(SRC:.cpp=.host)
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
DEPFLAGS = -M
OBJ = $(SRC:.cpp=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,94 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <cstdio>
// Demonstrate a parallel reduction using thread teams (TeamPolicy).
//
// A thread team consists of 1 to n threads. The hardware determines
// the maxmimum value of n. On a dual-socket CPU machine with 8 cores
// per socket, the maximum size of a team is 8. The number of teams
// (the league_size) is not limited by physical constraints (up to
// some reasonable bound, which eventually depends upon the hardware
// and programming model implementation).
int main (int narg, char* args[]) {
using Kokkos::parallel_reduce;
typedef Kokkos::TeamPolicy<> team_policy;
typedef typename team_policy::member_type team_member;
Kokkos::initialize (narg, args);
// Set up a policy that launches 12 teams, with the maximum number
// of threads per team.
const team_policy policy (12, Kokkos::AUTO);
// This is a reduction with a team policy. The team policy changes
// the first argument of the lambda. Rather than an integer index
// (as with RangePolicy), it's now TeamPolicy::member_type. This
// object provides all information to identify a thread uniquely.
// It also provides some team-related function calls such as a team
// barrier (which a subsequent example will use).
//
// Every member of the team contributes to the total sum. It is
// helpful to think of the lambda's body as a "team parallel
// region." That is, every team member is active and will execute
// the body of the lambda.
int sum = 0;
parallel_reduce (policy, KOKKOS_LAMBDA (const team_member& thread, int& lsum) {
lsum += 1;
// TeamPolicy<>::member_type provides functions to query the
// multidimensional index of a thread, as well as the number of
// thread teams and the size of each team.
printf ("Hello World: %i %i // %i %i\n", thread.league_rank (),
thread.team_rank (), thread.league_size (), thread.team_size ());
}, sum);
// The result will be 12*team_policy::team_size_max([=]{})
printf ("Result %i\n",sum);
Kokkos::finalize ();
}

View File

@ -0,0 +1,10 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
tutorial_hierarchicalparallelism_02_nested_parallel_for
SOURCES nested_parallel_for.cpp
COMM serial mpi
)

View File

@ -0,0 +1,43 @@
KOKKOS_PATH = ../../../..
SRC = $(wildcard *.cpp)
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ../../../../config/nvcc_wrapper
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = $(SRC:.cpp=.cuda)
KOKKOS_DEVICES = "Cuda,OpenMP"
KOKKOS_ARCH = "SNB,Kepler35"
else
CXX = g++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = $(SRC:.cpp=.host)
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
DEPFLAGS = -M
OBJ = $(SRC:.cpp=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,89 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <cstdio>
// See 01_thread_teams for an explanation of a basic TeamPolicy
typedef Kokkos::TeamPolicy<> team_policy ;
typedef typename team_policy::member_type team_member ;
struct hello_world {
typedef int value_type; //Specify value type for reduction target, sum
KOKKOS_INLINE_FUNCTION
void operator() ( const team_member & thread, int& sum) const {
sum+=1;
// When using the TeamPolicy Kokkos allows for nested parallel loops.
// All three Kokkos parallel patterns are allowed (for, reduce, scan) and they
// largely follow the same syntax as on the global level.
// The execution policy for the Thread level nesting (the Vector level is in the next
// tutorial example) is Kokkos::TeamThreadRange. This means the loop will be executed
// by all members of the team and the loop count will be split between threads of the
// team. Its arguments are the team_member, and a loop count.
// Not every thread will do the same amount of iterations. On a GPU for example with
// a team_size() larger than 31 only the first 31 threads would actually do anything.
// On a CPU with 8 threads 7 would execute 4 loop iterations, and 1 thread would do
// 3. Note also that the mode of splitting the count is architecture dependent similar
// to what the RangePolicy on a global level does.
// The call itself is not guaranteed to be synchronous. Also keep in mind that the
// operator using a team_policy acts like a parallel region for the team. That means
// that everything outside of the nested parallel_for is also executed by all threads
// of the team.
Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,31), [&] (const int& i) {
printf("Hello World: (%i , %i) executed loop %i \n",thread.league_rank(),thread.team_rank(),i);
});
}
};
int main(int narg, char* args[]) {
Kokkos::initialize(narg,args);
// Launch 3 teams of the maximum number of threads per team
const team_policy policy( 3 , team_policy::team_size_max( hello_world() ) );
int sum = 0;
Kokkos::parallel_reduce( policy , hello_world() , sum );
printf("Result %i\n",sum);
Kokkos::finalize();
}

View File

@ -0,0 +1,16 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
IF(Kokkos_ENABLE_CXX11)
TRIBITS_ADD_EXECUTABLE(
tutorial_hierarchicalparallelism_03_vectorization
SOURCES vectorization.cpp
COMM serial mpi
)
ENDIF()

View File

@ -0,0 +1,43 @@
KOKKOS_PATH = ../../../..
SRC = $(wildcard *.cpp)
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ../../../../config/nvcc_wrapper
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = $(SRC:.cpp=.cuda)
KOKKOS_DEVICES = "Cuda,OpenMP"
KOKKOS_ARCH = "SNB,Kepler35"
else
CXX = g++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = $(SRC:.cpp=.host)
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
DEPFLAGS = -M
OBJ = $(SRC:.cpp=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,162 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <Kokkos_Random.hpp>
#include <cstdio>
#ifdef KOKKOS_HAVE_CXX11
// The TeamPolicy actually supports 3D parallelism: Teams, Threads, Vector
// Kokkos::parallel_{for/reduce/scan} calls can be completely free nested.
// The execution policies for the nested layers are TeamThreadRange and
// ThreadVectorRange.
// The only restriction on nesting is that a given level can only be nested in a
// higher one. e.g. a ThreadVectorRange can be nested inside a TeamPolicy operator
// and inside a TeamThreadRange, but you can not nest a ThreadVectorRange or a
// TeamThreadRange inside another ThreadVectorRange.
// As with the 2D execution of TeamPolicy the operator has to be considered as
// a parallel region even with respect to VectorLanes. That means even outside
// a TeamThread or VectorThread loop all threads of a team and all vector lanes
// of a thread execute every line of the operator as long as there are no restricitons
// on them.
// Code lines can be restricted using Kokkos::single to either execute once PerThread
// or execute once PerTeam.
typedef typename Kokkos::TeamPolicy<>::member_type team_member ;
struct SomeCorrelation {
typedef int value_type; //Specify value type for reduction target, sum
typedef Kokkos::DefaultExecutionSpace::scratch_memory_space shared_space;
typedef Kokkos::View<int*,shared_space,Kokkos::MemoryUnmanaged> shared_1d_int;
Kokkos::View<const int***,Kokkos::LayoutRight> data;
Kokkos::View<int> gsum;
SomeCorrelation(Kokkos::View<int***,Kokkos::LayoutRight> data_in,
Kokkos::View<int> sum):data(data_in),gsum(sum){}
KOKKOS_INLINE_FUNCTION
void operator() ( const team_member & thread) const {
int i = thread.league_rank();
// Allocate a shared array for the team.
shared_1d_int count(thread.team_shmem(),data.dimension_1());
// With each team run a parallel_for with its threads
Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.dimension_1()), [=] (const int& j) {
int tsum;
// Run a vector loop reduction over the inner dimension of data
// Count how many values are multiples of 4
// Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.dimension_2()), [=] (const int& k, int & vsum) {
vsum+= (data(i,j,k) % 4 == 0)?1:0;
},tsum);
// Make sure only one vector lane adds the reduction value to the shared array, i.e. execute
// the next line only once PerThread
Kokkos::single(Kokkos::PerThread(thread),[=] () {
count(j) = tsum;
});
});
// Wait for all threads to finish the parallel_for so that all shared memory writes are done
thread.team_barrier();
// Check with one vector lane from each thread how many consecutive
// data segments have the same number of values divisible by 4
// The team reduction value is again broadcast to every team member (and every vector lane)
int team_sum = 0;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.dimension_1()-1), [=] (const int& j, int& thread_sum) {
// It is not valid to directly add to thread_sum
// Use a single function with broadcast instead
// team_sum will be used as input to the operator (i.e. it is used to initialize sum)
// the end value of sum will be broadcast to all vector lanes in the thread.
Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) {
if(count(j)==count(j+1)) sum++;
},thread_sum);
},team_sum);
// Add with one thread and vectorlane of the team the team_sum to the global value
Kokkos::single(Kokkos::PerTeam(thread),[=] () {
Kokkos::atomic_add(&gsum(),team_sum);
});
}
// The functor needs to define how much shared memory it requests given a team_size.
size_t team_shmem_size( int team_size ) const {
return shared_1d_int::shmem_size(data.dimension_1());
}
};
int main(int narg, char* args[]) {
Kokkos::initialize(narg,args);
// Produce some 3D random data (see Algorithms/01_random_numbers for more info)
Kokkos::View<int***,Kokkos::LayoutRight> data("Data",512,512,32);
Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
Kokkos::fill_random(data,rand_pool64,100);
// A global value to put the result in
Kokkos::View<int> gsum("Sum");
// Each team handles a slice of the data
// Set up TeamPolicy with 512 teams with maximum number of threads per team and 16 vector lanes.
// Kokkos::AUTO will determine the number of threads
// The maximum vector length is hardware dependent but can always be smaller than the hardware allows.
// The vector length must be a power of 2.
const Kokkos::TeamPolicy<> policy( 512 , Kokkos::AUTO , 16);
Kokkos::parallel_for( policy , SomeCorrelation(data,gsum) );
Kokkos::fence();
// Copy result value back
int sum = 0;
Kokkos::deep_copy(sum,gsum);
printf("Result %i\n",sum);
Kokkos::finalize();
}
#endif //KOKKOS_HAVE_CXX11

View File

@ -0,0 +1,10 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
tutorial_hierarchicalparallelism_04_team_scan
SOURCES team_scan.cpp
COMM serial mpi
)

View File

@ -0,0 +1,43 @@
KOKKOS_PATH = ../../../..
SRC = $(wildcard *.cpp)
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ../../../../config/nvcc_wrapper
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = $(SRC:.cpp=.cuda)
KOKKOS_DEVICES = "Cuda,OpenMP"
KOKKOS_ARCH = "SNB,Kepler35"
else
CXX = g++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = $(SRC:.cpp=.host)
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
DEPFLAGS = -M
OBJ = $(SRC:.cpp=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,141 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <Kokkos_DualView.hpp>
#include <impl/Kokkos_Timer.hpp>
#include <cstdio>
#include <cstdlib>
typedef Kokkos::DefaultExecutionSpace Device ;
typedef Kokkos::HostSpace::execution_space Host ;
typedef Kokkos::TeamPolicy< Device > team_policy ;
typedef team_policy::member_type team_member ;
static const int TEAM_SIZE = 16 ;
struct find_2_tuples {
int chunk_size;
Kokkos::View<const int*> data;
Kokkos::View<int**> histogram;
find_2_tuples(int chunk_size_, Kokkos::DualView<int*> data_,
Kokkos::DualView<int**> histogram_):chunk_size(chunk_size_),
data(data_.d_view),histogram(histogram_.d_view) {
data_.sync<Device>();
histogram_.sync<Device>();
histogram_.modify<Device>();
}
KOKKOS_INLINE_FUNCTION
void operator() ( const team_member & dev) const {
Kokkos::View<int**,Kokkos::MemoryUnmanaged> l_histogram(dev.team_shmem(),TEAM_SIZE,TEAM_SIZE);
Kokkos::View<int*,Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),chunk_size+1);
const int i = dev.league_rank() * chunk_size;
for(int j = dev.team_rank(); j<chunk_size+1; j+=dev.team_size())
l_data(j) = data(i+j);
for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
for(int l = 0; l < TEAM_SIZE; l++)
l_histogram(k,l) = 0;
dev.team_barrier();
for(int j = 0; j<chunk_size; j++) {
for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
for(int l = 0; l < TEAM_SIZE; l++) {
if((l_data(j) == k) && (l_data(j+1)==l))
l_histogram(k,l)++;
}
}
for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
for(int l = 0; l < TEAM_SIZE; l++) {
Kokkos::atomic_fetch_add(&histogram(k,l),l_histogram(k,l));
}
dev.team_barrier();
}
size_t team_shmem_size( int team_size ) const { return sizeof(int)*(chunk_size+2 + team_size * team_size ); }
};
int main(int narg, char* args[]) {
Kokkos::initialize(narg,args);
int chunk_size = 1024;
int nchunks = 100000; //1024*1024;
Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);
srand(1231093);
for(int i = 0; i < (int) data.dimension_0(); i++) {
data.h_view(i) = rand()%TEAM_SIZE;
}
data.modify<Host>();
data.sync<Device>();
Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE);
Kokkos::Timer timer;
// threads/team is automatically limited to maximum supported by the device.
Kokkos::parallel_for( team_policy( nchunks , TEAM_SIZE )
, find_2_tuples(chunk_size,data,histogram) );
Kokkos::fence();
double time = timer.seconds();
histogram.sync<Host>();
printf("Time: %f \n\n",time);
int sum = 0;
for(int k=0; k<TEAM_SIZE; k++) {
for(int l=0; l<TEAM_SIZE; l++) {
printf("%i ",histogram.h_view(k,l));
sum += histogram.h_view(k,l);
}
printf("\n");
}
printf("Result: %i %i\n",sum,chunk_size*nchunks);
Kokkos::finalize();
}

View File

@ -0,0 +1,8 @@
TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams)
IF (Kokkos_ENABLE_CXX11)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams_lambda)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_nested_parallel_for)
TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_vectorization)
ENDIF ()

View File

@ -0,0 +1,72 @@
default:
cd ./01_thread_teams; \
make -j 4
cd ./01_thread_teams_lambda; \
make -j 4
cd ./02_nested_parallel_for; \
make -j 4
cd ./03_vectorization; \
make -j 4
cd ./04_team_scan; \
make -j 4
openmp:
cd ./01_thread_teams; \
make -j 4 KOKKOS_DEVICES=OpenMP
cd ./01_thread_teams_lambda; \
make -j 4 KOKKOS_DEVICES=OpenMP
cd ./02_nested_parallel_for; \
make -j 4 KOKKOS_DEVICES=OpenMP
cd ./03_vectorization; \
make -j 4 KOKKOS_DEVICES=OpenMP
cd ./04_team_scan; \
make -j 4 KOKKOS_DEVICES=OpenMP
pthreads:
cd ./01_thread_teams; \
make -j 4 KOKKOS_DEVICES=Pthreads
cd ./01_thread_teams_lambda; \
make -j 4 KOKKOS_DEVICES=Pthreads
cd ./02_nested_parallel_for; \
make -j 4 KOKKOS_DEVICES=Pthreads
cd ./03_vectorization; \
make -j 4 KOKKOS_DEVICES=Pthreads
cd ./04_team_scan; \
make -j 4 KOKKOS_DEVICES=Pthreads
serial:
cd ./01_thread_teams; \
make -j 4 KOKKOS_DEVICES=Serial
cd ./01_thread_teams_lambda; \
make -j 4 KOKKOS_DEVICES=Serial
cd ./02_nested_parallel_for; \
make -j 4 KOKKOS_DEVICES=Serial
cd ./03_vectorization; \
make -j 4 KOKKOS_DEVICES=Serial
cd ./04_team_scan; \
make -j 4 KOKKOS_DEVICES=Serial
cuda:
cd ./01_thread_teams; \
make -j 4 KOKKOS_DEVICES=Cuda,Serial
cd ./01_thread_teams_lambda; \
make -j 4 KOKKOS_DEVICES=Cuda,Serial
cd ./02_nested_parallel_for; \
make -j 4 KOKKOS_DEVICES=Cuda,Serial
cd ./03_vectorization; \
make -j 4 KOKKOS_DEVICES=Cuda,Serial
cd ./04_team_scan; \
make -j 4 KOKKOS_DEVICES=Cuda,Serial
clean:
cd ./01_thread_teams; \
make clean
cd ./01_thread_teams_lambda; \
make clean
cd ./02_nested_parallel_for; \
make clean
cd ./03_vectorization; \
make clean
cd ./04_team_scan; \
make clean