Update Kokkos library to v2.7.00

This commit is contained in:
Stan Moore
2018-05-25 15:00:53 -06:00
parent ce4a446cea
commit 1422b0413b
1248 changed files with 64103 additions and 5133 deletions

View File

@ -127,44 +127,46 @@ int main (int narg, char* arg[]) {
// arguments from the list that start with '--kokkos-'.
Kokkos::initialize (narg, arg);
int size = 10000;
view_type a("A",size);
{
int size = 10000;
view_type a("A",size);
// Define two views with LayoutLeft and LayoutRight.
left_type l("L",size,10000);
right_type r("R",size,10000);
// Define two views with LayoutLeft and LayoutRight.
left_type l("L",size,10000);
right_type r("R",size,10000);
// Initialize the data in the views.
Kokkos::parallel_for(size,init_view<left_type>(l));
Kokkos::parallel_for(size,init_view<right_type>(r));
Kokkos::fence();
// Initialize the data in the views.
Kokkos::parallel_for(size,init_view<left_type>(l));
Kokkos::parallel_for(size,init_view<right_type>(r));
Kokkos::fence();
// Measure time to execute the contraction kernel when giving it a
// LayoutLeft view for v1 and a LayoutRight view for v2. This should be
// fast on GPUs and slow on CPUs
Kokkos::Timer time1;
Kokkos::parallel_for(size,contraction<left_type,right_type>(a,l,r));
Kokkos::fence();
double sec1 = time1.seconds();
// Measure time to execute the contraction kernel when giving it a
// LayoutLeft view for v1 and a LayoutRight view for v2. This should be
// fast on GPUs and slow on CPUs
Kokkos::Timer time1;
Kokkos::parallel_for(size,contraction<left_type,right_type>(a,l,r));
Kokkos::fence();
double sec1 = time1.seconds();
double sum1 = 0;
Kokkos::parallel_reduce(size,dot(a),sum1);
Kokkos::fence();
double sum1 = 0;
Kokkos::parallel_reduce(size,dot(a),sum1);
Kokkos::fence();
// Measure time to execute the contraction kernel when giving it a
// LayoutRight view for v1 and a LayoutLeft view for v2. This should be
// fast on CPUs and slow on GPUs
Kokkos::Timer time2;
Kokkos::parallel_for(size,contraction<right_type,left_type>(a,r,l));
Kokkos::fence();
double sec2 = time2.seconds();
// Measure time to execute the contraction kernel when giving it a
// LayoutRight view for v1 and a LayoutLeft view for v2. This should be
// fast on CPUs and slow on GPUs
Kokkos::Timer time2;
Kokkos::parallel_for(size,contraction<right_type,left_type>(a,r,l));
Kokkos::fence();
double sec2 = time2.seconds();
double sum2 = 0;
Kokkos::parallel_reduce(size,dot(a),sum2);
double sum2 = 0;
Kokkos::parallel_reduce(size,dot(a),sum2);
// Kokkos' reductions are deterministic.
// The results should always be equal.
printf("Result Left/Right %f Right/Left %f (equal result: %i)\n",sec1,sec2,sum2==sum1);
// Kokkos' reductions are deterministic.
// The results should always be equal.
printf("Result Left/Right %f Right/Left %f (equal result: %i)\n",sec1,sec2,sum2==sum1);
}
Kokkos::finalize();
}

View File

@ -99,43 +99,45 @@ struct localsum {
int main(int narg, char* arg[]) {
Kokkos::initialize (narg, arg);
int size = 1000000;
{
int size = 1000000;
idx_type idx("Idx",size,64);
idx_type_host h_idx = Kokkos::create_mirror_view (idx);
idx_type idx("Idx",size,64);
idx_type_host h_idx = Kokkos::create_mirror_view (idx);
view_type dest ("Dest", size);
view_type src ("Src", size);
view_type dest ("Dest", size);
view_type src ("Src", size);
srand(134231);
srand(134231);
for (int i = 0; i < size; i++) {
for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
for (int i = 0; i < size; i++) {
for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
}
}
// Deep copy the initial data to the device
Kokkos::deep_copy(idx,h_idx);
// Run the first kernel to warmup caches
Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
Kokkos::fence();
// Run the localsum functor using the RandomAccess trait. On CPUs there should
// not be any different in performance to not using the RandomAccess trait.
// On GPUs where can be a dramatic difference
Kokkos::Timer time1;
Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
Kokkos::fence();
double sec1 = time1.seconds();
Kokkos::Timer time2;
Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src));
Kokkos::fence();
double sec2 = time2.seconds();
printf("Time with Trait RandomAccess: %f with Plain: %f \n",sec1,sec2);
}
// Deep copy the initial data to the device
Kokkos::deep_copy(idx,h_idx);
// Run the first kernel to warmup caches
Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
Kokkos::fence();
// Run the localsum functor using the RandomAccess trait. On CPUs there should
// not be any different in performance to not using the RandomAccess trait.
// On GPUs where can be a dramatic difference
Kokkos::Timer time1;
Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
Kokkos::fence();
double sec1 = time1.seconds();
Kokkos::Timer time2;
Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src));
Kokkos::fence();
double sec2 = time2.seconds();
printf("Time with Trait RandomAccess: %f with Plain: %f \n",sec1,sec2);
Kokkos::finalize();
}

View File

@ -145,46 +145,48 @@ int main (int narg, char* arg[]) {
Kokkos::initialize (narg, arg);
// The number of mesh points along each dimension of the mesh, not
// including boundaries.
const size_type size = 100;
{
// The number of mesh points along each dimension of the mesh, not
// including boundaries.
const size_type size = 100;
// A is the full cubic 3-D mesh, including the boundaries.
mesh_type A ("A", size+2, size+2, size+2);
// Ai is the "inner" part of A, _not_ including the boundaries.
//
// A pair of indices in a particular dimension means the contiguous
// zero-based index range in that dimension, including the first
// entry of the pair but _not_ including the second entry.
inner_mesh_type Ai = subview(A, pair<size_type, size_type> (1, size+1),
pair<size_type, size_type> (1, size+1),
pair<size_type, size_type> (1, size+1));
// A has six boundaries, one for each face of the cube.
// Create a View of each of these boundaries.
// ALL() means "select all indices in that dimension."
xy_plane_type Zneg_halo = subview(A, ALL (), ALL (), 0);
xy_plane_type Zpos_halo = subview(A, ALL (), ALL (), 101);
xz_plane_type Yneg_halo = subview(A, ALL (), 0, ALL ());
xz_plane_type Ypos_halo = subview(A, ALL (), 101, ALL ());
yz_plane_type Xneg_halo = subview(A, 0, ALL (), ALL ());
yz_plane_type Xpos_halo = subview(A, 101, ALL (), ALL ());
// A is the full cubic 3-D mesh, including the boundaries.
mesh_type A ("A", size+2, size+2, size+2);
// Ai is the "inner" part of A, _not_ including the boundaries.
//
// A pair of indices in a particular dimension means the contiguous
// zero-based index range in that dimension, including the first
// entry of the pair but _not_ including the second entry.
inner_mesh_type Ai = subview(A, pair<size_type, size_type> (1, size+1),
pair<size_type, size_type> (1, size+1),
pair<size_type, size_type> (1, size+1));
// A has six boundaries, one for each face of the cube.
// Create a View of each of these boundaries.
// ALL() means "select all indices in that dimension."
xy_plane_type Zneg_halo = subview(A, ALL (), ALL (), 0);
xy_plane_type Zpos_halo = subview(A, ALL (), ALL (), 101);
xz_plane_type Yneg_halo = subview(A, ALL (), 0, ALL ());
xz_plane_type Ypos_halo = subview(A, ALL (), 101, ALL ());
yz_plane_type Xneg_halo = subview(A, 0, ALL (), ALL ());
yz_plane_type Xpos_halo = subview(A, 101, ALL (), ALL ());
// Set the boundaries to their initial conditions.
parallel_for (Zneg_halo.extent(0), set_boundary<xy_plane_type> (Zneg_halo, 1));
parallel_for (Zpos_halo.extent(0), set_boundary<xy_plane_type> (Zpos_halo, -1));
parallel_for (Yneg_halo.extent(0), set_boundary<xz_plane_type> (Yneg_halo, 2));
parallel_for (Ypos_halo.extent(0), set_boundary<xz_plane_type> (Ypos_halo, -2));
parallel_for (Xneg_halo.extent(0), set_boundary<yz_plane_type> (Xneg_halo, 3));
parallel_for (Xpos_halo.extent(0), set_boundary<yz_plane_type> (Xpos_halo, -3));
// Set the boundaries to their initial conditions.
parallel_for (Zneg_halo.extent(0), set_boundary<xy_plane_type> (Zneg_halo, 1));
parallel_for (Zpos_halo.extent(0), set_boundary<xy_plane_type> (Zpos_halo, -1));
parallel_for (Yneg_halo.extent(0), set_boundary<xz_plane_type> (Yneg_halo, 2));
parallel_for (Ypos_halo.extent(0), set_boundary<xz_plane_type> (Ypos_halo, -2));
parallel_for (Xneg_halo.extent(0), set_boundary<yz_plane_type> (Xneg_halo, 3));
parallel_for (Xpos_halo.extent(0), set_boundary<yz_plane_type> (Xpos_halo, -3));
// Set the interior of the mesh to its initial condition.
parallel_for (Ai.extent(0), set_inner<inner_mesh_type> (Ai, 0));
// Set the interior of the mesh to its initial condition.
parallel_for (Ai.extent(0), set_inner<inner_mesh_type> (Ai, 0));
// Update the interior of the mesh.
// This simulates one timestep with dt = 0.1.
parallel_for (Ai.extent(0), update<mesh_type> (A, 0.1));
// Update the interior of the mesh.
// This simulates one timestep with dt = 0.1.
parallel_for (Ai.extent(0), update<mesh_type> (A, 0.1));
printf ("Done\n");
printf ("Done\n");
}
Kokkos::finalize ();
}

View File

@ -83,58 +83,60 @@ struct localsum {
int main(int narg, char* arg[]) {
Kokkos::initialize(narg,arg);
int size = 1000000;
{
int size = 1000000;
// Create Views
idx_type idx("Idx",size,64);
view_type dest("Dest",size);
view_type src("Src",size);
// Create Views
idx_type idx("Idx",size,64);
view_type dest("Dest",size);
view_type src("Src",size);
srand(134231);
srand(134231);
Kokkos::fence();
Kokkos::fence();
// When using UVM Cuda views can be accessed on the Host directly
for(int i=0; i<size; i++) {
for(int j=0; j<int(idx.extent(1)); j++)
idx(i,j) = (size + i + (rand()%500 - 250))%size;
// When using UVM Cuda views can be accessed on the Host directly
for(int i=0; i<size; i++) {
for(int j=0; j<int(idx.extent(1)); j++)
idx(i,j) = (size + i + (rand()%500 - 250))%size;
}
Kokkos::fence();
// Run on the device
// This will cause a sync of idx to the device since it was modified on the host
Kokkos::Timer timer;
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
Kokkos::fence();
double sec1_dev = timer.seconds();
// No data transfer will happen now, since nothing is accessed on the host
timer.reset();
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
Kokkos::fence();
double sec2_dev = timer.seconds();
// Run on the host
// This will cause a sync back to the host of dest which was changed on the device
// Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
// when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
// this gives lower effective bandwidth when doing a manual copy via dual views
timer.reset();
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
Kokkos::fence();
double sec1_host = timer.seconds();
// No data transfers will happen now
timer.reset();
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
Kokkos::fence();
double sec2_host = timer.seconds();
printf("Device Time with Sync: %e without Sync: %e \n",sec1_dev,sec2_dev);
printf("Host Time with Sync: %e without Sync: %e \n",sec1_host,sec2_host);
}
Kokkos::fence();
// Run on the device
// This will cause a sync of idx to the device since it was modified on the host
Kokkos::Timer timer;
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
Kokkos::fence();
double sec1_dev = timer.seconds();
// No data transfer will happen now, since nothing is accessed on the host
timer.reset();
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
Kokkos::fence();
double sec2_dev = timer.seconds();
// Run on the host
// This will cause a sync back to the host of dest which was changed on the device
// Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
// when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
// this gives lower effective bandwidth when doing a manual copy via dual views
timer.reset();
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
Kokkos::fence();
double sec1_host = timer.seconds();
// No data transfers will happen now
timer.reset();
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
Kokkos::fence();
double sec2_host = timer.seconds();
printf("Device Time with Sync: %e without Sync: %e \n",sec1_dev,sec2_dev);
printf("Host Time with Sync: %e without Sync: %e \n",sec1_host,sec2_host);
Kokkos::finalize();
}