Update Kokkos library to v2.7.00
This commit is contained in:
@ -127,44 +127,46 @@ int main (int narg, char* arg[]) {
|
||||
// arguments from the list that start with '--kokkos-'.
|
||||
Kokkos::initialize (narg, arg);
|
||||
|
||||
int size = 10000;
|
||||
view_type a("A",size);
|
||||
{
|
||||
int size = 10000;
|
||||
view_type a("A",size);
|
||||
|
||||
// Define two views with LayoutLeft and LayoutRight.
|
||||
left_type l("L",size,10000);
|
||||
right_type r("R",size,10000);
|
||||
// Define two views with LayoutLeft and LayoutRight.
|
||||
left_type l("L",size,10000);
|
||||
right_type r("R",size,10000);
|
||||
|
||||
// Initialize the data in the views.
|
||||
Kokkos::parallel_for(size,init_view<left_type>(l));
|
||||
Kokkos::parallel_for(size,init_view<right_type>(r));
|
||||
Kokkos::fence();
|
||||
// Initialize the data in the views.
|
||||
Kokkos::parallel_for(size,init_view<left_type>(l));
|
||||
Kokkos::parallel_for(size,init_view<right_type>(r));
|
||||
Kokkos::fence();
|
||||
|
||||
// Measure time to execute the contraction kernel when giving it a
|
||||
// LayoutLeft view for v1 and a LayoutRight view for v2. This should be
|
||||
// fast on GPUs and slow on CPUs
|
||||
Kokkos::Timer time1;
|
||||
Kokkos::parallel_for(size,contraction<left_type,right_type>(a,l,r));
|
||||
Kokkos::fence();
|
||||
double sec1 = time1.seconds();
|
||||
// Measure time to execute the contraction kernel when giving it a
|
||||
// LayoutLeft view for v1 and a LayoutRight view for v2. This should be
|
||||
// fast on GPUs and slow on CPUs
|
||||
Kokkos::Timer time1;
|
||||
Kokkos::parallel_for(size,contraction<left_type,right_type>(a,l,r));
|
||||
Kokkos::fence();
|
||||
double sec1 = time1.seconds();
|
||||
|
||||
double sum1 = 0;
|
||||
Kokkos::parallel_reduce(size,dot(a),sum1);
|
||||
Kokkos::fence();
|
||||
double sum1 = 0;
|
||||
Kokkos::parallel_reduce(size,dot(a),sum1);
|
||||
Kokkos::fence();
|
||||
|
||||
// Measure time to execute the contraction kernel when giving it a
|
||||
// LayoutRight view for v1 and a LayoutLeft view for v2. This should be
|
||||
// fast on CPUs and slow on GPUs
|
||||
Kokkos::Timer time2;
|
||||
Kokkos::parallel_for(size,contraction<right_type,left_type>(a,r,l));
|
||||
Kokkos::fence();
|
||||
double sec2 = time2.seconds();
|
||||
// Measure time to execute the contraction kernel when giving it a
|
||||
// LayoutRight view for v1 and a LayoutLeft view for v2. This should be
|
||||
// fast on CPUs and slow on GPUs
|
||||
Kokkos::Timer time2;
|
||||
Kokkos::parallel_for(size,contraction<right_type,left_type>(a,r,l));
|
||||
Kokkos::fence();
|
||||
double sec2 = time2.seconds();
|
||||
|
||||
double sum2 = 0;
|
||||
Kokkos::parallel_reduce(size,dot(a),sum2);
|
||||
double sum2 = 0;
|
||||
Kokkos::parallel_reduce(size,dot(a),sum2);
|
||||
|
||||
// Kokkos' reductions are deterministic.
|
||||
// The results should always be equal.
|
||||
printf("Result Left/Right %f Right/Left %f (equal result: %i)\n",sec1,sec2,sum2==sum1);
|
||||
// Kokkos' reductions are deterministic.
|
||||
// The results should always be equal.
|
||||
printf("Result Left/Right %f Right/Left %f (equal result: %i)\n",sec1,sec2,sum2==sum1);
|
||||
}
|
||||
|
||||
Kokkos::finalize();
|
||||
}
|
||||
|
||||
@ -99,43 +99,45 @@ struct localsum {
|
||||
int main(int narg, char* arg[]) {
|
||||
Kokkos::initialize (narg, arg);
|
||||
|
||||
int size = 1000000;
|
||||
{
|
||||
int size = 1000000;
|
||||
|
||||
idx_type idx("Idx",size,64);
|
||||
idx_type_host h_idx = Kokkos::create_mirror_view (idx);
|
||||
idx_type idx("Idx",size,64);
|
||||
idx_type_host h_idx = Kokkos::create_mirror_view (idx);
|
||||
|
||||
view_type dest ("Dest", size);
|
||||
view_type src ("Src", size);
|
||||
view_type dest ("Dest", size);
|
||||
view_type src ("Src", size);
|
||||
|
||||
srand(134231);
|
||||
srand(134231);
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
|
||||
h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
|
||||
h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
|
||||
}
|
||||
}
|
||||
|
||||
// Deep copy the initial data to the device
|
||||
Kokkos::deep_copy(idx,h_idx);
|
||||
// Run the first kernel to warmup caches
|
||||
Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
|
||||
// Run the localsum functor using the RandomAccess trait. On CPUs there should
|
||||
// not be any different in performance to not using the RandomAccess trait.
|
||||
// On GPUs where can be a dramatic difference
|
||||
Kokkos::Timer time1;
|
||||
Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec1 = time1.seconds();
|
||||
|
||||
Kokkos::Timer time2;
|
||||
Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec2 = time2.seconds();
|
||||
|
||||
printf("Time with Trait RandomAccess: %f with Plain: %f \n",sec1,sec2);
|
||||
}
|
||||
|
||||
// Deep copy the initial data to the device
|
||||
Kokkos::deep_copy(idx,h_idx);
|
||||
// Run the first kernel to warmup caches
|
||||
Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
|
||||
// Run the localsum functor using the RandomAccess trait. On CPUs there should
|
||||
// not be any different in performance to not using the RandomAccess trait.
|
||||
// On GPUs where can be a dramatic difference
|
||||
Kokkos::Timer time1;
|
||||
Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec1 = time1.seconds();
|
||||
|
||||
Kokkos::Timer time2;
|
||||
Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec2 = time2.seconds();
|
||||
|
||||
printf("Time with Trait RandomAccess: %f with Plain: %f \n",sec1,sec2);
|
||||
|
||||
Kokkos::finalize();
|
||||
}
|
||||
|
||||
|
||||
@ -145,46 +145,48 @@ int main (int narg, char* arg[]) {
|
||||
|
||||
Kokkos::initialize (narg, arg);
|
||||
|
||||
// The number of mesh points along each dimension of the mesh, not
|
||||
// including boundaries.
|
||||
const size_type size = 100;
|
||||
{
|
||||
// The number of mesh points along each dimension of the mesh, not
|
||||
// including boundaries.
|
||||
const size_type size = 100;
|
||||
|
||||
// A is the full cubic 3-D mesh, including the boundaries.
|
||||
mesh_type A ("A", size+2, size+2, size+2);
|
||||
// Ai is the "inner" part of A, _not_ including the boundaries.
|
||||
//
|
||||
// A pair of indices in a particular dimension means the contiguous
|
||||
// zero-based index range in that dimension, including the first
|
||||
// entry of the pair but _not_ including the second entry.
|
||||
inner_mesh_type Ai = subview(A, pair<size_type, size_type> (1, size+1),
|
||||
pair<size_type, size_type> (1, size+1),
|
||||
pair<size_type, size_type> (1, size+1));
|
||||
// A has six boundaries, one for each face of the cube.
|
||||
// Create a View of each of these boundaries.
|
||||
// ALL() means "select all indices in that dimension."
|
||||
xy_plane_type Zneg_halo = subview(A, ALL (), ALL (), 0);
|
||||
xy_plane_type Zpos_halo = subview(A, ALL (), ALL (), 101);
|
||||
xz_plane_type Yneg_halo = subview(A, ALL (), 0, ALL ());
|
||||
xz_plane_type Ypos_halo = subview(A, ALL (), 101, ALL ());
|
||||
yz_plane_type Xneg_halo = subview(A, 0, ALL (), ALL ());
|
||||
yz_plane_type Xpos_halo = subview(A, 101, ALL (), ALL ());
|
||||
// A is the full cubic 3-D mesh, including the boundaries.
|
||||
mesh_type A ("A", size+2, size+2, size+2);
|
||||
// Ai is the "inner" part of A, _not_ including the boundaries.
|
||||
//
|
||||
// A pair of indices in a particular dimension means the contiguous
|
||||
// zero-based index range in that dimension, including the first
|
||||
// entry of the pair but _not_ including the second entry.
|
||||
inner_mesh_type Ai = subview(A, pair<size_type, size_type> (1, size+1),
|
||||
pair<size_type, size_type> (1, size+1),
|
||||
pair<size_type, size_type> (1, size+1));
|
||||
// A has six boundaries, one for each face of the cube.
|
||||
// Create a View of each of these boundaries.
|
||||
// ALL() means "select all indices in that dimension."
|
||||
xy_plane_type Zneg_halo = subview(A, ALL (), ALL (), 0);
|
||||
xy_plane_type Zpos_halo = subview(A, ALL (), ALL (), 101);
|
||||
xz_plane_type Yneg_halo = subview(A, ALL (), 0, ALL ());
|
||||
xz_plane_type Ypos_halo = subview(A, ALL (), 101, ALL ());
|
||||
yz_plane_type Xneg_halo = subview(A, 0, ALL (), ALL ());
|
||||
yz_plane_type Xpos_halo = subview(A, 101, ALL (), ALL ());
|
||||
|
||||
// Set the boundaries to their initial conditions.
|
||||
parallel_for (Zneg_halo.extent(0), set_boundary<xy_plane_type> (Zneg_halo, 1));
|
||||
parallel_for (Zpos_halo.extent(0), set_boundary<xy_plane_type> (Zpos_halo, -1));
|
||||
parallel_for (Yneg_halo.extent(0), set_boundary<xz_plane_type> (Yneg_halo, 2));
|
||||
parallel_for (Ypos_halo.extent(0), set_boundary<xz_plane_type> (Ypos_halo, -2));
|
||||
parallel_for (Xneg_halo.extent(0), set_boundary<yz_plane_type> (Xneg_halo, 3));
|
||||
parallel_for (Xpos_halo.extent(0), set_boundary<yz_plane_type> (Xpos_halo, -3));
|
||||
// Set the boundaries to their initial conditions.
|
||||
parallel_for (Zneg_halo.extent(0), set_boundary<xy_plane_type> (Zneg_halo, 1));
|
||||
parallel_for (Zpos_halo.extent(0), set_boundary<xy_plane_type> (Zpos_halo, -1));
|
||||
parallel_for (Yneg_halo.extent(0), set_boundary<xz_plane_type> (Yneg_halo, 2));
|
||||
parallel_for (Ypos_halo.extent(0), set_boundary<xz_plane_type> (Ypos_halo, -2));
|
||||
parallel_for (Xneg_halo.extent(0), set_boundary<yz_plane_type> (Xneg_halo, 3));
|
||||
parallel_for (Xpos_halo.extent(0), set_boundary<yz_plane_type> (Xpos_halo, -3));
|
||||
|
||||
// Set the interior of the mesh to its initial condition.
|
||||
parallel_for (Ai.extent(0), set_inner<inner_mesh_type> (Ai, 0));
|
||||
// Set the interior of the mesh to its initial condition.
|
||||
parallel_for (Ai.extent(0), set_inner<inner_mesh_type> (Ai, 0));
|
||||
|
||||
// Update the interior of the mesh.
|
||||
// This simulates one timestep with dt = 0.1.
|
||||
parallel_for (Ai.extent(0), update<mesh_type> (A, 0.1));
|
||||
// Update the interior of the mesh.
|
||||
// This simulates one timestep with dt = 0.1.
|
||||
parallel_for (Ai.extent(0), update<mesh_type> (A, 0.1));
|
||||
|
||||
printf ("Done\n");
|
||||
printf ("Done\n");
|
||||
}
|
||||
Kokkos::finalize ();
|
||||
}
|
||||
|
||||
|
||||
@ -83,58 +83,60 @@ struct localsum {
|
||||
int main(int narg, char* arg[]) {
|
||||
Kokkos::initialize(narg,arg);
|
||||
|
||||
int size = 1000000;
|
||||
{
|
||||
int size = 1000000;
|
||||
|
||||
// Create Views
|
||||
idx_type idx("Idx",size,64);
|
||||
view_type dest("Dest",size);
|
||||
view_type src("Src",size);
|
||||
// Create Views
|
||||
idx_type idx("Idx",size,64);
|
||||
view_type dest("Dest",size);
|
||||
view_type src("Src",size);
|
||||
|
||||
srand(134231);
|
||||
srand(134231);
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
|
||||
// When using UVM Cuda views can be accessed on the Host directly
|
||||
for(int i=0; i<size; i++) {
|
||||
for(int j=0; j<int(idx.extent(1)); j++)
|
||||
idx(i,j) = (size + i + (rand()%500 - 250))%size;
|
||||
// When using UVM Cuda views can be accessed on the Host directly
|
||||
for(int i=0; i<size; i++) {
|
||||
for(int j=0; j<int(idx.extent(1)); j++)
|
||||
idx(i,j) = (size + i + (rand()%500 - 250))%size;
|
||||
}
|
||||
|
||||
Kokkos::fence();
|
||||
// Run on the device
|
||||
// This will cause a sync of idx to the device since it was modified on the host
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec1_dev = timer.seconds();
|
||||
|
||||
// No data transfer will happen now, since nothing is accessed on the host
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec2_dev = timer.seconds();
|
||||
|
||||
// Run on the host
|
||||
// This will cause a sync back to the host of dest which was changed on the device
|
||||
// Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
|
||||
// when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
|
||||
// this gives lower effective bandwidth when doing a manual copy via dual views
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec1_host = timer.seconds();
|
||||
|
||||
// No data transfers will happen now
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec2_host = timer.seconds();
|
||||
|
||||
|
||||
|
||||
printf("Device Time with Sync: %e without Sync: %e \n",sec1_dev,sec2_dev);
|
||||
printf("Host Time with Sync: %e without Sync: %e \n",sec1_host,sec2_host);
|
||||
}
|
||||
|
||||
Kokkos::fence();
|
||||
// Run on the device
|
||||
// This will cause a sync of idx to the device since it was modified on the host
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec1_dev = timer.seconds();
|
||||
|
||||
// No data transfer will happen now, since nothing is accessed on the host
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec2_dev = timer.seconds();
|
||||
|
||||
// Run on the host
|
||||
// This will cause a sync back to the host of dest which was changed on the device
|
||||
// Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
|
||||
// when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
|
||||
// this gives lower effective bandwidth when doing a manual copy via dual views
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec1_host = timer.seconds();
|
||||
|
||||
// No data transfers will happen now
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
|
||||
Kokkos::fence();
|
||||
double sec2_host = timer.seconds();
|
||||
|
||||
|
||||
|
||||
printf("Device Time with Sync: %e without Sync: %e \n",sec1_dev,sec2_dev);
|
||||
printf("Host Time with Sync: %e without Sync: %e \n",sec1_host,sec2_host);
|
||||
|
||||
Kokkos::finalize();
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user