Updating kokkos lib
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14918 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -1,452 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef EXPLICIT_DRIVER_HPP
|
||||
#define EXPLICIT_DRIVER_HPP
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
|
||||
#include <ExplicitFunctors.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Explicit {
|
||||
|
||||
struct PerformanceData {
|
||||
double mesh_time ;
|
||||
double init_time ;
|
||||
double internal_force_time ;
|
||||
double central_diff ;
|
||||
double comm_time ;
|
||||
size_t number_of_steps ;
|
||||
|
||||
PerformanceData()
|
||||
: mesh_time(0)
|
||||
, init_time(0)
|
||||
, internal_force_time(0)
|
||||
, central_diff(0)
|
||||
, comm_time(0)
|
||||
, number_of_steps(0)
|
||||
{}
|
||||
|
||||
void best( const PerformanceData & rhs )
|
||||
{
|
||||
if ( rhs.mesh_time < mesh_time ) mesh_time = rhs.mesh_time ;
|
||||
if ( rhs.init_time < init_time ) init_time = rhs.init_time ;
|
||||
if ( rhs.internal_force_time < internal_force_time ) internal_force_time = rhs.internal_force_time ;
|
||||
if ( rhs.central_diff < central_diff ) central_diff = rhs.central_diff ;
|
||||
if ( rhs.comm_time < comm_time ) comm_time = rhs.comm_time ;
|
||||
}
|
||||
};
|
||||
|
||||
template< typename Scalar , class FixtureType >
|
||||
PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
|
||||
const int global_max_x ,
|
||||
const int global_max_y ,
|
||||
const int global_max_z ,
|
||||
const int steps ,
|
||||
const int print_sample )
|
||||
{
|
||||
typedef Scalar scalar_type ;
|
||||
typedef FixtureType fixture_type ;
|
||||
typedef typename fixture_type::execution_space execution_space ;
|
||||
//typedef typename fixture_type::FEMeshType mesh_type ; // unused
|
||||
|
||||
enum { ElementNodeCount = fixture_type::element_node_count };
|
||||
|
||||
const int NumStates = 2;
|
||||
|
||||
const int total_num_steps = steps ;
|
||||
|
||||
const Scalar user_dt = 5.0e-6;
|
||||
//const Scalar end_time = 0.0050;
|
||||
|
||||
// element block parameters
|
||||
const Scalar lin_bulk_visc = 0.0;
|
||||
const Scalar quad_bulk_visc = 0.0;
|
||||
|
||||
// const Scalar lin_bulk_visc = 0.06;
|
||||
// const Scalar quad_bulk_visc = 1.2;
|
||||
// const Scalar hg_stiffness = 0.0;
|
||||
// const Scalar hg_viscosity = 0.0;
|
||||
// const Scalar hg_stiffness = 0.03;
|
||||
// const Scalar hg_viscosity = 0.001;
|
||||
|
||||
// material properties
|
||||
const Scalar youngs_modulus=1.0e6;
|
||||
const Scalar poissons_ratio=0.0;
|
||||
const Scalar density = 8.0e-4;
|
||||
|
||||
const comm::Machine machine = mesh.parallel_data_map.machine ;
|
||||
|
||||
PerformanceData perf_data ;
|
||||
|
||||
Kokkos::Impl::Timer wall_clock ;
|
||||
|
||||
//------------------------------------
|
||||
// Generate fields
|
||||
|
||||
typedef Fields< scalar_type , execution_space > fields_type ;
|
||||
|
||||
fields_type mesh_fields( mesh ,
|
||||
lin_bulk_visc ,
|
||||
quad_bulk_visc ,
|
||||
youngs_modulus ,
|
||||
poissons_ratio ,
|
||||
density );
|
||||
|
||||
typename fields_type::node_coords_type::HostMirror
|
||||
model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords );
|
||||
|
||||
typename fields_type::geom_state_array_type::HostMirror
|
||||
displacement_h = Kokkos::create_mirror( mesh_fields.displacement );
|
||||
|
||||
typename fields_type::geom_state_array_type::HostMirror
|
||||
velocity_h = Kokkos::create_mirror( mesh_fields.velocity );
|
||||
|
||||
Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords );
|
||||
|
||||
//------------------------------------
|
||||
// Initialization
|
||||
|
||||
initialize_element<Scalar,execution_space>::apply( mesh_fields );
|
||||
initialize_node< Scalar,execution_space>::apply( mesh_fields );
|
||||
|
||||
const Scalar x_bc = global_max_x ;
|
||||
|
||||
// Initial condition on velocity to initiate a pulse along the X axis
|
||||
{
|
||||
const unsigned X = 0;
|
||||
for (int inode = 0; inode< mesh_fields.num_nodes; ++inode) {
|
||||
if ( model_coords_h(inode,X) == 0) {
|
||||
velocity_h(inode,X,0) = 1.0e3;
|
||||
velocity_h(inode,X,1) = 1.0e3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Kokkos::deep_copy( mesh_fields.velocity , velocity_h );
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
// We will call a sequence of functions. These functions have been
|
||||
// grouped into several functors to balance the number of global memory
|
||||
// accesses versus requiring too many registers or too much L1 cache.
|
||||
// Global memory accees have read/write cost and memory subsystem contention cost.
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
perf_data.init_time = comm::max( machine , wall_clock.seconds() );
|
||||
|
||||
// Parameters required for the internal force computations.
|
||||
|
||||
int current_state = 0;
|
||||
int previous_state = 0;
|
||||
int next_state = 0;
|
||||
|
||||
perf_data.number_of_steps = total_num_steps ;
|
||||
|
||||
#if defined( KOKKOS_HAVE_MPI )
|
||||
|
||||
typedef typename
|
||||
fields_type::geom_state_array_type::value_type comm_value_type ;
|
||||
|
||||
const unsigned comm_value_count = 6 ;
|
||||
|
||||
Kokkos::AsyncExchange< comm_value_type , execution_space ,
|
||||
Kokkos::ParallelDataMap >
|
||||
comm_exchange( mesh.parallel_data_map , comm_value_count );
|
||||
|
||||
#endif
|
||||
|
||||
for (int step = 0; step < total_num_steps; ++step) {
|
||||
|
||||
wall_clock.reset();
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
#if defined( KOKKOS_HAVE_MPI )
|
||||
{
|
||||
// Communicate "send" nodes' displacement and velocity next_state
|
||||
// to the ghosted nodes.
|
||||
// buffer packages: { { dx , dy , dz , vx , vy , vz }_node }
|
||||
|
||||
pack_state< Scalar , execution_space >
|
||||
::apply( comm_exchange.buffer() ,
|
||||
mesh.parallel_data_map.count_interior ,
|
||||
mesh.parallel_data_map.count_send ,
|
||||
mesh_fields , next_state );
|
||||
|
||||
comm_exchange.setup();
|
||||
|
||||
comm_exchange.send_receive();
|
||||
|
||||
unpack_state< Scalar , execution_space >
|
||||
::apply( mesh_fields , next_state ,
|
||||
comm_exchange.buffer() ,
|
||||
mesh.parallel_data_map.count_owned ,
|
||||
mesh.parallel_data_map.count_receive );
|
||||
|
||||
execution_space::fence();
|
||||
}
|
||||
#endif
|
||||
|
||||
perf_data.comm_time += comm::max( machine , wall_clock.seconds() );
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// rotate the states
|
||||
|
||||
previous_state = current_state;
|
||||
current_state = next_state;
|
||||
++next_state;
|
||||
next_state %= NumStates;
|
||||
|
||||
wall_clock.reset();
|
||||
|
||||
// First kernel 'grad_hgop' combines two functions:
|
||||
// gradient, velocity gradient
|
||||
grad< Scalar , execution_space >::apply( mesh_fields ,
|
||||
current_state ,
|
||||
previous_state );
|
||||
|
||||
// Combine tensor decomposition and rotation functions.
|
||||
decomp_rotate< Scalar , execution_space >::apply( mesh_fields ,
|
||||
current_state ,
|
||||
previous_state );
|
||||
|
||||
internal_force< Scalar , execution_space >::apply( mesh_fields ,
|
||||
user_dt ,
|
||||
current_state );
|
||||
|
||||
execution_space::fence();
|
||||
|
||||
perf_data.internal_force_time +=
|
||||
comm::max( machine , wall_clock.seconds() );
|
||||
|
||||
wall_clock.reset();
|
||||
|
||||
// Assembly of elements' contributions to nodal force into
|
||||
// a nodal force vector. Update the accelerations, velocities,
|
||||
// displacements.
|
||||
// The same pattern can be used for matrix-free residual computations.
|
||||
nodal_step< Scalar , execution_space >::apply( mesh_fields ,
|
||||
x_bc ,
|
||||
current_state,
|
||||
next_state );
|
||||
execution_space::fence();
|
||||
|
||||
perf_data.central_diff +=
|
||||
comm::max( machine , wall_clock.seconds() );
|
||||
|
||||
if ( print_sample && 0 == step % 100 ) {
|
||||
Kokkos::deep_copy( displacement_h , mesh_fields.displacement );
|
||||
Kokkos::deep_copy( velocity_h , mesh_fields.velocity );
|
||||
|
||||
if ( 1 == print_sample ) {
|
||||
|
||||
std::cout << "step " << step
|
||||
<< " : displacement(*,0,0) =" ;
|
||||
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
|
||||
if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) {
|
||||
std::cout << " " << displacement_h(i,0,next_state);
|
||||
}
|
||||
}
|
||||
std::cout << std::endl ;
|
||||
|
||||
const float tol = 1.0e-6 ;
|
||||
const int yb = global_max_y ;
|
||||
const int zb = global_max_z ;
|
||||
std::cout << "step " << step
|
||||
<< " : displacement(*," << yb << "," << zb << ") =" ;
|
||||
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
|
||||
if ( fabs( model_coords_h(i,1) - yb ) < tol &&
|
||||
fabs( model_coords_h(i,2) - zb ) < tol ) {
|
||||
std::cout << " " << displacement_h(i,0,next_state);
|
||||
}
|
||||
}
|
||||
std::cout << std::endl ;
|
||||
}
|
||||
else if ( 2 == print_sample ) {
|
||||
|
||||
const float tol = 1.0e-6 ;
|
||||
const int xb = global_max_x / 2 ;
|
||||
const int yb = global_max_y / 2 ;
|
||||
const int zb = global_max_z / 2 ;
|
||||
|
||||
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
|
||||
if ( fabs( model_coords_h(i,0) - xb ) < tol &&
|
||||
fabs( model_coords_h(i,1) - yb ) < tol &&
|
||||
fabs( model_coords_h(i,2) - zb ) < tol ) {
|
||||
std::cout << "step " << step
|
||||
<< " : displacement("
|
||||
<< xb << "," << yb << "," << zb << ") = {"
|
||||
<< std::setprecision(6)
|
||||
<< " " << displacement_h(i,0,next_state)
|
||||
<< std::setprecision(2)
|
||||
<< " " << displacement_h(i,1,next_state)
|
||||
<< std::setprecision(2)
|
||||
<< " " << displacement_h(i,2,next_state)
|
||||
<< " }" << std::endl ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return perf_data ;
|
||||
}
|
||||
|
||||
|
||||
template <typename Scalar, typename Device>
|
||||
static void driver( const char * const label ,
|
||||
comm::Machine machine ,
|
||||
const int gang_count ,
|
||||
const int elem_count_beg ,
|
||||
const int elem_count_end ,
|
||||
const int runs )
|
||||
{
|
||||
typedef Scalar scalar_type ;
|
||||
typedef Device execution_space ;
|
||||
typedef double coordinate_scalar_type ;
|
||||
typedef FixtureElementHex8 fixture_element_type ;
|
||||
|
||||
typedef BoxMeshFixture< coordinate_scalar_type ,
|
||||
execution_space ,
|
||||
fixture_element_type > fixture_type ;
|
||||
|
||||
typedef typename fixture_type::FEMeshType mesh_type ;
|
||||
|
||||
const size_t proc_count = comm::size( machine );
|
||||
const size_t proc_rank = comm::rank( machine );
|
||||
|
||||
const int space = 15 ;
|
||||
const int steps = 1000 ;
|
||||
const int print_sample = 0 ;
|
||||
|
||||
if ( comm::rank( machine ) == 0 ) {
|
||||
|
||||
std::cout << std::endl ;
|
||||
std::cout << "\"MiniExplicitDynamics with Kokkos " << label
|
||||
<< " time_steps(" << steps << ")"
|
||||
<< "\"" << std::endl;
|
||||
std::cout << std::left << std::setw(space) << "\"Element\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"Node\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"Initialize\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"ElemForce\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"NodeUpdate\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"NodeComm\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"Time/Elem\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"Time/Node\"";
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << std::left << std::setw(space) << "\"count\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"count\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
|
||||
std::cout << std::left << std::setw(space) << "\"microsec\"";
|
||||
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
|
||||
{
|
||||
const int iz = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
|
||||
const int iy = iz + 1 ;
|
||||
const int ix = 2 * iy ;
|
||||
const int nelem = ix * iy * iz ;
|
||||
const int nnode = ( ix + 1 ) * ( iy + 1 ) * ( iz + 1 );
|
||||
|
||||
mesh_type mesh =
|
||||
fixture_type::create( proc_count , proc_rank , gang_count ,
|
||||
ix , iy , iz );
|
||||
|
||||
mesh.parallel_data_map.machine = machine ;
|
||||
|
||||
PerformanceData perf , best ;
|
||||
|
||||
for(int j = 0; j < runs; j++){
|
||||
|
||||
perf = run<scalar_type,fixture_type>(mesh,ix,iy,iz,steps,print_sample);
|
||||
|
||||
if( j == 0 ) {
|
||||
best = perf ;
|
||||
}
|
||||
else {
|
||||
best.best( perf );
|
||||
}
|
||||
}
|
||||
|
||||
if ( comm::rank( machine ) == 0 ) {
|
||||
double time_per_element =
|
||||
( best.internal_force_time ) / ( nelem * perf.number_of_steps );
|
||||
double time_per_node =
|
||||
( best.comm_time + best.central_diff ) / ( nnode * perf.number_of_steps );
|
||||
|
||||
std::cout << std::setw(space-3) << nelem << " , "
|
||||
<< std::setw(space-3) << nnode << " , "
|
||||
<< std::setw(space-3) << best.number_of_steps << " , "
|
||||
<< std::setw(space-3) << best.init_time * 1000000 << " , "
|
||||
<< std::setw(space-3)
|
||||
<< ( best.internal_force_time * 1000000 ) / best.number_of_steps << " , "
|
||||
<< std::setw(space-3)
|
||||
<< ( best.central_diff * 1000000 ) / best.number_of_steps << " , "
|
||||
<< std::setw(space-3)
|
||||
<< ( best.comm_time * 1000000 ) / best.number_of_steps << " , "
|
||||
<< std::setw(space-3) << time_per_element * 1000000 << " , "
|
||||
<< std::setw(space-3) << time_per_node * 1000000
|
||||
<< std::endl ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace Explicit
|
||||
|
||||
#endif /* #ifndef EXPLICIT_DRIVER_HPP */
|
||||
Reference in New Issue
Block a user