453 lines
16 KiB
C++
453 lines
16 KiB
C++
/*
|
|
//@HEADER
|
|
// ************************************************************************
|
|
//
|
|
// Kokkos v. 2.0
|
|
// Copyright (2014) Sandia Corporation
|
|
//
|
|
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
|
// the U.S. Government retains certain rights in this software.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// 1. Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
//
|
|
// 2. Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
//
|
|
// 3. Neither the name of the Corporation nor the names of the
|
|
// contributors may be used to endorse or promote products derived from
|
|
// this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
|
//
|
|
// ************************************************************************
|
|
//@HEADER
|
|
*/
|
|
|
|
#ifndef EXPLICIT_DRIVER_HPP
|
|
#define EXPLICIT_DRIVER_HPP
|
|
|
|
#include <sys/time.h>
|
|
#include <iostream>
|
|
#include <iomanip>
|
|
#include <cstdlib>
|
|
#include <cmath>
|
|
|
|
#include <impl/Kokkos_Timer.hpp>
|
|
|
|
#include <ExplicitFunctors.hpp>
|
|
|
|
//----------------------------------------------------------------------------
|
|
|
|
namespace Explicit {
|
|
|
|
struct PerformanceData {
|
|
double mesh_time ;
|
|
double init_time ;
|
|
double internal_force_time ;
|
|
double central_diff ;
|
|
double comm_time ;
|
|
size_t number_of_steps ;
|
|
|
|
PerformanceData()
|
|
: mesh_time(0)
|
|
, init_time(0)
|
|
, internal_force_time(0)
|
|
, central_diff(0)
|
|
, comm_time(0)
|
|
, number_of_steps(0)
|
|
{}
|
|
|
|
void best( const PerformanceData & rhs )
|
|
{
|
|
if ( rhs.mesh_time < mesh_time ) mesh_time = rhs.mesh_time ;
|
|
if ( rhs.init_time < init_time ) init_time = rhs.init_time ;
|
|
if ( rhs.internal_force_time < internal_force_time ) internal_force_time = rhs.internal_force_time ;
|
|
if ( rhs.central_diff < central_diff ) central_diff = rhs.central_diff ;
|
|
if ( rhs.comm_time < comm_time ) comm_time = rhs.comm_time ;
|
|
}
|
|
};
|
|
|
|
template< typename Scalar , class FixtureType >
|
|
PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
|
|
const int global_max_x ,
|
|
const int global_max_y ,
|
|
const int global_max_z ,
|
|
const int steps ,
|
|
const int print_sample )
|
|
{
|
|
typedef Scalar scalar_type ;
|
|
typedef FixtureType fixture_type ;
|
|
typedef typename fixture_type::execution_space execution_space ;
|
|
//typedef typename fixture_type::FEMeshType mesh_type ; // unused
|
|
|
|
enum { ElementNodeCount = fixture_type::element_node_count };
|
|
|
|
const int NumStates = 2;
|
|
|
|
const int total_num_steps = steps ;
|
|
|
|
const Scalar user_dt = 5.0e-6;
|
|
//const Scalar end_time = 0.0050;
|
|
|
|
// element block parameters
|
|
const Scalar lin_bulk_visc = 0.0;
|
|
const Scalar quad_bulk_visc = 0.0;
|
|
|
|
// const Scalar lin_bulk_visc = 0.06;
|
|
// const Scalar quad_bulk_visc = 1.2;
|
|
// const Scalar hg_stiffness = 0.0;
|
|
// const Scalar hg_viscosity = 0.0;
|
|
// const Scalar hg_stiffness = 0.03;
|
|
// const Scalar hg_viscosity = 0.001;
|
|
|
|
// material properties
|
|
const Scalar youngs_modulus=1.0e6;
|
|
const Scalar poissons_ratio=0.0;
|
|
const Scalar density = 8.0e-4;
|
|
|
|
const comm::Machine machine = mesh.parallel_data_map.machine ;
|
|
|
|
PerformanceData perf_data ;
|
|
|
|
Kokkos::Timer wall_clock ;
|
|
|
|
//------------------------------------
|
|
// Generate fields
|
|
|
|
typedef Fields< scalar_type , execution_space > fields_type ;
|
|
|
|
fields_type mesh_fields( mesh ,
|
|
lin_bulk_visc ,
|
|
quad_bulk_visc ,
|
|
youngs_modulus ,
|
|
poissons_ratio ,
|
|
density );
|
|
|
|
typename fields_type::node_coords_type::HostMirror
|
|
model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords );
|
|
|
|
typename fields_type::geom_state_array_type::HostMirror
|
|
displacement_h = Kokkos::create_mirror( mesh_fields.displacement );
|
|
|
|
typename fields_type::geom_state_array_type::HostMirror
|
|
velocity_h = Kokkos::create_mirror( mesh_fields.velocity );
|
|
|
|
Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords );
|
|
|
|
//------------------------------------
|
|
// Initialization
|
|
|
|
initialize_element<Scalar,execution_space>::apply( mesh_fields );
|
|
initialize_node< Scalar,execution_space>::apply( mesh_fields );
|
|
|
|
const Scalar x_bc = global_max_x ;
|
|
|
|
// Initial condition on velocity to initiate a pulse along the X axis
|
|
{
|
|
const unsigned X = 0;
|
|
for (int inode = 0; inode< mesh_fields.num_nodes; ++inode) {
|
|
if ( model_coords_h(inode,X) == 0) {
|
|
velocity_h(inode,X,0) = 1.0e3;
|
|
velocity_h(inode,X,1) = 1.0e3;
|
|
}
|
|
}
|
|
}
|
|
|
|
Kokkos::deep_copy( mesh_fields.velocity , velocity_h );
|
|
|
|
//--------------------------------------------------------------------------
|
|
// We will call a sequence of functions. These functions have been
|
|
// grouped into several functors to balance the number of global memory
|
|
// accesses versus requiring too many registers or too much L1 cache.
|
|
// Global memory accees have read/write cost and memory subsystem contention cost.
|
|
//--------------------------------------------------------------------------
|
|
|
|
perf_data.init_time = comm::max( machine , wall_clock.seconds() );
|
|
|
|
// Parameters required for the internal force computations.
|
|
|
|
int current_state = 0;
|
|
int previous_state = 0;
|
|
int next_state = 0;
|
|
|
|
perf_data.number_of_steps = total_num_steps ;
|
|
|
|
#if defined( KOKKOS_ENABLE_MPI )
|
|
|
|
typedef typename
|
|
fields_type::geom_state_array_type::value_type comm_value_type ;
|
|
|
|
const unsigned comm_value_count = 6 ;
|
|
|
|
Kokkos::AsyncExchange< comm_value_type , execution_space ,
|
|
Kokkos::ParallelDataMap >
|
|
comm_exchange( mesh.parallel_data_map , comm_value_count );
|
|
|
|
#endif
|
|
|
|
for (int step = 0; step < total_num_steps; ++step) {
|
|
|
|
wall_clock.reset();
|
|
|
|
//------------------------------------------------------------------------
|
|
#if defined( KOKKOS_ENABLE_MPI )
|
|
{
|
|
// Communicate "send" nodes' displacement and velocity next_state
|
|
// to the ghosted nodes.
|
|
// buffer packages: { { dx , dy , dz , vx , vy , vz }_node }
|
|
|
|
pack_state< Scalar , execution_space >
|
|
::apply( comm_exchange.buffer() ,
|
|
mesh.parallel_data_map.count_interior ,
|
|
mesh.parallel_data_map.count_send ,
|
|
mesh_fields , next_state );
|
|
|
|
comm_exchange.setup();
|
|
|
|
comm_exchange.send_receive();
|
|
|
|
unpack_state< Scalar , execution_space >
|
|
::apply( mesh_fields , next_state ,
|
|
comm_exchange.buffer() ,
|
|
mesh.parallel_data_map.count_owned ,
|
|
mesh.parallel_data_map.count_receive );
|
|
|
|
execution_space().fence();
|
|
}
|
|
#endif
|
|
|
|
perf_data.comm_time += comm::max( machine , wall_clock.seconds() );
|
|
|
|
//------------------------------------------------------------------------
|
|
// rotate the states
|
|
|
|
previous_state = current_state;
|
|
current_state = next_state;
|
|
++next_state;
|
|
next_state %= NumStates;
|
|
|
|
wall_clock.reset();
|
|
|
|
// First kernel 'grad_hgop' combines two functions:
|
|
// gradient, velocity gradient
|
|
grad< Scalar , execution_space >::apply( mesh_fields ,
|
|
current_state ,
|
|
previous_state );
|
|
|
|
// Combine tensor decomposition and rotation functions.
|
|
decomp_rotate< Scalar , execution_space >::apply( mesh_fields ,
|
|
current_state ,
|
|
previous_state );
|
|
|
|
internal_force< Scalar , execution_space >::apply( mesh_fields ,
|
|
user_dt ,
|
|
current_state );
|
|
|
|
execution_space().fence();
|
|
|
|
perf_data.internal_force_time +=
|
|
comm::max( machine , wall_clock.seconds() );
|
|
|
|
wall_clock.reset();
|
|
|
|
// Assembly of elements' contributions to nodal force into
|
|
// a nodal force vector. Update the accelerations, velocities,
|
|
// displacements.
|
|
// The same pattern can be used for matrix-free residual computations.
|
|
nodal_step< Scalar , execution_space >::apply( mesh_fields ,
|
|
x_bc ,
|
|
current_state,
|
|
next_state );
|
|
execution_space().fence();
|
|
|
|
perf_data.central_diff +=
|
|
comm::max( machine , wall_clock.seconds() );
|
|
|
|
if ( print_sample && 0 == step % 100 ) {
|
|
Kokkos::deep_copy( displacement_h , mesh_fields.displacement );
|
|
Kokkos::deep_copy( velocity_h , mesh_fields.velocity );
|
|
|
|
if ( 1 == print_sample ) {
|
|
|
|
std::cout << "step " << step
|
|
<< " : displacement(*,0,0) =" ;
|
|
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
|
|
if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) {
|
|
std::cout << " " << displacement_h(i,0,next_state);
|
|
}
|
|
}
|
|
std::cout << std::endl ;
|
|
|
|
const float tol = 1.0e-6 ;
|
|
const int yb = global_max_y ;
|
|
const int zb = global_max_z ;
|
|
std::cout << "step " << step
|
|
<< " : displacement(*," << yb << "," << zb << ") =" ;
|
|
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
|
|
if ( fabs( model_coords_h(i,1) - yb ) < tol &&
|
|
fabs( model_coords_h(i,2) - zb ) < tol ) {
|
|
std::cout << " " << displacement_h(i,0,next_state);
|
|
}
|
|
}
|
|
std::cout << std::endl ;
|
|
}
|
|
else if ( 2 == print_sample ) {
|
|
|
|
const float tol = 1.0e-6 ;
|
|
const int xb = global_max_x / 2 ;
|
|
const int yb = global_max_y / 2 ;
|
|
const int zb = global_max_z / 2 ;
|
|
|
|
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
|
|
if ( fabs( model_coords_h(i,0) - xb ) < tol &&
|
|
fabs( model_coords_h(i,1) - yb ) < tol &&
|
|
fabs( model_coords_h(i,2) - zb ) < tol ) {
|
|
std::cout << "step " << step
|
|
<< " : displacement("
|
|
<< xb << "," << yb << "," << zb << ") = {"
|
|
<< std::setprecision(6)
|
|
<< " " << displacement_h(i,0,next_state)
|
|
<< std::setprecision(2)
|
|
<< " " << displacement_h(i,1,next_state)
|
|
<< std::setprecision(2)
|
|
<< " " << displacement_h(i,2,next_state)
|
|
<< " }" << std::endl ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return perf_data ;
|
|
}
|
|
|
|
|
|
template <typename Scalar, typename Device>
|
|
static void driver( const char * const label ,
|
|
comm::Machine machine ,
|
|
const int gang_count ,
|
|
const int elem_count_beg ,
|
|
const int elem_count_end ,
|
|
const int runs )
|
|
{
|
|
typedef Scalar scalar_type ;
|
|
typedef Device execution_space ;
|
|
typedef double coordinate_scalar_type ;
|
|
typedef FixtureElementHex8 fixture_element_type ;
|
|
|
|
typedef BoxMeshFixture< coordinate_scalar_type ,
|
|
execution_space ,
|
|
fixture_element_type > fixture_type ;
|
|
|
|
typedef typename fixture_type::FEMeshType mesh_type ;
|
|
|
|
const size_t proc_count = comm::size( machine );
|
|
const size_t proc_rank = comm::rank( machine );
|
|
|
|
const int space = 15 ;
|
|
const int steps = 1000 ;
|
|
const int print_sample = 0 ;
|
|
|
|
if ( comm::rank( machine ) == 0 ) {
|
|
|
|
std::cout << std::endl ;
|
|
std::cout << "\"MiniExplicitDynamics with Kokkos " << label
|
|
<< " time_steps(" << steps << ")"
|
|
<< "\"" << std::endl;
|
|
std::cout << std::left << std::setw(space) << "\"Element\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"Node\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"Initialize\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"ElemForce\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"NodeUpdate\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"NodeComm\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"Time/Elem\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"Time/Node\"";
|
|
|
|
std::cout << std::endl;
|
|
|
|
std::cout << std::left << std::setw(space) << "\"count\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"count\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
|
|
std::cout << std::left << std::setw(space) << "\"microsec\"";
|
|
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
|
|
{
|
|
const int iz = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
|
|
const int iy = iz + 1 ;
|
|
const int ix = 2 * iy ;
|
|
const int nelem = ix * iy * iz ;
|
|
const int nnode = ( ix + 1 ) * ( iy + 1 ) * ( iz + 1 );
|
|
|
|
mesh_type mesh =
|
|
fixture_type::create( proc_count , proc_rank , gang_count ,
|
|
ix , iy , iz );
|
|
|
|
mesh.parallel_data_map.machine = machine ;
|
|
|
|
PerformanceData perf , best ;
|
|
|
|
for(int j = 0; j < runs; j++){
|
|
|
|
perf = run<scalar_type,fixture_type>(mesh,ix,iy,iz,steps,print_sample);
|
|
|
|
if( j == 0 ) {
|
|
best = perf ;
|
|
}
|
|
else {
|
|
best.best( perf );
|
|
}
|
|
}
|
|
|
|
if ( comm::rank( machine ) == 0 ) {
|
|
double time_per_element =
|
|
( best.internal_force_time ) / ( nelem * perf.number_of_steps );
|
|
double time_per_node =
|
|
( best.comm_time + best.central_diff ) / ( nnode * perf.number_of_steps );
|
|
|
|
std::cout << std::setw(space-3) << nelem << " , "
|
|
<< std::setw(space-3) << nnode << " , "
|
|
<< std::setw(space-3) << best.number_of_steps << " , "
|
|
<< std::setw(space-3) << best.init_time * 1000000 << " , "
|
|
<< std::setw(space-3)
|
|
<< ( best.internal_force_time * 1000000 ) / best.number_of_steps << " , "
|
|
<< std::setw(space-3)
|
|
<< ( best.central_diff * 1000000 ) / best.number_of_steps << " , "
|
|
<< std::setw(space-3)
|
|
<< ( best.comm_time * 1000000 ) / best.number_of_steps << " , "
|
|
<< std::setw(space-3) << time_per_element * 1000000 << " , "
|
|
<< std::setw(space-3) << time_per_node * 1000000
|
|
<< std::endl ;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
} // namespace Explicit
|
|
|
|
#endif /* #ifndef EXPLICIT_DRIVER_HPP */
|