Updating kokkos lib

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14918 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
stamoor
2016-05-02 22:06:50 +00:00
parent c5d0c55bee
commit 0a1b765248
411 changed files with 0 additions and 133424 deletions

View File

@ -1,610 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_BOXMESHFIXTURE_HPP
#define KOKKOS_BOXMESHFIXTURE_HPP
#include <cmath>
#include <stdexcept>
#include <sstream>
#include <Kokkos_Core.hpp>
#include <BoxMeshPartition.hpp>
#include <FEMesh.hpp>
#include <HexElement.hpp>
//----------------------------------------------------------------------------
struct FixtureElementHex8 {
static const unsigned element_node_count = 8 ;
HybridFEM::HexElement_TensorData< element_node_count > elem_data ;
BoxBoundsLinear box_bounds ;
FixtureElementHex8() : elem_data(), box_bounds() {}
static void create_node_boxes_from_vertex_boxes(
const BoxType & vertex_box_global ,
const std::vector< BoxType > & vertex_box_parts ,
BoxType & node_box_global ,
std::vector< BoxType > & node_box_parts )
{
node_box_global = vertex_box_global ;
node_box_parts = vertex_box_parts ;
}
void elem_to_node( const unsigned node_local , unsigned coord[] ) const
{
coord[0] += elem_data.eval_map[ node_local ][0] ;
coord[1] += elem_data.eval_map[ node_local ][1] ;
coord[2] += elem_data.eval_map[ node_local ][2] ;
}
};
struct FixtureElementHex27 {
static const unsigned element_node_count = 27 ;
HybridFEM::HexElement_TensorData< element_node_count > elem_data ;
BoxBoundsQuadratic box_bounds ;
FixtureElementHex27() : elem_data(), box_bounds() {}
static void create_node_boxes_from_vertex_boxes(
const BoxType & vertex_box_global ,
const std::vector< BoxType > & vertex_box_parts ,
BoxType & node_box_global ,
std::vector< BoxType > & node_box_parts )
{
node_box_global = vertex_box_global ;
node_box_parts = vertex_box_parts ;
node_box_global[0][1] = 2 * node_box_global[0][1] - 1 ;
node_box_global[1][1] = 2 * node_box_global[1][1] - 1 ;
node_box_global[2][1] = 2 * node_box_global[2][1] - 1 ;
for ( unsigned i = 0 ; i < vertex_box_parts.size() ; ++i ) {
node_box_parts[i][0][0] = 2 * node_box_parts[i][0][0] ;
node_box_parts[i][1][0] = 2 * node_box_parts[i][1][0] ;
node_box_parts[i][2][0] = 2 * node_box_parts[i][2][0] ;
node_box_parts[i][0][1] =
std::min( node_box_global[0][1] , 2 * node_box_parts[i][0][1] );
node_box_parts[i][1][1] =
std::min( node_box_global[1][1] , 2 * node_box_parts[i][1][1] );
node_box_parts[i][2][1] =
std::min( node_box_global[2][1] , 2 * node_box_parts[i][2][1] );
}
}
void elem_to_node( const unsigned node_local , unsigned coord[] ) const
{
coord[0] = 2 * coord[0] + elem_data.eval_map[ node_local ][0] ;
coord[1] = 2 * coord[1] + elem_data.eval_map[ node_local ][1] ;
coord[2] = 2 * coord[2] + elem_data.eval_map[ node_local ][2] ;
}
};
//----------------------------------------------------------------------------
template< typename Scalar , class Device , class ElementSpec >
struct BoxMeshFixture {
typedef Scalar coordinate_scalar_type ;
typedef Device execution_space ;
static const unsigned element_node_count = ElementSpec::element_node_count ;
typedef HybridFEM::FEMesh< coordinate_scalar_type ,
element_node_count ,
execution_space > FEMeshType ;
typedef typename FEMeshType::node_coords_type node_coords_type ;
typedef typename FEMeshType::elem_node_ids_type elem_node_ids_type ;
typedef typename FEMeshType::node_elem_ids_type node_elem_ids_type ;
static void verify(
const typename FEMeshType::node_coords_type::HostMirror & node_coords ,
const typename FEMeshType::elem_node_ids_type::HostMirror & elem_node_ids ,
const typename FEMeshType::node_elem_ids_type::HostMirror & node_elem_ids )
{
typedef typename FEMeshType::size_type size_type ;
//typedef typename node_coords_type::value_type coords_type ; // unused
const size_type node_count_total = node_coords.dimension_0();
const size_type elem_count_total = elem_node_ids.dimension_0();
const ElementSpec element ;
for ( size_type node_index = 0 ;
node_index < node_count_total ; ++node_index ) {
for ( size_type
j = node_elem_ids.row_map[ node_index ] ;
j < node_elem_ids.row_map[ node_index + 1 ] ; ++j ) {
const size_type elem_index = node_elem_ids.entries(j,0);
const size_type node_local = node_elem_ids.entries(j,1);
const size_type en_id = elem_node_ids(elem_index,node_local);
if ( node_index != en_id ) {
std::ostringstream msg ;
msg << "BoxMeshFixture node_elem_ids error"
<< " : node_index(" << node_index
<< ") entry(" << j
<< ") elem_index(" << elem_index
<< ") node_local(" << node_local
<< ") elem_node_id(" << en_id
<< ")" ;
throw std::runtime_error( msg.str() );
}
}
}
for ( size_type elem_index = 0 ;
elem_index < elem_count_total; ++elem_index ) {
coordinate_scalar_type elem_node_coord[ element_node_count ][3] ;
for ( size_type nn = 0 ; nn < element_node_count ; ++nn ) {
const size_type node_index = elem_node_ids( elem_index , nn );
for ( size_type nc = 0 ; nc < 3 ; ++nc ) {
elem_node_coord[nn][nc] = node_coords( node_index , nc );
}
}
for ( size_type nn = 0 ; nn < element_node_count ; ++nn ) {
const unsigned ix = element.elem_data.eval_map[nn][0] ;
const unsigned iy = element.elem_data.eval_map[nn][1] ;
const unsigned iz = element.elem_data.eval_map[nn][2] ;
if ( elem_node_coord[nn][0] != elem_node_coord[0][0] + ix ||
elem_node_coord[nn][1] != elem_node_coord[0][1] + iy ||
elem_node_coord[nn][2] != elem_node_coord[0][2] + iz ) {
std::ostringstream msg ;
msg << "BoxMeshFixture elem_node_coord mapping failure { "
<< elem_node_coord[nn][0] << " "
<< elem_node_coord[nn][1] << " "
<< elem_node_coord[nn][2] << " } != { "
<< elem_node_coord[ 0][0] + ix << " "
<< elem_node_coord[ 0][1] + iy << " "
<< elem_node_coord[ 0][2] + iz
<< " }" ;
throw std::runtime_error( msg.str() );
}
}
}
}
//------------------------------------
// Initialize element-node connectivity:
// Order elements that only depend on owned nodes first.
// These elements could be computed while waiting for
// received node data.
static void layout_elements_interior_exterior(
const BoxType vertex_box_local_used ,
const BoxType vertex_box_local_owned ,
const BoxType node_box_local_used ,
const std::vector<size_t> & node_used_id_map ,
const ElementSpec element_fixture ,
const size_t elem_count_interior ,
const typename elem_node_ids_type::HostMirror elem_node_ids )
{
size_t elem_index_interior = 0 ;
size_t elem_index_boundary = elem_count_interior ;
for ( size_t iz = vertex_box_local_used[2][0] ;
iz < vertex_box_local_used[2][1] - 1 ; ++iz ) {
for ( size_t iy = vertex_box_local_used[1][0] ;
iy < vertex_box_local_used[1][1] - 1 ; ++iy ) {
for ( size_t ix = vertex_box_local_used[0][0] ;
ix < vertex_box_local_used[0][1] - 1 ; ++ix ) {
size_t elem_index ;
// If lower and upper vertices are owned then element is interior
if ( contain( vertex_box_local_owned, ix, iy, iz ) &&
contain( vertex_box_local_owned, ix+1, iy+1, iz+1 ) ) {
elem_index = elem_index_interior++ ;
}
else {
elem_index = elem_index_boundary++ ;
}
for ( size_t nn = 0 ; nn < element_node_count ; ++nn ) {
unsigned coord[3] = { static_cast<unsigned>(ix) , static_cast<unsigned>(iy) , static_cast<unsigned>(iz) };
element_fixture.elem_to_node( nn , coord );
const size_t node_local_id =
box_map_id( node_box_local_used ,
node_used_id_map ,
coord[0] , coord[1] , coord[2] );
elem_node_ids( elem_index , nn ) = node_local_id ;
}
}}}
}
//------------------------------------
// Nested partitioning of elements by number of thread 'gangs'
static void layout_elements_partitioned(
const BoxType vertex_box_local_used ,
const BoxType /*vertex_box_local_owned*/ ,
const BoxType node_box_local_used ,
const std::vector<size_t> & node_used_id_map ,
const ElementSpec element_fixture ,
const size_t thread_gang_count ,
const typename elem_node_ids_type::HostMirror elem_node_ids )
{
std::vector< BoxType > element_box_gangs( thread_gang_count );
BoxType element_box_local_used = vertex_box_local_used ;
element_box_local_used[0][1] -= 1 ;
element_box_local_used[1][1] -= 1 ;
element_box_local_used[2][1] -= 1 ;
box_partition_rcb( element_box_local_used , element_box_gangs );
size_t elem_index = 0 ;
for ( size_t ig = 0 ; ig < thread_gang_count ; ++ig ) {
const BoxType box = element_box_gangs[ig] ;
for ( size_t iz = box[2][0] ; iz < box[2][1] ; ++iz ) {
for ( size_t iy = box[1][0] ; iy < box[1][1] ; ++iy ) {
for ( size_t ix = box[0][0] ; ix < box[0][1] ; ++ix , ++elem_index ) {
for ( size_t nn = 0 ; nn < element_node_count ; ++nn ) {
unsigned coord[3] = { static_cast<unsigned>(ix) , static_cast<unsigned>(iy) , static_cast<unsigned>(iz) };
element_fixture.elem_to_node( nn , coord );
const size_t node_local_id =
box_map_id( node_box_local_used ,
node_used_id_map ,
coord[0] , coord[1] , coord[2] );
elem_node_ids( elem_index , nn ) = node_local_id ;
}
}}}
}
}
//------------------------------------
static FEMeshType create( const size_t proc_count ,
const size_t proc_local ,
const size_t gang_count ,
const size_t elems_x ,
const size_t elems_y ,
const size_t elems_z ,
const double x_coord_curve = 1 ,
const double y_coord_curve = 1 ,
const double z_coord_curve = 1 )
{
const size_t vertices_x = elems_x + 1 ;
const size_t vertices_y = elems_y + 1 ;
const size_t vertices_z = elems_z + 1 ;
const BoxBoundsLinear vertex_box_bounds ;
const ElementSpec element ;
// Partition based upon vertices:
BoxType vertex_box_global ;
std::vector< BoxType > vertex_box_parts( proc_count );
vertex_box_global[0][0] = 0 ; vertex_box_global[0][1] = vertices_x ;
vertex_box_global[1][0] = 0 ; vertex_box_global[1][1] = vertices_y ;
vertex_box_global[2][0] = 0 ; vertex_box_global[2][1] = vertices_z ;
box_partition_rcb( vertex_box_global , vertex_box_parts );
const BoxType vertex_box_local_owned = vertex_box_parts[ proc_local ];
// Determine interior and used vertices:
BoxType vertex_box_local_interior ;
BoxType vertex_box_local_used ;
vertex_box_bounds.apply( vertex_box_global ,
vertex_box_local_owned ,
vertex_box_local_interior ,
vertex_box_local_used );
// Element counts:
const long local_elems_x =
( vertex_box_local_used[0][1] - vertex_box_local_used[0][0] ) - 1 ;
const long local_elems_y =
( vertex_box_local_used[1][1] - vertex_box_local_used[1][0] ) - 1 ;
const long local_elems_z =
( vertex_box_local_used[2][1] - vertex_box_local_used[2][0] ) - 1 ;
const size_t elem_count_total = std::max( long(0) , local_elems_x ) *
std::max( long(0) , local_elems_y ) *
std::max( long(0) , local_elems_z );
const long interior_elems_x =
( vertex_box_local_owned[0][1] - vertex_box_local_owned[0][0] ) - 1 ;
const long interior_elems_y =
( vertex_box_local_owned[1][1] - vertex_box_local_owned[1][0] ) - 1 ;
const long interior_elems_z =
( vertex_box_local_owned[2][1] - vertex_box_local_owned[2][0] ) - 1 ;
const size_t elem_count_interior = std::max( long(0) , interior_elems_x ) *
std::max( long(0) , interior_elems_y ) *
std::max( long(0) , interior_elems_z );
// Expand vertex boxes to node boxes:
BoxType node_box_global ;
BoxType node_box_local_used ;
std::vector< BoxType > node_box_parts ;
element.create_node_boxes_from_vertex_boxes(
vertex_box_global , vertex_box_parts ,
node_box_global , node_box_parts );
// Node communication maps:
size_t node_count_interior = 0 ;
size_t node_count_owned = 0 ;
size_t node_count_total = 0 ;
std::vector<size_t> node_used_id_map ;
std::vector<size_t> node_part_counts ;
std::vector< std::vector<size_t> > node_send_map ;
box_partition_maps( node_box_global ,
node_box_parts ,
element.box_bounds ,
proc_local ,
node_box_local_used ,
node_used_id_map ,
node_count_interior ,
node_count_owned ,
node_count_total ,
node_part_counts ,
node_send_map );
size_t node_count_send = 0 ;
for ( size_t i = 0 ; i < node_send_map.size() ; ++i ) {
node_count_send += node_send_map[i].size();
}
size_t recv_msg_count = 0 ;
size_t send_msg_count = 0 ;
size_t send_count = 0 ;
for ( size_t i = 1 ; i < proc_count ; ++i ) {
if ( node_part_counts[i] ) ++recv_msg_count ;
if ( node_send_map[i].size() ) {
++send_msg_count ;
send_count += node_send_map[i].size();
}
}
// Finite element mesh:
FEMeshType mesh ;
if ( node_count_total ) {
mesh.node_coords = node_coords_type( "node_coords", node_count_total );
}
if ( elem_count_total ) {
mesh.elem_node_ids =
elem_node_ids_type( "elem_node_ids", elem_count_total );
}
mesh.parallel_data_map.assign( node_count_interior ,
node_count_owned ,
node_count_total ,
recv_msg_count ,
send_msg_count ,
send_count );
typename node_coords_type::HostMirror node_coords =
Kokkos::create_mirror( mesh.node_coords );
typename elem_node_ids_type::HostMirror elem_node_ids =
Kokkos::create_mirror( mesh.elem_node_ids );
//------------------------------------
// set node coordinates to grid location for subsequent verification
for ( size_t iz = node_box_local_used[2][0] ;
iz < node_box_local_used[2][1] ; ++iz ) {
for ( size_t iy = node_box_local_used[1][0] ;
iy < node_box_local_used[1][1] ; ++iy ) {
for ( size_t ix = node_box_local_used[0][0] ;
ix < node_box_local_used[0][1] ; ++ix ) {
const size_t node_local_id =
box_map_id( node_box_local_used , node_used_id_map , ix , iy , iz );
node_coords( node_local_id , 0 ) = ix ;
node_coords( node_local_id , 1 ) = iy ;
node_coords( node_local_id , 2 ) = iz ;
}}}
//------------------------------------
// Initialize element-node connectivity:
if ( 1 < gang_count ) {
layout_elements_partitioned( vertex_box_local_used ,
vertex_box_local_owned ,
node_box_local_used ,
node_used_id_map ,
element ,
gang_count ,
elem_node_ids );
}
else {
layout_elements_interior_exterior( vertex_box_local_used ,
vertex_box_local_owned ,
node_box_local_used ,
node_used_id_map ,
element ,
elem_count_interior ,
elem_node_ids );
}
//------------------------------------
// Populate node->element connectivity:
std::vector<size_t> node_elem_work( node_count_total , (size_t) 0 );
for ( size_t i = 0 ; i < elem_count_total ; ++i ) {
for ( size_t n = 0 ; n < element_node_count ; ++n ) {
++node_elem_work[ elem_node_ids(i,n) ];
}
}
mesh.node_elem_ids =
Kokkos::create_staticcrsgraph< node_elem_ids_type >( "node_elem_ids" , node_elem_work );
typename node_elem_ids_type::HostMirror
node_elem_ids = Kokkos::create_mirror( mesh.node_elem_ids );
for ( size_t i = 0 ; i < node_count_total ; ++i ) {
node_elem_work[i] = node_elem_ids.row_map[i];
}
// Looping in element order insures the list of elements
// is sorted by element index.
for ( size_t i = 0 ; i < elem_count_total ; ++i ) {
for ( size_t n = 0 ; n < element_node_count ; ++n ) {
const unsigned nid = elem_node_ids(i, n);
const unsigned j = node_elem_work[nid] ; ++node_elem_work[nid] ;
node_elem_ids.entries( j , 0 ) = i ;
node_elem_ids.entries( j , 1 ) = n ;
}
}
//------------------------------------
// Verify setup with node coordinates matching grid indices.
verify( node_coords , elem_node_ids , node_elem_ids );
//------------------------------------
// Scale node coordinates to problem extent with
// nonlinear mapping.
{
const double problem_extent[3] =
{ static_cast<double>( vertex_box_global[0][1] - 1 ) ,
static_cast<double>( vertex_box_global[1][1] - 1 ) ,
static_cast<double>( vertex_box_global[2][1] - 1 ) };
const double grid_extent[3] =
{ static_cast<double>( node_box_global[0][1] - 1 ) ,
static_cast<double>( node_box_global[1][1] - 1 ) ,
static_cast<double>( node_box_global[2][1] - 1 ) };
for ( size_t i = 0 ; i < node_count_total ; ++i ) {
const double x_unit = node_coords(i,0) / grid_extent[0] ;
const double y_unit = node_coords(i,1) / grid_extent[1] ;
const double z_unit = node_coords(i,2) / grid_extent[2] ;
node_coords(i,0) = coordinate_scalar_type( problem_extent[0] * std::pow( x_unit , x_coord_curve ) );
node_coords(i,1) = coordinate_scalar_type( problem_extent[1] * std::pow( y_unit , y_coord_curve ) );
node_coords(i,2) = coordinate_scalar_type( problem_extent[2] * std::pow( z_unit , z_coord_curve ) );
}
}
Kokkos::deep_copy( mesh.node_coords , node_coords );
Kokkos::deep_copy( mesh.elem_node_ids , elem_node_ids );
Kokkos::deep_copy( mesh.node_elem_ids.entries , node_elem_ids.entries );
//------------------------------------
// Communication lists:
{
recv_msg_count = 0 ;
send_msg_count = 0 ;
send_count = 0 ;
for ( size_t i = 1 ; i < proc_count ; ++i ) {
// Order sending starting with the local processor rank
// to try to smooth out the amount of messages simultaneously
// send to a particular processor.
const int proc = ( proc_local + i ) % proc_count ;
if ( node_part_counts[i] ) {
mesh.parallel_data_map.host_recv(recv_msg_count,0) = proc ;
mesh.parallel_data_map.host_recv(recv_msg_count,1) = node_part_counts[i] ;
++recv_msg_count ;
}
if ( node_send_map[i].size() ) {
mesh.parallel_data_map.host_send(send_msg_count,0) = proc ;
mesh.parallel_data_map.host_send(send_msg_count,1) = node_send_map[i].size() ;
for ( size_t j = 0 ; j < node_send_map[i].size() ; ++j , ++send_count ) {
mesh.parallel_data_map.host_send_item(send_count) = node_send_map[i][j] - node_count_interior ;
}
++send_msg_count ;
}
}
}
return mesh ;
}
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_BOXMESHFIXTURE_HPP */

View File

@ -1,381 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <limits>
#include <BoxMeshPartition.hpp>
//----------------------------------------------------------------------------
namespace {
void box_partition( size_t ip , size_t up ,
const BoxType & box ,
BoxType * const p_box )
{
const size_t np = up - ip ;
if ( 1 == np ) {
p_box[ip] = box ;
}
else {
// Choose axis with largest count:
const size_t n0 = box[0][1] - box[0][0] ;
const size_t n1 = box[1][1] - box[1][0] ;
const size_t n2 = box[2][1] - box[2][0] ;
const size_t axis = n2 > n1 ? ( n2 > n0 ? 2 : ( n1 > n0 ? 1 : 0 ) ) :
( n1 > n0 ? 1 : 0 );
const size_t n = box[ axis ][1] - box[ axis ][0] ;
if ( 0 == np % 3 ) {
const size_t np_part = np / 3 ; // exact
const size_t nbox_low = (size_t)(( (double) n ) * ( 1.0 / 3.0 ));
const size_t nbox_mid = (size_t)(( (double) n ) * ( 2.0 / 3.0 ));
BoxType dbox_low = box ; // P = [ip,ip+np/3)
BoxType dbox_mid = box ; // P = [ip+np/3,ip+2*np/3)
BoxType dbox_upp = box ; // P = [ip+2*np/3,ip+np)
dbox_low[ axis ][1] = box[ axis ][0] + nbox_low ;
dbox_mid[ axis ][1] = box[ axis ][0] + nbox_mid ;
dbox_mid[ axis ][0] = dbox_low[ axis ][1];
dbox_upp[ axis ][0] = dbox_mid[ axis ][1];
box_partition( ip, ip + np_part, dbox_low , p_box );
box_partition( ip+ np_part, ip + 2*np_part, dbox_mid , p_box );
box_partition( ip+2*np_part, up, dbox_upp , p_box );
}
else {
const size_t np_low = np / 2 ; /* Rounded down */
const size_t nbox_low = (size_t)
(((double)n) * ( ((double) np_low ) / ((double) np ) ));
BoxType dbox_low = box ;
BoxType dbox_upp = box ;
dbox_low[ axis ][1] = dbox_low[ axis ][0] + nbox_low ;
dbox_upp[ axis ][0] = dbox_low[ axis ][1];
box_partition( ip, ip + np_low, dbox_low , p_box );
box_partition( ip + np_low, up, dbox_upp , p_box );
}
}
}
size_t box_map_offset( const BoxType & local_use ,
const size_t global_i ,
const size_t global_j ,
const size_t global_k )
{
const size_t max = std::numeric_limits<size_t>::max();
const size_t n[3] =
{ local_use[0][1] - local_use[0][0] ,
local_use[1][1] - local_use[1][0] ,
local_use[2][1] - local_use[2][0] };
const size_t use[3] = {
( global_i >= local_use[0][0] ? global_i - local_use[0][0] : max ) ,
( global_j >= local_use[1][0] ? global_j - local_use[1][0] : max ) ,
( global_k >= local_use[2][0] ? global_k - local_use[2][0] : max ) };
const size_t offset =
( use[0] < n[0] && use[1] < n[1] && use[2] < n[2] ) ?
( use[0] + n[0] * ( use[1] + n[1] * use[2] ) ) : max ;
if ( offset == max ) {
std::ostringstream msg ;
msg << "box_map_offset ERROR: "
<< " use " << local_use
<< " ( " << global_i
<< " , " << global_j
<< " , " << global_k
<< " )" ;
throw std::runtime_error( msg.str() );
}
return offset ;
}
} // namespace
//----------------------------------------------------------------------------
void BoxBoundsLinear::apply( const BoxType & box_global ,
const BoxType & box_part ,
BoxType & box_interior ,
BoxType & box_use ) const
{
const unsigned ghost = 1 ;
if ( 0 == count( box_part ) ) {
box_interior = box_part ;
box_use = box_part ;
}
else {
for ( size_t i = 0 ; i < 3 ; ++i ) {
box_interior[i][0] =
( box_part[i][0] == box_global[i][0] ) ? box_part[i][0] : (
( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][0] + ghost :
box_part[i][1] );
box_interior[i][1] =
( box_part[i][1] == box_global[i][1] ) ? box_part[i][1] : (
( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][1] - ghost :
box_part[i][0] );
box_use[i][0] =
( box_part[i][0] > ghost + box_global[i][0] ) ? box_part[i][0] - ghost :
box_global[i][0] ;
box_use[i][1] =
( box_part[i][1] + ghost < box_global[i][1] ) ? box_part[i][1] + ghost :
box_global[i][1] ;
}
}
}
void BoxBoundsQuadratic::apply( const BoxType & box_global ,
const BoxType & box_part ,
BoxType & box_interior ,
BoxType & box_use ) const
{
if ( 0 == count( box_part ) ) {
box_interior = box_part ;
box_use = box_part ;
}
else {
for ( size_t i = 0 ; i < 3 ; ++i ) {
const bool odd = ( box_part[i][0] - box_global[i][0] ) & 01 ;
const unsigned ghost = odd ? 1 : 2 ;
box_interior[i][0] =
( box_part[i][0] == box_global[i][0] ) ? box_part[i][0] : (
( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][0] + ghost :
box_part[i][1] );
box_interior[i][1] =
( box_part[i][1] == box_global[i][1] ) ? box_part[i][1] : (
( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][1] - ghost :
box_part[i][0] );
box_use[i][0] =
( box_part[i][0] > ghost + box_global[i][0] ) ? box_part[i][0] - ghost :
box_global[i][0] ;
box_use[i][1] =
( box_part[i][1] + ghost < box_global[i][1] ) ? box_part[i][1] + ghost :
box_global[i][1] ;
}
}
}
//----------------------------------------------------------------------------
void box_partition_rcb( const BoxType & root_box ,
std::vector<BoxType> & part_boxes )
{
const BoxBoundsLinear use_boxes ;
const size_t part_count = part_boxes.size();
box_partition( 0 , part_count , root_box , & part_boxes[0] );
// Verify partitioning
size_t total_cell = 0 ;
for ( size_t i = 0 ; i < part_count ; ++i ) {
total_cell += count( part_boxes[i] );
BoxType box_interior , box_use ;
use_boxes.apply( root_box , part_boxes[i] , box_interior , box_use );
if ( count( box_use ) < count( part_boxes[i] ) ||
count( part_boxes[i] ) < count( box_interior ) ||
part_boxes[i] != intersect( part_boxes[i] , box_use ) ||
box_interior != intersect( part_boxes[i] , box_interior )) {
std::ostringstream msg ;
msg << "box_partition_rcb ERROR : "
<< "part_boxes[" << i << "] = "
<< part_boxes[i]
<< " use " << box_use
<< " interior " << box_interior
<< std::endl
<< " part ^ use " << intersect( part_boxes[i] , box_use )
<< " part ^ interior " << intersect( part_boxes[i] , box_interior );
throw std::runtime_error( msg.str() );
}
for ( size_t j = i + 1 ; j < part_count ; ++j ) {
const BoxType tmp = intersect( part_boxes[i] , part_boxes[j] );
if ( count( tmp ) ) {
throw std::runtime_error( std::string("box partition intersection") );
}
}
}
if ( total_cell != count( root_box ) ) {
throw std::runtime_error( std::string("box partition count") );
}
}
//----------------------------------------------------------------------------
size_t box_map_id( const BoxType & local_use ,
const std::vector<size_t> & local_use_id_map ,
const size_t global_i ,
const size_t global_j ,
const size_t global_k )
{
const size_t offset =
box_map_offset( local_use , global_i , global_j , global_k );
return local_use_id_map[ offset ];
}
//----------------------------------------------------------------------------
void box_partition_maps( const BoxType & root_box ,
const std::vector<BoxType> & part_boxes ,
const BoxBounds & use_boxes ,
const size_t my_part ,
BoxType & my_use_box ,
std::vector<size_t> & my_use_id_map ,
size_t & my_count_interior ,
size_t & my_count_owned ,
size_t & my_count_uses ,
std::vector<size_t> & my_part_counts ,
std::vector<std::vector<size_t> > & my_send_map )
{
const size_t np = part_boxes.size();
if ( np <= my_part ) {
std::ostringstream msg ;
msg << "box_partition_maps ERROR : "
<< " np(" << np << ") <= my_part(" << my_part << ")" ;
throw std::runtime_error( msg.str() );
}
const BoxType my_owned_box = part_boxes[my_part];
BoxType my_interior_box ;
use_boxes.apply( root_box, my_owned_box, my_interior_box, my_use_box );
my_count_interior = count( my_interior_box );
my_count_owned = count( my_owned_box );
my_count_uses = count( my_use_box );
my_use_id_map.assign( my_count_uses , std::numeric_limits<size_t>::max() );
// Order ids as { owned-interior , owned-parallel , received_{(p+i)%np} }
size_t offset_interior = 0 ;
size_t offset_parallel = my_count_interior ;
for ( size_t iz = my_owned_box[2][0] ; iz < my_owned_box[2][1] ; ++iz ) {
for ( size_t iy = my_owned_box[1][0] ; iy < my_owned_box[1][1] ; ++iy ) {
for ( size_t ix = my_owned_box[0][0] ; ix < my_owned_box[0][1] ; ++ix ) {
const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
if ( contain( my_interior_box , ix , iy , iz ) ) {
my_use_id_map[ offset ] = offset_interior++ ;
}
else {
my_use_id_map[ offset ] = offset_parallel++ ;
}
}}}
my_part_counts.assign( np , (size_t) 0 );
my_send_map.assign( np , std::vector<size_t>() );
my_part_counts[0] = my_count_owned ;
for ( size_t i = 1 ; i < np ; ++i ) {
const size_t ip = ( my_part + i ) % np ;
const BoxType p_owned_box = part_boxes[ip];
BoxType p_use_box , p_interior_box ;
use_boxes.apply( root_box, p_owned_box, p_interior_box, p_use_box );
const BoxType recv_box = intersect( my_use_box , p_owned_box );
const BoxType send_box = intersect( my_owned_box , p_use_box );
if ( 0 != ( my_part_counts[i] = count( recv_box ) ) ) {
for ( size_t iz = recv_box[2][0] ; iz < recv_box[2][1] ; ++iz ) {
for ( size_t iy = recv_box[1][0] ; iy < recv_box[1][1] ; ++iy ) {
for ( size_t ix = recv_box[0][0] ; ix < recv_box[0][1] ; ++ix ) {
const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
my_use_id_map[ offset ] = offset_parallel++ ;
}}}
}
if ( 0 != count( send_box ) ) {
for ( size_t iz = send_box[2][0] ; iz < send_box[2][1] ; ++iz ) {
for ( size_t iy = send_box[1][0] ; iy < send_box[1][1] ; ++iy ) {
for ( size_t ix = send_box[0][0] ; ix < send_box[0][1] ; ++ix ) {
const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
my_send_map[ i ].push_back( my_use_id_map[ offset ] );
}}}
}
}
}

View File

@ -1,210 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef BOXMESHPARTITION_HPP
#define BOXMESHPARTITION_HPP
#include <cstddef>
#include <utility>
#include <vector>
#include <iostream>
//----------------------------------------------------------------------------
struct BoxType {
size_t data[3][2] ;
typedef size_t range_type[2] ;
inline
const range_type & operator[]( size_t i ) const { return data[i]; }
inline
range_type & operator[]( size_t i ) { return data[i]; }
inline
bool operator == ( const BoxType & rhs ) const
{
return data[0][0] == rhs.data[0][0] && data[0][1] == rhs.data[0][1] &&
data[1][0] == rhs.data[1][0] && data[1][1] == rhs.data[2][1] &&
data[2][0] == rhs.data[2][0] && data[2][1] == rhs.data[2][1] ;
}
inline
bool operator != ( const BoxType & rhs ) const
{
return data[0][0] != rhs.data[0][0] || data[0][1] != rhs.data[0][1] ||
data[1][0] != rhs.data[1][0] || data[1][1] != rhs.data[1][1] ||
data[2][0] != rhs.data[2][0] || data[2][1] != rhs.data[2][1] ;
}
};
inline
size_t count( const BoxType & b )
{
size_t n = 1 ;
for ( size_t i = 0 ; i < 3 ; ++i ) {
n *= b[i][1] > b[i][0] ? b[i][1] - b[i][0] : 0 ;
}
return n ;
}
inline
bool contain( const BoxType & b , size_t i , size_t j , size_t k )
{
return b[0][0] <= i && i < b[0][1] &&
b[1][0] <= j && j < b[1][1] &&
b[2][0] <= k && k < b[2][1] ;
}
inline
BoxType intersect( const BoxType & x , const BoxType & y )
{
BoxType z ;
for ( size_t i = 0 ; i < 3 ; ++i ) {
z[i][0] = std::max( x[i][0] , y[i][0] );
z[i][1] = std::min( x[i][1] , y[i][1] );
}
return z ;
}
inline
std::ostream & operator << ( std::ostream & s , const BoxType & box )
{
s << "{ "
<< box[0][0] << " " << box[0][1] << " , "
<< box[1][0] << " " << box[1][1] << " , "
<< box[2][0] << " " << box[2][1] << " }" ;
return s ;
}
//----------------------------------------------------------------------------
class BoxBounds {
public:
/** \brief Default bounds to one layer of ghosting */
virtual
void apply( const BoxType & box_global ,
const BoxType & box_part ,
BoxType & box_interior ,
BoxType & box_use ) const = 0 ;
virtual ~BoxBounds() {}
BoxBounds() {}
};
class BoxBoundsLinear : public BoxBounds
{
public:
/** \brief Default bounds to one layer of ghosting */
virtual
void apply( const BoxType & box_global ,
const BoxType & box_part ,
BoxType & box_interior ,
BoxType & box_use ) const ;
virtual ~BoxBoundsLinear() {}
BoxBoundsLinear() {}
};
class BoxBoundsQuadratic : public BoxBounds {
public:
/** \brief Quadratic mesh: even ordinates have two layers,
* odd ordinates have one layer.
*/
virtual
void apply( const BoxType & box_global ,
const BoxType & box_part ,
BoxType & box_interior ,
BoxType & box_use ) const ;
virtual ~BoxBoundsQuadratic() {}
BoxBoundsQuadratic() {}
};
//----------------------------------------------------------------------------
/* Partition box into part_boxes.size() sub-boxes */
void box_partition_rcb( const BoxType & root_box ,
std::vector<BoxType> & part_boxes );
//----------------------------------------------------------------------------
/* Determine local id layout and communication maps for partitioned boxes.
*
* Local ids are layed out as follows:
* { [ owned-interior ids not sent ] ,
* [ owned-boundary ids to be sent to other processes ] ,
* [ received ids from processor ( my_part + 1 ) % part_count ]
* [ received ids from processor ( my_part + 2 ) % part_count ]
* [ received ids from processor ( my_part + 3 ) % part_count ]
* ... };
*
* This layout allows
* (1) received data to be copied into a contiguous block of memory
* (2) send data to be extracted from a contiguous block of memory.
*/
void box_partition_maps(
const BoxType & root_box , // [in] Global box
const std::vector<BoxType> & part_boxes , // [in] Partitioned boxes
const BoxBounds & use_boxes , // [in] Ghost boundaries
const size_t my_part , // [in] My local part
BoxType & my_use_box , // [out] My used box with ghost
std::vector<size_t> & my_use_id_map , // [out] Local ordering map
size_t & my_count_interior , // [out] How many interior
size_t & my_count_owned , // [out] How many owned
size_t & my_count_uses , // [out] How may used
std::vector<size_t> & my_part_counts , // [out] Partitioning of my_use_id_map
std::vector<std::vector<size_t> > & my_send_map ); // [out] Send id map
/* Mapping of cartesian coordinate to local id */
size_t box_map_id( const BoxType & my_use_box ,
const std::vector<size_t> & my_use_id_map ,
const size_t global_i ,
const size_t global_j ,
const size_t global_k );
//----------------------------------------------------------------------------
#endif /* #ifndef BOXMESHPARTITION_HPP */

View File

@ -1,16 +0,0 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
SET(SOURCES "")
FILE(GLOB SOURCES *.cpp)
SET(LIBRARIES kokkoscore)
TRIBITS_ADD_EXECUTABLE(
multi_fem
SOURCES ${SOURCES}
COMM serial mpi
)

View File

@ -1,452 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef EXPLICIT_DRIVER_HPP
#define EXPLICIT_DRIVER_HPP
#include <sys/time.h>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cmath>
#include <impl/Kokkos_Timer.hpp>
#include <ExplicitFunctors.hpp>
//----------------------------------------------------------------------------
namespace Explicit {
struct PerformanceData {
double mesh_time ;
double init_time ;
double internal_force_time ;
double central_diff ;
double comm_time ;
size_t number_of_steps ;
PerformanceData()
: mesh_time(0)
, init_time(0)
, internal_force_time(0)
, central_diff(0)
, comm_time(0)
, number_of_steps(0)
{}
void best( const PerformanceData & rhs )
{
if ( rhs.mesh_time < mesh_time ) mesh_time = rhs.mesh_time ;
if ( rhs.init_time < init_time ) init_time = rhs.init_time ;
if ( rhs.internal_force_time < internal_force_time ) internal_force_time = rhs.internal_force_time ;
if ( rhs.central_diff < central_diff ) central_diff = rhs.central_diff ;
if ( rhs.comm_time < comm_time ) comm_time = rhs.comm_time ;
}
};
template< typename Scalar , class FixtureType >
PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
const int global_max_x ,
const int global_max_y ,
const int global_max_z ,
const int steps ,
const int print_sample )
{
typedef Scalar scalar_type ;
typedef FixtureType fixture_type ;
typedef typename fixture_type::execution_space execution_space ;
//typedef typename fixture_type::FEMeshType mesh_type ; // unused
enum { ElementNodeCount = fixture_type::element_node_count };
const int NumStates = 2;
const int total_num_steps = steps ;
const Scalar user_dt = 5.0e-6;
//const Scalar end_time = 0.0050;
// element block parameters
const Scalar lin_bulk_visc = 0.0;
const Scalar quad_bulk_visc = 0.0;
// const Scalar lin_bulk_visc = 0.06;
// const Scalar quad_bulk_visc = 1.2;
// const Scalar hg_stiffness = 0.0;
// const Scalar hg_viscosity = 0.0;
// const Scalar hg_stiffness = 0.03;
// const Scalar hg_viscosity = 0.001;
// material properties
const Scalar youngs_modulus=1.0e6;
const Scalar poissons_ratio=0.0;
const Scalar density = 8.0e-4;
const comm::Machine machine = mesh.parallel_data_map.machine ;
PerformanceData perf_data ;
Kokkos::Impl::Timer wall_clock ;
//------------------------------------
// Generate fields
typedef Fields< scalar_type , execution_space > fields_type ;
fields_type mesh_fields( mesh ,
lin_bulk_visc ,
quad_bulk_visc ,
youngs_modulus ,
poissons_ratio ,
density );
typename fields_type::node_coords_type::HostMirror
model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords );
typename fields_type::geom_state_array_type::HostMirror
displacement_h = Kokkos::create_mirror( mesh_fields.displacement );
typename fields_type::geom_state_array_type::HostMirror
velocity_h = Kokkos::create_mirror( mesh_fields.velocity );
Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords );
//------------------------------------
// Initialization
initialize_element<Scalar,execution_space>::apply( mesh_fields );
initialize_node< Scalar,execution_space>::apply( mesh_fields );
const Scalar x_bc = global_max_x ;
// Initial condition on velocity to initiate a pulse along the X axis
{
const unsigned X = 0;
for (int inode = 0; inode< mesh_fields.num_nodes; ++inode) {
if ( model_coords_h(inode,X) == 0) {
velocity_h(inode,X,0) = 1.0e3;
velocity_h(inode,X,1) = 1.0e3;
}
}
}
Kokkos::deep_copy( mesh_fields.velocity , velocity_h );
//--------------------------------------------------------------------------
// We will call a sequence of functions. These functions have been
// grouped into several functors to balance the number of global memory
// accesses versus requiring too many registers or too much L1 cache.
// Global memory accees have read/write cost and memory subsystem contention cost.
//--------------------------------------------------------------------------
perf_data.init_time = comm::max( machine , wall_clock.seconds() );
// Parameters required for the internal force computations.
int current_state = 0;
int previous_state = 0;
int next_state = 0;
perf_data.number_of_steps = total_num_steps ;
#if defined( KOKKOS_HAVE_MPI )
typedef typename
fields_type::geom_state_array_type::value_type comm_value_type ;
const unsigned comm_value_count = 6 ;
Kokkos::AsyncExchange< comm_value_type , execution_space ,
Kokkos::ParallelDataMap >
comm_exchange( mesh.parallel_data_map , comm_value_count );
#endif
for (int step = 0; step < total_num_steps; ++step) {
wall_clock.reset();
//------------------------------------------------------------------------
#if defined( KOKKOS_HAVE_MPI )
{
// Communicate "send" nodes' displacement and velocity next_state
// to the ghosted nodes.
// buffer packages: { { dx , dy , dz , vx , vy , vz }_node }
pack_state< Scalar , execution_space >
::apply( comm_exchange.buffer() ,
mesh.parallel_data_map.count_interior ,
mesh.parallel_data_map.count_send ,
mesh_fields , next_state );
comm_exchange.setup();
comm_exchange.send_receive();
unpack_state< Scalar , execution_space >
::apply( mesh_fields , next_state ,
comm_exchange.buffer() ,
mesh.parallel_data_map.count_owned ,
mesh.parallel_data_map.count_receive );
execution_space::fence();
}
#endif
perf_data.comm_time += comm::max( machine , wall_clock.seconds() );
//------------------------------------------------------------------------
// rotate the states
previous_state = current_state;
current_state = next_state;
++next_state;
next_state %= NumStates;
wall_clock.reset();
// First kernel 'grad_hgop' combines two functions:
// gradient, velocity gradient
grad< Scalar , execution_space >::apply( mesh_fields ,
current_state ,
previous_state );
// Combine tensor decomposition and rotation functions.
decomp_rotate< Scalar , execution_space >::apply( mesh_fields ,
current_state ,
previous_state );
internal_force< Scalar , execution_space >::apply( mesh_fields ,
user_dt ,
current_state );
execution_space::fence();
perf_data.internal_force_time +=
comm::max( machine , wall_clock.seconds() );
wall_clock.reset();
// Assembly of elements' contributions to nodal force into
// a nodal force vector. Update the accelerations, velocities,
// displacements.
// The same pattern can be used for matrix-free residual computations.
nodal_step< Scalar , execution_space >::apply( mesh_fields ,
x_bc ,
current_state,
next_state );
execution_space::fence();
perf_data.central_diff +=
comm::max( machine , wall_clock.seconds() );
if ( print_sample && 0 == step % 100 ) {
Kokkos::deep_copy( displacement_h , mesh_fields.displacement );
Kokkos::deep_copy( velocity_h , mesh_fields.velocity );
if ( 1 == print_sample ) {
std::cout << "step " << step
<< " : displacement(*,0,0) =" ;
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) {
std::cout << " " << displacement_h(i,0,next_state);
}
}
std::cout << std::endl ;
const float tol = 1.0e-6 ;
const int yb = global_max_y ;
const int zb = global_max_z ;
std::cout << "step " << step
<< " : displacement(*," << yb << "," << zb << ") =" ;
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
if ( fabs( model_coords_h(i,1) - yb ) < tol &&
fabs( model_coords_h(i,2) - zb ) < tol ) {
std::cout << " " << displacement_h(i,0,next_state);
}
}
std::cout << std::endl ;
}
else if ( 2 == print_sample ) {
const float tol = 1.0e-6 ;
const int xb = global_max_x / 2 ;
const int yb = global_max_y / 2 ;
const int zb = global_max_z / 2 ;
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
if ( fabs( model_coords_h(i,0) - xb ) < tol &&
fabs( model_coords_h(i,1) - yb ) < tol &&
fabs( model_coords_h(i,2) - zb ) < tol ) {
std::cout << "step " << step
<< " : displacement("
<< xb << "," << yb << "," << zb << ") = {"
<< std::setprecision(6)
<< " " << displacement_h(i,0,next_state)
<< std::setprecision(2)
<< " " << displacement_h(i,1,next_state)
<< std::setprecision(2)
<< " " << displacement_h(i,2,next_state)
<< " }" << std::endl ;
}
}
}
}
}
return perf_data ;
}
template <typename Scalar, typename Device>
static void driver( const char * const label ,
comm::Machine machine ,
const int gang_count ,
const int elem_count_beg ,
const int elem_count_end ,
const int runs )
{
typedef Scalar scalar_type ;
typedef Device execution_space ;
typedef double coordinate_scalar_type ;
typedef FixtureElementHex8 fixture_element_type ;
typedef BoxMeshFixture< coordinate_scalar_type ,
execution_space ,
fixture_element_type > fixture_type ;
typedef typename fixture_type::FEMeshType mesh_type ;
const size_t proc_count = comm::size( machine );
const size_t proc_rank = comm::rank( machine );
const int space = 15 ;
const int steps = 1000 ;
const int print_sample = 0 ;
if ( comm::rank( machine ) == 0 ) {
std::cout << std::endl ;
std::cout << "\"MiniExplicitDynamics with Kokkos " << label
<< " time_steps(" << steps << ")"
<< "\"" << std::endl;
std::cout << std::left << std::setw(space) << "\"Element\" , ";
std::cout << std::left << std::setw(space) << "\"Node\" , ";
std::cout << std::left << std::setw(space) << "\"Initialize\" , ";
std::cout << std::left << std::setw(space) << "\"ElemForce\" , ";
std::cout << std::left << std::setw(space) << "\"NodeUpdate\" , ";
std::cout << std::left << std::setw(space) << "\"NodeComm\" , ";
std::cout << std::left << std::setw(space) << "\"Time/Elem\" , ";
std::cout << std::left << std::setw(space) << "\"Time/Node\"";
std::cout << std::endl;
std::cout << std::left << std::setw(space) << "\"count\" , ";
std::cout << std::left << std::setw(space) << "\"count\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\"";
std::cout << std::endl;
}
for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
{
const int iz = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
const int iy = iz + 1 ;
const int ix = 2 * iy ;
const int nelem = ix * iy * iz ;
const int nnode = ( ix + 1 ) * ( iy + 1 ) * ( iz + 1 );
mesh_type mesh =
fixture_type::create( proc_count , proc_rank , gang_count ,
ix , iy , iz );
mesh.parallel_data_map.machine = machine ;
PerformanceData perf , best ;
for(int j = 0; j < runs; j++){
perf = run<scalar_type,fixture_type>(mesh,ix,iy,iz,steps,print_sample);
if( j == 0 ) {
best = perf ;
}
else {
best.best( perf );
}
}
if ( comm::rank( machine ) == 0 ) {
double time_per_element =
( best.internal_force_time ) / ( nelem * perf.number_of_steps );
double time_per_node =
( best.comm_time + best.central_diff ) / ( nnode * perf.number_of_steps );
std::cout << std::setw(space-3) << nelem << " , "
<< std::setw(space-3) << nnode << " , "
<< std::setw(space-3) << best.number_of_steps << " , "
<< std::setw(space-3) << best.init_time * 1000000 << " , "
<< std::setw(space-3)
<< ( best.internal_force_time * 1000000 ) / best.number_of_steps << " , "
<< std::setw(space-3)
<< ( best.central_diff * 1000000 ) / best.number_of_steps << " , "
<< std::setw(space-3)
<< ( best.comm_time * 1000000 ) / best.number_of_steps << " , "
<< std::setw(space-3) << time_per_element * 1000000 << " , "
<< std::setw(space-3) << time_per_node * 1000000
<< std::endl ;
}
}
}
} // namespace Explicit
#endif /* #ifndef EXPLICIT_DRIVER_HPP */

File diff suppressed because it is too large Load Diff

View File

@ -1,86 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_FEMESH_HPP
#define KOKKOS_FEMESH_HPP
#include <utility>
#include <limits>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <Kokkos_Core.hpp>
#include <Kokkos_StaticCrsGraph.hpp>
#include <ParallelComm.hpp>
#include <ParallelDataMap.hpp>
namespace HybridFEM {
//----------------------------------------------------------------------------
/** \brief Finite element mesh fixture for hybrid parallel performance tests.
*/
template< typename CoordScalarType , unsigned ElemNodeCount , class Device >
struct FEMesh {
typedef typename Device::size_type size_type ;
static const size_type element_node_count = ElemNodeCount ;
typedef Kokkos::View< CoordScalarType*[3] , Device > node_coords_type ;
typedef Kokkos::View< size_type*[ElemNodeCount], Device > elem_node_ids_type ;
typedef Kokkos::StaticCrsGraph< size_type[2] , Device > node_elem_ids_type ;
node_coords_type node_coords ;
elem_node_ids_type elem_node_ids ;
node_elem_ids_type node_elem_ids ;
Kokkos::ParallelDataMap parallel_data_map ;
};
//----------------------------------------------------------------------------
} /* namespace HybridFEM */
#endif /* #ifndef KOKKOS_FEMESH_HPP */

View File

@ -1,268 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef ELEMENTHEX_HPP
#define ELEMENTHEX_HPP
namespace HybridFEM {
template< unsigned NodeCount >
class HexElement_TensorData ;
template< unsigned NodeCount , class Device >
class HexElement_TensorEval ;
//----------------------------------------------------------------------------
/** \brief Evaluate Hex element on interval [-1,1]^3 */
template<>
class HexElement_TensorData< 8 > {
public:
static const unsigned element_node_count = 8 ;
static const unsigned spatial_dimension = 3 ;
static const unsigned integration_count_1d = 2 ;
static const unsigned function_count_1d = 2 ;
float values_1d [ function_count_1d ][ integration_count_1d ];
float derivs_1d [ function_count_1d ][ integration_count_1d ];
float weights_1d[ integration_count_1d ];
unsigned char eval_map[ element_node_count ][4] ;
static float eval_value_1d( const unsigned jf , const float x )
{
return 0 == jf ? 0.5 * ( 1.0 - x ) : (
1 == jf ? 0.5 * ( 1.0 + x ) : 0 );
}
static float eval_deriv_1d( const unsigned jf , const float )
{
return 0 == jf ? -0.5 : (
1 == jf ? 0.5 : 0 );
}
HexElement_TensorData()
{
const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
{ { 0 , 0 , 0 },
{ 1 , 0 , 0 },
{ 1 , 1 , 0 },
{ 0 , 1 , 0 },
{ 0 , 0 , 1 },
{ 1 , 0 , 1 },
{ 1 , 1 , 1 },
{ 0 , 1 , 1 } };
weights_1d[0] = 1 ;
weights_1d[1] = 1 ;
const float points_1d[ integration_count_1d ] =
{ -0.577350269 , 0.577350269 };
for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
eval_map[i][0] = tmp_map[i][0];
eval_map[i][1] = tmp_map[i][1];
eval_map[i][2] = tmp_map[i][2];
}
for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
}}
}
};
//----------------------------------------------------------------------------
template<>
class HexElement_TensorData< 27 > {
public:
static const unsigned element_node_count = 27 ;
static const unsigned spatial_dimension = 3 ;
static const unsigned integration_count_1d = 3 ;
static const unsigned function_count_1d = 3 ;
float values_1d [ function_count_1d ][ integration_count_1d ];
float derivs_1d [ function_count_1d ][ integration_count_1d ];
float weights_1d[ integration_count_1d ];
unsigned char eval_map[ element_node_count ][4] ;
// sizeof(EvaluateElementHex) = 111 bytes =
// sizeof(float) * 9 +
// sizeof(float) * 9 +
// sizeof(float) * 3 +
// sizeof(char) * 27
static float eval_value_1d( const unsigned jf , const float p )
{
return 0 == jf ? 0.5 * p * ( p - 1 ) : (
1 == jf ? 1.0 - p * p : (
2 == jf ? 0.5 * p * ( p + 1 ) : 0 ));
}
static float eval_deriv_1d( const unsigned jf , const float p )
{
return 0 == jf ? p - 0.5 : (
1 == jf ? -2.0 * p : (
2 == jf ? p + 0.5 : 0 ));
}
HexElement_TensorData()
{
const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
{ { 0 , 0 , 0 },
{ 2 , 0 , 0 },
{ 2 , 2 , 0 },
{ 0 , 2 , 0 },
{ 0 , 0 , 2 },
{ 2 , 0 , 2 },
{ 2 , 2 , 2 },
{ 0 , 2 , 2 },
{ 1 , 0 , 0 },
{ 2 , 1 , 0 },
{ 1 , 2 , 0 },
{ 0 , 1 , 0 },
{ 0 , 0 , 1 },
{ 2 , 0 , 1 },
{ 2 , 2 , 1 },
{ 0 , 2 , 1 },
{ 1 , 0 , 2 },
{ 2 , 1 , 2 },
{ 1 , 2 , 2 },
{ 0 , 1 , 2 },
{ 1 , 1 , 1 },
{ 1 , 1 , 0 },
{ 1 , 1 , 2 },
{ 0 , 1 , 1 },
{ 2 , 1 , 1 },
{ 1 , 0 , 1 },
{ 1 , 2 , 1 } };
// Interval [-1,1]
weights_1d[0] = 0.555555556 ;
weights_1d[1] = 0.888888889 ;
weights_1d[2] = 0.555555556 ;
const float points_1d[3] = { -0.774596669 ,
0.000000000 ,
0.774596669 };
for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
eval_map[i][0] = tmp_map[i][0];
eval_map[i][1] = tmp_map[i][1];
eval_map[i][2] = tmp_map[i][2];
}
for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
}}
}
};
//----------------------------------------------------------------------------
template< unsigned NodeCount >
class HexElement_Data {
public:
static const unsigned spatial_dimension = 3 ;
static const unsigned element_node_count = NodeCount ;
static const unsigned integration_count = NodeCount ;
static const unsigned function_count = NodeCount ;
float weights[ integration_count ] ;
float values[ integration_count ][ function_count ];
float gradients[ integration_count ][ spatial_dimension ][ function_count ];
HexElement_Data()
{
HexElement_TensorData< NodeCount > tensor_data ;
for ( unsigned ip = 0 ; ip < integration_count ; ++ip ) {
const unsigned ipx = tensor_data.eval_map[ip][0] ;
const unsigned ipy = tensor_data.eval_map[ip][1] ;
const unsigned ipz = tensor_data.eval_map[ip][2] ;
weights[ip] = tensor_data.weights_1d[ ipx ] *
tensor_data.weights_1d[ ipy ] *
tensor_data.weights_1d[ ipz ] ;
for ( unsigned jf = 0 ; jf < function_count ; ++jf ) {
const unsigned jfx = tensor_data.eval_map[jf][0] ;
const unsigned jfy = tensor_data.eval_map[jf][1] ;
const unsigned jfz = tensor_data.eval_map[jf][2] ;
values[ip][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
tensor_data.values_1d[ ipy ][ jfy ] *
tensor_data.values_1d[ ipz ][ jfz ] ;
gradients[ip][0][jf] = tensor_data.derivs_1d[ ipx ][ jfx ] *
tensor_data.values_1d[ ipy ][ jfy ] *
tensor_data.values_1d[ ipz ][ jfz ] ;
gradients[ip][1][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
tensor_data.derivs_1d[ ipy ][ jfy ] *
tensor_data.values_1d[ ipz ][ jfz ] ;
gradients[ip][2][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
tensor_data.values_1d[ ipy ][ jfy ] *
tensor_data.derivs_1d[ ipz ][ jfz ] ;
}
}
}
};
//----------------------------------------------------------------------------
} /* namespace HybridFEM */
#endif /* #ifndef ELEMENTHEX_HPP */

View File

@ -1,443 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_HEXEXPLICITFUNCTIONS_HPP
#define KOKKOS_HEXEXPLICITFUNCTIONS_HPP
#include <math.h>
namespace Explicit {
struct Hex8Functions
{
static const unsigned SpatialDim = 3 ;
static const unsigned ElemNodeCount = 8 ;
// Indices for full 3x3 tensor:
static const unsigned K_F_XX = 0 ;
static const unsigned K_F_YY = 1 ;
static const unsigned K_F_ZZ = 2 ;
static const unsigned K_F_XY = 3 ;
static const unsigned K_F_YZ = 4 ;
static const unsigned K_F_ZX = 5 ;
static const unsigned K_F_YX = 6 ;
static const unsigned K_F_ZY = 7 ;
static const unsigned K_F_XZ = 8 ;
static const unsigned K_F_SIZE = 9 ;
// Indexes into a 3 by 3 symmetric tensor stored as a length 6 vector
static const unsigned K_S_XX = 0 ;
static const unsigned K_S_YY = 1 ;
static const unsigned K_S_ZZ = 2 ;
static const unsigned K_S_XY = 3 ;
static const unsigned K_S_YZ = 4 ;
static const unsigned K_S_ZX = 5 ;
static const unsigned K_S_YX = 3 ;
static const unsigned K_S_ZY = 4 ;
static const unsigned K_S_XZ = 5 ;
static const unsigned K_S_SIZE = 6 ;
// Indexes into a 3 by 3 skew symmetric tensor stored as a length 3 vector
static const unsigned K_V_XY = 0 ;
static const unsigned K_V_YZ = 1 ;
static const unsigned K_V_ZX = 2 ;
static const unsigned K_V_SIZE = 3 ;
//--------------------------------------------------------------------------
template< typename ScalarA , typename ScalarB >
KOKKOS_INLINE_FUNCTION static
double dot8( const ScalarA * const a , const ScalarB * const b )
{ return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3] +
a[4] * b[4] + a[5] * b[5] + a[6] * b[6] + a[7] * b[7] ; }
//--------------------------------------------------------------------------
template< class ScalarPrecise ,
class ScalarCompact >
KOKKOS_INLINE_FUNCTION static
void grad( const ScalarPrecise x[] ,
const ScalarPrecise z[] ,
ScalarCompact grad_y[] )
{
const ScalarCompact R42=(x[3] - x[1]);
const ScalarCompact R52=(x[4] - x[1]);
const ScalarCompact R54=(x[4] - x[3]);
const ScalarCompact R63=(x[5] - x[2]);
const ScalarCompact R83=(x[7] - x[2]);
const ScalarCompact R86=(x[7] - x[5]);
const ScalarCompact R31=(x[2] - x[0]);
const ScalarCompact R61=(x[5] - x[0]);
const ScalarCompact R74=(x[6] - x[3]);
const ScalarCompact R72=(x[6] - x[1]);
const ScalarCompact R75=(x[6] - x[4]);
const ScalarCompact R81=(x[7] - x[0]);
const ScalarCompact t1=(R63 + R54);
const ScalarCompact t2=(R61 + R74);
const ScalarCompact t3=(R72 + R81);
const ScalarCompact t4 =(R86 + R42);
const ScalarCompact t5 =(R83 + R52);
const ScalarCompact t6 =(R75 + R31);
// Calculate Y gradient from X and Z data
grad_y[0] = (z[1] * t1) - (z[2] * R42) - (z[3] * t5) + (z[4] * t4) + (z[5] * R52) - (z[7] * R54);
grad_y[1] = (z[2] * t2) + (z[3] * R31) - (z[0] * t1) - (z[5] * t6) + (z[6] * R63) - (z[4] * R61);
grad_y[2] = (z[3] * t3) + (z[0] * R42) - (z[1] * t2) - (z[6] * t4) + (z[7] * R74) - (z[5] * R72);
grad_y[3] = (z[0] * t5) - (z[1] * R31) - (z[2] * t3) + (z[7] * t6) + (z[4] * R81) - (z[6] * R83);
grad_y[4] = (z[5] * t3) + (z[6] * R86) - (z[7] * t2) - (z[0] * t4) - (z[3] * R81) + (z[1] * R61);
grad_y[5] = (z[6] * t5) - (z[4] * t3) - (z[7] * R75) + (z[1] * t6) - (z[0] * R52) + (z[2] * R72);
grad_y[6] = (z[7] * t1) - (z[5] * t5) - (z[4] * R86) + (z[2] * t4) - (z[1] * R63) + (z[3] * R83);
grad_y[7] = (z[4] * t2) - (z[6] * t1) + (z[5] * R75) - (z[3] * t6) - (z[2] * R74) + (z[0] * R54);
}
template< class ScalarPrecise ,
class ScalarCompact >
static KOKKOS_INLINE_FUNCTION
void grad( const ScalarPrecise x[] ,
const ScalarPrecise y[] ,
const ScalarPrecise z[] ,
ScalarCompact grad_x[] ,
ScalarCompact grad_y[] ,
ScalarCompact grad_z[] )
{
grad( x , z , grad_y );
grad( z , y , grad_x );
grad( y , x , grad_z );
}
//--------------------------------------------------------------------------
template< class ScalarPrecise ,
class ScalarCompact >
KOKKOS_INLINE_FUNCTION static
void polar_decomp( const float dt ,
const ScalarCompact v_gr[] ,
ScalarPrecise stretch[] /* INOUT */ ,
ScalarCompact str_ten[] /* OUT */ ,
ScalarCompact rot[] /* OUT */ )
{
const float dt_half = 0.5 * dt;
ScalarCompact vort[ K_V_SIZE ]; // Vorticity
// Symmetric part
str_ten[K_S_XX] = v_gr[K_F_XX];
str_ten[K_S_YY] = v_gr[K_F_YY];
str_ten[K_S_ZZ] = v_gr[K_F_ZZ];
str_ten[K_S_XY] = 0.5 * ( v_gr[K_F_XY] + v_gr[K_F_YX] );
str_ten[K_S_YZ] = 0.5 * ( v_gr[K_F_YZ] + v_gr[K_F_ZY] );
str_ten[K_S_ZX] = 0.5 * ( v_gr[K_F_ZX] + v_gr[K_F_XZ] );
// Skew Symmetric part
vort[K_V_XY] = 0.5 * ( v_gr[K_F_XY] - v_gr[K_F_YX] );
vort[K_V_YZ] = 0.5 * ( v_gr[K_F_YZ] - v_gr[K_F_ZY] );
vort[K_V_ZX] = 0.5 * ( v_gr[K_F_ZX] - v_gr[K_F_XZ] );
// calculate the rates of rotation via gauss elimination.
ScalarCompact z1 = str_ten[K_S_XY] * stretch[K_S_ZX] -
str_ten[K_S_ZX] * stretch[K_S_XY] +
str_ten[K_S_YY] * stretch[K_S_YZ] -
str_ten[K_S_YZ] * stretch[K_S_YY] +
str_ten[K_S_YZ] * stretch[K_S_ZZ] -
str_ten[K_S_ZZ] * stretch[K_S_YZ];
ScalarCompact z2 = str_ten[K_S_ZX] * stretch[K_S_XX] -
str_ten[K_S_XX] * stretch[K_S_ZX] +
str_ten[K_S_YZ] * stretch[K_S_XY] -
str_ten[K_S_XY] * stretch[K_S_YZ] +
str_ten[K_S_ZZ] * stretch[K_S_ZX] -
str_ten[K_S_ZX] * stretch[K_S_ZZ];
ScalarCompact z3 = str_ten[K_S_XX] * stretch[K_S_XY] -
str_ten[K_S_XY] * stretch[K_S_XX] +
str_ten[K_S_XY] * stretch[K_S_YY] -
str_ten[K_S_YY] * stretch[K_S_XY] +
str_ten[K_S_ZX] * stretch[K_S_YZ] -
str_ten[K_S_YZ] * stretch[K_S_ZX];
{
// forward elimination
const ScalarCompact a1inv = 1.0 / (stretch[K_S_YY] + stretch[K_S_ZZ]);
const ScalarCompact a4BYa1 = -1 * stretch[K_S_XY] * a1inv;
const ScalarCompact a2inv = 1.0 / (stretch[K_S_ZZ] + stretch[K_S_XX] + stretch[K_S_XY] * a4BYa1);
const ScalarCompact a5 = -stretch[K_S_YZ] + stretch[K_S_ZX] * a4BYa1;
z2 -= z1 * a4BYa1;
const ScalarCompact a6BYa1 = -1 * stretch[K_S_ZX] * a1inv;
const ScalarCompact a5BYa2 = a5 * a2inv;
z3 -= z1 * a6BYa1 - z2 * a5BYa2;
// backward substitution -
z3 /= (stretch[K_S_XX] + stretch[K_S_YY] + stretch[K_S_ZX] * a6BYa1 + a5 * a5BYa2);
z2 = (z2 - a5 * z3) * a2inv;
z1 = (z1*a1inv - a6BYa1 * z3 -a4BYa1 * z2);
}
// calculate rotation rates - recall that spin_rate is an asymmetric tensor,
// so compute spin rate vector as dual of spin rate tensor,
// i.e w_i = e_ijk * spin_rate_jk
z1 += vort[K_V_YZ];
z2 += vort[K_V_ZX];
z3 += vort[K_V_XY];
{
// update rotation tensor:
// 1) premultiply old rotation tensor to get right-hand side.
ScalarCompact r_XX = rot[K_F_XX] + dt_half*( z3 * rot[K_F_YX] - z2 * rot[K_F_ZX] );
ScalarCompact r_YX = rot[K_F_YX] + dt_half*( z1 * rot[K_F_ZX] - z3 * rot[K_F_XX] );
ScalarCompact r_ZX = rot[K_F_ZX] + dt_half*( z2 * rot[K_F_XX] - z1 * rot[K_F_YX] );
ScalarCompact r_XY = rot[K_F_XY] + dt_half*( z3 * rot[K_F_YY] - z2 * rot[K_F_ZY] );
ScalarCompact r_YY = rot[K_F_YY] + dt_half*( z1 * rot[K_F_ZY] - z3 * rot[K_F_XY] );
ScalarCompact r_ZY = rot[K_F_ZY] + dt_half*( z2 * rot[K_F_XY] - z1 * rot[K_F_YY] );
ScalarCompact r_XZ = rot[K_F_XZ] + dt_half*( z3 * rot[K_F_YZ] - z2 * rot[K_F_ZZ] );
ScalarCompact r_YZ = rot[K_F_YZ] + dt_half*( z1 * rot[K_F_ZZ] - z3 * rot[K_F_XZ] );
ScalarCompact r_ZZ = rot[K_F_ZZ] + dt_half*( z2 * rot[K_F_XZ] - z1 * rot[K_F_YZ] );
// 2) solve for new rotation tensor via gauss elimination.
// forward elimination -
const ScalarCompact a12 = - dt_half * z3;
const ScalarCompact a13 = dt_half * z2;
ScalarCompact b32 = - dt_half * z1;
const ScalarCompact a22inv = 1.0 / (1.0 + a12 * a12);
const ScalarCompact a13a12 = a13*a12;
const ScalarCompact a23 = b32 + a13a12;
r_YX += r_XX * a12;
r_YY += r_XY * a12;
r_YZ += r_XZ * a12;
b32 = (b32 - a13a12) * a22inv;
r_ZX += r_XX * a13 + r_YX * b32;
r_ZY += r_XY * a13 + r_YY * b32;
r_ZZ += r_XZ * a13 + r_YZ * b32;
// backward substitution -
const ScalarCompact a33inv = 1.0 / (1.0 + a13 * a13 + a23 * b32);
rot[K_F_ZX] = r_ZX * a33inv;
rot[K_F_ZY] = r_ZY * a33inv;
rot[K_F_ZZ] = r_ZZ * a33inv;
rot[K_F_YX] = ( r_YX - rot[K_F_ZX] * a23 ) * a22inv;
rot[K_F_YY] = ( r_YY - rot[K_F_ZY] * a23 ) * a22inv;
rot[K_F_YZ] = ( r_YZ - rot[K_F_ZZ] * a23 ) * a22inv;
rot[K_F_XX] = r_XX - rot[K_F_ZX] * a13 - rot[K_F_YX] * a12;
rot[K_F_XY] = r_XY - rot[K_F_ZY] * a13 - rot[K_F_YY] * a12;
rot[K_F_XZ] = r_XZ - rot[K_F_ZZ] * a13 - rot[K_F_YZ] * a12;
}
// update stretch tensor in the new configuration -
const ScalarCompact a1 = str_ten[K_S_XY] + vort[K_V_XY];
const ScalarCompact a2 = str_ten[K_S_YZ] + vort[K_V_YZ];
const ScalarCompact a3 = str_ten[K_S_ZX] + vort[K_V_ZX];
const ScalarCompact b1 = str_ten[K_S_ZX] - vort[K_V_ZX];
const ScalarCompact b2 = str_ten[K_S_XY] - vort[K_V_XY];
const ScalarCompact b3 = str_ten[K_S_YZ] - vort[K_V_YZ];
const ScalarCompact s_XX = stretch[K_S_XX];
const ScalarCompact s_YY = stretch[K_S_YY];
const ScalarCompact s_ZZ = stretch[K_S_ZZ];
const ScalarCompact s_XY = stretch[K_S_XY];
const ScalarCompact s_YZ = stretch[K_S_YZ];
const ScalarCompact s_ZX = stretch[K_S_ZX];
stretch[K_S_XX] += dt * (str_ten[K_S_XX] * s_XX + ( a1 + z3 ) * s_XY + ( b1 - z2 ) * s_ZX);
stretch[K_S_YY] += dt * (str_ten[K_S_YY] * s_YY + ( a2 + z1 ) * s_YZ + ( b2 - z3 ) * s_XY);
stretch[K_S_ZZ] += dt * (str_ten[K_S_ZZ] * s_ZZ + ( a3 + z2 ) * s_ZX + ( b3 - z1 ) * s_YZ);
stretch[K_S_XY] += dt * (str_ten[K_S_XX] * s_XY + ( a1 ) * s_YY + ( b1 ) * s_YZ - z3 * s_XX + z1 * s_ZX);
stretch[K_S_YZ] += dt * (str_ten[K_S_YY] * s_YZ + ( a2 ) * s_ZZ + ( b2 ) * s_ZX - z1 * s_YY + z2 * s_XY);
stretch[K_S_ZX] += dt * (str_ten[K_S_ZZ] * s_ZX + ( a3 ) * s_XX + ( b3 ) * s_XY - z2 * s_ZZ + z3 * s_YZ);
}
//--------------------------------------------------------------------------
template< typename ScalarCompact >
static KOKKOS_INLINE_FUNCTION
void rotate_tensor( const ScalarCompact str_ten[] ,
const ScalarCompact rot[] ,
ScalarCompact rot_str[] )
{
ScalarCompact t[9];
t[0] = str_ten[K_S_XX]*rot[K_F_XX] + str_ten[K_S_XY]*rot[K_F_YX] + str_ten[K_S_XZ]*rot[K_F_ZX];
t[1] = str_ten[K_S_YX]*rot[K_F_XX] + str_ten[K_S_YY]*rot[K_F_YX] + str_ten[K_S_YZ]*rot[K_F_ZX];
t[2] = str_ten[K_S_ZX]*rot[K_F_XX] + str_ten[K_S_ZY]*rot[K_F_YX] + str_ten[K_S_ZZ]*rot[K_F_ZX];
t[3] = str_ten[K_S_XX]*rot[K_F_XY] + str_ten[K_S_XY]*rot[K_F_YY] + str_ten[K_S_XZ]*rot[K_F_ZY];
t[4] = str_ten[K_S_YX]*rot[K_F_XY] + str_ten[K_S_YY]*rot[K_F_YY] + str_ten[K_S_YZ]*rot[K_F_ZY];
t[5] = str_ten[K_S_ZX]*rot[K_F_XY] + str_ten[K_S_ZY]*rot[K_F_YY] + str_ten[K_S_ZZ]*rot[K_F_ZY];
t[6] = str_ten[K_S_XX]*rot[K_F_XZ] + str_ten[K_S_XY]*rot[K_F_YZ] + str_ten[K_S_XZ]*rot[K_F_ZZ];
t[7] = str_ten[K_S_YX]*rot[K_F_XZ] + str_ten[K_S_YY]*rot[K_F_YZ] + str_ten[K_S_YZ]*rot[K_F_ZZ];
t[8] = str_ten[K_S_ZX]*rot[K_F_XZ] + str_ten[K_S_ZY]*rot[K_F_YZ] + str_ten[K_S_ZZ]*rot[K_F_ZZ];
rot_str[ K_S_XX ] = rot[K_F_XX] * t[0] + rot[K_F_YX] * t[1] + rot[K_F_ZX] * t[2];
rot_str[ K_S_YY ] = rot[K_F_XY] * t[3] + rot[K_F_YY] * t[4] + rot[K_F_ZY] * t[5];
rot_str[ K_S_ZZ ] = rot[K_F_XZ] * t[6] + rot[K_F_YZ] * t[7] + rot[K_F_ZZ] * t[8];
rot_str[ K_S_XY ] = rot[K_F_XX] * t[3] + rot[K_F_YX] * t[4] + rot[K_F_ZX] * t[5];
rot_str[ K_S_YZ ] = rot[K_F_XY] * t[6] + rot[K_F_YY] * t[7] + rot[K_F_ZY] * t[8];
rot_str[ K_S_ZX ] = rot[K_F_XZ] * t[0] + rot[K_F_YZ] * t[1] + rot[K_F_ZZ] * t[2];
}
//--------------------------------------------------------------------------
template< class ScalarPrecise ,
class ScalarCompact >
static KOKKOS_INLINE_FUNCTION
void rotate_tensor_backward( const ScalarPrecise stress[] ,
const ScalarCompact rot[] ,
ScalarCompact rot_stress[] )
{
ScalarCompact t[9] ;
t[0] = stress[K_S_XX]*rot[K_F_XX]+ stress[K_S_XY]*rot[K_F_XY]+ stress[K_S_XZ]*rot[K_F_XZ];
t[1] = stress[K_S_YX]*rot[K_F_XX]+ stress[K_S_YY]*rot[K_F_XY]+ stress[K_S_YZ]*rot[K_F_XZ];
t[2] = stress[K_S_ZX]*rot[K_F_XX]+ stress[K_S_ZY]*rot[K_F_XY]+ stress[K_S_ZZ]*rot[K_F_XZ];
t[3] = stress[K_S_XX]*rot[K_F_YX]+ stress[K_S_XY]*rot[K_F_YY]+ stress[K_S_XZ]*rot[K_F_YZ];
t[4] = stress[K_S_YX]*rot[K_F_YX]+ stress[K_S_YY]*rot[K_F_YY]+ stress[K_S_YZ]*rot[K_F_YZ];
t[5] = stress[K_S_ZX]*rot[K_F_YX]+ stress[K_S_ZY]*rot[K_F_YY]+ stress[K_S_ZZ]*rot[K_F_YZ];
t[6] = stress[K_S_XX]*rot[K_F_ZX]+ stress[K_S_XY]*rot[K_F_ZY]+ stress[K_S_XZ]*rot[K_F_ZZ];
t[7] = stress[K_S_YX]*rot[K_F_ZX]+ stress[K_S_YY]*rot[K_F_ZY]+ stress[K_S_YZ]*rot[K_F_ZZ];
t[8] = stress[K_S_ZX]*rot[K_F_ZX]+ stress[K_S_ZY]*rot[K_F_ZY]+ stress[K_S_ZZ]*rot[K_F_ZZ];
rot_stress[ K_S_XX ] = rot[K_F_XX]*t[0] + rot[K_F_XY]*t[1] + rot[K_F_XZ]*t[2];
rot_stress[ K_S_YY ] = rot[K_F_YX]*t[3] + rot[K_F_YY]*t[4] + rot[K_F_YZ]*t[5];
rot_stress[ K_S_ZZ ] = rot[K_F_ZX]*t[6] + rot[K_F_ZY]*t[7] + rot[K_F_ZZ]*t[8];
rot_stress[ K_S_XY ] = rot[K_F_XX]*t[3] + rot[K_F_XY]*t[4] + rot[K_F_XZ]*t[5];
rot_stress[ K_S_YZ ] = rot[K_F_YX]*t[6] + rot[K_F_YY]*t[7] + rot[K_F_YZ]*t[8];
rot_stress[ K_S_ZX ] = rot[K_F_ZX]*t[0] + rot[K_F_ZY]*t[1] + rot[K_F_ZZ]*t[2];
}
//--------------------------------------------------------------------------
template< class ScalarPrecise ,
class ScalarCompact >
KOKKOS_INLINE_FUNCTION static
void update_stress( const float dt ,
const float two_mu ,
const float bulk_modulus ,
const ScalarCompact rot_str[] ,
ScalarPrecise stress[] )
{
const ScalarCompact e = rot_str[ K_S_XX ] + rot_str[ K_S_YY ] + rot_str[ K_S_ZZ ] ;
const ScalarCompact eb = e * bulk_modulus ;
const ScalarCompact e3 = e / 3.0 ;
stress[K_S_XX] += dt * ( two_mu * ( rot_str[K_S_XX] - e3 ) + eb );
stress[K_S_YY] += dt * ( two_mu * ( rot_str[K_S_YY] - e3 ) + eb );
stress[K_S_ZZ] += dt * ( two_mu * ( rot_str[K_S_ZZ] - e3 ) + eb );
stress[K_S_XY] += dt * two_mu * rot_str[K_S_XY];
stress[K_S_YZ] += dt * two_mu * rot_str[K_S_YZ];
stress[K_S_ZX] += dt * two_mu * rot_str[K_S_ZX];
}
//--------------------------------------------------------------------------
template< class ScalarPrecise ,
class ScalarCompact >
static KOKKOS_INLINE_FUNCTION
void comp_force( const ScalarPrecise vx[] ,
const ScalarPrecise vy[] ,
const ScalarPrecise vz[] ,
const ScalarCompact grad_x[] ,
const ScalarCompact grad_y[] ,
const ScalarCompact grad_z[] ,
const ScalarCompact total_stress12th[] ,
ScalarCompact force[][ SpatialDim ] ,
ScalarCompact & energy )
{
ScalarPrecise internal_energy = 0 ;
for ( unsigned inode = 0; inode < ElemNodeCount ; ++inode ) {
force[inode][0] = total_stress12th[K_S_XX] * grad_x[inode] +
total_stress12th[K_S_XY] * grad_y[inode] +
total_stress12th[K_S_XZ] * grad_z[inode] ;
force[inode][1] = total_stress12th[K_S_YX] * grad_x[inode] +
total_stress12th[K_S_YY] * grad_y[inode] +
total_stress12th[K_S_YZ] * grad_z[inode] ;
force[inode][2] = total_stress12th[K_S_ZX] * grad_x[inode] +
total_stress12th[K_S_ZY] * grad_y[inode] +
total_stress12th[K_S_ZZ] * grad_z[inode] ;
internal_energy += force[inode][0] * vx[inode] +
force[inode][1] * vy[inode] +
force[inode][2] * vz[inode] ;
}
energy = internal_energy ;
}
//--------------------------------------------------------------------------
};
} // namespace Explicit
#endif /* #ifndef KOKKOS_HEXEXPLICITFUNCTIONS_HPP */

View File

@ -1,341 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef HYBRIDFEM_IMPLICIT_HPP
#define HYBRIDFEM_IMPLICIT_HPP
#include <utility>
#include <iostream>
#include <iomanip>
#include <Kokkos_Core.hpp>
#include <SparseLinearSystem.hpp>
#include <SparseLinearSystemFill.hpp>
#include <ImplicitFunctors.hpp>
#include <FEMesh.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace HybridFEM {
namespace Implicit {
struct PerformanceData {
double mesh_time ;
double graph_time ;
double elem_time ;
double matrix_gather_fill_time ;
double matrix_boundary_condition_time ;
double cg_iteration_time ;
PerformanceData()
: mesh_time(0)
, graph_time(0)
, elem_time(0)
, matrix_gather_fill_time(0)
, matrix_boundary_condition_time(0)
, cg_iteration_time(0)
{}
void best( const PerformanceData & rhs )
{
mesh_time = std::min( mesh_time , rhs.mesh_time );
graph_time = std::min( graph_time , rhs.graph_time );
elem_time = std::min( elem_time , rhs.elem_time );
matrix_gather_fill_time = std::min( matrix_gather_fill_time , rhs.matrix_gather_fill_time );
matrix_boundary_condition_time = std::min( matrix_boundary_condition_time , rhs.matrix_boundary_condition_time );
cg_iteration_time = std::min( cg_iteration_time , rhs.cg_iteration_time );
}
};
//----------------------------------------------------------------------------
template< typename Scalar , class FixtureType >
PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
const int , // global_max_x ,
const int , // global_max_y ,
const int global_max_z ,
const bool print_sample )
{
typedef Scalar scalar_type ;
typedef FixtureType fixture_type ;
typedef typename fixture_type::execution_space execution_space;
//typedef typename execution_space::size_type size_type ; // unused
typedef typename fixture_type::FEMeshType mesh_type ;
typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;
enum { ElementNodeCount = fixture_type::element_node_count };
const comm::Machine machine = mesh.parallel_data_map.machine ;
const size_t element_count = mesh.elem_node_ids.dimension_0();
const size_t iteration_limit = 200 ;
const double residual_tolerance = 1e-14 ;
size_t iteration_count = 0 ;
double residual_norm = 0 ;
PerformanceData perf_data ;
//------------------------------------
// Sparse linear system types:
typedef Kokkos::View< scalar_type* , execution_space > vector_type ;
typedef Kokkos::CrsMatrix< scalar_type , execution_space > matrix_type ;
typedef typename matrix_type::graph_type matrix_graph_type ;
typedef typename matrix_type::coefficients_type matrix_coefficients_type ;
typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;
//------------------------------------
// Problem setup types:
typedef ElementComputation< scalar_type , scalar_type , execution_space > ElementFunctor ;
typedef DirichletBoundary< scalar_type , scalar_type , execution_space > BoundaryFunctor ;
typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
typedef typename ElementFunctor::elem_vectors_type elem_vectors_type ;
typedef GatherFill< matrix_type ,
mesh_type ,
elem_matrices_type ,
elem_vectors_type > GatherFillFunctor ;
//------------------------------------
const scalar_type elem_coeff_K = 2 ;
const scalar_type elem_load_Q = 1 ;
matrix_type linsys_matrix ;
vector_type linsys_rhs ;
vector_type linsys_solution ;
typename graph_factory::element_map_type element_map ;
Kokkos::Impl::Timer wall_clock ;
//------------------------------------
// Generate sparse matrix graph and element->graph map.
graph_factory::create( mesh , linsys_matrix.graph , element_map );
execution_space::fence();
perf_data.graph_time = comm::max( machine , wall_clock.seconds() );
//------------------------------------
// Allocate linear system coefficients and rhs:
const size_t local_owned_length =
linsys_matrix.graph.row_map.dimension_0() - 1 ;
linsys_matrix.coefficients =
matrix_coefficients_type( "coeff" , linsys_matrix.graph.entries.dimension_0() );
linsys_rhs = vector_type( "rhs" , local_owned_length );
linsys_solution = vector_type( "solution" , local_owned_length );
//------------------------------------
// Fill linear system
{
elem_matrices_type elem_matrices ;
elem_vectors_type elem_vectors ;
if ( element_count ) {
elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
elem_vectors = elem_vectors_type ( std::string("elem_vectors"), element_count );
}
//------------------------------------
// Compute element matrices and vectors:
wall_clock.reset();
ElementFunctor::apply( mesh ,
elem_matrices , elem_vectors ,
elem_coeff_K , elem_load_Q );
execution_space::fence();
perf_data.elem_time = comm::max( machine , wall_clock.seconds() );
//------------------------------------
// Fill linear system coefficients:
wall_clock.reset();
GatherFillFunctor::apply( linsys_matrix , linsys_rhs ,
mesh , element_map , elem_matrices , elem_vectors );
execution_space::fence();
perf_data.matrix_gather_fill_time = comm::max( machine , wall_clock.seconds() );
// Apply boundary conditions:
wall_clock.reset();
BoundaryFunctor::apply( linsys_matrix , linsys_rhs , mesh ,
0 , global_max_z , 0 , global_max_z );
execution_space::fence();
perf_data.matrix_boundary_condition_time = comm::max( machine , wall_clock.seconds() );
}
//------------------------------------
// Solve linear sytem
cgsolve( mesh.parallel_data_map ,
linsys_matrix , linsys_rhs , linsys_solution ,
iteration_count , residual_norm ,
perf_data.cg_iteration_time ,
iteration_limit , residual_tolerance );
//------------------------------------
if ( print_sample ) {
typename mesh_type::node_coords_type::HostMirror coords_h =
Kokkos::create_mirror( mesh.node_coords );
typename vector_type::HostMirror X_h =
Kokkos::create_mirror( linsys_solution );
Kokkos::deep_copy( coords_h , mesh.node_coords );
Kokkos::deep_copy( X_h , linsys_solution );
for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
const coordinate_scalar_type x = coords_h(i,0);
const coordinate_scalar_type y = coords_h(i,1);
const coordinate_scalar_type z = coords_h(i,2);
if ( x <= 0 && y <= 0 ) {
std::cout << " node( " << x << " " << y << " " << z << " ) = "
<< X_h(i) << std::endl ;
}
}
}
return perf_data ;
}
//----------------------------------------------------------------------------
template< typename Scalar , class Device >
void driver( const char * const label ,
comm::Machine machine ,
const int gang_count ,
const int elem_count_beg ,
const int elem_count_end ,
const int runs )
{
typedef Scalar scalar_type ;
typedef Device execution_space ;
typedef double coordinate_scalar_type ;
typedef FixtureElementHex8 fixture_element_type ;
typedef BoxMeshFixture< coordinate_scalar_type ,
execution_space ,
fixture_element_type > fixture_type ;
typedef typename fixture_type::FEMeshType mesh_type ;
const size_t proc_count = comm::size( machine );
const size_t proc_rank = comm::rank( machine );
if ( elem_count_beg == 0 || elem_count_end == 0 || runs == 0 ) return ;
if ( comm::rank( machine ) == 0 ) {
std::cout << std::endl ;
std::cout << "\"Kokkos::HybridFE::Implicit " << label << "\"" << std::endl;
std::cout << "\"Size\" , \"Graphing\" , \"Element\" , \"Fill\" , \"Boundary\" , \"CG-Iter\"" << std::endl
<< "\"elems\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\"" << std::endl ;
}
for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
{
const int ix = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
const int iy = ix + 1 ;
const int iz = 2 * iy ;
const int n = ix * iy * iz ;
mesh_type mesh =
fixture_type::create( proc_count , proc_rank , gang_count ,
ix , iy , iz );
mesh.parallel_data_map.machine = machine ;
PerformanceData perf_data , perf_best ;
for(int j = 0; j < runs; j++){
perf_data = run<scalar_type,fixture_type>(mesh,ix,iy,iz, false );
if( j == 0 ) {
perf_best = perf_data ;
}
else {
perf_best.best( perf_data );
}
}
if ( comm::rank( machine ) == 0 ) {
std::cout << std::setw(8) << n << " , "
<< std::setw(10) << perf_best.graph_time * 1000 << " , "
<< std::setw(10) << perf_best.elem_time * 1000 << " , "
<< std::setw(10) << perf_best.matrix_gather_fill_time * 1000 << " , "
<< std::setw(10) << perf_best.matrix_boundary_condition_time * 1000 << " , "
<< std::setw(10) << perf_best.cg_iteration_time * 1000
<< std::endl ;
}
}
}
//----------------------------------------------------------------------------
} /* namespace Implicit */
} /* namespace HybridFEM */
#endif /* #ifndef HYBRIDFEM_IMPLICIT_HPP */

View File

@ -1,585 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <iostream>
#include <fstream>
#include <iomanip>
#include <cstdlib>
#include <cmath>
namespace HybridFEM {
namespace Implicit {
//----------------------------------------------------------------------------
template< typename Scalar , unsigned Dim , unsigned N >
struct TensorIntegration ;
template<typename Scalar >
struct TensorIntegration<Scalar,1,1> {
Scalar pts[1] ;
Scalar wts[1] ;
TensorIntegration() { pts[0] = 0 ; wts[0] = 2 ; }
};
template<typename Scalar >
struct TensorIntegration<Scalar,1,2>
{
Scalar pts[2] ;
Scalar wts[2] ;
TensorIntegration()
{
const Scalar x2 = 0.577350269 ;
pts[0] = -x2; wts[0] = 1.0;
pts[1] = x2; wts[1] = 1.0;
}
};
template<typename Scalar >
struct TensorIntegration<Scalar,1,3>
{
Scalar pts[3] ;
Scalar wts[3] ;
TensorIntegration()
{
const Scalar x3 = 0.774596669 ;
const Scalar w1 = 0.555555556 ;
const Scalar w2 = 0.888888889 ;
pts[0] = -x3 ; wts[0] = w1 ;
pts[1] = 0 ; wts[1] = w2 ;
pts[2] = x3 ; wts[2] = w1 ;
}
};
template< typename Scalar , unsigned Order >
struct TensorIntegration<Scalar,3,Order>
{
static const unsigned N = Order * Order * Order ;
Scalar pts[N][3] ;
Scalar wts[N];
TensorIntegration()
{
TensorIntegration<Scalar,1,Order> oneD ;
unsigned n = 0 ;
for ( unsigned k = 0 ; k < Order ; ++k ) {
for ( unsigned j = 0 ; j < Order ; ++j ) {
for ( unsigned i = 0 ; i < Order ; ++i , ++n ) {
pts[n][0] = oneD.pts[i] ;
pts[n][1] = oneD.pts[j] ;
pts[n][2] = oneD.pts[k] ;
wts[n] = oneD.wts[i] * oneD.wts[j] * oneD.wts[k] ;
}}}
}
};
//----------------------------------------------------------------------------
template< typename Scalar >
struct ShapeFunctionEvaluation {
static const unsigned FunctionCount = 8 ;
static const unsigned SpatialDimension = 3 ;
static const unsigned IntegrationOrder = 2 ;
typedef TensorIntegration< Scalar , SpatialDimension , IntegrationOrder >
TensorIntegrationType ;
static const unsigned PointCount = TensorIntegrationType::N ;
Scalar value [ PointCount ][ FunctionCount ] ;
Scalar gradient[ PointCount ][ FunctionCount * SpatialDimension ];
Scalar weight [ PointCount ];
ShapeFunctionEvaluation()
{
const TensorIntegration< Scalar , SpatialDimension , IntegrationOrder >
integration ;
const Scalar ONE8TH = 0.125 ;
for ( unsigned i = 0 ; i < PointCount ; ++i ) {
const Scalar u = 1.0 - integration.pts[i][0];
const Scalar v = 1.0 - integration.pts[i][1];
const Scalar w = 1.0 - integration.pts[i][2];
const Scalar up1 = 1.0 + integration.pts[i][0];
const Scalar vp1 = 1.0 + integration.pts[i][1];
const Scalar wp1 = 1.0 + integration.pts[i][2];
weight[i] = integration.wts[i] ;
// Vaues:
value[i][0] = ONE8TH * u * v * w ;
value[i][1] = ONE8TH * up1 * v * w ;
value[i][2] = ONE8TH * up1 * vp1 * w ;
value[i][3] = ONE8TH * u * vp1 * w ;
value[i][4] = ONE8TH * u * v * wp1 ;
value[i][5] = ONE8TH * up1 * v * wp1 ;
value[i][6] = ONE8TH * up1 * vp1 * wp1 ;
value[i][7] = ONE8TH * u * vp1 * wp1 ;
//fn 0 = u * v * w
gradient[i][ 0] = ONE8TH * -1 * v * w ;
gradient[i][ 1] = ONE8TH * u * -1 * w ;
gradient[i][ 2] = ONE8TH * u * v * -1 ;
//fn 1 = up1 * v * w
gradient[i][ 3] = ONE8TH * 1 * v * w ;
gradient[i][ 4] = ONE8TH * up1 * -1 * w ;
gradient[i][ 5] = ONE8TH * up1 * v * -1 ;
//fn 2 = up1 * vp1 * w
gradient[i][ 6] = ONE8TH * 1 * vp1 * w ;
gradient[i][ 7] = ONE8TH * up1 * 1 * w ;
gradient[i][ 8] = ONE8TH * up1 * vp1 * -1 ;
//fn 3 = u * vp1 * w
gradient[i][ 9] = ONE8TH * -1 * vp1 * w ;
gradient[i][10] = ONE8TH * u * 1 * w ;
gradient[i][11] = ONE8TH * u * vp1 * -1 ;
//fn 4 = u * v * wp1
gradient[i][12] = ONE8TH * -1 * v * wp1 ;
gradient[i][13] = ONE8TH * u * -1 * wp1 ;
gradient[i][14] = ONE8TH * u * v * 1 ;
//fn 5 = up1 * v * wp1
gradient[i][15] = ONE8TH * 1 * v * wp1 ;
gradient[i][16] = ONE8TH * up1 * -1 * wp1 ;
gradient[i][17] = ONE8TH * up1 * v * 1 ;
//fn 6 = up1 * vp1 * wp1
gradient[i][18] = ONE8TH * 1 * vp1 * wp1 ;
gradient[i][19] = ONE8TH * up1 * 1 * wp1 ;
gradient[i][20] = ONE8TH * up1 * vp1 * 1 ;
//fn 7 = u * vp1 * wp1
gradient[i][21] = ONE8TH * -1 * vp1 * wp1 ;
gradient[i][22] = ONE8TH * u * 1 * wp1 ;
gradient[i][23] = ONE8TH * u * vp1 * 1 ;
}
}
};
//----------------------------------------------------------------------------
template< typename ScalarType , typename ScalarCoordType , class DeviceType >
struct ElementComputation
{
typedef DeviceType execution_space;
typedef ScalarType scalar_type ;
typedef typename execution_space::size_type size_type ;
static const size_type ElementNodeCount = 8 ;
typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
typedef Kokkos::View< scalar_type[][ElementNodeCount][ElementNodeCount] , execution_space > elem_matrices_type ;
typedef Kokkos::View< scalar_type[][ElementNodeCount] , execution_space > elem_vectors_type ;
typedef ShapeFunctionEvaluation< scalar_type > shape_function_data ;
static const unsigned SpatialDim = shape_function_data::SpatialDimension ;
static const unsigned FunctionCount = shape_function_data::FunctionCount ;
private:
const shape_function_data shape_eval ;
typename mesh_type::elem_node_ids_type elem_node_ids ;
typename mesh_type::node_coords_type node_coords ;
elem_matrices_type element_matrices ;
elem_vectors_type element_vectors ;
scalar_type coeff_K ;
scalar_type coeff_Q ;
ElementComputation( const mesh_type & arg_mesh ,
const elem_matrices_type & arg_element_matrices ,
const elem_vectors_type & arg_element_vectors ,
const scalar_type arg_coeff_K ,
const scalar_type arg_coeff_Q )
: shape_eval()
, elem_node_ids( arg_mesh.elem_node_ids )
, node_coords( arg_mesh.node_coords )
, element_matrices( arg_element_matrices )
, element_vectors( arg_element_vectors )
, coeff_K( arg_coeff_K )
, coeff_Q( arg_coeff_Q )
{}
public:
static void apply( const mesh_type & mesh ,
const elem_matrices_type & elem_matrices ,
const elem_vectors_type & elem_vectors ,
const scalar_type elem_coeff_K ,
const scalar_type elem_coeff_Q )
{
ElementComputation comp( mesh , elem_matrices , elem_vectors , elem_coeff_K , elem_coeff_Q );
const size_t elem_count = mesh.elem_node_ids.dimension_0();
parallel_for( elem_count , comp );
}
//------------------------------------
static const unsigned FLOPS_jacobian =
FunctionCount * SpatialDim * SpatialDim * 2 ;
KOKKOS_INLINE_FUNCTION
void jacobian( const ScalarCoordType * x,
const ScalarCoordType * y,
const ScalarCoordType * z,
const scalar_type * grad_vals,
scalar_type * J) const
{
int i_grad = 0 ;
for( unsigned i = 0; i < ElementNodeCount ; ++i , i_grad += SpatialDim ) {
const scalar_type g0 = grad_vals[ i_grad ];
const scalar_type g1 = grad_vals[ i_grad + 1 ];
const scalar_type g2 = grad_vals[ i_grad + 2 ];
const scalar_type x0 = x[i] ;
const scalar_type x1 = y[i] ;
const scalar_type x2 = z[i] ;
J[0] += g0 * x0 ;
J[1] += g0 * x1 ;
J[2] += g0 * x2 ;
J[3] += g1 * x0 ;
J[4] += g1 * x1 ;
J[5] += g1 * x2 ;
J[6] += g2 * x0 ;
J[7] += g2 * x1 ;
J[8] += g2 * x2 ;
}
}
//------------------------------------
static const unsigned FLOPS_inverse_and_det = 46 ;
KOKKOS_INLINE_FUNCTION
scalar_type inverse_and_determinant3x3( scalar_type * const J ) const
{
const scalar_type J00 = J[0];
const scalar_type J01 = J[1];
const scalar_type J02 = J[2];
const scalar_type J10 = J[3];
const scalar_type J11 = J[4];
const scalar_type J12 = J[5];
const scalar_type J20 = J[6];
const scalar_type J21 = J[7];
const scalar_type J22 = J[8];
const scalar_type term0 = J22*J11 - J21*J12;
const scalar_type term1 = J22*J01 - J21*J02;
const scalar_type term2 = J12*J01 - J11*J02;
const scalar_type detJ = J00*term0 - J10*term1 + J20*term2;
const scalar_type inv_detJ = 1.0/detJ;
J[0] = term0*inv_detJ;
J[1] = -term1*inv_detJ;
J[2] = term2*inv_detJ;
J[3] = -(J22*J10 - J20*J12)*inv_detJ;
J[4] = (J22*J00 - J20*J02)*inv_detJ;
J[5] = -(J12*J00 - J10*J02)*inv_detJ;
J[6] = (J21*J10 - J20*J11)*inv_detJ;
J[7] = -(J21*J00 - J20*J01)*inv_detJ;
J[8] = (J11*J00 - J10*J01)*inv_detJ;
return detJ ;
}
//------------------------------------
KOKKOS_INLINE_FUNCTION
void matTransMat3x3_X_3xn( const scalar_type * A, int n,
const scalar_type * B,
scalar_type * C ) const
{
//A is 3x3, B is 3xn. So C is also 3xn.
//A,B,C are all assumed to be ordered such that columns are contiguous.
scalar_type * Cj = C;
const scalar_type * Bj = B;
for(int j=0; j<n; ++j) {
Cj[0] = A[0]*Bj[0] + A[1]*Bj[1] + A[2]*Bj[2];
Cj[1] = A[3]*Bj[0] + A[4]*Bj[1] + A[5]*Bj[2];
Cj[2] = A[6]*Bj[0] + A[7]*Bj[1] + A[8]*Bj[2];
Bj += 3;
Cj += 3;
}
}
//------------------------------------
static const unsigned FLOPS_contributeDiffusionMatrix = FunctionCount * ( 3 * 5 + FunctionCount * 7 ) ;
KOKKOS_INLINE_FUNCTION
void contributeDiffusionMatrix(
const scalar_type weight ,
const scalar_type grad_vals[] ,
const scalar_type invJ[] ,
scalar_type elem_mat[][8] ) const
{
scalar_type dpsidx[8], dpsidy[8], dpsidz[8];
int i_grad = 0 ;
for( unsigned i = 0; i < FunctionCount ; ++i , i_grad += 3 ) {
const scalar_type g0 = grad_vals[i_grad+0];
const scalar_type g1 = grad_vals[i_grad+1];
const scalar_type g2 = grad_vals[i_grad+2];
dpsidx[i] = g0 * invJ[0] + g1 * invJ[1] + g2 * invJ[2];
dpsidy[i] = g0 * invJ[3] + g1 * invJ[4] + g2 * invJ[5];
dpsidz[i] = g0 * invJ[6] + g1 * invJ[7] + g2 * invJ[8];
}
for( unsigned m = 0; m < FunctionCount; m++) {
for( unsigned n = 0; n < FunctionCount; n++) {
elem_mat[m][n] += weight *
((dpsidx[m] * dpsidx[n]) +
(dpsidy[m] * dpsidy[n]) +
(dpsidz[m] * dpsidz[n]));
}
}
}
//------------------------------------
static const unsigned FLOPS_contributeSourceVector = FunctionCount * 2 ;
KOKKOS_INLINE_FUNCTION
void contributeSourceVector( const scalar_type term ,
const scalar_type psi[] ,
scalar_type elem_vec[] ) const
{
for( unsigned i=0; i< FunctionCount ; ++i) {
elem_vec[i] += psi[i] * term ;
}
}
static const unsigned FLOPS_operator =
shape_function_data::PointCount * ( 3
+ FLOPS_jacobian
+ FLOPS_inverse_and_det
+ FLOPS_contributeDiffusionMatrix
+ FLOPS_contributeSourceVector ) ;
KOKKOS_INLINE_FUNCTION
void operator()( int ielem )const {
scalar_type elem_vec[8] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
scalar_type elem_mat[8][8] =
{ { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
{ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
{ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
{ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
{ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
{ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
{ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
{ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } };
ScalarCoordType x[8], y[8], z[8];
for ( int i = 0 ; i < 8 ; ++i ) {
const int node_index = elem_node_ids( ielem , i );
x[i] = node_coords( node_index , 0 );
y[i] = node_coords( node_index , 1 );
z[i] = node_coords( node_index , 2 );
}
// This loop could be parallelized; however,
// it would require additional per-thread temporaries
// of 'elem_vec' and 'elem_mat' which would
// consume more local memory and have to be reduced.
for ( unsigned i = 0 ; i < shape_function_data::PointCount ; ++i ) {
scalar_type J[SpatialDim*SpatialDim] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
jacobian( x, y, z, shape_eval.gradient[i] , J );
// Overwrite J with its inverse to save scratch memory space.
const scalar_type detJ_w = shape_eval.weight[i] * inverse_and_determinant3x3(J);
const scalar_type k_detJ_w = coeff_K * detJ_w ;
const scalar_type Q_detJ_w = coeff_Q * detJ_w ;
contributeDiffusionMatrix( k_detJ_w , shape_eval.gradient[i] , J , elem_mat );
contributeSourceVector( Q_detJ_w , shape_eval.value[i] , elem_vec );
}
for( size_type i=0; i< ElementNodeCount ; ++i) {
element_vectors(ielem, i) = elem_vec[i] ;
}
for( size_type i = 0; i < ElementNodeCount ; i++){
for( size_type j = 0; j < ElementNodeCount ; j++){
element_matrices(ielem, i, j) = elem_mat[i][j] ;
}
}
}
}; /* ElementComputation */
//----------------------------------------------------------------------------
template< typename ScalarType , typename ScalarCoordType , class DeviceType >
struct DirichletBoundary
{
typedef DeviceType execution_space;
typedef typename execution_space::size_type size_type ;
static const size_type ElementNodeCount = 8 ;
typedef Kokkos::CrsMatrix< ScalarType , execution_space > matrix_type ;
typedef Kokkos::View< ScalarType[] , execution_space > vector_type ;
typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
typename mesh_type::node_coords_type node_coords ;
matrix_type matrix ;
vector_type rhs ;
ScalarCoordType bc_lower_z ;
ScalarCoordType bc_upper_z ;
ScalarType bc_lower_value ;
ScalarType bc_upper_value ;
KOKKOS_INLINE_FUNCTION
void operator()( size_type inode ) const
{
// Apply a dirichlet boundary condition to 'irow'
// to maintain the symmetry of the original
// global stiffness matrix, zero out the columns
// that correspond to boundary conditions, and
// adjust the load vector accordingly
const size_type iBeg = matrix.graph.row_map[inode];
const size_type iEnd = matrix.graph.row_map[inode+1];
const ScalarCoordType z = node_coords(inode,2);
const bool bc_lower = z <= bc_lower_z ;
const bool bc_upper = bc_upper_z <= z ;
if ( bc_lower || bc_upper ) {
const ScalarType bc_value = bc_lower ? bc_lower_value
: bc_upper_value ;
rhs(inode) = bc_value ; // set the rhs vector
// zero each value on the row, and leave a one
// on the diagonal
for( size_type i = iBeg ; i < iEnd ; i++) {
matrix.coefficients(i) =
(int) inode == matrix.graph.entries(i) ? 1 : 0 ;
}
}
else {
// Find any columns that are boundary conditions.
// Clear them and adjust the load vector
for( size_type i = iBeg ; i < iEnd ; i++ ) {
const size_type cnode = matrix.graph.entries(i) ;
const ScalarCoordType zc = node_coords(cnode,2);
const bool c_bc_lower = zc <= bc_lower_z ;
const bool c_bc_upper = bc_upper_z <= zc ;
if ( c_bc_lower || c_bc_upper ) {
const ScalarType c_bc_value = c_bc_lower ? bc_lower_value
: bc_upper_value ;
rhs( inode ) -= c_bc_value * matrix.coefficients(i);
matrix.coefficients(i) = 0 ;
}
}
}
}
static void apply( const matrix_type & linsys_matrix ,
const vector_type & linsys_rhs ,
const mesh_type & mesh ,
const ScalarCoordType bc_lower_z ,
const ScalarCoordType bc_upper_z ,
const ScalarType bc_lower_value ,
const ScalarType bc_upper_value )
{
const size_t row_count = linsys_matrix.graph.row_map.dimension_0() - 1 ;
DirichletBoundary op ;
op.node_coords = mesh.node_coords ;
op.matrix = linsys_matrix ;
op.rhs = linsys_rhs ;
op.bc_lower_z = bc_lower_z ;
op.bc_upper_z = bc_upper_z ;
op.bc_lower_value = bc_lower_value ;
op.bc_upper_value = bc_upper_value ;
parallel_for( row_count , op );
}
};
//----------------------------------------------------------------------------
} /* namespace Implicit */
} /* namespace HybridFEM */

View File

@ -1,567 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef USESCASES_LINALG_BLAS_HPP
#define USESCASES_LINALG_BLAS_HPP
#include <cmath>
#include <utility>
#include <ParallelComm.hpp>
#include <Kokkos_Core.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class Scalar , class Layout , class DeviceType > struct Dot ;
template< class Scalar , class Layout , class DeviceType > struct Dot1 ;
template< typename ScalarA ,
typename ScalarY ,
class Layout , class Device >
struct Scale ;
template< typename ScalarA ,
typename ScalarY ,
class Layout , class Device >
struct Fill ;
template< typename ScalarA ,
typename ScalarX ,
typename ScalarY ,
class Layout , class Device >
struct AXPY ;
template< typename ScalarX ,
typename ScalarB ,
typename ScalarY ,
class Layout , class Device >
struct XPBY ;
template< typename ScalarA ,
typename ScalarX ,
typename ScalarB ,
typename ScalarY ,
typename ScalarW ,
class Layout , class Device >
struct WAXPBY ;
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
//----------------------------------------------------------------------------
#if defined( KOKKOS_HAVE_MPI )
template< typename ScalarX /* Allow mix of const and non-const */ ,
typename ScalarY /* Allow mix of const and non-const */ ,
class L , class D ,
class MX /* Allow any management type */ ,
class MY /* Allow any management type */ >
inline
double dot( const size_t n ,
const View< ScalarX * , L , D , MX > & x ,
const View< ScalarY * , L , D , MY > & y ,
comm::Machine machine )
{
double global_result = 0 ;
double local_result = 0 ;
Impl::Dot< ScalarX , L , D >( n , x , y , local_result );
MPI_Allreduce( & local_result , & global_result , 1 ,
MPI_DOUBLE , MPI_SUM , machine.mpi_comm );
return global_result ;
}
#else
template< typename ScalarX /* Allow mix of const and non-const */ ,
typename ScalarY /* Allow mix of const and non-const */ ,
class L , class D ,
class MX /* Allow any management type */ ,
class MY /* Allow any management type */ >
inline
double dot( const size_t n ,
const View< ScalarX * , L , D , MX > & x ,
const View< ScalarY * , L , D , MY > & y ,
comm::Machine )
{
double global_result = 0 ;
Impl::Dot< ScalarX , L , D >( n , x , y , global_result );
return global_result ;
}
#endif
//----------------------------------------------------------------------------
#if defined( KOKKOS_HAVE_MPI )
template< typename ScalarX /* Allow mix of const and non-const */ ,
class L , class D ,
class MX /* Allow any management type */ >
inline
double dot( const size_t n ,
const View< ScalarX * , L , D , MX > & x ,
comm::Machine machine )
{
double global_result = 0 ;
double local_result = 0 ;
Impl::Dot1< ScalarX , L , D >( n , x , local_result );
MPI_Allreduce( & local_result , & global_result , 1 ,
MPI_DOUBLE , MPI_SUM , machine.mpi_comm );
return global_result ;
}
#else
template< typename ScalarX /* Allow mix of const and non-const */ ,
class L , class D ,
class MX /* Allow any management type */ >
inline
double dot( const size_t n ,
const View< ScalarX * , L , D , MX > & x ,
comm::Machine )
{
double global_result = 0 ;
Impl::Dot1< ScalarX , L , D >( n , x , global_result );
return global_result ;
}
#endif
//----------------------------------------------------------------------------
template< typename ScalarX /* Allow mix of const and non-const */ ,
class L , class D ,
class MX /* Allow any management type */ >
inline
double norm2( const size_t n ,
const View< ScalarX * , L , D , MX > & x ,
comm::Machine machine )
{
return std::sqrt( dot( n , x , machine ) );
}
//----------------------------------------------------------------------------
template< typename ScalarA ,
typename ScalarX ,
class L ,
class D ,
class MX >
void scale( const size_t n ,
const ScalarA & alpha ,
const View< ScalarX * , L , D , MX > & x )
{
Impl::Scale< ScalarA , ScalarX , L , D >( n , alpha , x );
}
template< typename ScalarA ,
typename ScalarX ,
class L ,
class D ,
class MX >
void fill( const size_t n ,
const ScalarA & alpha ,
const View< ScalarX * , L , D , MX > & x )
{
Impl::Fill< ScalarA , ScalarX , L , D >( n , alpha , x );
}
//----------------------------------------------------------------------------
template< typename ScalarA ,
typename ScalarX ,
typename ScalarY ,
class L ,
class D ,
class MX ,
class MY >
void axpy( const size_t n ,
const ScalarA & alpha ,
const View< ScalarX *, L , D , MX > & x ,
const View< ScalarY *, L , D , MY > & y )
{
Impl::AXPY< ScalarA, ScalarX, ScalarY , L , D >( n, alpha, x, y );
}
//----------------------------------------------------------------------------
template< typename ScalarX ,
typename ScalarB ,
typename ScalarY ,
class L ,
class D ,
class MX ,
class MY >
void xpby( const size_t n ,
const View< ScalarX *, L , D , MX > & x ,
const ScalarB & beta ,
const View< ScalarY *, L , D , MY > & y )
{
Impl::XPBY< ScalarX, ScalarB, ScalarY , L , D >( n, x, beta, y );
}
//----------------------------------------------------------------------------
// w = alpha * x + beta * y
template< typename ScalarA ,
typename ScalarX ,
typename ScalarB ,
typename ScalarY ,
typename ScalarW ,
class L , class D ,
class MX , class MY , class MW >
void waxpby( const size_t n ,
const ScalarA & alpha ,
const View< ScalarX * , L , D , MX > & x ,
const ScalarB & beta ,
const View< ScalarY * , L , D , MY > & y ,
const View< ScalarW * , L , D , MW > & w )
{
Impl::WAXPBY<ScalarA,ScalarX,ScalarB,ScalarY,ScalarW,L,D>
( n , alpha , x , beta , y , w );
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< typename Scalar , class L , class D >
struct Dot
{
private:
typedef View< const Scalar*, L, D, MemoryUnmanaged > vector_const_type ;
const vector_const_type x ;
const vector_const_type y ;
public:
typedef typename vector_const_type::execution_space execution_space ; // Manycore device
typedef double value_type ; // Reduction value
template< class ArgX , class ArgY >
inline
Dot( const size_t n , const ArgX & arg_x , const ArgY & arg_y , double & result )
: x( arg_x ), y( arg_y )
{
parallel_reduce( n , *this , result );
}
template< typename iType >
KOKKOS_INLINE_FUNCTION
void operator()( const iType & i , value_type & update ) const
{ update += x(i) * y(i); }
KOKKOS_INLINE_FUNCTION
static void join( volatile value_type & update ,
const volatile value_type & source )
{ update += source; }
KOKKOS_INLINE_FUNCTION
static void init( value_type & update )
{ update = 0 ; }
}; // Dot
//----------------------------------------------------------------------------
template< typename Scalar , class L , class D >
struct Dot1
{
private:
typedef View< const Scalar*, L, D , MemoryUnmanaged > vector_const_type ;
const vector_const_type x ;
public:
typedef typename vector_const_type::execution_space execution_space ; // Manycore device
typedef double value_type ; // Reduction value
template< class ArgX >
inline
Dot1( const size_t n , const ArgX & arg_x , double & result )
: x( arg_x )
{
parallel_reduce( n , *this , result );
}
template< typename iType >
KOKKOS_INLINE_FUNCTION
void operator()( const iType & i , value_type & update ) const
{ update += x(i) * x(i) ; }
KOKKOS_INLINE_FUNCTION
static void join( volatile value_type & update ,
const volatile value_type & source )
{ update += source ; }
KOKKOS_INLINE_FUNCTION
static void init( value_type & update )
{ update = 0 ; }
}; // Dot
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template < typename ScalarA ,
typename ScalarX ,
typename ScalarB ,
typename ScalarY ,
typename ScalarW ,
class L , class D >
struct WAXPBY
{
private:
typedef View< ScalarW *, L , D , MemoryUnmanaged > ViewW ;
typedef View< const ScalarX *, L , D , MemoryUnmanaged > ViewX ;
typedef View< const ScalarY *, L , D , MemoryUnmanaged > ViewY ;
const ViewW w ;
const ViewX x ;
const ViewY y ;
const ScalarA alpha ;
const ScalarB beta ;
public:
typedef typename ViewW::execution_space execution_space ;
template< typename iType >
KOKKOS_INLINE_FUNCTION
void operator()( const iType inode ) const
{
w(inode) = alpha * x(inode) + beta * y(inode);
}
template< class ArgX , class ArgY , class ArgW >
inline
WAXPBY( const size_t n ,
const ScalarA & arg_alpha ,
const ArgX & arg_x ,
const ScalarB & arg_beta ,
const ArgY & arg_y ,
const ArgW & arg_w )
: w( arg_w ), x( arg_x ), y( arg_y )
, alpha( arg_alpha ), beta( arg_beta )
{
parallel_for( n , *this );
}
}; // WAXPBY
//----------------------------------------------------------------------------
template < typename ScalarB ,
typename ScalarW ,
class L , class D >
struct Scale
{
private:
typedef View< ScalarW *, L , D , MemoryUnmanaged > ViewW ;
const ViewW w ;
const ScalarB beta ;
public:
typedef typename ViewW::execution_space execution_space ;
template< typename iType >
KOKKOS_INLINE_FUNCTION
void operator()( const iType & i ) const
{ w(i) *= beta ; }
template< class ArgW >
inline
Scale( const size_t n , const ScalarB & arg_beta , const ArgW & arg_w )
: w( arg_w )
, beta( arg_beta )
{
parallel_for( n , *this );
}
};
template < typename ScalarB ,
typename ScalarW ,
class L , class D >
struct Fill
{
private:
typedef View< ScalarW *, L , D , MemoryUnmanaged > ViewW ;
const ViewW w ;
const ScalarB beta ;
public:
typedef typename ViewW::execution_space execution_space ;
template< typename iType >
KOKKOS_INLINE_FUNCTION
void operator()( const iType & i ) const
{ w(i) = beta ; }
template< class ArgW >
inline
Fill( const size_t n , const ScalarB & arg_beta , const ArgW & arg_w )
: w( arg_w )
, beta( arg_beta )
{
parallel_for( n , *this );
}
};
//----------------------------------------------------------------------------
template < typename ScalarA ,
typename ScalarX ,
typename ScalarW ,
class L , class D >
struct AXPY
{
private:
typedef View< ScalarW *, L , D , MemoryUnmanaged > ViewW ;
typedef View< const ScalarX *, L , D , MemoryUnmanaged > ViewX ;
const ViewW w ;
const ViewX x ;
const ScalarA alpha ;
public:
typedef typename ViewW::execution_space execution_space ;
template< typename iType >
KOKKOS_INLINE_FUNCTION
void operator()( const iType & i ) const
{ w(i) += alpha * x(i); }
template< class ArgX , class ArgW >
inline
AXPY( const size_t n ,
const ScalarA & arg_alpha ,
const ArgX & arg_x ,
const ArgW & arg_w )
: w( arg_w ), x( arg_x )
, alpha( arg_alpha )
{
parallel_for( n , *this );
}
}; // AXPY
template< typename ScalarX ,
typename ScalarB ,
typename ScalarW ,
class L , class D >
struct XPBY
{
private:
typedef View< ScalarW *, L , D , MemoryUnmanaged > ViewW ;
typedef View< const ScalarX *, L , D , MemoryUnmanaged > ViewX ;
const ViewW w ;
const ViewX x ;
const ScalarB beta ;
public:
typedef typename ViewW::execution_space execution_space ;
template< typename iType >
KOKKOS_INLINE_FUNCTION
void operator()( const iType & i ) const
{ w(i) = x(i) + beta * w(i); }
template< class ArgX , class ArgW >
inline
XPBY( const size_t n ,
const ArgX & arg_x ,
const ScalarB & arg_beta ,
const ArgW & arg_w )
: w( arg_w ), x( arg_x )
, beta( arg_beta )
{
parallel_for( n , *this );
}
}; // XPBY
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef USESCASES_LINALG_BLAS_HPP */

View File

@ -1,53 +0,0 @@
KOKKOS_PATH ?= ../..
MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
SRC_DIR := $(dir $(MAKEFILE_PATH))
SRC = $(wildcard $(SRC_DIR)/*.cpp)
OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
#SRC = $(wildcard *.cpp)
#OBJ = $(SRC:%.cpp=%.o)
default: build
echo "Start Build"
# use installed Makefile.kokkos
include $(KOKKOS_PATH)/Makefile.kokkos
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = $(NVCC_WRAPPER)
CXXFLAGS = -I$(SRC_DIR) -I$(CUDA_PATH) -O3
LINK = $(CXX)
LINKFLAGS = -L$(CUDA_PATH)/lib64 -lcusparse
EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
#KOKKOS_DEVICES = "Cuda,OpenMP"
#KOKKOS_ARCH = "SNB,Kepler35"
else
CXX = g++
CXXFLAGS = -I$(SRC_DIR) -O3
LINK = $(CXX)
LINKFLAGS =
EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
#KOKKOS_DEVICES = "OpenMP"
#KOKKOS_ARCH = "SNB"
endif
DEPFLAGS = -M
LIB =
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean:
rm -f *.a *.o *.cuda *.host
# Compilation rules
%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -1,573 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef HYBRIDFEM_NONLINEAR_HPP
#define HYBRIDFEM_NONLINEAR_HPP
#include <utility>
#include <iostream>
#include <iomanip>
#include <Kokkos_Core.hpp>
#include <SparseLinearSystem.hpp>
#include <SparseLinearSystemFill.hpp>
#include <NonlinearFunctors.hpp>
#include <FEMesh.hpp>
#include <HexElement.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace HybridFEM {
namespace Nonlinear {
struct PerformanceData {
double mesh_time ;
double graph_time ;
double elem_time ;
double matrix_gather_fill_time ;
double matrix_boundary_condition_time ;
double cg_iteration_time ;
size_t cg_iteration_count ;
size_t newton_iteration_count ;
double error_max ;
PerformanceData()
: mesh_time(0)
, graph_time(0)
, elem_time(0)
, matrix_gather_fill_time(0)
, matrix_boundary_condition_time(0)
, cg_iteration_time(0)
, cg_iteration_count(0)
, newton_iteration_count(0)
, error_max(0)
{}
void best( const PerformanceData & rhs )
{
mesh_time = std::min( mesh_time , rhs.mesh_time );
graph_time = std::min( graph_time , rhs.graph_time );
elem_time = std::min( elem_time , rhs.elem_time );
matrix_gather_fill_time = std::min( matrix_gather_fill_time , rhs.matrix_gather_fill_time );
matrix_boundary_condition_time = std::min( matrix_boundary_condition_time , rhs.matrix_boundary_condition_time );
cg_iteration_time = std::min( cg_iteration_time , rhs.cg_iteration_time );
cg_iteration_count = std::min( cg_iteration_count , rhs.cg_iteration_count );
newton_iteration_count = std::min( newton_iteration_count , rhs.newton_iteration_count );
error_max = std::min( error_max , rhs.error_max );
}
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
class ManufacturedSolution {
public:
// Manufactured solution for one dimensional nonlinear PDE
//
// -K T_zz + T^2 = 0 ; T(zmin) = T_zmin ; T(zmax) = T_zmax
//
// Has an analytic solution of the form:
//
// T(z) = ( a ( z - zmin ) + b )^(-2) where K = 1 / ( 6 a^2 )
//
// Given T_0 and T_L compute K for this analytic solution.
//
// Two analytic solutions:
//
// Solution with singularity:
// , a( ( 1.0 / sqrt(T_zmax) + 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
// , b( -1.0 / sqrt(T_zmin) )
//
// Solution without singularity:
// , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
// , b( 1.0 / sqrt(T_zmin) )
const double zmin ;
const double zmax ;
const double T_zmin ;
const double T_zmax ;
const double a ;
const double b ;
const double K ;
ManufacturedSolution( const double arg_zmin ,
const double arg_zmax ,
const double arg_T_zmin ,
const double arg_T_zmax )
: zmin( arg_zmin )
, zmax( arg_zmax )
, T_zmin( arg_T_zmin )
, T_zmax( arg_T_zmax )
, a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
, b( 1.0 / sqrt(T_zmin) )
, K( 1.0 / ( 6.0 * a * a ) )
{}
double operator()( const double z ) const
{
const double tmp = a * ( z - zmin ) + b ;
return 1.0 / ( tmp * tmp );
}
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template< typename Scalar , class FixtureType >
PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
const int , // global_max_x ,
const int , // global_max_y ,
const int global_max_z ,
const bool print_error )
{
typedef Scalar scalar_type ;
typedef FixtureType fixture_type ;
typedef typename fixture_type::execution_space execution_space;
//typedef typename execution_space::size_type size_type ; // unused
typedef typename fixture_type::FEMeshType mesh_type ;
typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;
enum { ElementNodeCount = fixture_type::element_node_count };
const comm::Machine machine = mesh.parallel_data_map.machine ;
const size_t element_count = mesh.elem_node_ids.dimension_0();
//------------------------------------
// The amount of nonlinearity is proportional to the ratio
// between T(zmax) and T(zmin). For the manufactured solution
// 0 < T(zmin) and 0 < T(zmax)
const ManufacturedSolution
exact_solution( /* zmin */ 0 ,
/* zmax */ global_max_z ,
/* T(zmin) */ 1 ,
/* T(zmax) */ 20 );
//-----------------------------------
// Convergence Criteria and perf data:
const size_t cg_iteration_limit = 200 ;
const double cg_tolerance = 1e-14 ;
const size_t newton_iteration_limit = 150 ;
const double newton_tolerance = 1e-14 ;
size_t cg_iteration_count_total = 0 ;
double cg_iteration_time = 0 ;
size_t newton_iteration_count = 0 ;
double residual_norm_init = 0 ;
double residual_norm = 0 ;
PerformanceData perf_data ;
//------------------------------------
// Sparse linear system types:
typedef Kokkos::View< scalar_type* , execution_space > vector_type ;
typedef Kokkos::CrsMatrix< scalar_type , execution_space > matrix_type ;
typedef typename matrix_type::graph_type matrix_graph_type ;
typedef typename matrix_type::coefficients_type matrix_coefficients_type ;
typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;
//------------------------------------
// Problem setup types:
typedef ElementComputation < mesh_type , scalar_type > ElementFunctor ;
typedef DirichletSolution < mesh_type , scalar_type > DirichletSolutionFunctor ;
typedef DirichletResidual < mesh_type , scalar_type > DirichletResidualFunctor ;
typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
typedef typename ElementFunctor::elem_vectors_type elem_vectors_type ;
typedef GatherFill< matrix_type ,
mesh_type ,
elem_matrices_type ,
elem_vectors_type > GatherFillFunctor ;
//------------------------------------
matrix_type jacobian ;
vector_type residual ;
vector_type delta ;
vector_type nodal_solution ;
typename graph_factory::element_map_type element_map ;
//------------------------------------
// Generate mesh and corresponding sparse matrix graph
Kokkos::Impl::Timer wall_clock ;
//------------------------------------
// Generate sparse matrix graph and element->graph map.
wall_clock.reset();
graph_factory::create( mesh , jacobian.graph , element_map );
execution_space::fence();
perf_data.graph_time = comm::max( machine , wall_clock.seconds() );
//------------------------------------
// Allocate linear system coefficients and rhs:
const size_t local_owned_length = jacobian.graph.row_map.dimension_0() - 1 ;
const size_t local_total_length = mesh.node_coords.dimension_0();
jacobian.coefficients =
matrix_coefficients_type( "jacobian_coeff" , jacobian.graph.entries.dimension_0() );
// Nonlinear residual for owned nodes:
residual = vector_type( "residual" , local_owned_length );
// Nonlinear solution for owned and ghosted nodes:
nodal_solution = vector_type( "solution" , local_total_length );
// Nonlinear solution update for owned nodes:
delta = vector_type( "delta" , local_owned_length );
//------------------------------------
// Allocation of arrays to fill the linear system
elem_matrices_type elem_matrices ; // Jacobian matrices
elem_vectors_type elem_vectors ; // Residual vectors
if ( element_count ) {
elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
elem_vectors = elem_vectors_type( std::string("elem_vectors"), element_count );
}
//------------------------------------
// For boundary condition set the correct values in the solution vector
// The 'zmin' face is assigned to 'T_zmin'.
// The 'zmax' face is assigned to 'T_zmax'.
// The resulting solution is one dimensional along the 'Z' axis.
DirichletSolutionFunctor::apply( nodal_solution , mesh ,
exact_solution.zmin ,
exact_solution.zmax ,
exact_solution.T_zmin ,
exact_solution.T_zmax );
for(;;) { // Nonlinear loop
#if defined( KOKKOS_HAVE_MPI )
{ //------------------------------------
// Import off-processor nodal solution values
// for residual and jacobian computations
Kokkos::AsyncExchange< typename vector_type::value_type , execution_space ,
Kokkos::ParallelDataMap >
exchange( mesh.parallel_data_map , 1 );
Kokkos::PackArray< vector_type >
::pack( exchange.buffer() ,
mesh.parallel_data_map.count_interior ,
mesh.parallel_data_map.count_send ,
nodal_solution );
exchange.setup();
exchange.send_receive();
Kokkos::UnpackArray< vector_type >
::unpack( nodal_solution , exchange.buffer() ,
mesh.parallel_data_map.count_owned ,
mesh.parallel_data_map.count_receive );
}
#endif
//------------------------------------
// Compute element matrices and vectors:
wall_clock.reset();
ElementFunctor( mesh ,
elem_matrices ,
elem_vectors ,
nodal_solution ,
exact_solution.K );
execution_space::fence();
perf_data.elem_time += comm::max( machine , wall_clock.seconds() );
//------------------------------------
// Fill linear system coefficients:
wall_clock.reset();
fill( jacobian.coefficients.dimension_0(), 0 , jacobian.coefficients );
fill( residual.dimension_0() , 0 , residual );
GatherFillFunctor::apply( jacobian ,
residual ,
mesh ,
element_map ,
elem_matrices ,
elem_vectors );
execution_space::fence();
perf_data.matrix_gather_fill_time += comm::max( machine , wall_clock.seconds() );
// Apply boundary conditions:
wall_clock.reset();
// Updates jacobian matrix to 1 on the diagonal, zero elsewhere,
// and 0 in the residual due to the solution vector having the correct value
DirichletResidualFunctor::apply( jacobian, residual, mesh ,
exact_solution.zmin ,
exact_solution.zmax );
execution_space::fence();
perf_data.matrix_boundary_condition_time +=
comm::max( machine , wall_clock.seconds() );
//------------------------------------
// Has the residual converged?
residual_norm = norm2( mesh.parallel_data_map.count_owned,
residual,
mesh.parallel_data_map.machine );
if ( 0 == newton_iteration_count ) {
residual_norm_init = residual_norm ;
}
if ( residual_norm / residual_norm_init < newton_tolerance ) {
break ;
}
//------------------------------------
// Solve linear sytem
size_t cg_iteration_count = 0 ;
double cg_residual_norm = 0 ;
cgsolve( mesh.parallel_data_map ,
jacobian , residual , delta ,
cg_iteration_count ,
cg_residual_norm ,
cg_iteration_time ,
cg_iteration_limit , cg_tolerance ) ;
perf_data.cg_iteration_time += cg_iteration_time ;
cg_iteration_count_total += cg_iteration_count ;
// Update non-linear solution with delta...
// delta is : - Dx = [Jacobian]^1 * Residual which is the negative update
// LaTeX:
// \vec {x}_{n+1} = \vec {x}_{n} - ( - \Delta \vec{x}_{n} )
// text:
// x[n+1] = x[n] + Dx
axpy( mesh.parallel_data_map.count_owned ,
-1.0, delta, nodal_solution);
++newton_iteration_count ;
if ( newton_iteration_limit < newton_iteration_count ) {
break ;
}
};
if ( newton_iteration_count ) {
perf_data.elem_time /= newton_iteration_count ;
perf_data.matrix_gather_fill_time /= newton_iteration_count ;
perf_data.matrix_boundary_condition_time /= newton_iteration_count ;
}
if ( cg_iteration_count_total ) {
perf_data.cg_iteration_time /= cg_iteration_count_total ;
}
perf_data.newton_iteration_count = newton_iteration_count ;
perf_data.cg_iteration_count = cg_iteration_count_total ;
//------------------------------------
{
// For extracting the nodal solution and its coordinates:
typename mesh_type::node_coords_type::HostMirror node_coords_host =
Kokkos::create_mirror( mesh.node_coords );
typename vector_type::HostMirror nodal_solution_host =
Kokkos::create_mirror( nodal_solution );
Kokkos::deep_copy( node_coords_host , mesh.node_coords );
Kokkos::deep_copy( nodal_solution_host , nodal_solution );
double tmp = 0 ;
for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
const coordinate_scalar_type x = node_coords_host(i,0);
const coordinate_scalar_type y = node_coords_host(i,1);
const coordinate_scalar_type z = node_coords_host(i,2);
const double Tx = exact_solution(z);
const double Ts = nodal_solution_host(i);
const double Te = std::abs( Tx - Ts ) / std::abs( Tx );
tmp = std::max( tmp , Te );
if ( print_error && 0.02 < Te ) {
std::cout << " node( " << x << " " << y << " " << z << " ) = "
<< Ts << " != exact_solution " << Tx
<< std::endl ;
}
}
perf_data.error_max = comm::max( machine , tmp );
}
return perf_data ;
}
//----------------------------------------------------------------------------
template< typename Scalar , class Device , class FixtureElement >
void driver( const char * const label ,
comm::Machine machine ,
const int gang_count ,
const int elem_count_beg ,
const int elem_count_end ,
const int runs )
{
typedef Scalar scalar_type ;
typedef Device execution_space ;
typedef double coordinate_scalar_type ;
typedef FixtureElement fixture_element_type ;
typedef BoxMeshFixture< coordinate_scalar_type ,
execution_space ,
fixture_element_type > fixture_type ;
typedef typename fixture_type::FEMeshType mesh_type ;
const size_t proc_count = comm::size( machine );
const size_t proc_rank = comm::rank( machine );
if ( elem_count_beg == 0 || elem_count_end == 0 || runs == 0 ) return ;
if ( comm::rank( machine ) == 0 ) {
std::cout << std::endl ;
std::cout << "\"Kokkos::HybridFE::Nonlinear " << label << "\"" << std::endl;
std::cout
<< "\"Size\" , \"Size\" , \"Graphing\" , \"Element\" , \"Fill\" , \"Boundary\" , \"CG-Iter\" , \"CG-Iter\" , \"Newton-Iter\" , \"Max-node-error\""
<< std::endl
<< "\"elems\" , \"nodes\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"total-count\" , \"total-count\" , \"ratio\""
<< std::endl ;
}
const bool print_sample = 0 ;
const double x_curve = 1.0 ;
const double y_curve = 1.0 ;
const double z_curve = 0.8 ;
for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
{
const int ix = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
const int iy = 1 + ix ;
const int iz = 2 * iy ;
const int global_elem_count = ix * iy * iz ;
const int global_node_count = ( 2 * ix + 1 ) *
( 2 * iy + 1 ) *
( 2 * iz + 1 );
mesh_type mesh =
fixture_type::create( proc_count , proc_rank , gang_count ,
ix , iy , iz ,
x_curve , y_curve , z_curve );
mesh.parallel_data_map.machine = machine ;
PerformanceData perf_data , perf_best ;
for(int j = 0; j < runs; j++){
perf_data = run<scalar_type,fixture_type>(mesh,ix,iy,iz, print_sample );
if( j == 0 ) {
perf_best = perf_data ;
}
else {
perf_best.best( perf_data );
}
}
if ( comm::rank( machine ) == 0 ) {
std::cout << std::setw(8) << global_elem_count << " , "
<< std::setw(8) << global_node_count << " , "
<< std::setw(10) << perf_best.graph_time * 1000 << " , "
<< std::setw(10) << perf_best.elem_time * 1000 << " , "
<< std::setw(10) << perf_best.matrix_gather_fill_time * 1000 << " , "
<< std::setw(10) << perf_best.matrix_boundary_condition_time * 1000 << " , "
<< std::setw(10) << perf_best.cg_iteration_time * 1000 << " , "
<< std::setw(7) << perf_best.cg_iteration_count << " , "
<< std::setw(3) << perf_best.newton_iteration_count << " , "
<< std::setw(10) << perf_best.error_max
<< std::endl ;
}
}
}
//----------------------------------------------------------------------------
} /* namespace Nonlinear */
} /* namespace HybridFEM */
#endif /* #ifndef HYBRIDFEM_IMPLICIT_HPP */

View File

@ -1,390 +0,0 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <cstdlib>
#include <cmath>
#include <Kokkos_Core.hpp>
#include <HexElement.hpp>
#include <FEMesh.hpp>
namespace HybridFEM {
namespace Nonlinear {
template< class MeshType , typename ScalarType > struct ElementComputation ;
//----------------------------------------------------------------------------
template<>
struct ElementComputation< FEMesh< double , 27 , Kokkos::Cuda > , double >
{
typedef Kokkos::Cuda execution_space ;
static const unsigned ElementNodeCount = 27 ;
typedef HexElement_Data< ElementNodeCount > element_data_type ;
typedef FEMesh< double , ElementNodeCount , execution_space > mesh_type ;
static const unsigned SpatialDim = element_data_type::spatial_dimension ;
static const unsigned FunctionCount = element_data_type::function_count ;
static const unsigned IntegrationCount = element_data_type::integration_count ;
static const unsigned TensorDim = SpatialDim * SpatialDim ;
typedef Kokkos::View< double[][FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
typedef Kokkos::View< double[][FunctionCount] , execution_space > elem_vectors_type ;
typedef Kokkos::View< double[] , execution_space > value_vector_type ;
private:
const element_data_type elem_data ;
const typename mesh_type::elem_node_ids_type elem_node_ids ;
const typename mesh_type::node_coords_type node_coords ;
const value_vector_type nodal_values ;
const elem_matrices_type element_matrices ;
const elem_vectors_type element_vectors ;
const float coeff_K ;
const unsigned elem_count ;
unsigned invJacIndex[9][4] ;
static const unsigned j11 = 0 , j12 = 1 , j13 = 2 ,
j21 = 3 , j22 = 4 , j23 = 5 ,
j31 = 6 , j32 = 7 , j33 = 8 ;
// Can only handle up to 16 warps:
static const unsigned BlockDimX = 32 ;
static const unsigned BlockDimY = 7 ;
struct WorkSpace {
double sum[ BlockDimY ][ BlockDimX ];
double value_at_integ[ IntegrationCount ];
double gradx_at_integ[ IntegrationCount ];
double grady_at_integ[ IntegrationCount ];
double gradz_at_integ[ IntegrationCount ];
float spaceJac[ BlockDimY ][ 9 ];
float spaceInvJac[ BlockDimY ][ 9 ];
float detJweight[ IntegrationCount ];
float dpsidx[ FunctionCount ][ IntegrationCount ];
float dpsidy[ FunctionCount ][ IntegrationCount ];
float dpsidz[ FunctionCount ][ IntegrationCount ];
};
public:
ElementComputation ( const mesh_type & arg_mesh ,
const elem_matrices_type & arg_element_matrices ,
const elem_vectors_type & arg_element_vectors ,
const value_vector_type & arg_nodal_values ,
const float arg_coeff_K )
: elem_data()
, elem_node_ids( arg_mesh.elem_node_ids )
, node_coords( arg_mesh.node_coords )
, nodal_values( arg_nodal_values )
, element_matrices( arg_element_matrices )
, element_vectors( arg_element_vectors )
, coeff_K( arg_coeff_K )
, elem_count( arg_mesh.elem_node_ids.dimension_0() )
{
const unsigned jInvJ[9][4] =
{ { j22 , j33 , j23 , j32 } ,
{ j13 , j32 , j12 , j33 } ,
{ j12 , j23 , j13 , j22 } ,
{ j23 , j31 , j21 , j33 } ,
{ j11 , j33 , j13 , j31 } ,
{ j13 , j21 , j11 , j23 } ,
{ j21 , j32 , j22 , j31 } ,
{ j12 , j31 , j11 , j32 } ,
{ j11 , j22 , j12 , j21 } };
for ( unsigned i = 0 ; i < 9 ; ++i ) {
for ( unsigned j = 0 ; j < 4 ; ++j ) {
invJacIndex[i][j] = jInvJ[i][j] ;
}
}
const unsigned shmem = sizeof(WorkSpace);
const unsigned grid_max = 65535 ;
const unsigned grid_count = std::min( grid_max , elem_count );
// For compute capability 2.x up to 1024 threads per block
const dim3 block( BlockDimX , BlockDimY , 1 );
const dim3 grid( grid_count , 1 , 1 );
Kokkos::Impl::CudaParallelLaunch< ElementComputation >( *this , grid , block , shmem );
}
public:
//------------------------------------
// Sum among the threadIdx.x
template< typename Type >
__device__ inline static
void sum_x( Type & result , const double value )
{
extern __shared__ WorkSpace work_data[] ;
volatile double * const base_sum =
& work_data->sum[ threadIdx.y ][ threadIdx.x ] ;
base_sum[ 0] = value ;
if ( threadIdx.x < 16 ) {
base_sum[0] += base_sum[16];
base_sum[0] += base_sum[ 8];
base_sum[0] += base_sum[ 4];
base_sum[0] += base_sum[ 2];
base_sum[0] += base_sum[ 1];
}
if ( 0 == threadIdx.x ) {
result = base_sum[0] ;
}
}
__device__ inline static
void sum_x_clear()
{
extern __shared__ WorkSpace work_data[] ;
work_data->sum[ threadIdx.y ][ threadIdx.x ] = 0 ;
}
//------------------------------------
//------------------------------------
__device__ inline
void evaluateFunctions( const unsigned ielem ) const
{
extern __shared__ WorkSpace work_data[] ;
// Each warp (threadIdx.y) computes an integration point
// Each thread is responsible for a node / function.
const unsigned iFunc = threadIdx.x ;
const bool hasFunc = iFunc < FunctionCount ;
//------------------------------------
// Each warp gathers a different variable into 'elem_mat' shared memory.
if ( hasFunc ) {
const unsigned node = elem_node_ids( ielem , iFunc );
for ( unsigned iy = threadIdx.y ; iy < 4 ; iy += blockDim.y ) {
switch( iy ) {
case 0 : work_data->sum[0][iFunc] = node_coords(node,0); break ;
case 1 : work_data->sum[1][iFunc] = node_coords(node,1); break ;
case 2 : work_data->sum[2][iFunc] = node_coords(node,2); break ;
case 3 : work_data->sum[3][iFunc] = nodal_values(node); break ;
default: break ;
}
}
}
__syncthreads(); // Wait for all warps to finish gathering
// now get local 'const' copies in register space:
const double x = work_data->sum[0][ iFunc ];
const double y = work_data->sum[1][ iFunc ];
const double z = work_data->sum[2][ iFunc ];
const double dof_val = work_data->sum[3][ iFunc ];
__syncthreads(); // Wait for all warps to finish extracting
sum_x_clear(); // Make sure summation scratch is zero
//------------------------------------
// Each warp is now on its own computing an integration point
// so no further explicit synchronizations are required.
if ( hasFunc ) {
float * const J = work_data->spaceJac[ threadIdx.y ];
float * const invJ = work_data->spaceInvJac[ threadIdx.y ];
for ( unsigned iInt = threadIdx.y ;
iInt < IntegrationCount ; iInt += blockDim.y ) {
const float val = elem_data.values[iInt][iFunc] ;
const float gx = elem_data.gradients[iInt][0][iFunc] ;
const float gy = elem_data.gradients[iInt][1][iFunc] ;
const float gz = elem_data.gradients[iInt][2][iFunc] ;
sum_x( J[j11], gx * x );
sum_x( J[j12], gx * y );
sum_x( J[j13], gx * z );
sum_x( J[j21], gy * x );
sum_x( J[j22], gy * y );
sum_x( J[j23], gy * z );
sum_x( J[j31], gz * x );
sum_x( J[j32], gz * y );
sum_x( J[j33], gz * z );
// Inverse jacobian, only enough parallel work for 9 threads in the warp
if ( iFunc < TensorDim ) {
invJ[ iFunc ] =
J[ invJacIndex[iFunc][0] ] * J[ invJacIndex[iFunc][1] ] -
J[ invJacIndex[iFunc][2] ] * J[ invJacIndex[iFunc][3] ] ;
// Let all threads in the warp compute determinant into a register
const float detJ = J[j11] * invJ[j11] +
J[j21] * invJ[j12] +
J[j31] * invJ[j13] ;
invJ[ iFunc ] /= detJ ;
if ( 0 == iFunc ) {
work_data->detJweight[ iInt ] = detJ * elem_data.weights[ iInt ] ;
}
}
// Transform bases gradients and compute value and gradient
const float dx = gx * invJ[j11] + gy * invJ[j12] + gz * invJ[j13];
const float dy = gx * invJ[j21] + gy * invJ[j22] + gz * invJ[j23];
const float dz = gx * invJ[j31] + gy * invJ[j32] + gz * invJ[j33];
work_data->dpsidx[iFunc][iInt] = dx ;
work_data->dpsidy[iFunc][iInt] = dy ;
work_data->dpsidz[iFunc][iInt] = dz ;
sum_x( work_data->gradx_at_integ[iInt] , dof_val * dx );
sum_x( work_data->grady_at_integ[iInt] , dof_val * dy );
sum_x( work_data->gradz_at_integ[iInt] , dof_val * dz );
sum_x( work_data->value_at_integ[iInt] , dof_val * val );
}
}
__syncthreads(); // All shared data must be populated at return.
}
__device__ inline
void contributeResidualJacobian( const unsigned ielem ) const
{
extern __shared__ WorkSpace work_data[] ;
sum_x_clear(); // Make sure summation scratch is zero
// $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
// $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
const unsigned iInt = threadIdx.x ;
if ( iInt < IntegrationCount ) {
const double value_at_integ = work_data->value_at_integ[ iInt ] ;
const double gradx_at_integ = work_data->gradx_at_integ[ iInt ] ;
const double grady_at_integ = work_data->grady_at_integ[ iInt ] ;
const double gradz_at_integ = work_data->gradz_at_integ[ iInt ] ;
const float detJweight = work_data->detJweight[ iInt ] ;
const float coeff_K_detJweight = coeff_K * detJweight ;
for ( unsigned iRow = threadIdx.y ;
iRow < FunctionCount ; iRow += blockDim.y ) {
const float value_row = elem_data.values[ iInt ][ iRow ] * detJweight ;
const float dpsidx_row = work_data->dpsidx[ iRow ][ iInt ] * coeff_K_detJweight ;
const float dpsidy_row = work_data->dpsidy[ iRow ][ iInt ] * coeff_K_detJweight ;
const float dpsidz_row = work_data->dpsidz[ iRow ][ iInt ] * coeff_K_detJweight ;
const double res_del = dpsidx_row * gradx_at_integ +
dpsidy_row * grady_at_integ +
dpsidz_row * gradz_at_integ ;
const double res_val = value_at_integ * value_at_integ * value_row ;
const double jac_val_row = 2 * value_at_integ * value_row ;
sum_x( element_vectors( ielem , iRow ) , res_del + res_val );
for ( unsigned iCol = 0 ; iCol < FunctionCount ; ++iCol ) {
const float jac_del =
dpsidx_row * work_data->dpsidx[iCol][iInt] +
dpsidy_row * work_data->dpsidy[iCol][iInt] +
dpsidz_row * work_data->dpsidz[iCol][iInt] ;
const double jac_val =
jac_val_row * elem_data.values[ iInt ][ iCol ] ;
sum_x( element_matrices( ielem , iRow , iCol ) , jac_del + jac_val );
}
}
}
__syncthreads(); // All warps finish before refilling shared data
}
__device__ inline
void operator()(void) const
{
extern __shared__ WorkSpace work_data[] ;
for ( unsigned ielem = blockIdx.x ; ielem < elem_count ; ielem += gridDim.x ) {
evaluateFunctions( ielem );
contributeResidualJacobian( ielem );
}
}
}; /* ElementComputation */
} /* namespace Nonlinear */
} /* namespace HybridFEM */

View File

@ -1,482 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_NONLINEARFUNCTORS_HPP
#define KOKKOS_NONLINEARFUNCTORS_HPP
#include <iostream>
#include <fstream>
#include <iomanip>
#include <cstdlib>
#include <cmath>
namespace HybridFEM {
namespace Nonlinear {
template< class MeshType , typename ScalarType > struct ElementComputation ;
template< class MeshType , typename ScalarType > struct DirichletSolution ;
template< class MeshType , typename ScalarType > struct DirichletResidual ;
}
}
/* A Cuda-specific specialization for the element computation functor. */
#if defined( __CUDACC__ )
#include <NonlinearElement_Cuda.hpp>
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace HybridFEM {
namespace Nonlinear {
template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
typename ScalarType >
struct ElementComputation<
FEMesh< ScalarCoordType , ElemNode , DeviceType > , ScalarType >
{
typedef DeviceType execution_space;
typedef ScalarType scalar_type ;
static const unsigned ElementNodeCount = ElemNode ;
typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
typedef HexElement_Data< ElementNodeCount > element_data_type ;
static const unsigned SpatialDim = element_data_type::spatial_dimension ;
static const unsigned FunctionCount = element_data_type::function_count ;
static const unsigned IntegrationCount = element_data_type::integration_count ;
static const unsigned TensorDim = SpatialDim * SpatialDim ;
typedef Kokkos::View< scalar_type[][FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
typedef Kokkos::View< scalar_type[][FunctionCount] , execution_space > elem_vectors_type ;
typedef Kokkos::View< scalar_type[] , execution_space > value_vector_type ;
private:
const element_data_type elem_data ;
typename mesh_type::elem_node_ids_type elem_node_ids ;
typename mesh_type::node_coords_type node_coords ;
value_vector_type nodal_values ;
elem_matrices_type element_matrices ;
elem_vectors_type element_vectors ;
scalar_type coeff_K ;
public:
ElementComputation( const mesh_type & arg_mesh ,
const elem_matrices_type & arg_element_matrices ,
const elem_vectors_type & arg_element_vectors ,
const value_vector_type & arg_nodal_values ,
const scalar_type arg_coeff_K )
: elem_data()
, elem_node_ids( arg_mesh.elem_node_ids )
, node_coords( arg_mesh.node_coords )
, nodal_values( arg_nodal_values )
, element_matrices( arg_element_matrices )
, element_vectors( arg_element_vectors )
, coeff_K( arg_coeff_K )
{
const size_t elem_count = arg_mesh.elem_node_ids.dimension_0();
parallel_for( elem_count , *this );
}
//------------------------------------
static const unsigned FLOPS_transform_gradients =
/* Jacobian */ FunctionCount * TensorDim * 2 +
/* Inverse jacobian */ TensorDim * 6 + 6 +
/* Gradient transform */ FunctionCount * 15 ;
KOKKOS_INLINE_FUNCTION
float transform_gradients(
const float grad[][ FunctionCount ] , // Gradient of bases master element
const double x[] ,
const double y[] ,
const double z[] ,
float dpsidx[] ,
float dpsidy[] ,
float dpsidz[] ) const
{
enum { j11 = 0 , j12 = 1 , j13 = 2 ,
j21 = 3 , j22 = 4 , j23 = 5 ,
j31 = 6 , j32 = 7 , j33 = 8 };
// Jacobian accumulation:
double J[ TensorDim ] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
for( unsigned i = 0; i < FunctionCount ; ++i ) {
const double x1 = x[i] ;
const double x2 = y[i] ;
const double x3 = z[i] ;
const float g1 = grad[0][i] ;
const float g2 = grad[1][i] ;
const float g3 = grad[2][i] ;
J[j11] += g1 * x1 ;
J[j12] += g1 * x2 ;
J[j13] += g1 * x3 ;
J[j21] += g2 * x1 ;
J[j22] += g2 * x2 ;
J[j23] += g2 * x3 ;
J[j31] += g3 * x1 ;
J[j32] += g3 * x2 ;
J[j33] += g3 * x3 ;
}
// Inverse jacobian:
float invJ[ TensorDim ] = {
static_cast<float>( J[j22] * J[j33] - J[j23] * J[j32] ) ,
static_cast<float>( J[j13] * J[j32] - J[j12] * J[j33] ) ,
static_cast<float>( J[j12] * J[j23] - J[j13] * J[j22] ) ,
static_cast<float>( J[j23] * J[j31] - J[j21] * J[j33] ) ,
static_cast<float>( J[j11] * J[j33] - J[j13] * J[j31] ) ,
static_cast<float>( J[j13] * J[j21] - J[j11] * J[j23] ) ,
static_cast<float>( J[j21] * J[j32] - J[j22] * J[j31] ) ,
static_cast<float>( J[j12] * J[j31] - J[j11] * J[j32] ) ,
static_cast<float>( J[j11] * J[j22] - J[j12] * J[j21] ) };
const float detJ = J[j11] * invJ[j11] +
J[j21] * invJ[j12] +
J[j31] * invJ[j13] ;
const float detJinv = 1.0 / detJ ;
for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
// Transform gradients:
for( unsigned i = 0; i < FunctionCount ; ++i ) {
const float g0 = grad[0][i];
const float g1 = grad[1][i];
const float g2 = grad[2][i];
dpsidx[i] = g0 * invJ[j11] + g1 * invJ[j12] + g2 * invJ[j13];
dpsidy[i] = g0 * invJ[j21] + g1 * invJ[j22] + g2 * invJ[j23];
dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33];
}
return detJ ;
}
KOKKOS_INLINE_FUNCTION
void contributeResidualJacobian(
const float coeff_k ,
const double dof_values[] ,
const float dpsidx[] ,
const float dpsidy[] ,
const float dpsidz[] ,
const float detJ ,
const float integ_weight ,
const float bases_vals[] ,
double elem_res[] ,
double elem_mat[][ FunctionCount ] ) const
{
double value_at_pt = 0 ;
double gradx_at_pt = 0 ;
double grady_at_pt = 0 ;
double gradz_at_pt = 0 ;
for ( unsigned m = 0 ; m < FunctionCount ; m++ ) {
value_at_pt += dof_values[m] * bases_vals[m] ;
gradx_at_pt += dof_values[m] * dpsidx[m] ;
grady_at_pt += dof_values[m] * dpsidy[m] ;
gradz_at_pt += dof_values[m] * dpsidz[m] ;
}
const scalar_type k_detJ_weight = coeff_k * detJ * integ_weight ;
const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ;
const double mat_val = 2.0 * value_at_pt * detJ * integ_weight ;
// $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
// $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
for ( unsigned m = 0; m < FunctionCount; m++) {
double * const mat = elem_mat[m] ;
const float bases_val_m = bases_vals[m];
const float dpsidx_m = dpsidx[m] ;
const float dpsidy_m = dpsidy[m] ;
const float dpsidz_m = dpsidz[m] ;
elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt +
dpsidy_m * grady_at_pt +
dpsidz_m * gradz_at_pt ) +
res_val * bases_val_m ;
for( unsigned n = 0; n < FunctionCount; n++) {
mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] +
dpsidy_m * dpsidy[n] +
dpsidz_m * dpsidz[n] ) +
mat_val * bases_val_m * bases_vals[n];
}
}
}
KOKKOS_INLINE_FUNCTION
void operator()( const unsigned ielem ) const
{
// Gather nodal coordinates and solution vector:
double x[ FunctionCount ] ;
double y[ FunctionCount ] ;
double z[ FunctionCount ] ;
double val[ FunctionCount ] ;
for ( unsigned i = 0 ; i < ElementNodeCount ; ++i ) {
const unsigned node_index = elem_node_ids( ielem , i );
x[i] = node_coords( node_index , 0 );
y[i] = node_coords( node_index , 1 );
z[i] = node_coords( node_index , 2 );
val[i] = nodal_values( node_index );
}
double elem_vec[ FunctionCount ] ;
double elem_mat[ FunctionCount ][ FunctionCount ] ;
for( unsigned i = 0; i < FunctionCount ; i++ ) {
elem_vec[i] = 0 ;
for( unsigned j = 0; j < FunctionCount ; j++){
elem_mat[i][j] = 0 ;
}
}
for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) {
float dpsidx[ FunctionCount ] ;
float dpsidy[ FunctionCount ] ;
float dpsidz[ FunctionCount ] ;
const float detJ =
transform_gradients( elem_data.gradients[i] , x , y , z ,
dpsidx , dpsidy , dpsidz );
contributeResidualJacobian( coeff_K ,
val , dpsidx , dpsidy , dpsidz ,
detJ ,
elem_data.weights[i] ,
elem_data.values[i] ,
elem_vec , elem_mat );
}
for( unsigned i = 0; i < FunctionCount ; i++){
element_vectors(ielem, i) = elem_vec[i] ;
for( unsigned j = 0; j < FunctionCount ; j++){
element_matrices(ielem, i, j) = elem_mat[i][j] ;
}
}
}
}; /* ElementComputation */
//----------------------------------------------------------------------------
template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
typename ScalarType >
struct DirichletSolution<
FEMesh< ScalarCoordType , ElemNode , DeviceType > ,
ScalarType >
{
typedef DeviceType execution_space;
static const unsigned ElementNodeCount = ElemNode ;
typedef Kokkos::View< ScalarType[] , execution_space > vector_type ;
typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
typename mesh_type::node_coords_type node_coords ;
vector_type solution ;
ScalarCoordType bc_lower_z ;
ScalarCoordType bc_upper_z ;
ScalarType bc_lower_value ;
ScalarType bc_upper_value ;
KOKKOS_INLINE_FUNCTION
void operator()( const unsigned inode ) const
{
// Apply dirichlet boundary condition on the Solution vector.
// Define boundary node values to be either bc_lower_value or
// bc_upper_value, depending on which boundary face they lie on.
// Non-boundary terms will be left at their previous value.
const ScalarCoordType z = node_coords(inode,2);
const bool bc_lower = z <= bc_lower_z ;
const bool bc_upper = bc_upper_z <= z ;
if ( bc_lower || bc_upper ) {
const ScalarType bc_value = bc_lower ? bc_lower_value
: bc_upper_value ;
solution(inode) = bc_value ; // set the solution vector
}
}
static void apply( const vector_type & solution ,
const mesh_type & mesh ,
const ScalarCoordType bc_lower_z ,
const ScalarCoordType bc_upper_z ,
const ScalarType bc_lower_value ,
const ScalarType bc_upper_value )
{
DirichletSolution op ;
op.node_coords = mesh.node_coords ;
op.solution = solution ;
op.bc_lower_z = bc_lower_z ;
op.bc_upper_z = bc_upper_z ;
op.bc_lower_value = bc_lower_value ;
op.bc_upper_value = bc_upper_value ;
parallel_for( solution.dimension_0() , op );
}
};
//----------------------------------------------------------------------------
template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
typename ScalarType >
struct DirichletResidual<
FEMesh< ScalarCoordType , ElemNode , DeviceType > , ScalarType >
{
typedef DeviceType execution_space;
typedef typename execution_space::size_type size_type ;
static const unsigned ElementNodeCount = ElemNode ;
typedef Kokkos::CrsMatrix< ScalarType , execution_space > matrix_type ;
typedef Kokkos::View< ScalarType[] , execution_space > vector_type ;
typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
typename mesh_type::node_coords_type node_coords ;
matrix_type matrix ;
vector_type rhs ;
ScalarCoordType bc_lower_z ;
ScalarCoordType bc_upper_z ;
KOKKOS_INLINE_FUNCTION
void operator()( const unsigned inode ) const
{
// Apply a dirichlet boundary condition to 'irow'
// to maintain the symmetry of the original
// global stiffness matrix, zero out the columns
// that correspond to boundary conditions, and
// adjust the load vector accordingly
const size_type iBeg = matrix.graph.row_map[inode];
const size_type iEnd = matrix.graph.row_map[inode+1];
const ScalarCoordType z = node_coords(inode,2);
const bool bc_lower = z <= bc_lower_z ;
const bool bc_upper = bc_upper_z <= z ;
if ( bc_lower || bc_upper ) {
rhs(inode) = 0 ; // set the residual vector
// zero each value on the row, and leave a one
// on the diagonal
for( size_type i = iBeg ; i < iEnd ; i++) {
matrix.coefficients(i) =
(int) inode == matrix.graph.entries(i) ? 1 : 0 ;
}
}
else {
// Find any columns that are boundary conditions.
// Clear them and adjust the load vector
for( size_type i = iBeg ; i < iEnd ; i++ ) {
const size_type cnode = matrix.graph.entries(i) ;
const ScalarCoordType zc = node_coords(cnode,2);
const bool c_bc_lower = zc <= bc_lower_z ;
const bool c_bc_upper = bc_upper_z <= zc ;
if ( c_bc_lower || c_bc_upper ) {
matrix.coefficients(i) = 0 ;
}
}
}
}
static void apply( const matrix_type & linsys_matrix ,
const vector_type & linsys_rhs ,
const mesh_type & mesh ,
const ScalarCoordType bc_lower_z ,
const ScalarCoordType bc_upper_z)
{
const size_t row_count = linsys_matrix.graph.row_map.dimension_0() - 1 ;
DirichletResidual op ;
op.node_coords = mesh.node_coords ;
op.matrix = linsys_matrix ;
op.rhs = linsys_rhs ;
op.bc_lower_z = bc_lower_z ;
op.bc_upper_z = bc_upper_z ;
parallel_for( row_count , op );
}
};
//----------------------------------------------------------------------------
} /* namespace Nonlinear */
} /* namespace HybridFEM */
#endif /* #ifndef KOKKOS_NONLINEARFUNCTORS_HPP */

View File

@ -1,167 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef PARALLELCOMM_HPP
#define PARALLELCOMM_HPP
//------------------------------------------------------------------------
#include <Kokkos_Macros.hpp>
//------------------------------------------------------------------------
#if defined( KOKKOS_HAVE_MPI )
#include <mpi.h>
#include <string>
namespace comm {
struct Machine {
MPI_Comm mpi_comm ;
Machine() : mpi_comm( MPI_COMM_NULL ) {}
Machine( const Machine & rhs )
: mpi_comm( rhs.mpi_comm ) {}
Machine( MPI_Comm c ) : mpi_comm( c ) {}
static Machine init( int * argc , char *** argv )
{
MPI_Init( argc , argv );
return Machine( MPI_COMM_WORLD );
}
static void finalize() { MPI_Finalize(); }
};
inline
unsigned size( Machine machine )
{
int np ; MPI_Comm_size( machine.mpi_comm , & np ); return np ;
}
inline
unsigned rank( Machine machine )
{
int ip ; MPI_Comm_rank( machine.mpi_comm , & ip ); return ip ;
}
inline
double max( Machine machine , double local )
{
double global = 0;
MPI_Allreduce( & local , & global , 1 , MPI_DOUBLE , MPI_MAX , machine.mpi_comm );
return global ;
}
inline
std::string command_line( Machine machine , const int argc , const char * const * const argv )
{
std::string argline ;
if ( 0 == rank( machine ) ) {
for ( int i = 1 ; i < argc ; ++i ) {
argline.append(" ").append( argv[i] );
}
}
int length = argline.length();
MPI_Bcast( & length , 1 , MPI_INT , 0 , machine.mpi_comm );
argline.resize( length , ' ' );
MPI_Bcast( (void*) argline.data() , length , MPI_CHAR , 0 , machine.mpi_comm );
return argline ;
}
}
#else /* ! defined( KOKKOS_HAVE_MPI ) */
#include <string>
namespace comm {
// Stub for non-parallel
struct Machine {
static Machine init( int * , char *** )
{ return Machine(); }
static void finalize() {}
};
inline
unsigned size( Machine ) { return 1 ; }
inline
unsigned rank( Machine ) { return 0 ; }
inline
double max( Machine , double local )
{ return local ; }
inline
std::string command_line( Machine machine , const int argc , const char * const * const argv )
{
std::string argline ;
if ( 0 == rank( machine ) ) {
for ( int i = 1 ; i < argc ; ++i ) {
argline.append(" ").append( argv[i] );
}
}
return argline ;
}
}
#endif /* ! defined( KOKKOS_HAVE_MPI ) */
//------------------------------------------------------------------------
#endif /* #ifndef PARALLELCOMM_HPP */

View File

@ -1,517 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_PARALLELDATAMAP_HPP
#define KOKKOS_PARALLELDATAMAP_HPP
#include <utility>
#include <limits>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <Kokkos_Core.hpp>
#include <ParallelComm.hpp>
namespace Kokkos {
//----------------------------------------------------------------------------
/** \brief Parallel distributed data mapping
*
* ordering { interior : { owned items not sent elsewhere }
* send : { owned items sent }
* receive : { not-owned items received } }
*
* recv { { N ghosted items from process P : ( P , N ) } }
*
* send { { N send items to process P : ( P , N ) } }
*
* send_item { send item offsets within 'send' range }
*/
struct ParallelDataMap {
typedef View< unsigned*[2], HostSpace > host_recv_type ;
typedef View< unsigned*[2], HostSpace > host_send_type ;
typedef View< unsigned* , HostSpace > host_send_item_type ;
comm::Machine machine ;
host_recv_type host_recv ;
host_send_type host_send ;
host_send_item_type host_send_item ;
unsigned count_interior ;
unsigned count_send ;
unsigned count_owned ; // = count_interior + count_send
unsigned count_receive ;
void assign( const unsigned arg_count_interior ,
const unsigned arg_count_owned ,
const unsigned arg_count_total ,
const unsigned arg_recv_msg ,
const unsigned arg_send_msg ,
const unsigned arg_send_count )
{
const std::string label("Kokkos::ParallelDataMap buffer");
count_interior = arg_count_interior ;
count_owned = arg_count_owned ;
count_send = arg_count_owned - arg_count_interior ;
count_receive = arg_count_total - arg_count_owned ;
host_recv = host_recv_type( label , arg_recv_msg );
host_send = host_send_type( label , arg_send_msg );
host_send_item = host_send_item_type( label , arg_send_count );
}
};
//----------------------------------------------------------------------------
//PackArray
//----------------------------------------------------------------------------
template< class ArrayType , class Rank = void >
struct PackArray ;
template< typename DeviceType, typename ValueType >
struct PackArray< View< ValueType* , DeviceType > , void >
{
typedef DeviceType execution_space ;
typedef typename DeviceType::size_type size_type ;
typedef View< ValueType* , execution_space > array_type ;
typedef View< ValueType* , execution_space > buffer_type ;
private:
buffer_type output ;
array_type input ;
size_type base ;
public:
KOKKOS_INLINE_FUNCTION
void operator()( const size_type i ) const
{ output[i] = input(base+i); }
inline
static
void pack( const buffer_type & arg_output ,
const size_type arg_begin ,
const size_type arg_count ,
const array_type & arg_input )
{
PackArray op ;
op.output = arg_output ;
op.input = arg_input ;
op.base = arg_begin ;
parallel_for( arg_count , op );
}
};
template< typename DeviceType, typename ValueType , unsigned N1 >
struct PackArray< View< ValueType*[N1] , DeviceType > , void >
{
typedef DeviceType execution_space ;
typedef typename DeviceType::size_type size_type ;
typedef View< ValueType*[N1] , execution_space > array_type ;
typedef View< ValueType* , execution_space > buffer_type ;
private:
buffer_type output ;
array_type input ;
size_type base ;
public:
KOKKOS_INLINE_FUNCTION
void operator()( const size_type i ) const
{
for ( size_type j = 0 , k = i * N1 ; j < N1 ; ++j , ++k ) {
output[k] = input(base+i,j);
}
}
inline static
void pack( const buffer_type & arg_output ,
const size_type arg_begin ,
const size_type arg_count ,
const array_type & arg_input )
{
if ( arg_count ) {
PackArray op ;
op.output = arg_output ;
op.input = arg_input ;
op.base = arg_begin ;
parallel_for( arg_count , op );
}
}
};
//----------------------------------------------------------------------------
//UnpackArray
//----------------------------------------------------------------------------
template< class ArrayType , class Rank = void > struct UnpackArray ;
template< typename DeviceType, typename ValueType >
struct UnpackArray< View< ValueType* , DeviceType > , void >
{
typedef DeviceType execution_space ;
typedef typename DeviceType::size_type size_type ;
typedef View< ValueType* , execution_space > array_type ;
typedef View< ValueType* , execution_space > buffer_type ;
private:
array_type output ;
buffer_type input ;
size_type base ;
public:
KOKKOS_INLINE_FUNCTION
void operator()( const size_type i ) const
{ output(base+i) = input[i]; }
inline
static
void unpack( const array_type & arg_output ,
const buffer_type & arg_input ,
const size_type arg_begin ,
const size_type arg_count )
{
UnpackArray op ;
op.output = arg_output ;
op.input = arg_input ;
op.base = arg_begin ;
parallel_for( arg_count , op );
}
};
template< typename DeviceType, typename ValueType , unsigned N1 >
struct UnpackArray< View< ValueType*[N1] , DeviceType > , void >
{
typedef DeviceType execution_space ;
typedef typename DeviceType::size_type size_type ;
typedef View< ValueType* , execution_space > buffer_type ;
typedef View< ValueType*[N1] , execution_space > array_type ;
private:
array_type output ;
buffer_type input ;
size_type base ;
public:
KOKKOS_INLINE_FUNCTION
void operator()( const size_type i ) const
{
for ( size_type j = 0 , k = i * N1 ; j < N1 ; ++j , ++k ) {
output(base+i,j) = input(k);
}
}
inline
static
void unpack( const array_type & arg_output ,
const buffer_type & arg_input ,
const size_type arg_begin ,
const size_type arg_count )
{
if ( arg_count ) {
UnpackArray op ;
op.output = arg_output ;
op.input = arg_input ;
op.base = arg_begin ;
parallel_for( arg_count , op );
}
}
};
//----------------------------------------------------------------------------
template< class ValueType , class Device , class DataMap >
class AsyncExchange ;
} // namespace Kokkos
//----------------------------------------------------------------------------
// Application call procedure:
//
// construct: AsyncExchange object
// * pack send buffer on device
// initiate: copy send buffer from device to host
// * dispatch asynchronous local work
// complete: send/receive on host, copy receive buffer to device
// * unpack receive buffer on device
// destroy: AsyncExchange object
//
//----------------------------------------------------------------------------
#ifdef KOKKOS_HAVE_MPI
namespace Kokkos {
template< class ValueType , class Device >
class AsyncExchange< ValueType, Device , Kokkos::ParallelDataMap > {
public:
typedef Device execution_space ;
typedef Kokkos::ParallelDataMap data_map_type ;
typedef Kokkos::View< ValueType* , execution_space > buffer_dev_type ;
typedef typename buffer_dev_type::HostMirror buffer_host_type ;
private:
static const int mpi_tag = 11 ;
const data_map_type data_map ;
unsigned chunk_size ;
unsigned send_count_max ;
buffer_host_type host_recv_buffer ;
buffer_host_type host_send_buffer ;
buffer_host_type send_msg_buffer ;
buffer_dev_type dev_buffer ;
buffer_dev_type dev_send_buffer ; // Subview for send
buffer_dev_type dev_recv_buffer ; // Subview for receive
std::vector< MPI_Request > recv_request ;
public:
const buffer_dev_type & buffer() const { return dev_buffer ; }
AsyncExchange( const data_map_type & arg_data_map ,
const size_t arg_chunk )
: data_map( arg_data_map )
, chunk_size( arg_chunk )
, send_count_max( 0 )
, host_recv_buffer()
, host_send_buffer()
, send_msg_buffer()
, dev_buffer()
, dev_send_buffer()
, dev_recv_buffer()
, recv_request()
{
const size_t send_msg_count = arg_data_map.host_send.dimension_0();
const size_t recv_msg_count = arg_data_map.host_recv.dimension_0();
const size_t send_msg_length = arg_chunk * arg_data_map.count_send ;
const size_t recv_msg_length = arg_chunk * arg_data_map.count_receive ;
for ( size_t i = 0 ; i < send_msg_count ; ++i ) {
send_count_max = std::max( send_count_max ,
(unsigned) arg_data_map.host_send(i,1) );
}
// A single shared buffer on the device can be used for
// send and receive message buffers.
dev_buffer = buffer_dev_type(
std::string("AsyncExchange dev_buffer") ,
std::max( send_msg_length , recv_msg_length ) );
// Total send subview of the device buffer
dev_send_buffer =
Kokkos::subview( dev_buffer , std::pair<size_t,size_t>( 0 , send_msg_length ) );
// Total receive subview of the device buffer
dev_recv_buffer =
Kokkos::subview( dev_buffer , std::pair<size_t,size_t>( 0 , recv_msg_length ) );
// Total receive message buffer on the host:
host_recv_buffer = buffer_host_type(
std::string("AsyncExchange host_recv_buffer") ,
recv_msg_length );
// Total send message buffer on the host:
host_send_buffer = buffer_host_type(
std::string("AsyncExchange host_send_buffer") ,
send_msg_length );
// Individual send message buffer on the host:
send_msg_buffer = buffer_host_type(
std::string("AsyncExchange send_msg_buffer") ,
arg_chunk * send_count_max );
// MPI asynchronous receive request handles:
recv_request.assign( recv_msg_count , MPI_REQUEST_NULL );
}
//------------------------------------------------------------------------
void setup()
{
{ // Post receives:
const size_t recv_msg_count = data_map.host_recv.dimension_0();
ValueType * ptr = host_recv_buffer.ptr_on_device();
for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
const int proc = data_map.host_recv(i,0);
const int count = data_map.host_recv(i,1) * chunk_size ;
MPI_Irecv( ptr , count * sizeof(ValueType) , MPI_BYTE ,
proc , mpi_tag , data_map.machine.mpi_comm ,
& recv_request[i] );
ptr += count ;
}
}
// Copy send buffer from the device to host memory for sending
Kokkos::deep_copy( host_send_buffer , dev_send_buffer );
// Done with the device until communication is complete.
// Application can dispatch asynchronous work on the device.
}
// Application can dispatch local work to device ...
// No communication progress until main thread calls 'send_receive'
void send_receive()
{
const size_t recv_msg_count = data_map.host_recv.dimension_0();
const size_t send_msg_count = data_map.host_send.dimension_0();
// Pack and send:
for ( size_t i = 0 , j = 0 ; i < send_msg_count ; ++i ) {
const int proc = data_map.host_send(i,0);
const int count = data_map.host_send(i,1);
for ( int k = 0 , km = 0 ; k < count ; ++k , ++j ) {
const int km_end = km + chunk_size ;
int ki = chunk_size * data_map.host_send_item(j);
for ( ; km < km_end ; ++km , ++ki ) {
send_msg_buffer[km] = host_send_buffer[ki];
}
}
// MPI_Ssend blocks until
// (1) a receive is matched for the message and
// (2) the send buffer can be re-used.
//
// It is suggested that MPI_Ssend will have the best performance:
// http://www.mcs.anl.gov/research/projects/mpi/sendmode.html .
MPI_Ssend( send_msg_buffer.ptr_on_device(),
count * chunk_size * sizeof(ValueType) , MPI_BYTE ,
proc , mpi_tag , data_map.machine.mpi_comm );
}
// Wait for receives and verify:
for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
MPI_Status recv_status ;
int recv_which = 0 ;
int recv_size = 0 ;
MPI_Waitany( recv_msg_count , & recv_request[0] ,
& recv_which , & recv_status );
const int recv_proc = recv_status.MPI_SOURCE ;
MPI_Get_count( & recv_status , MPI_BYTE , & recv_size );
// Verify message properly received:
const int expected_proc = data_map.host_recv(recv_which,0);
const int expected_size = data_map.host_recv(recv_which,1) *
chunk_size * sizeof(ValueType);
if ( ( expected_proc != recv_proc ) ||
( expected_size != recv_size ) ) {
std::ostringstream msg ;
msg << "AsyncExchange error:"
<< " P" << comm::rank( data_map.machine )
<< " received from P" << recv_proc
<< " size " << recv_size
<< " expected " << expected_size
<< " from P" << expected_proc ;
throw std::runtime_error( msg.str() );
}
}
// Copy received data to device memory.
Kokkos::deep_copy( dev_recv_buffer , host_recv_buffer );
}
};
} // namespace Kokkos
#else /* ! #ifdef KOKKOS_HAVE_MPI */
namespace Kokkos {
template< class ValueType , class Device >
class AsyncExchange< ValueType, Device , Kokkos::ParallelDataMap > {
public:
typedef Device execution_space ;
typedef Kokkos::ParallelDataMap data_map_type ;
typedef Kokkos::View< ValueType* , execution_space > buffer_dev_type ;
typedef typename buffer_dev_type::HostMirror buffer_host_type ;
buffer_dev_type dev_buffer ;
public:
const buffer_dev_type & buffer() const { return dev_buffer ; }
AsyncExchange( const data_map_type & , const size_t )
: dev_buffer()
{ }
//------------------------------------------------------------------------
void setup() { }
void send_receive() { }
};
} // namespace Kokkos
#endif /* ! #ifdef KOKKOS_HAVE_MPI */
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_PARALLELDATAMAP_HPP */

View File

@ -1,178 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if 0
#include <stdlib.h>
#include <string.h>
#include <ParallelMachine.hpp>
#include <Kokkos_Core.hpp>
#if ! defined( KOKKOS_HAVE_MPI )
#define MPI_COMM_NULL 0
#endif
//------------------------------------------------------------------------
namespace Parallel {
Machine::Machine( int * argc , char *** argv )
: m_mpi_comm( MPI_COMM_NULL )
, m_mpi_size(0)
, m_mpi_rank(0)
, m_mpi_gpu(0)
{
#if defined( KOKKOS_HAVE_CUDA )
//------------------------------------
// Might be using a Cuda aware version of MPI.
// Must select Cuda device before initializing MPI.
{
int i = 1 ;
for ( ; i < *argc && strcmp((*argv)[i],"mpi_cuda") ; ++i );
if ( i < *argc ) {
// Determine, if possible, what will be the node-local
// rank of the MPI process once MPI has been initialized.
// This rank is needed to set the Cuda device before 'mvapich'
// is initialized.
const char * const mvapich_local_rank = getenv("MV2_COMM_WORLD_LOCAL_RANK");
const char * const slurm_local_rank = getenv("SLURM_LOCALID");
const int pre_mpi_local_rank =
0 != mvapich_local_rank ? atoi( mvapich_local_rank ) : (
0 != slurm_local_rank ? atoi( slurm_local_rank ) : (
-1 ) );
if ( 0 <= pre_mpi_local_rank ) {
const int ngpu = Kokkos::Cuda::detect_device_count();
const int cuda_device_rank = pre_mpi_local_rank % ngpu ;
Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cuda_device_rank ) );
m_mpi_gpu = 1 ;
}
}
}
#endif
//------------------------------------
#if defined( KOKKOS_HAVE_MPI )
MPI_Init( argc , argv );
m_mpi_comm = MPI_COMM_WORLD ;
MPI_Comm_size( m_mpi_comm , & m_mpi_size );
MPI_Comm_rank( m_mpi_comm , & m_mpi_rank );
#endif
// Query hwloc after MPI initialization to allow MPI binding:
//------------------------------------
// Request to use host device:
{
int i = 1 ;
for ( ; i < *argc && strcmp((*argv)[i],"host") ; ++i );
if ( i < *argc ) {
unsigned team_count = Kokkos::hwloc::get_available_numa_count();
unsigned threads_per_team = Kokkos::hwloc::get_available_cores_per_numa() *
Kokkos::hwloc::get_available_threads_per_core();
if ( i + 2 < *argc ) {
team_count = atoi( (*argv)[i+1] );
threads_per_team = atoi( (*argv)[i+2] );
}
Kokkos::Threads::initialize( team_count * threads_per_team );
}
}
#if defined( KOKKOS_HAVE_CUDA )
//------------------------------------
// Request to use Cuda device and not already initialized.
if ( ! m_mpi_gpu ) {
int i = 1 ;
for ( ; i < *argc && strcmp((*argv)[i],"mpi_cuda") && strcmp((*argv)[i],"cuda") ; ++i );
if ( i < *argc ) {
const int ngpu = Kokkos::Cuda::detect_device_count();
const int cuda_device_rank = m_mpi_rank % ngpu ;
Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cuda_device_rank ) );
}
}
#endif
}
Machine::~Machine()
{
Kokkos::Threads::finalize();
#if defined( KOKKOS_HAVE_CUDA )
Kokkos::Cuda::finalize();
#endif
#if defined( KOKKOS_HAVE_MPI )
MPI_Finalize();
#endif
}
void Machine::print_configuration( std::ostream & msg ) const
{
msg << "MPI [ " << m_mpi_rank << " / " << m_mpi_size << " ]" << std::endl ;
Kokkos::Threads::print_configuration( msg );
#if defined( KOKKOS_HAVE_CUDA )
Kokkos::Cuda::print_configuration( msg );
#endif
}
}
#endif /* #if 0 */

View File

@ -1,118 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#error "ParallelMachine"
#ifndef PARALLELMACHINE_HPP
#define PARALLELMACHINE_HPP
//------------------------------------------------------------------------
#include <iosfwd>
#include <Kokkos_Core.hpp>
//------------------------------------------------------------------------
#if defined( KOKKOS_HAVE_MPI )
#include <mpi.h>
#else
typedef int MPI_Comm ;
#endif
//------------------------------------------------------------------------
//------------------------------------------------------------------------
namespace Parallel {
/** \brief Hybrid parallel machine with MPI+Kokkos::Threads or MPI+Kokkos::Cuda.
*
* Initialization of MPI and Kokkos device has interdependencies which this
* class manages. The command line and environment variables are queried to initialize
* the Threads or Cuda device:
*
* 1) cuda : initializes Cuda device
* 2) host : initializes Threads device with all hwloc detected cores.
* 3) host #gang #worker : initializes Threads with specified
*/
class Machine {
private:
MPI_Comm m_mpi_comm ;
int m_mpi_size ;
int m_mpi_rank ;
unsigned m_mpi_gpu ;
unsigned m_gpu_arch ;
Machine();
Machine( const Machine & );
Machine & operator = ( const Machine & );
public:
/** \brief Coordinated initialize MPI, Cuda, or Threads devices from 'main'. */
Machine( int * argc , char *** argv );
~Machine();
MPI_Comm mpi_comm() const { return m_mpi_comm ; }
int mpi_size() const { return m_mpi_size ; }
int mpi_rank() const { return m_mpi_rank ; }
/** \brief If using MPI that can directly operate on GPU memory */
bool mpi_gpu() const { return m_mpi_gpu ; }
/** \brief If using GPU then what architecture */
unsigned gpu_arch() const { return m_gpu_arch ; }
void print_configuration( std::ostream & ) const ;
};
}
//------------------------------------------------------------------------
#endif /* #ifndef PARALLELMACHINE_HPP */

View File

@ -1,400 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef SPARSELINEARSYSTEM_HPP
#define SPARSELINEARSYSTEM_HPP
#include <cmath>
#include <impl/Kokkos_Timer.hpp>
#include <Kokkos_Core.hpp>
#include <Kokkos_StaticCrsGraph.hpp>
#include <LinAlgBLAS.hpp>
namespace Kokkos {
template< typename ScalarType , class Device >
struct CrsMatrix {
typedef Device execution_space ;
typedef ScalarType value_type ;
typedef StaticCrsGraph< int , execution_space , void , int > graph_type ;
typedef View< value_type* , execution_space > coefficients_type ;
graph_type graph ;
coefficients_type coefficients ;
};
//----------------------------------------------------------------------------
namespace Impl {
template< class Matrix , class OutputVector , class InputVector >
struct Multiply ;
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< typename AScalarType ,
typename VScalarType ,
class DeviceType >
struct Multiply< CrsMatrix<AScalarType,DeviceType> ,
View<VScalarType*,DeviceType > ,
View<VScalarType*,DeviceType > >
{
typedef DeviceType execution_space ;
typedef typename execution_space::size_type size_type ;
typedef View< VScalarType*, execution_space, MemoryUnmanaged > vector_type ;
typedef View< const VScalarType*, execution_space, MemoryUnmanaged > vector_const_type ;
typedef CrsMatrix< AScalarType , execution_space > matrix_type ;
private:
matrix_type m_A ;
vector_const_type m_x ;
vector_type m_y ;
public:
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
void operator()( const size_type iRow ) const
{
const size_type iEntryBegin = m_A.graph.row_map[iRow];
const size_type iEntryEnd = m_A.graph.row_map[iRow+1];
double sum = 0 ;
#if defined( __INTEL_COMPILER )
#pragma simd reduction(+:sum)
#pragma ivdep
for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
}
#else
for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
}
#endif
m_y(iRow) = sum ;
}
Multiply( const matrix_type & A ,
const size_type nrow ,
const size_type , // ncol ,
const vector_type & x ,
const vector_type & y )
: m_A( A ), m_x( x ), m_y( y )
{
parallel_for( nrow , *this );
}
};
//----------------------------------------------------------------------------
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
//----------------------------------------------------------------------------
template< typename AScalarType ,
typename VScalarType ,
class Device >
class Operator {
typedef CrsMatrix<AScalarType,Device> matrix_type ;
typedef View<VScalarType*,Device> vector_type ;
private:
const CrsMatrix<AScalarType,Device> A ;
ParallelDataMap data_map ;
AsyncExchange< VScalarType , Device , ParallelDataMap > exchange ;
public:
Operator( const ParallelDataMap & arg_data_map ,
const CrsMatrix<AScalarType,Device> & arg_A )
: A( arg_A )
, data_map( arg_data_map )
, exchange( arg_data_map , 1 )
{}
void apply( const View<VScalarType*,Device> & x ,
const View<VScalarType*,Device> & y )
{
// Gather off-processor data for 'x'
PackArray< vector_type >::pack( exchange.buffer() ,
data_map.count_interior ,
data_map.count_send , x );
exchange.setup();
// If interior & boundary matrices then could launch interior multiply
exchange.send_receive();
UnpackArray< vector_type >::unpack( x , exchange.buffer() ,
data_map.count_owned ,
data_map.count_receive );
const typename Device::size_type nrow = data_map.count_owned ;
const typename Device::size_type ncol = data_map.count_owned +
data_map.count_receive ;
Impl::Multiply<matrix_type,vector_type,vector_type>( A, nrow, ncol, x, y);
}
};
//----------------------------------------------------------------------------
template< typename AScalarType , typename VScalarType , class Device >
void cgsolve(
const ParallelDataMap data_map ,
const CrsMatrix<AScalarType,Device> A ,
const View<VScalarType*,Device> b ,
const View<VScalarType*,Device> x ,
size_t & iteration ,
double & normr ,
double & iter_time ,
const size_t maximum_iteration = 200 ,
const double tolerance = std::numeric_limits<VScalarType>::epsilon() )
{
typedef View<VScalarType*,Device> vector_type ;
//typedef View<VScalarType, Device> value_type ; // unused
const size_t count_owned = data_map.count_owned ;
const size_t count_total = data_map.count_owned + data_map.count_receive ;
Operator<AScalarType,VScalarType,Device> matrix_operator( data_map , A );
// Need input vector to matvec to be owned + received
vector_type pAll ( "cg::p" , count_total );
vector_type p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
vector_type r ( "cg::r" , count_owned );
vector_type Ap( "cg::Ap", count_owned );
/* r = b - A * x ; */
/* p = x */ deep_copy( p , x );
/* Ap = A * p */ matrix_operator.apply( pAll , Ap );
/* r = b - Ap */ waxpby( count_owned , 1.0 , b , -1.0 , Ap , r );
/* p = r */ deep_copy( p , r );
double old_rdot = dot( count_owned , r , data_map.machine );
normr = sqrt( old_rdot );
iteration = 0 ;
Kokkos::Impl::Timer wall_clock ;
while ( tolerance < normr && iteration < maximum_iteration ) {
/* pAp_dot = dot( p , Ap = A * p ) */
/* Ap = A * p */ matrix_operator.apply( pAll , Ap );
const double pAp_dot = dot( count_owned , p , Ap , data_map.machine );
const double alpha = old_rdot / pAp_dot ;
/* x += alpha * p ; */ axpy( count_owned, alpha, p , x );
/* r -= alpha * Ap ; */ axpy( count_owned, -alpha, Ap, r );
const double r_dot = dot( count_owned , r , data_map.machine );
const double beta = r_dot / old_rdot ;
/* p = r + beta * p ; */ xpby( count_owned , r , beta , p );
normr = sqrt( old_rdot = r_dot );
++iteration ;
}
iter_time = wall_clock.seconds();
}
//----------------------------------------------------------------------------
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( KOKKOS_HAVE_CUDA )
#if ( CUDA_VERSION < 6000 )
#pragma message "cusparse_v2.h"
#include <cusparse_v2.h>
#else
#pragma message "cusparse.h"
#include <cusparse.h>
#endif
namespace Kokkos {
namespace Impl {
struct CudaSparseSingleton {
cusparseHandle_t handle;
cusparseMatDescr_t descra;
CudaSparseSingleton()
{
cusparseCreate( & handle );
cusparseCreateMatDescr( & descra );
cusparseSetMatType( descra , CUSPARSE_MATRIX_TYPE_GENERAL );
cusparseSetMatIndexBase( descra , CUSPARSE_INDEX_BASE_ZERO );
}
static CudaSparseSingleton & singleton();
};
template<>
struct Multiply< CrsMatrix<double,Cuda> ,
View<double*,Cuda > ,
View<double*,Cuda > >
{
typedef Cuda execution_space ;
typedef execution_space::size_type size_type ;
typedef double scalar_type ;
typedef View< scalar_type* , execution_space > vector_type ;
typedef CrsMatrix< scalar_type , execution_space > matrix_type ;
public:
Multiply( const matrix_type & A ,
const size_type nrow ,
const size_type ncol ,
const vector_type & x ,
const vector_type & y )
{
CudaSparseSingleton & s = CudaSparseSingleton::singleton();
const scalar_type alpha = 1 , beta = 0 ;
cusparseStatus_t status =
cusparseDcsrmv( s.handle ,
CUSPARSE_OPERATION_NON_TRANSPOSE ,
nrow , ncol , A.coefficients.dimension_0() ,
&alpha ,
s.descra ,
A.coefficients.ptr_on_device() ,
A.graph.row_map.ptr_on_device() ,
A.graph.entries.ptr_on_device() ,
x.ptr_on_device() ,
&beta ,
y.ptr_on_device() );
if ( CUSPARSE_STATUS_SUCCESS != status ) {
throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
}
}
};
template<>
struct Multiply< CrsMatrix<float,Cuda> ,
View<float*,Cuda > ,
View<float*,Cuda > >
{
typedef Cuda execution_space ;
typedef execution_space::size_type size_type ;
typedef float scalar_type ;
typedef View< scalar_type* , execution_space > vector_type ;
typedef CrsMatrix< scalar_type , execution_space > matrix_type ;
public:
Multiply( const matrix_type & A ,
const size_type nrow ,
const size_type ncol ,
const vector_type & x ,
const vector_type & y )
{
CudaSparseSingleton & s = CudaSparseSingleton::singleton();
const scalar_type alpha = 1 , beta = 0 ;
cusparseStatus_t status =
cusparseScsrmv( s.handle ,
CUSPARSE_OPERATION_NON_TRANSPOSE ,
nrow , ncol , A.coefficients.dimension_0() ,
&alpha ,
s.descra ,
A.coefficients.ptr_on_device() ,
A.graph.row_map.ptr_on_device() ,
A.graph.entries.ptr_on_device() ,
x.ptr_on_device() ,
&beta ,
y.ptr_on_device() );
if ( CUSPARSE_STATUS_SUCCESS != status ) {
throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
}
}
};
} /* namespace Impl */
} /* namespace Kokkos */
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef SPARSELINEARSYSTEM_HPP */

View File

@ -1,276 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef SPARSELINEARSYSTEMFILL_HPP
#define SPARSELINEARSYSTEMFILL_HPP
#include <vector>
#include <algorithm>
#include <limits>
#include <FEMesh.hpp>
#include <SparseLinearSystem.hpp>
//----------------------------------------------------------------------------
namespace HybridFEM {
template< class MatrixType , class MeshType ,
class elem_matrices_type ,
class elem_vectors_type > struct GatherFill ;
template< typename ScalarType ,
class DeviceType ,
unsigned ElemNode ,
typename CoordScalarType ,
class elem_matrices_type ,
class elem_vectors_type >
struct GatherFill<
Kokkos::CrsMatrix< ScalarType , DeviceType > ,
FEMesh< CoordScalarType , ElemNode , DeviceType > ,
elem_matrices_type , elem_vectors_type >
{
typedef DeviceType execution_space ;
typedef typename execution_space::size_type size_type ;
static const size_type ElemNodeCount = ElemNode ;
typedef Kokkos::CrsMatrix< ScalarType , execution_space > matrix_type ;
typedef typename matrix_type::coefficients_type coefficients_type ;
typedef Kokkos::View< ScalarType[] , execution_space > vector_type ;
typedef Kokkos::View< size_type[][ElemNodeCount][ElemNodeCount] , execution_space > elem_graph_type ;
typedef FEMesh< CoordScalarType , ElemNodeCount , execution_space > mesh_type ;
typedef typename mesh_type::node_elem_ids_type node_elem_ids_type ;
private:
node_elem_ids_type node_elem_ids ;
elem_graph_type elem_graph ;
elem_matrices_type elem_matrices ;
elem_vectors_type elem_vectors ;
coefficients_type system_coeff ;
vector_type system_rhs ;
public:
KOKKOS_INLINE_FUNCTION
void operator()( size_type irow ) const
{
const size_type node_elem_begin = node_elem_ids.row_map[irow];
const size_type node_elem_end = node_elem_ids.row_map[irow+1];
// for each element that a node belongs to
for ( size_type i = node_elem_begin ; i < node_elem_end ; i++ ) {
const size_type elem_id = node_elem_ids.entries( i, 0);
const size_type row_index = node_elem_ids.entries( i, 1);
system_rhs(irow) += elem_vectors(elem_id, row_index);
// for each node in a particular related element
// gather the contents of the element stiffness
// matrix that belong in irow
for ( size_type j = 0 ; j < ElemNodeCount ; ++j ){
const size_type A_index = elem_graph( elem_id , row_index , j );
system_coeff( A_index ) += elem_matrices( elem_id, row_index, j );
}
}
}
static void apply( const matrix_type & matrix ,
const vector_type & rhs ,
const mesh_type & mesh ,
const elem_graph_type & elem_graph ,
const elem_matrices_type & elem_matrices ,
const elem_vectors_type & elem_vectors )
{
const size_t row_count = matrix.graph.row_map.dimension_0() - 1 ;
GatherFill op ;
op.node_elem_ids = mesh.node_elem_ids ;
op.elem_graph = elem_graph ;
op.elem_matrices = elem_matrices ;
op.elem_vectors = elem_vectors ;
op.system_coeff = matrix.coefficients ;
op.system_rhs = rhs ;
parallel_for( row_count , op );
}
};
} /* namespace HybridFEM */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace HybridFEM {
template< class GraphType , class MeshType >
struct GraphFactory {
typedef GraphType graph_type ;
typedef MeshType mesh_type ;
typedef typename graph_type::execution_space execution_space ;
typedef typename execution_space::size_type size_type ;
static const unsigned ElemNodeCount = mesh_type::element_node_count ;
typedef Kokkos::View< size_type[][ElemNodeCount][ElemNodeCount] , execution_space > element_map_type ;
static
void
create( const mesh_type & mesh ,
graph_type & graph ,
element_map_type & elem_map )
{
typename mesh_type::node_elem_ids_type::HostMirror
node_elem_ids = create_mirror( mesh.node_elem_ids );
typename mesh_type::elem_node_ids_type::HostMirror
elem_node_ids = create_mirror( mesh.elem_node_ids );
typedef typename element_map_type::HostMirror element_map_host_type ;
deep_copy( elem_node_ids , mesh.elem_node_ids );
deep_copy( node_elem_ids.entries , mesh.node_elem_ids.entries );
const size_t owned_node = mesh.parallel_data_map.count_owned ;
const size_t total_elem = mesh.elem_node_ids.dimension_0();
if ( total_elem ) {
elem_map = element_map_type( std::string("element_map"), total_elem );
}
element_map_host_type elem_map_host = create_mirror( elem_map );
//------------------------------------
// Node->node mapping for the CrsMatrix graph
std::vector< std::vector< unsigned > > node_node_ids( owned_node );
std::vector< unsigned > node_node_begin( owned_node );
size_t offset = 0 ;
for ( size_t i = 0 ; i < owned_node ; ++i ) {
const size_t j_end = node_elem_ids.row_map[i+1];
size_t j = node_elem_ids.row_map[i];
node_node_begin[i] = offset ;
std::vector< unsigned > & work = node_node_ids[i] ;
for ( ; j < j_end ; ++j ) {
const size_t elem_id = node_elem_ids.entries(j,0);
for ( size_t k = 0 ; k < ElemNodeCount ; ++k ) {
work.push_back( elem_node_ids( elem_id , k ) );
}
}
std::sort( work.begin() , work.end() );
work.erase( std::unique( work.begin() , work.end() ) , work.end() );
offset += work.size();
}
graph = Kokkos::create_staticcrsgraph< graph_type >( "node_node_ids" , node_node_ids );
//------------------------------------
// ( element , node_row , node_column ) -> matrix_crs_column
for ( size_t elem_id = 0 ; elem_id < total_elem ; ++elem_id ) {
for ( size_t i = 0 ; i < ElemNodeCount ; ++i ) {
const size_t node_row = elem_node_ids( elem_id , i );
const size_t node_row_begin = node_node_begin[ node_row ];
const std::vector< unsigned > & column = node_node_ids[ node_row ] ;
if ( owned_node <= node_row ) {
for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
elem_map_host( elem_id , i , j ) = std::numeric_limits<size_type>::max();
}
}
else {
for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
const size_type node_col = elem_node_ids( elem_id , j );
int col_search = 0 ;
for ( int len = column.size() ; 0 < len ; ) {
const int half = len >> 1;
const int middle = col_search + half ;
if ( column[middle] < node_col ){
col_search = middle + 1 ;
len -= half + 1 ;
}
else {
len = half ;
}
}
if ( node_col != column[col_search] ) {
throw std::runtime_error(std::string("Failed"));
}
elem_map_host( elem_id , i , j ) = col_search + node_row_begin ;
}
}
}
}
deep_copy( elem_map , elem_map_host );
}
};
} // namespace HybridFEM
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef SPARSELINEARSYSTEMFILL_HPP */

View File

@ -1,164 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef SPARSELINEARSYSTEM_CUDA_HPP
#define SPARSELINEARSYSTEM_CUDA_HPP
#if defined( BUILD_FROM_CU_FILE )
#include <cusparse_v2.h>
#include <Kokkos_Core.hpp>
namespace Kokkos {
namespace Impl {
struct CudaSparseSingleton {
cusparseHandle_t handle;
cusparseMatDescr_t descra;
CudaSparseSingleton()
{
cusparseCreate( & handle );
cusparseCreateMatDescr( & descra );
cusparseSetMatType( descra , CUSPARSE_MATRIX_TYPE_GENERAL );
cusparseSetMatIndexBase( descra , CUSPARSE_INDEX_BASE_ZERO );
}
static CudaSparseSingleton & singleton();
};
CudaSparseSingleton & CudaSparseSingleton::singleton()
{ static CudaSparseSingleton s ; return s ; }
template<>
struct Multiply< CrsMatrix<double,Cuda> ,
View<double*,Cuda > ,
View<double*,Cuda > >
{
typedef Cuda execution_space ;
typedef execution_space::size_type size_type ;
typedef double scalar_type ;
typedef View< scalar_type* , execution_space > vector_type ;
typedef CrsMatrix< scalar_type , execution_space > matrix_type ;
public:
Multiply( const matrix_type & A ,
const size_type nrow ,
const size_type ncol ,
const vector_type & x ,
const vector_type & y )
{
CudaSparseSingleton & s = CudaSparseSingleton::singleton();
const scalar_type alpha = 1 , beta = 0 ;
cusparseStatus_t status =
cusparseDcsrmv( s.handle ,
CUSPARSE_OPERATION_NON_TRANSPOSE ,
nrow , ncol , A.coefficients.dimension_0() ,
&alpha ,
s.descra ,
A.coefficients.ptr_on_device() ,
A.graph.row_map.ptr_on_device() ,
A.graph.entries.ptr_on_device() ,
x.ptr_on_device() ,
&beta ,
y.ptr_on_device() );
if ( CUSPARSE_STATUS_SUCCESS != status ) {
throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
}
}
};
template<>
struct Multiply< CrsMatrix<float,Cuda> ,
View<float*,Cuda > ,
View<float*,Cuda > >
{
typedef Cuda execution_space ;
typedef execution_space::size_type size_type ;
typedef float scalar_type ;
typedef View< scalar_type* , execution_space > vector_type ;
typedef CrsMatrix< scalar_type , execution_space > matrix_type ;
public:
Multiply( const matrix_type & A ,
const size_type nrow ,
const size_type ncol ,
const vector_type & x ,
const vector_type & y )
{
CudaSparseSingleton & s = CudaSparseSingleton::singleton();
const scalar_type alpha = 1 , beta = 0 ;
cusparseStatus_t status =
cusparseScsrmv( s.handle ,
CUSPARSE_OPERATION_NON_TRANSPOSE ,
nrow , ncol , A.coefficients.dimension_0() ,
&alpha ,
s.descra ,
A.coefficients.ptr_on_device() ,
A.graph.row_map.ptr_on_device() ,
A.graph.entries.ptr_on_device() ,
x.ptr_on_device() ,
&beta ,
y.ptr_on_device() );
if ( CUSPARSE_STATUS_SUCCESS != status ) {
throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
}
}
};
} /* namespace Impl */
} /* namespace Kokkos */
#endif /* #if defined( __CUDACC__ ) */
#endif /* #ifndef SPARSELINEARSYSTEM_CUDA_HPP */

View File

@ -1,242 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef TESTFEMESHBOXFIXTURE_HPP
#define TESTFEMESHBOXFIXTURE_HPP
#include <stdio.h>
#include <iostream>
#include <stdexcept>
#include <limits>
#include <utility>
#include <BoxMeshFixture.hpp>
#include <ParallelComm.hpp>
//----------------------------------------------------------------------------
namespace TestFEMesh {
template< class ViewType >
struct VerifyUnpack ;
template< typename DeviceType, typename T >
struct VerifyUnpack< Kokkos::View< T*[3] , DeviceType > >
{
typedef DeviceType execution_space ;
typedef typename execution_space::size_type size_type ;
typedef size_type value_type ;
typedef Kokkos::View< T* , execution_space > buffer_type ;
typedef Kokkos::View< T*[3] , execution_space > array_type ;
private:
array_type node_coords ;
buffer_type buffer ;
size_type node_begin ;
public:
KOKKOS_INLINE_FUNCTION
static void init( value_type & update )
{ update = 0 ; }
KOKKOS_INLINE_FUNCTION
static void join( volatile value_type & update ,
const volatile value_type & source )
{ update += source ; }
KOKKOS_INLINE_FUNCTION
void operator()( const size_type i , value_type & update ) const
{
const size_type node_id = i + node_begin ;
const size_type k = i * 3 ;
const long xb = buffer[k];
const long yb = buffer[k+1];
const long zb = buffer[k+2];
const long xn = node_coords(node_id,0);
const long yn = node_coords(node_id,1);
const long zn = node_coords(node_id,2);
if ( xb != xn || yb != yn || zb != zn ) {
printf("TestFEMesh::VerifyUnpack failed at %d : node %d : { %ld %ld %ld } != { %ld %ld %ld }\n",
(int)i,(int)node_id, xb,yb,zb, xn, yn, zn );
++update ;
}
}
static inline
size_type unpack( const array_type & arg_node_coords ,
const size_type arg_node_begin ,
const size_type arg_node_count ,
const buffer_type & arg_buffer )
{
VerifyUnpack op ;
op.node_coords = arg_node_coords ;
op.buffer = arg_buffer ;
op.node_begin = arg_node_begin ;
size_type count = 0 ;
Kokkos::parallel_reduce( arg_node_count , op , count );
return count ;
}
};
}
//----------------------------------------------------------------------------
#ifdef KOKKOS_HAVE_MPI
namespace TestFEMesh {
template< typename coordinate_scalar_type ,
unsigned ElemNodeCount ,
class Device >
void verify_parallel(
const HybridFEM::FEMesh< coordinate_scalar_type ,
ElemNodeCount ,
Device > & mesh )
{
typedef HybridFEM::FEMesh< coordinate_scalar_type, ElemNodeCount, Device > femesh_type ;
typedef typename femesh_type::node_coords_type node_coords_type ;
comm::Machine machine = mesh.parallel_data_map.machine ;
// Communicate node coordinates to verify communication and setup.
const size_t chunk_size = 3 ;
Kokkos::AsyncExchange< coordinate_scalar_type, Device, Kokkos::ParallelDataMap >
exchange( mesh.parallel_data_map , chunk_size );
const size_t send_begin = mesh.parallel_data_map.count_interior ;
const size_t send_count = mesh.parallel_data_map.count_send ;
const size_t recv_begin = mesh.parallel_data_map.count_owned ;
const size_t recv_count = mesh.parallel_data_map.count_receive ;
typedef Kokkos::PackArray< node_coords_type > pack_type ;
pack_type::pack( exchange.buffer(), send_begin, send_count, mesh.node_coords );
exchange.setup();
// Launch local-action device kernels
exchange.send_receive();
unsigned long local[3] ;
local[0] = mesh.parallel_data_map.count_owned ;
local[1] = mesh.parallel_data_map.count_receive ;
local[2] = TestFEMesh::VerifyUnpack< node_coords_type >::unpack( mesh.node_coords, recv_begin, recv_count, exchange.buffer() );
unsigned long global[3] = { 0 , 0 , 0 };
MPI_Allreduce( local , global ,
3 , MPI_UNSIGNED_LONG , MPI_SUM , machine.mpi_comm );
if ( 0 == comm::rank( machine ) ) {
std::cout << ( global[2] ? "FAILED" : "PASSED" )
<< ": TestFEMesh::verify_parallel "
<< "NP(" << comm::size( machine )
<< ") total_node(" << global[0]
<< ") verified_nodes(" << global[1]
<< ") failed_nodes(" << global[2]
<< ")" << std::endl ;
}
}
} // namespace TestFEMesh
#else /* ! #ifdef KOKKOS_HAVE_MPI */
namespace TestFEMesh {
template< typename coordinate_scalar_type ,
unsigned ElemNodeCount ,
class Device >
void verify_parallel(
const HybridFEM::FEMesh< coordinate_scalar_type ,
ElemNodeCount ,
Device > & )
{}
} // namespace TestFEMesh
#endif /* ! #ifdef KOKKOS_HAVE_MPI */
//----------------------------------------------------------------------------
template< class Device >
void test_box_fixture( comm::Machine machine ,
const size_t gang_count ,
const size_t nodes_nx ,
const size_t nodes_ny ,
const size_t nodes_nz )
{
typedef long coordinate_scalar_type ;
typedef FixtureElementHex8 fixture_element_type ;
typedef BoxMeshFixture< coordinate_scalar_type ,
Device ,
fixture_element_type > fixture_type ;
typedef typename fixture_type::FEMeshType mesh_type ;
const size_t proc_count = comm::size( machine );
const size_t proc_local = comm::rank( machine ) ;
mesh_type mesh =
fixture_type::create( proc_count , proc_local , gang_count ,
nodes_nx - 1 , nodes_ny - 1 , nodes_nz - 1 );
mesh.parallel_data_map.machine = machine ;
TestFEMesh::verify_parallel( mesh );
}
#endif /* #ifndef TESTFEMESHBOXFIXTURE_HPP */

View File

@ -1,172 +0,0 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
#include <iostream>
#include <stdexcept>
#include <limits>
#include <utility>
#include <BoxMeshPartition.hpp>
//----------------------------------------------------------------------------
void test_box_partition( bool print )
{
const size_t np_max = 10000 ;
const BoxBoundsLinear use_box ;
BoxType root_box ;
root_box[0][0] = 0 ; root_box[0][1] = 100 ;
root_box[1][0] = 0 ; root_box[1][1] = 200 ;
root_box[2][0] = 0 ; root_box[2][1] = 300 ;
const size_t cell_total =
( root_box[0][1] - root_box[0][0] ) *
( root_box[1][1] - root_box[1][0] ) *
( root_box[2][1] - root_box[2][0] );
for ( size_t np = 2 ; np < np_max ; np = 2 * ( np + 1 ) ) {
std::vector<BoxType> part_boxes( np );
box_partition_rcb( root_box , part_boxes );
size_t cell_goal = ( cell_total + np - 1 ) / np ;
size_t cell_max = 0 ;
for ( size_t i = 0 ; i < np ; ++i ) {
cell_max = std::max( cell_max , count( part_boxes[i] ) );
}
if ( print ) {
std::cout << std::endl
<< "box_part( " << np
<< " ) max( " << cell_max
<< " ) goal( " << cell_goal
<< " ) ratio( " << double(cell_max) / double(cell_goal)
<< " )" << std::endl ;
}
const size_t nsample = std::min(np,(size_t)4);
const size_t stride = ( np + nsample - 1 ) / nsample ;
for ( size_t my_part = 0 ; my_part < np ; my_part += stride ) {
BoxType my_use_box ;
std::vector<size_t> my_use_id_map ;
size_t my_count_interior ;
size_t my_count_owned ;
size_t my_count_uses ;
std::vector<size_t> my_recv_counts ;
std::vector<std::vector<size_t> > my_send_map ;
size_t count_verify = 0 ;
box_partition_maps( root_box , part_boxes ,
use_box , my_part ,
my_use_box , my_use_id_map ,
my_count_interior ,
my_count_owned ,
my_count_uses ,
my_recv_counts ,
my_send_map );
count_verify = my_count_owned ;
if ( print ) {
std::cout << " my_part(" << my_part << ") layout { "
<< "P" << my_part
<< "(" << my_count_interior
<< "," << ( my_count_owned - my_count_interior )
<< ")" ;
}
for ( size_t i = 1 ; i < np ; ++i ) {
if ( my_recv_counts[i] ) {
count_verify += my_recv_counts[i] ;
const size_t ip = ( my_part + i ) % np ;
if ( print ) {
std::cout << " P" << ip << "(" << my_recv_counts[i] << ")" ;
}
// Compare recv & send lists
BoxType ip_use_box ;
std::vector<size_t> ip_use_id_map ;
size_t ip_count_interior ;
size_t ip_count_owned ;
size_t ip_count_uses ;
std::vector<size_t> ip_recv_counts ;
std::vector<std::vector<size_t> > ip_send_map ;
box_partition_maps( root_box , part_boxes ,
use_box , ip ,
ip_use_box , ip_use_id_map ,
ip_count_interior ,
ip_count_owned ,
ip_count_uses ,
ip_recv_counts ,
ip_send_map );
// Sent by ip, received by my_part:
const BoxType recv_send = intersect( part_boxes[ip] , my_use_box );
const size_t recv_send_count = count( recv_send );
const size_t j = ( my_part + np - ip ) % np ;
if ( recv_send_count != my_recv_counts[i] ||
recv_send_count != ip_send_map[j].size() ) {
throw std::runtime_error( std::string("bad recv/send map") );
}
}
}
if ( print ) { std::cout << " }" << std::endl ; }
if ( count_verify != my_count_uses ) {
throw std::runtime_error( std::string("bad partition map") );
}
}
}
}

View File

@ -1,192 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <TestBoxMeshFixture.hpp>
#include <Implicit.hpp>
#include <Nonlinear.hpp>
#include <Explicit.hpp>
#include <SparseLinearSystem.hpp>
#if defined( KOKKOS_HAVE_CUDA )
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
CudaSparseSingleton & CudaSparseSingleton::singleton()
{ static CudaSparseSingleton s ; return s ; }
}
}
//----------------------------------------------------------------------------
void test_cuda_query( comm::Machine machine )
{
const size_t comm_rank = comm::rank( machine );
std::cout << "P" << comm_rank
<< ": Cuda device_count = "
<< Kokkos::Cuda::detect_device_count()
<< std::endl ;
}
//----------------------------------------------------------------------------
void test_cuda_fixture( comm::Machine machine ,
size_t nx , size_t ny , size_t nz )
{
const size_t comm_rank = comm::rank( machine );
const size_t comm_size = comm::size( machine );
const size_t dev_count = Kokkos::Cuda::detect_device_count();
const size_t dev_rank =
dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
const size_t gang_count = 0 ;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Cuda::SelectDevice select_device( dev_rank );
Kokkos::Cuda::initialize( select_device );
test_box_fixture<Kokkos::Cuda>( machine , gang_count , nx , ny , nz );
Kokkos::Cuda::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
//----------------------------------------------------------------------------
void test_cuda_implicit( comm::Machine machine ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run )
{
const size_t comm_rank = comm::rank( machine );
const size_t comm_size = comm::size( machine );
const size_t dev_count = Kokkos::Cuda::detect_device_count();
const size_t dev_rank =
dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
const size_t gang_count = 0 ;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Cuda::SelectDevice select_device( dev_rank );
Kokkos::Cuda::initialize( select_device );
HybridFEM::Implicit::driver<double,Kokkos::Cuda>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
Kokkos::Cuda::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
//----------------------------------------------------------------------------
void test_cuda_explicit( comm::Machine machine ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run )
{
const size_t comm_rank = comm::rank( machine );
const size_t comm_size = comm::size( machine );
const size_t dev_count = Kokkos::Cuda::detect_device_count();
const size_t dev_rank =
dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
const size_t gang_count = 0 ;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Cuda::SelectDevice select_device( dev_rank );
Kokkos::Cuda::initialize( select_device );
Explicit::driver<double,Kokkos::Cuda>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
Kokkos::Cuda::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
//----------------------------------------------------------------------------
void test_cuda_nonlinear( comm::Machine machine ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run )
{
const size_t comm_rank = comm::rank( machine );
const size_t comm_size = comm::size( machine );
const size_t dev_count = Kokkos::Cuda::detect_device_count();
const size_t dev_rank =
dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
const size_t gang_count = 0 ;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Cuda::SelectDevice select_device( dev_rank );
Kokkos::Cuda::initialize( select_device );
typedef Kokkos::Cuda device ;
typedef FixtureElementHex8 hex8 ;
HybridFEM::Nonlinear::driver<double,device,hex8>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
Kokkos::Cuda::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
void test_cuda_nonlinear_quadratic( comm::Machine machine ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run )
{
const size_t comm_rank = comm::rank( machine );
const size_t comm_size = comm::size( machine );
const size_t dev_count = Kokkos::Cuda::detect_device_count();
const size_t dev_rank =
dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
const size_t gang_count = 0 ;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Cuda::SelectDevice select_device( dev_rank );
Kokkos::Cuda::initialize( select_device );
typedef Kokkos::Cuda device ;
typedef FixtureElementHex27 hex27 ;
HybridFEM::Nonlinear::driver<double,device,hex27>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
Kokkos::Cuda::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */

View File

@ -1,137 +0,0 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
// Must be included first on Intel-Phi systems due to
// redefinition of SEEK_SET in <mpi.h>.
#include <ParallelComm.hpp>
#include <iostream>
#include <stdexcept>
#include <limits>
#include <utility>
//----------------------------------------------------------------------------
#include <Kokkos_Core.hpp>
#include <BoxMeshFixture.hpp>
#include <TestBoxMeshFixture.hpp>
#include <Implicit.hpp>
#include <Nonlinear.hpp>
#include <Explicit.hpp>
#include <SparseLinearSystem.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
void test_host_fixture( comm::Machine machine ,
size_t gang_count ,
size_t gang_worker_count ,
size_t nx , size_t ny , size_t nz )
{
Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
test_box_fixture<Kokkos::HostSpace::execution_space>( machine , gang_count , nx , ny , nz );
Kokkos::HostSpace::execution_space::finalize();
}
//----------------------------------------------------------------------------
void test_host_implicit( comm::Machine machine ,
size_t gang_count ,
size_t gang_worker_count ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run )
{
Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
HybridFEM::Implicit::driver<double,Kokkos::HostSpace::execution_space>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
Kokkos::HostSpace::execution_space::finalize();
}
//----------------------------------------------------------------------------
void test_host_explicit( comm::Machine machine ,
size_t gang_count ,
size_t gang_worker_count ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run )
{
Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
Explicit::driver<double,Kokkos::HostSpace::execution_space>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
Kokkos::HostSpace::execution_space::finalize();
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
void test_host_nonlinear( comm::Machine machine ,
size_t gang_count ,
size_t gang_worker_count ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run )
{
Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
typedef FixtureElementHex8 hex8 ;
typedef Kokkos::HostSpace::execution_space device ;
HybridFEM::Nonlinear::driver<double,device,hex8>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
Kokkos::HostSpace::execution_space::finalize();
}
void test_host_nonlinear_quadratic( comm::Machine machine ,
size_t gang_count ,
size_t gang_worker_count ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run )
{
Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
typedef FixtureElementHex27 hex27 ;
typedef Kokkos::HostSpace::execution_space device ;
HybridFEM::Nonlinear::driver<double,device,hex27>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
Kokkos::HostSpace::execution_space::finalize();
}
//----------------------------------------------------------------------------

View File

@ -1,348 +0,0 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
// Must be included first on Intel-Phi systems due to
// redefinition of SEEK_SET in <mpi.h>.
#include <ParallelComm.hpp>
#include <string>
#include <sstream>
#include <iostream>
#include <Kokkos_hwloc.hpp>
//----------------------------------------------------------------------------
void test_box_partition( bool print );
//----------------------------------------------------------------------------
void test_host_fixture( comm::Machine machine ,
size_t gang_count ,
size_t gang_worker_count ,
size_t nx , size_t ny , size_t nz );
void test_host_implicit( comm::Machine machine ,
size_t gang_count ,
size_t gang_worker_count ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run );
void test_host_explicit( comm::Machine machine ,
size_t gang_count ,
size_t gang_worker_count ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run );
void test_host_nonlinear( comm::Machine machine ,
size_t gang_count ,
size_t gang_worker_count ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run );
void test_host_nonlinear_quadratic( comm::Machine machine ,
size_t gang_count ,
size_t gang_worker_count ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run );
//----------------------------------------------------------------------------
void test_cuda_query( comm::Machine );
void test_cuda_fixture( comm::Machine machine ,
size_t nx , size_t ny , size_t nz );
void test_cuda_implicit( comm::Machine machine ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run );
void test_cuda_explicit( comm::Machine machine ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run );
void test_cuda_nonlinear( comm:: Machine machine ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run );
void test_cuda_nonlinear_quadratic( comm::Machine machine ,
size_t elem_count_begin ,
size_t elem_count_end ,
size_t count_run );
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace {
bool run_host( std::istream & input ,
comm::Machine machine ,
const size_t host_gang_count ,
const size_t host_gang_worker_count )
{
bool cmd_error = false ;
std::string which ; input >> which ;
if ( which == std::string("fixture") ) {
size_t nx = 0 , ny = 0 , nz = 0 ;
input >> nx >> ny >> nz ;
test_host_fixture( machine , host_gang_count , host_gang_worker_count , nx , ny , nz );
}
else if ( which == std::string("explicit") ) {
size_t mesh_node_begin = 100 ;
size_t mesh_node_end = 300 ;
size_t run = 1 ;
input >> mesh_node_begin >> mesh_node_end >> run ;
test_host_explicit( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
}
else if ( which == std::string("implicit") ) {
size_t mesh_node_begin = 100 ;
size_t mesh_node_end = 300 ;
size_t run = 1 ;
input >> mesh_node_begin >> mesh_node_end >> run ;
test_host_implicit( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
}
else if ( which == std::string("nonlinear") ) {
size_t mesh_node_begin = 100 ;
size_t mesh_node_end = 300 ;
size_t run = 1 ;
input >> mesh_node_begin >> mesh_node_end >> run ;
test_host_nonlinear( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
}
else if ( which == std::string("nonlinear_quadratic") ) {
size_t mesh_node_begin = 100 ;
size_t mesh_node_end = 300 ;
size_t run = 1 ;
input >> mesh_node_begin >> mesh_node_end >> run ;
test_host_nonlinear_quadratic( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
}
else {
cmd_error = true ;
}
return cmd_error ;
}
#if defined( KOKKOS_HAVE_CUDA )
bool run_cuda( std::istream & input , comm::Machine machine )
{
bool cmd_error = false ;
std::string which ; input >> which ;
if ( which == std::string("fixture") ) {
size_t nx = 0 , ny = 0 , nz = 0 ;
input >> nx >> ny >> nz ;
test_cuda_fixture( machine , nx , ny , nz );
}
else if ( which == std::string("explicit") ) {
size_t mesh_node_begin = 100 ;
size_t mesh_node_end = 300 ;
size_t run = 1 ;
input >> mesh_node_begin >> mesh_node_end >> run ;
test_cuda_explicit( machine , mesh_node_begin , mesh_node_end , run );
}
else if ( which == std::string("implicit") ) {
size_t mesh_node_begin = 100 ;
size_t mesh_node_end = 300 ;
size_t run = 1 ;
input >> mesh_node_begin >> mesh_node_end >> run ;
test_cuda_implicit( machine , mesh_node_begin , mesh_node_end , run );
}
else if ( which == std::string("nonlinear") ) {
size_t mesh_node_begin = 100 ;
size_t mesh_node_end = 300 ;
size_t run = 1 ;
input >> mesh_node_begin >> mesh_node_end >> run ;
test_cuda_nonlinear( machine , mesh_node_begin , mesh_node_end , run );
}
else if ( which == std::string("nonlinear_quadratic") ) {
size_t mesh_node_begin = 100 ;
size_t mesh_node_end = 300 ;
size_t run = 1 ;
input >> mesh_node_begin >> mesh_node_end >> run ;
test_cuda_nonlinear_quadratic( machine , mesh_node_begin , mesh_node_end , run );
}
else {
cmd_error = true ;
}
return cmd_error ;
}
#endif
void run( const std::string & argline , comm::Machine machine )
{
const unsigned numa_count = Kokkos::hwloc::get_available_numa_count();
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
std::istringstream input( argline );
bool cmd_error = false ;
std::string which ; input >> which ;
if ( which == std::string("query") ) {
std::cout << "P" << comm::rank( machine )
<< ": hwloc { NUMA[" << numa_count << "]"
<< " CORE[" << cores_per_numa << "]"
<< " PU[" << threads_per_core << "] }"
<< std::endl ;
#if defined( KOKKOS_HAVE_CUDA )
test_cuda_query( machine );
#endif
}
else if ( which == std::string("partition") ) {
if ( 0 == comm::rank( machine ) ) {
test_box_partition( false /* print flag */ );
}
}
else {
if ( which == std::string("host") ) {
size_t host_gang_count = 0 ;
size_t host_gang_worker_count = 1 ;
input >> host_gang_count ;
input >> host_gang_worker_count ;
cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
}
else if ( which == std::string("host-all") ) {
size_t host_gang_count = numa_count ;
size_t host_gang_worker_count = cores_per_numa * threads_per_core ;
cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
}
else if ( which == std::string("host-most") ) {
size_t host_gang_count = numa_count ;
size_t host_gang_worker_count = ( cores_per_numa - 1 ) * threads_per_core ;
cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
}
#if defined( KOKKOS_HAVE_CUDA )
else if ( which == std::string("cuda") ) {
cmd_error = run_cuda( input , machine );
}
#endif
else {
cmd_error = true ;
}
}
if ( cmd_error && 0 == comm::rank( machine ) ) {
std::cout << "Expecting command line with" << std::endl
<< " query" << std::endl
<< " partition" << std::endl
<< " host NumNumaNode NumThreadPerNode <test>" << std::endl
<< " host-all <test>" << std::endl
<< " host-most <test>" << std::endl
<< " cuda <test>" << std::endl
<< "where <test> is" << std::endl
<< " fixture NumElemX NumElemY NumElemZ" << std::endl
<< " implicit NumElemBegin NumElemEnd NumRun" << std::endl
<< " explicit NumElemBegin NumElemEnd NumRun" << std::endl
<< " nonlinear NumElemBegin NumElemEnd NumRun" << std::endl
<< " nonlinear_quadratic NumElemBegin NumElemEnd NumRun" << std::endl ;
}
}
} // namespace
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
int main( int argc , char ** argv )
{
comm::Machine machine = comm::Machine::init( & argc , & argv );
const unsigned comm_rank = comm::rank( machine );
const std::string argline = comm::command_line( machine , argc , argv );
try {
run( argline , machine );
}
catch( const std::exception & x ) {
std::cerr << "P" << comm_rank << " throw: " << x.what() << std::endl ;
}
catch( ... ) {
std::cerr << "P" << comm_rank << " throw: unknown exception" << std::endl ;
}
comm::Machine::finalize();
return 0 ;
}