Updating kokkos lib

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14918 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2016-05-02 22:06:50 +00:00
parent c5d0c55bee
commit 0a1b765248
411 changed files with 0 additions and 133424 deletions
--- a/lib/kokkos/example/multi_fem/BoxMeshFixture.hpp
+++ b/lib/kokkos/example/multi_fem/BoxMeshFixture.hpp
@ -1,610 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_BOXMESHFIXTURE_HPP
-#define KOKKOS_BOXMESHFIXTURE_HPP
-
-#include <cmath>
-#include <stdexcept>
-#include <sstream>
-
-#include <Kokkos_Core.hpp>
-#include <BoxMeshPartition.hpp>
-#include <FEMesh.hpp>
-#include <HexElement.hpp>
-
-//----------------------------------------------------------------------------
-
-struct FixtureElementHex8 {
-
-  static const unsigned element_node_count = 8 ;
-
-  HybridFEM::HexElement_TensorData< element_node_count > elem_data ;
-  BoxBoundsLinear box_bounds ;
-
-  FixtureElementHex8() : elem_data(), box_bounds() {}
-
-  static void create_node_boxes_from_vertex_boxes(
-    const BoxType                & vertex_box_global ,
-    const std::vector< BoxType > & vertex_box_parts ,
-          BoxType                & node_box_global ,
-          std::vector< BoxType > & node_box_parts )
-  {
-    node_box_global = vertex_box_global ;
-    node_box_parts  = vertex_box_parts  ;
-  }
-
-  void elem_to_node( const unsigned node_local , unsigned coord[] ) const
-  {
-    coord[0] += elem_data.eval_map[ node_local ][0] ;
-    coord[1] += elem_data.eval_map[ node_local ][1] ;
-    coord[2] += elem_data.eval_map[ node_local ][2] ;
-  }
-};
-
-struct FixtureElementHex27 {
-  static const unsigned element_node_count = 27 ;
-
-  HybridFEM::HexElement_TensorData< element_node_count > elem_data ;
-  BoxBoundsQuadratic box_bounds ;
-
-  FixtureElementHex27() : elem_data(), box_bounds() {}
-
-  static void create_node_boxes_from_vertex_boxes(
-    const BoxType                & vertex_box_global ,
-    const std::vector< BoxType > & vertex_box_parts ,
-          BoxType                & node_box_global ,
-          std::vector< BoxType > & node_box_parts )
-  {
-    node_box_global = vertex_box_global ;
-    node_box_parts  = vertex_box_parts  ;
-
-    node_box_global[0][1] = 2 * node_box_global[0][1] - 1 ;
-    node_box_global[1][1] = 2 * node_box_global[1][1] - 1 ;
-    node_box_global[2][1] = 2 * node_box_global[2][1] - 1 ;
-
-    for ( unsigned i = 0 ; i < vertex_box_parts.size() ; ++i ) {
-      node_box_parts[i][0][0] = 2 * node_box_parts[i][0][0] ;
-      node_box_parts[i][1][0] = 2 * node_box_parts[i][1][0] ;
-      node_box_parts[i][2][0] = 2 * node_box_parts[i][2][0] ;
-
-      node_box_parts[i][0][1] =
-        std::min( node_box_global[0][1] , 2 * node_box_parts[i][0][1] );
-      node_box_parts[i][1][1] =
-        std::min( node_box_global[1][1] , 2 * node_box_parts[i][1][1] );
-      node_box_parts[i][2][1] =
-        std::min( node_box_global[2][1] , 2 * node_box_parts[i][2][1] );
-    }
-  }
-
-  void elem_to_node( const unsigned node_local , unsigned coord[] ) const
-  {
-    coord[0] = 2 * coord[0] + elem_data.eval_map[ node_local ][0] ;
-    coord[1] = 2 * coord[1] + elem_data.eval_map[ node_local ][1] ;
-    coord[2] = 2 * coord[2] + elem_data.eval_map[ node_local ][2] ;
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class Device , class ElementSpec >
-struct BoxMeshFixture {
-
-  typedef Scalar  coordinate_scalar_type ;
-  typedef Device  execution_space ;
-
-  static const unsigned element_node_count = ElementSpec::element_node_count ;
-
-  typedef HybridFEM::FEMesh< coordinate_scalar_type ,
-                             element_node_count ,
-                             execution_space > FEMeshType ;
-
-  typedef typename FEMeshType::node_coords_type    node_coords_type ;
-  typedef typename FEMeshType::elem_node_ids_type  elem_node_ids_type ;
-  typedef typename FEMeshType::node_elem_ids_type  node_elem_ids_type ;
-
-
-  static void verify(
-    const typename FEMeshType::node_coords_type::HostMirror   & node_coords ,
-    const typename FEMeshType::elem_node_ids_type::HostMirror & elem_node_ids ,
-    const typename FEMeshType::node_elem_ids_type::HostMirror & node_elem_ids )
-  {
-    typedef typename FEMeshType::size_type         size_type ;
-    //typedef typename node_coords_type::value_type  coords_type ; // unused
-
-    const size_type node_count_total = node_coords.dimension_0();
-    const size_type elem_count_total = elem_node_ids.dimension_0();
-
-    const ElementSpec element ;
-
-    for ( size_type node_index = 0 ;
-                    node_index < node_count_total ; ++node_index ) {
-
-      for ( size_type
-              j = node_elem_ids.row_map[ node_index ] ;
-              j < node_elem_ids.row_map[ node_index + 1 ] ; ++j ) {
-
-        const size_type elem_index = node_elem_ids.entries(j,0);
-        const size_type node_local = node_elem_ids.entries(j,1);
-        const size_type en_id      = elem_node_ids(elem_index,node_local);
-
-        if ( node_index != en_id ) {
-          std::ostringstream msg ;
-          msg << "BoxMeshFixture node_elem_ids error"
-              << " : node_index(" << node_index
-              << ") entry(" << j
-              << ") elem_index(" << elem_index
-              << ") node_local(" << node_local
-              << ") elem_node_id(" << en_id
-              << ")" ;
-          throw std::runtime_error( msg.str() );
-        }
-      }
-    }
-
-    for ( size_type elem_index = 0 ;
-                    elem_index < elem_count_total; ++elem_index ) {
-
-      coordinate_scalar_type elem_node_coord[ element_node_count ][3] ;
-
-      for ( size_type nn = 0 ; nn < element_node_count ; ++nn ) {
-        const size_type node_index = elem_node_ids( elem_index , nn );
-
-        for ( size_type nc = 0 ; nc < 3 ; ++nc ) {
-          elem_node_coord[nn][nc] = node_coords( node_index , nc );
-        }
-      }
-
-
-      for ( size_type nn = 0 ; nn < element_node_count ; ++nn ) {
-
-        const unsigned ix = element.elem_data.eval_map[nn][0] ;
-        const unsigned iy = element.elem_data.eval_map[nn][1] ;
-        const unsigned iz = element.elem_data.eval_map[nn][2] ;
-
-        if ( elem_node_coord[nn][0] != elem_node_coord[0][0] + ix ||
-             elem_node_coord[nn][1] != elem_node_coord[0][1] + iy ||
-             elem_node_coord[nn][2] != elem_node_coord[0][2] + iz ) {
-
-          std::ostringstream msg ;
-          msg << "BoxMeshFixture elem_node_coord mapping failure { "
-              << elem_node_coord[nn][0] << " "
-              << elem_node_coord[nn][1] << " "
-              << elem_node_coord[nn][2] << " } != { "
-              << elem_node_coord[ 0][0] + ix << " "
-              << elem_node_coord[ 0][1] + iy << " "
-              << elem_node_coord[ 0][2] + iz
-              << " }" ;
-          throw std::runtime_error( msg.str() );
-        }
-      }
-    }
-  }
-
-  //------------------------------------
-  // Initialize element-node connectivity:
-  // Order elements that only depend on owned nodes first.
-  // These elements could be computed while waiting for
-  // received node data.
-
-  static void layout_elements_interior_exterior(
-    const BoxType                vertex_box_local_used ,
-    const BoxType                vertex_box_local_owned ,
-    const BoxType                node_box_local_used ,
-    const std::vector<size_t> &  node_used_id_map ,
-    const ElementSpec            element_fixture ,
-    const size_t                 elem_count_interior ,
-    const typename elem_node_ids_type::HostMirror elem_node_ids )
-  {
-    size_t elem_index_interior = 0 ;
-    size_t elem_index_boundary = elem_count_interior ;
-
-    for ( size_t iz = vertex_box_local_used[2][0] ;
-                 iz < vertex_box_local_used[2][1] - 1 ; ++iz ) {
-    for ( size_t iy = vertex_box_local_used[1][0] ;
-                 iy < vertex_box_local_used[1][1] - 1 ; ++iy ) {
-    for ( size_t ix = vertex_box_local_used[0][0] ;
-                 ix < vertex_box_local_used[0][1] - 1 ; ++ix ) {
-
-      size_t elem_index ;
-
-      // If lower and upper vertices are owned then element is interior
-      if ( contain( vertex_box_local_owned, ix,   iy,   iz ) &&
-           contain( vertex_box_local_owned, ix+1, iy+1, iz+1 ) ) {
-        elem_index = elem_index_interior++ ;
-      }
-      else {
-        elem_index = elem_index_boundary++ ;
-      }
-
-      for ( size_t nn = 0 ; nn < element_node_count ; ++nn ) {
-        unsigned coord[3] = { static_cast<unsigned>(ix) , static_cast<unsigned>(iy) , static_cast<unsigned>(iz) };
-
-        element_fixture.elem_to_node( nn , coord );
-
-        const size_t node_local_id =
-          box_map_id( node_box_local_used ,
-                      node_used_id_map ,
-                      coord[0] , coord[1] , coord[2] );
-
-        elem_node_ids( elem_index , nn ) = node_local_id ;
-      }
-    }}}
-  }
-
-  //------------------------------------
-  // Nested partitioning of elements by number of thread 'gangs'
-
-  static void layout_elements_partitioned(
-    const BoxType                vertex_box_local_used ,
-    const BoxType                /*vertex_box_local_owned*/ ,
-    const BoxType                node_box_local_used ,
-    const std::vector<size_t> &  node_used_id_map ,
-    const ElementSpec            element_fixture ,
-    const size_t                 thread_gang_count ,
-    const typename elem_node_ids_type::HostMirror elem_node_ids )
-  {
-    std::vector< BoxType > element_box_gangs( thread_gang_count );
-
-    BoxType element_box_local_used = vertex_box_local_used ;
-
-    element_box_local_used[0][1] -= 1 ;
-    element_box_local_used[1][1] -= 1 ;
-    element_box_local_used[2][1] -= 1 ;
-
-    box_partition_rcb( element_box_local_used , element_box_gangs );
-
-    size_t elem_index = 0 ;
-
-    for ( size_t ig = 0 ; ig < thread_gang_count ; ++ig ) {
-
-      const BoxType box = element_box_gangs[ig] ;
-
-      for ( size_t iz = box[2][0] ; iz < box[2][1] ; ++iz ) {
-      for ( size_t iy = box[1][0] ; iy < box[1][1] ; ++iy ) {
-      for ( size_t ix = box[0][0] ; ix < box[0][1] ; ++ix , ++elem_index ) {
-
-        for ( size_t nn = 0 ; nn < element_node_count ; ++nn ) {
-          unsigned coord[3] = { static_cast<unsigned>(ix) , static_cast<unsigned>(iy) , static_cast<unsigned>(iz) };
-
-          element_fixture.elem_to_node( nn , coord );
-
-          const size_t node_local_id =
-            box_map_id( node_box_local_used ,
-                        node_used_id_map ,
-                        coord[0] , coord[1] , coord[2] );
-
-          elem_node_ids( elem_index , nn ) = node_local_id ;
-        }
-      }}}
-    }
-  }
-
-  //------------------------------------
-
-  static FEMeshType create( const size_t proc_count ,
-                            const size_t proc_local ,
-                            const size_t gang_count ,
-                            const size_t elems_x ,
-                            const size_t elems_y ,
-                            const size_t elems_z ,
-                            const double x_coord_curve = 1 ,
-                            const double y_coord_curve = 1 ,
-                            const double z_coord_curve = 1 )
-  {
-    const size_t vertices_x = elems_x + 1 ;
-    const size_t vertices_y = elems_y + 1 ;
-    const size_t vertices_z = elems_z + 1 ;
-
-    const BoxBoundsLinear vertex_box_bounds ;
-    const ElementSpec element ;
-
-    // Partition based upon vertices:
-
-    BoxType vertex_box_global ;
-    std::vector< BoxType > vertex_box_parts( proc_count );
-
-    vertex_box_global[0][0] = 0 ; vertex_box_global[0][1] = vertices_x ;
-    vertex_box_global[1][0] = 0 ; vertex_box_global[1][1] = vertices_y ;
-    vertex_box_global[2][0] = 0 ; vertex_box_global[2][1] = vertices_z ;
-
-    box_partition_rcb( vertex_box_global , vertex_box_parts );
-
-    const BoxType vertex_box_local_owned = vertex_box_parts[ proc_local ];
-
-    // Determine interior and used vertices:
-
-    BoxType vertex_box_local_interior ;
-    BoxType vertex_box_local_used ;
-
-    vertex_box_bounds.apply( vertex_box_global ,
-                             vertex_box_local_owned ,
-                             vertex_box_local_interior ,
-                             vertex_box_local_used );
-
-    // Element counts:
-
-    const long local_elems_x =
-      ( vertex_box_local_used[0][1] - vertex_box_local_used[0][0] ) - 1 ;
-    const long local_elems_y =
-      ( vertex_box_local_used[1][1] - vertex_box_local_used[1][0] ) - 1 ;
-    const long local_elems_z =
-      ( vertex_box_local_used[2][1] - vertex_box_local_used[2][0] ) - 1 ;
-
-    const size_t elem_count_total = std::max( long(0) , local_elems_x ) *
-                                    std::max( long(0) , local_elems_y ) *
-                                    std::max( long(0) , local_elems_z );
-
-    const long interior_elems_x =
-      ( vertex_box_local_owned[0][1] - vertex_box_local_owned[0][0] ) - 1 ;
-    const long interior_elems_y =
-      ( vertex_box_local_owned[1][1] - vertex_box_local_owned[1][0] ) - 1 ;
-    const long interior_elems_z =
-      ( vertex_box_local_owned[2][1] - vertex_box_local_owned[2][0] ) - 1 ;
-
-    const size_t elem_count_interior = std::max( long(0) , interior_elems_x ) *
-                                       std::max( long(0) , interior_elems_y ) *
-                                       std::max( long(0) , interior_elems_z );
-
-    // Expand vertex boxes to node boxes:
-
-    BoxType node_box_global ;
-    BoxType node_box_local_used ;
-    std::vector< BoxType > node_box_parts ;
-
-    element.create_node_boxes_from_vertex_boxes(
-      vertex_box_global , vertex_box_parts ,
-      node_box_global , node_box_parts );
-
-    // Node communication maps:
-
-    size_t node_count_interior = 0 ;
-    size_t node_count_owned    = 0 ;
-    size_t node_count_total    = 0 ;
-    std::vector<size_t>                 node_used_id_map ;
-    std::vector<size_t>                 node_part_counts ;
-    std::vector< std::vector<size_t> >  node_send_map ;
-
-    box_partition_maps( node_box_global ,
-                        node_box_parts ,
-                        element.box_bounds ,
-                        proc_local ,
-                        node_box_local_used ,
-                        node_used_id_map ,
-                        node_count_interior ,
-                        node_count_owned ,
-                        node_count_total ,
-                        node_part_counts ,
-                        node_send_map );
-
-    size_t node_count_send = 0 ;
-    for ( size_t i = 0 ; i < node_send_map.size() ; ++i ) {
-      node_count_send += node_send_map[i].size();
-    }
-
-    size_t recv_msg_count = 0 ;
-    size_t send_msg_count = 0 ;
-    size_t send_count = 0 ;
-
-    for ( size_t i = 1 ; i < proc_count ; ++i ) {
-      if ( node_part_counts[i] ) ++recv_msg_count ;
-      if ( node_send_map[i].size() ) {
-        ++send_msg_count ;
-        send_count += node_send_map[i].size();
-      }
-    }
-
-    // Finite element mesh:
-
-    FEMeshType mesh ;
-
-    if ( node_count_total ) {
-      mesh.node_coords = node_coords_type( "node_coords", node_count_total );
-    }
-
-    if ( elem_count_total ) {
-      mesh.elem_node_ids =
-        elem_node_ids_type( "elem_node_ids", elem_count_total );
-    }
-
-    mesh.parallel_data_map.assign( node_count_interior ,
-                                   node_count_owned ,
-                                   node_count_total ,
-                                   recv_msg_count ,
-                                   send_msg_count ,
-                                   send_count );
-
-    typename node_coords_type::HostMirror node_coords =
-      Kokkos::create_mirror( mesh.node_coords );
-
-    typename elem_node_ids_type::HostMirror elem_node_ids =
-      Kokkos::create_mirror( mesh.elem_node_ids );
-
-    //------------------------------------
-    // set node coordinates to grid location for subsequent verification
-
-    for ( size_t iz = node_box_local_used[2][0] ;
-                 iz < node_box_local_used[2][1] ; ++iz ) {
-
-    for ( size_t iy = node_box_local_used[1][0] ;
-                 iy < node_box_local_used[1][1] ; ++iy ) {
-
-    for ( size_t ix = node_box_local_used[0][0] ;
-                 ix < node_box_local_used[0][1] ; ++ix ) {
-
-      const size_t node_local_id =
-        box_map_id( node_box_local_used , node_used_id_map , ix , iy , iz );
-
-      node_coords( node_local_id , 0 ) = ix ;
-      node_coords( node_local_id , 1 ) = iy ;
-      node_coords( node_local_id , 2 ) = iz ;
-    }}}
-
-    //------------------------------------
-    // Initialize element-node connectivity:
-
-    if ( 1 < gang_count ) {
-      layout_elements_partitioned( vertex_box_local_used ,
-                                   vertex_box_local_owned ,
-                                   node_box_local_used ,
-                                   node_used_id_map ,
-                                   element ,
-                                   gang_count ,
-                                   elem_node_ids );
-    }
-    else {
-      layout_elements_interior_exterior( vertex_box_local_used ,
-                                         vertex_box_local_owned ,
-                                         node_box_local_used ,
-                                         node_used_id_map ,
-                                         element ,
-                                         elem_count_interior ,
-                                         elem_node_ids );
-    }
-
-    //------------------------------------
-    // Populate node->element connectivity:
-
-    std::vector<size_t> node_elem_work( node_count_total , (size_t) 0 );
-
-    for ( size_t i = 0 ; i < elem_count_total ; ++i ) {
-      for ( size_t n = 0 ; n < element_node_count  ; ++n ) {
-        ++node_elem_work[ elem_node_ids(i,n) ];
-      }
-    }
-
-    mesh.node_elem_ids =
-      Kokkos::create_staticcrsgraph< node_elem_ids_type >( "node_elem_ids" , node_elem_work );
-
-    typename node_elem_ids_type::HostMirror
-      node_elem_ids = Kokkos::create_mirror( mesh.node_elem_ids );
-
-    for ( size_t i = 0 ; i < node_count_total ; ++i ) {
-      node_elem_work[i] = node_elem_ids.row_map[i];
-    }
-
-    // Looping in element order insures the list of elements
-    // is sorted by element index.
-
-    for ( size_t i = 0 ; i < elem_count_total ; ++i ) {
-      for ( size_t n = 0 ; n < element_node_count ; ++n ) {
-        const unsigned nid = elem_node_ids(i, n);
-        const unsigned j = node_elem_work[nid] ; ++node_elem_work[nid] ;
-
-        node_elem_ids.entries( j , 0 ) = i ;
-        node_elem_ids.entries( j , 1 ) = n ;
-      }
-    }
-    //------------------------------------
-    // Verify setup with node coordinates matching grid indices.
-    verify( node_coords , elem_node_ids , node_elem_ids );
-
-    //------------------------------------
-    // Scale node coordinates to problem extent with
-    // nonlinear mapping.
-    {
-      const double problem_extent[3] =
-        { static_cast<double>( vertex_box_global[0][1] - 1 ) ,
-          static_cast<double>( vertex_box_global[1][1] - 1 ) ,
-          static_cast<double>( vertex_box_global[2][1] - 1 ) };
-
-      const double grid_extent[3] =
-        { static_cast<double>( node_box_global[0][1] - 1 ) ,
-          static_cast<double>( node_box_global[1][1] - 1 ) ,
-          static_cast<double>( node_box_global[2][1] - 1 ) };
-
-      for ( size_t i = 0 ; i < node_count_total ; ++i ) {
-        const double x_unit = node_coords(i,0) / grid_extent[0] ;
-        const double y_unit = node_coords(i,1) / grid_extent[1] ;
-        const double z_unit = node_coords(i,2) / grid_extent[2] ;
-
-        node_coords(i,0) = coordinate_scalar_type( problem_extent[0] * std::pow( x_unit , x_coord_curve ) );
-        node_coords(i,1) = coordinate_scalar_type( problem_extent[1] * std::pow( y_unit , y_coord_curve ) );
-        node_coords(i,2) = coordinate_scalar_type( problem_extent[2] * std::pow( z_unit , z_coord_curve ) );
-      }
-    }
-
-    Kokkos::deep_copy( mesh.node_coords ,   node_coords );
-    Kokkos::deep_copy( mesh.elem_node_ids , elem_node_ids );
-    Kokkos::deep_copy( mesh.node_elem_ids.entries , node_elem_ids.entries );
-
-    //------------------------------------
-    // Communication lists:
-    {
-      recv_msg_count = 0 ;
-      send_msg_count = 0 ;
-      send_count = 0 ;
-
-      for ( size_t i = 1 ; i < proc_count ; ++i ) {
-
-        // Order sending starting with the local processor rank
-        // to try to smooth out the amount of messages simultaneously
-        // send to a particular processor.
-
-        const int proc = ( proc_local + i ) % proc_count ;
-        if ( node_part_counts[i] ) {
-          mesh.parallel_data_map.host_recv(recv_msg_count,0) = proc ;
-          mesh.parallel_data_map.host_recv(recv_msg_count,1) = node_part_counts[i] ;
-          ++recv_msg_count ;
-        }
-        if ( node_send_map[i].size() ) {
-          mesh.parallel_data_map.host_send(send_msg_count,0) = proc ;
-          mesh.parallel_data_map.host_send(send_msg_count,1) = node_send_map[i].size() ;
-          for ( size_t j = 0 ; j < node_send_map[i].size() ; ++j , ++send_count ) {
-            mesh.parallel_data_map.host_send_item(send_count) = node_send_map[i][j] - node_count_interior ;
-          }
-          ++send_msg_count ;
-        }
-      }
-    }
-
-    return mesh ;
-  }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_BOXMESHFIXTURE_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/BoxMeshPartition.cpp
+++ b/lib/kokkos/example/multi_fem/BoxMeshPartition.cpp
@ -1,381 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <limits>
-#include <BoxMeshPartition.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace {
-
-void box_partition( size_t ip , size_t up ,
-                    const BoxType & box ,
-                    BoxType * const p_box )
-{
-  const size_t np = up - ip ;
-
-  if ( 1 == np ) {
-    p_box[ip] = box ;
-  }
-  else {
-    // Choose axis with largest count:
-
-    const size_t n0 = box[0][1] - box[0][0] ;
-    const size_t n1 = box[1][1] - box[1][0] ;
-    const size_t n2 = box[2][1] - box[2][0] ;
-
-    const size_t axis = n2 > n1 ? ( n2 > n0 ? 2 : ( n1 > n0 ? 1 : 0 ) ) :
-                                  ( n1 > n0 ? 1 : 0 );
-
-    const size_t n = box[ axis ][1] - box[ axis ][0] ;
-
-    if ( 0 == np % 3 ) {
-      const size_t np_part = np / 3 ; // exact
-
-      const size_t nbox_low = (size_t)(( (double) n ) * ( 1.0 / 3.0 ));
-      const size_t nbox_mid = (size_t)(( (double) n ) * ( 2.0 / 3.0 ));
-
-      BoxType dbox_low = box ; // P = [ip,ip+np/3) 
-      BoxType dbox_mid = box ; // P = [ip+np/3,ip+2*np/3) 
-      BoxType dbox_upp = box ; // P = [ip+2*np/3,ip+np) 
-
-      dbox_low[ axis ][1] = box[ axis ][0] + nbox_low ;
-      dbox_mid[ axis ][1] = box[ axis ][0] + nbox_mid ;
-
-      dbox_mid[ axis ][0] = dbox_low[ axis ][1];
-      dbox_upp[ axis ][0] = dbox_mid[ axis ][1];
-
-      box_partition( ip,           ip +   np_part, dbox_low , p_box );
-      box_partition( ip+  np_part, ip + 2*np_part, dbox_mid , p_box );
-      box_partition( ip+2*np_part, up,             dbox_upp , p_box );
-    }
-    else {
-      const size_t np_low = np / 2 ; /* Rounded down */
-      const size_t nbox_low = (size_t)
-        (((double)n) * ( ((double) np_low ) / ((double) np ) ));
-
-      BoxType dbox_low = box ;
-      BoxType dbox_upp = box ;
-
-      dbox_low[ axis ][1] = dbox_low[ axis ][0] + nbox_low ; 
-      dbox_upp[ axis ][0] = dbox_low[ axis ][1];
-
-      box_partition( ip, ip + np_low, dbox_low , p_box );
-      box_partition( ip + np_low, up, dbox_upp , p_box );
-    }
-  }
-}
-
-size_t box_map_offset( const BoxType & local_use ,
-                       const size_t global_i ,
-                       const size_t global_j ,
-                       const size_t global_k )
-
-{
-  const size_t max = std::numeric_limits<size_t>::max();
-
-  const size_t n[3] =
-    { local_use[0][1] - local_use[0][0] ,
-      local_use[1][1] - local_use[1][0] ,
-      local_use[2][1] - local_use[2][0] };
-
-  const size_t use[3] = {
-    ( global_i >= local_use[0][0] ? global_i - local_use[0][0] : max ) ,
-    ( global_j >= local_use[1][0] ? global_j - local_use[1][0] : max ) ,
-    ( global_k >= local_use[2][0] ? global_k - local_use[2][0] : max ) };
-
-  const size_t offset =
-    ( use[0] < n[0] && use[1] < n[1] && use[2] < n[2] ) ?
-    ( use[0] + n[0] * ( use[1] + n[1] * use[2] ) ) : max ;
-
-  if ( offset == max ) {
-    std::ostringstream msg ;
-    msg << "box_map_offset ERROR: "
-        << " use " << local_use
-        << " ( " << global_i
-        << " , " << global_j
-        << " , " << global_k
-        << " )" ;
-    throw std::runtime_error( msg.str() );
-  }
-
-  return offset ;
-}
-
-} // namespace
-
-//----------------------------------------------------------------------------
-
-void BoxBoundsLinear::apply(  const BoxType & box_global ,
-                              const BoxType & box_part ,
-                                    BoxType & box_interior ,
-                                    BoxType & box_use ) const
-{
-  const unsigned ghost = 1 ;
-
-  if ( 0 == count( box_part ) ) {
-    box_interior = box_part ;
-    box_use      = box_part ;
-  }
-  else {
-    for ( size_t i = 0 ; i < 3 ; ++i ) {
-
-      box_interior[i][0] =
-        ( box_part[i][0] == box_global[i][0] )      ? box_part[i][0] : (
-        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][0] + ghost : 
-                                                      box_part[i][1] );
-
-      box_interior[i][1] =
-        ( box_part[i][1] == box_global[i][1] )      ? box_part[i][1] : (
-        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][1] - ghost :
-                                                      box_part[i][0] );
-
-      box_use[i][0] = 
-        ( box_part[i][0] > ghost + box_global[i][0] ) ? box_part[i][0] - ghost :
-                                                        box_global[i][0] ;
-      box_use[i][1] = 
-        ( box_part[i][1] + ghost < box_global[i][1] ) ? box_part[i][1] + ghost :
-                                                        box_global[i][1] ;
-    }
-  }
-}
-
-void BoxBoundsQuadratic::apply( const BoxType & box_global ,
-                                const BoxType & box_part ,
-                                      BoxType & box_interior ,
-                                      BoxType & box_use ) const
-{
-  if ( 0 == count( box_part ) ) {
-    box_interior = box_part ;
-    box_use      = box_part ;
-  }
-  else {
-    for ( size_t i = 0 ; i < 3 ; ++i ) {
-      const bool odd = ( box_part[i][0] - box_global[i][0] ) & 01 ;
-
-      const unsigned ghost = odd ? 1 : 2 ;
-
-      box_interior[i][0] =
-        ( box_part[i][0] == box_global[i][0] )      ? box_part[i][0] : (
-        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][0] + ghost : 
-                                                      box_part[i][1] );
-
-      box_interior[i][1] =
-        ( box_part[i][1] == box_global[i][1] )      ? box_part[i][1] : (
-        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][1] - ghost :
-                                                      box_part[i][0] );
-
-      box_use[i][0] = 
-        ( box_part[i][0] > ghost + box_global[i][0] ) ? box_part[i][0] - ghost :
-                                                        box_global[i][0] ;
-      box_use[i][1] = 
-        ( box_part[i][1] + ghost < box_global[i][1] ) ? box_part[i][1] + ghost :
-                                                        box_global[i][1] ;
-    }
-  }
-}
-
-//----------------------------------------------------------------------------
-
-void box_partition_rcb( const BoxType        & root_box ,
-                        std::vector<BoxType> & part_boxes )
-{
-  const BoxBoundsLinear use_boxes ;
-
-  const size_t part_count = part_boxes.size();
-
-  box_partition( 0 , part_count , root_box , & part_boxes[0] );
-
-  // Verify partitioning
-
-  size_t total_cell = 0 ;
-
-  for ( size_t i = 0 ; i < part_count ; ++i ) {
-
-    total_cell += count( part_boxes[i] );
-
-    BoxType box_interior , box_use ;
-
-    use_boxes.apply( root_box , part_boxes[i] , box_interior , box_use );
-
-    if ( count( box_use ) < count( part_boxes[i] ) ||
-         count( part_boxes[i] ) < count( box_interior ) ||
-         part_boxes[i] != intersect( part_boxes[i] , box_use ) ||
-         box_interior  != intersect( part_boxes[i] , box_interior )) {
-
-      std::ostringstream msg ;
-
-      msg << "box_partition_rcb ERROR : "
-          << "part_boxes[" << i << "] = "
-          << part_boxes[i]
-          << " use " << box_use
-          << " interior " << box_interior
-          << std::endl 
-          << "  part ^ use " << intersect( part_boxes[i] , box_use )
-          << "  part ^ interior " << intersect( part_boxes[i] , box_interior );
-
-      throw std::runtime_error( msg.str() );
-    }
-
-    for ( size_t j = i + 1 ; j < part_count ; ++j ) {
-      const BoxType tmp = intersect( part_boxes[i] , part_boxes[j] );
-
-      if ( count( tmp ) ) {
-        throw std::runtime_error( std::string("box partition intersection") );
-      }
-    }
-  }
-
-  if ( total_cell != count( root_box ) ) {
-    throw std::runtime_error( std::string("box partition count") );
-  }
-}
-
-//----------------------------------------------------------------------------
-         
-size_t box_map_id( const BoxType & local_use ,
-                   const std::vector<size_t> & local_use_id_map ,
-                   const size_t global_i ,
-                   const size_t global_j ,
-                   const size_t global_k )
-
-{
-  const size_t offset =
-    box_map_offset( local_use , global_i , global_j , global_k );
-  return local_use_id_map[ offset ];
-}
-         
-//----------------------------------------------------------------------------
-
-void box_partition_maps( const BoxType              & root_box ,
-                         const std::vector<BoxType> & part_boxes ,
-                         const BoxBounds            & use_boxes ,
-                         const size_t          my_part ,
-                         BoxType             & my_use_box ,
-                         std::vector<size_t> & my_use_id_map ,
-                         size_t              & my_count_interior ,
-                         size_t              & my_count_owned ,
-                         size_t              & my_count_uses ,
-                         std::vector<size_t> & my_part_counts ,
-                         std::vector<std::vector<size_t> > & my_send_map )
-{
-  const size_t np = part_boxes.size();
-
-  if ( np <= my_part ) {
-    std::ostringstream msg ;
-    msg << "box_partition_maps ERROR : "
-        << " np(" << np << ") <= my_part(" << my_part << ")" ;
-    throw std::runtime_error( msg.str() );
-  }
-
-  const BoxType my_owned_box = part_boxes[my_part];
-  BoxType my_interior_box ;
-
-
-  use_boxes.apply( root_box, my_owned_box, my_interior_box, my_use_box );
-
-  my_count_interior = count( my_interior_box );
-  my_count_owned    = count( my_owned_box );
-  my_count_uses     = count( my_use_box );
-
-  my_use_id_map.assign( my_count_uses , std::numeric_limits<size_t>::max() );
-
-  // Order ids as { owned-interior , owned-parallel , received_{(p+i)%np} }
-
-  size_t offset_interior = 0 ;
-  size_t offset_parallel = my_count_interior ;
-
-  for ( size_t iz = my_owned_box[2][0] ; iz < my_owned_box[2][1] ; ++iz ) {
-  for ( size_t iy = my_owned_box[1][0] ; iy < my_owned_box[1][1] ; ++iy ) {
-  for ( size_t ix = my_owned_box[0][0] ; ix < my_owned_box[0][1] ; ++ix ) {
-    const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
-    if ( contain( my_interior_box , ix , iy , iz ) ) {
-      my_use_id_map[ offset ] = offset_interior++ ;
-    }
-    else {
-      my_use_id_map[ offset ] = offset_parallel++ ;
-    }
-  }}}
-
-
-  my_part_counts.assign( np , (size_t) 0 );
-  my_send_map.assign( np , std::vector<size_t>() );
-
-  my_part_counts[0] = my_count_owned ;
-
-  for ( size_t i = 1 ; i < np ; ++i ) {
-
-    const size_t ip = ( my_part + i ) % np ;
-
-    const BoxType p_owned_box = part_boxes[ip];
-    BoxType p_use_box , p_interior_box ;
-    use_boxes.apply( root_box, p_owned_box, p_interior_box, p_use_box );
-
-    const BoxType recv_box = intersect( my_use_box , p_owned_box );
-    const BoxType send_box = intersect( my_owned_box , p_use_box );
-
-    if ( 0 != ( my_part_counts[i] = count( recv_box ) ) ) {
-      for ( size_t iz = recv_box[2][0] ; iz < recv_box[2][1] ; ++iz ) {
-      for ( size_t iy = recv_box[1][0] ; iy < recv_box[1][1] ; ++iy ) {
-      for ( size_t ix = recv_box[0][0] ; ix < recv_box[0][1] ; ++ix ) {
-        const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
-        my_use_id_map[ offset ] = offset_parallel++ ;
-      }}}
-    }
-
-    if ( 0 != count( send_box ) ) {
-      for ( size_t iz = send_box[2][0] ; iz < send_box[2][1] ; ++iz ) {
-      for ( size_t iy = send_box[1][0] ; iy < send_box[1][1] ; ++iy ) {
-      for ( size_t ix = send_box[0][0] ; ix < send_box[0][1] ; ++ix ) {
-        const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
-
-        my_send_map[ i ].push_back( my_use_id_map[ offset ] );
-      }}}
-    }
-  }
-}
-
-
--- a/lib/kokkos/example/multi_fem/BoxMeshPartition.hpp
+++ b/lib/kokkos/example/multi_fem/BoxMeshPartition.hpp
@ -1,210 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef BOXMESHPARTITION_HPP
-#define BOXMESHPARTITION_HPP
-
-#include <cstddef>
-#include <utility>
-#include <vector>
-#include <iostream>
-
-//----------------------------------------------------------------------------
-
-struct BoxType {
-  size_t data[3][2] ;
-
-  typedef size_t range_type[2] ;
-
-  inline
-  const range_type & operator[]( size_t i ) const { return data[i]; }
-
-  inline
-  range_type & operator[]( size_t i ) { return data[i]; }
-
-  inline
-  bool operator == ( const BoxType & rhs ) const
-  {
-    return data[0][0] == rhs.data[0][0] && data[0][1] == rhs.data[0][1] &&
-           data[1][0] == rhs.data[1][0] && data[1][1] == rhs.data[2][1] &&
-           data[2][0] == rhs.data[2][0] && data[2][1] == rhs.data[2][1] ;
-  }
-
-  inline
-  bool operator != ( const BoxType & rhs ) const
-  {
-    return data[0][0] != rhs.data[0][0] || data[0][1] != rhs.data[0][1] ||
-           data[1][0] != rhs.data[1][0] || data[1][1] != rhs.data[1][1] ||
-           data[2][0] != rhs.data[2][0] || data[2][1] != rhs.data[2][1] ;
-  }
-};
-
-inline
-size_t count( const BoxType & b )
-{
-  size_t n = 1 ;
-  for ( size_t i = 0 ; i < 3 ; ++i ) {
-    n *= b[i][1] > b[i][0] ? b[i][1] - b[i][0] : 0 ;
-  }
-  return n ;
-}
-
-inline
-bool contain( const BoxType & b , size_t i , size_t j , size_t k )
-{
-  return b[0][0] <= i && i < b[0][1] &&
-         b[1][0] <= j && j < b[1][1] &&
-         b[2][0] <= k && k < b[2][1] ;
-}
-
-inline
-BoxType intersect( const BoxType & x , const BoxType & y )
-{
-  BoxType z ;
-  for ( size_t i = 0 ; i < 3 ; ++i ) {
-    z[i][0] = std::max( x[i][0] , y[i][0] );    
-    z[i][1] = std::min( x[i][1] , y[i][1] );    
-  }
-
-  return z ;
-}
-
-inline
-std::ostream & operator << ( std::ostream & s , const BoxType & box )
-{
-  s << "{ "
-    << box[0][0] << " " << box[0][1] << " , "
-    << box[1][0] << " " << box[1][1] << " , "
-    << box[2][0] << " " << box[2][1] << " }" ;
-  return s ;
-}
-
-//----------------------------------------------------------------------------
-
-class BoxBounds {
-public:
-  /** \brief  Default bounds to one layer of ghosting */
-  virtual
-  void apply( const BoxType & box_global ,
-              const BoxType & box_part ,
-                    BoxType & box_interior ,
-                    BoxType & box_use ) const = 0 ;
-
-  virtual ~BoxBounds() {}
-  BoxBounds() {}
-};
-
-class BoxBoundsLinear : public BoxBounds
-{
-public:
-  /** \brief  Default bounds to one layer of ghosting */
-  virtual
-  void apply( const BoxType & box_global ,
-              const BoxType & box_part ,
-                    BoxType & box_interior ,
-                    BoxType & box_use ) const ;
-
-  virtual ~BoxBoundsLinear() {}
-  BoxBoundsLinear() {}
-};
-
-class BoxBoundsQuadratic : public BoxBounds {
-public:
-  /** \brief  Quadratic mesh: even ordinates have two layers,
-   *          odd ordinates have one layer.
-   */
-  virtual
-  void apply( const BoxType & box_global ,
-              const BoxType & box_part ,
-                    BoxType & box_interior ,
-                    BoxType & box_use ) const ;
-
-  virtual ~BoxBoundsQuadratic() {}
-  BoxBoundsQuadratic() {}
-};
-
-//----------------------------------------------------------------------------
-/* Partition box into part_boxes.size() sub-boxes */
-
-void box_partition_rcb( const BoxType        & root_box ,
-                        std::vector<BoxType> & part_boxes );
-
-//----------------------------------------------------------------------------
-/* Determine local id layout and communication maps for partitioned boxes.
- *
- *  Local ids are layed out as follows:
- *    { [ owned-interior ids not sent ] ,
- *      [ owned-boundary ids to be sent to other processes ] ,
- *      [ received ids from processor ( my_part + 1 ) % part_count ]
- *      [ received ids from processor ( my_part + 2 ) % part_count ]
- *      [ received ids from processor ( my_part + 3 ) % part_count ]
- *      ... };
- *
- *  This layout allows
- *  (1) received data to be copied into a contiguous block of memory
- *  (2) send data to be extracted from a contiguous block of memory.
- */
-void box_partition_maps(
-  const BoxType              & root_box ,   // [in] Global box
-  const std::vector<BoxType> & part_boxes , // [in] Partitioned boxes
-  const BoxBounds            & use_boxes ,  // [in] Ghost boundaries
-  const size_t          my_part ,           // [in] My local part
-  BoxType             & my_use_box ,        // [out] My used box with ghost
-  std::vector<size_t> & my_use_id_map ,     // [out] Local ordering map
-  size_t              & my_count_interior , // [out] How many interior
-  size_t              & my_count_owned ,    // [out] How many owned
-  size_t              & my_count_uses ,     // [out] How may used
-  std::vector<size_t> & my_part_counts ,    // [out] Partitioning of my_use_id_map
-  std::vector<std::vector<size_t> > & my_send_map ); // [out] Send id map
-
-/*  Mapping of cartesian coordinate to local id */
-size_t box_map_id( const BoxType             & my_use_box ,
-                   const std::vector<size_t> & my_use_id_map ,
-                   const size_t global_i ,
-                   const size_t global_j ,
-                   const size_t global_k );
-
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef BOXMESHPARTITION_HPP */
-
--- a/lib/kokkos/example/multi_fem/CMakeLists.txt
+++ b/lib/kokkos/example/multi_fem/CMakeLists.txt
@ -1,16 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-SET(SOURCES "")
-
-FILE(GLOB SOURCES *.cpp)
-
-SET(LIBRARIES kokkoscore)
-
-TRIBITS_ADD_EXECUTABLE(
-  multi_fem
-  SOURCES ${SOURCES}
-  COMM serial mpi
-  )
-
--- a/lib/kokkos/example/multi_fem/Explicit.hpp
+++ b/lib/kokkos/example/multi_fem/Explicit.hpp
@ -1,452 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef EXPLICIT_DRIVER_HPP
-#define EXPLICIT_DRIVER_HPP
-
-#include <sys/time.h>
-#include <iostream>
-#include <iomanip>
-#include <cstdlib>
-#include <cmath>
-
-#include <impl/Kokkos_Timer.hpp>
-
-#include <ExplicitFunctors.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Explicit {
-
-struct PerformanceData {
-  double mesh_time ;
-  double init_time ;
-  double internal_force_time ;
-  double central_diff ;
-  double comm_time ;
-  size_t number_of_steps ;
-
-  PerformanceData()
-  : mesh_time(0)
-  , init_time(0)
-  , internal_force_time(0)
-  , central_diff(0)
-  , comm_time(0)
-  , number_of_steps(0)
-  {}
-
-  void best( const PerformanceData & rhs )
-  {
-    if ( rhs.mesh_time < mesh_time ) mesh_time = rhs.mesh_time ;
-    if ( rhs.init_time < init_time ) init_time = rhs.init_time ;
-    if ( rhs.internal_force_time < internal_force_time ) internal_force_time = rhs.internal_force_time ;
-    if ( rhs.central_diff < central_diff ) central_diff = rhs.central_diff ;
-    if ( rhs.comm_time < comm_time ) comm_time = rhs.comm_time ;
-  }
-};
-
-template< typename Scalar , class FixtureType >
-PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
-                     const int global_max_x ,
-                     const int global_max_y ,
-                     const int global_max_z ,
-                     const int steps ,
-                     const int print_sample )
-{
-  typedef Scalar                              scalar_type ;
-  typedef FixtureType                         fixture_type ;
-  typedef typename fixture_type::execution_space  execution_space ;
-  //typedef typename fixture_type::FEMeshType   mesh_type ; // unused
-
-  enum { ElementNodeCount = fixture_type::element_node_count };
-
-  const int NumStates = 2;
-
-  const int total_num_steps = steps ;
-
-  const Scalar user_dt = 5.0e-6;
-  //const Scalar  end_time = 0.0050;
-
-  // element block parameters
-  const Scalar  lin_bulk_visc = 0.0;
-  const Scalar  quad_bulk_visc = 0.0;
-
-  // const Scalar  lin_bulk_visc = 0.06;
-  // const Scalar  quad_bulk_visc = 1.2;
-  // const Scalar  hg_stiffness = 0.0;
-  // const Scalar  hg_viscosity = 0.0;
-  // const Scalar  hg_stiffness = 0.03;
-  // const Scalar  hg_viscosity = 0.001;
-
-  // material properties
-  const Scalar youngs_modulus=1.0e6;
-  const Scalar poissons_ratio=0.0;
-  const Scalar  density = 8.0e-4;
-
-  const comm::Machine machine = mesh.parallel_data_map.machine ;
-
-  PerformanceData perf_data ;
-
-  Kokkos::Impl::Timer wall_clock ;
-
-  //------------------------------------
-  // Generate fields
-
-  typedef Fields< scalar_type , execution_space > fields_type ;
-
-  fields_type mesh_fields( mesh ,
-                           lin_bulk_visc ,
-                           quad_bulk_visc ,
-                           youngs_modulus ,
-                           poissons_ratio ,
-                           density );
-
-  typename fields_type::node_coords_type::HostMirror
-    model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords );
-
-  typename fields_type::geom_state_array_type::HostMirror
-    displacement_h = Kokkos::create_mirror( mesh_fields.displacement );
-
-  typename fields_type::geom_state_array_type::HostMirror
-    velocity_h = Kokkos::create_mirror( mesh_fields.velocity );
-
-  Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords );
-
-  //------------------------------------
-  // Initialization
-
-  initialize_element<Scalar,execution_space>::apply( mesh_fields );
-  initialize_node<   Scalar,execution_space>::apply( mesh_fields );
-
-  const Scalar x_bc = global_max_x ;
-
-  // Initial condition on velocity to initiate a pulse along the X axis
-  {
-    const unsigned X = 0;
-    for (int inode = 0; inode< mesh_fields.num_nodes; ++inode) {
-      if ( model_coords_h(inode,X) == 0) {
-        velocity_h(inode,X,0) = 1.0e3;
-        velocity_h(inode,X,1) = 1.0e3;
-      }
-    }
-  }
-
-  Kokkos::deep_copy( mesh_fields.velocity , velocity_h );
-
-  //--------------------------------------------------------------------------
-  // We will call a sequence of functions.  These functions have been
-  // grouped into several functors to balance the number of global memory
-  // accesses versus requiring too many registers or too much L1 cache.
-  // Global memory accees have read/write cost and memory subsystem contention cost.
-  //--------------------------------------------------------------------------
-
-  perf_data.init_time = comm::max( machine , wall_clock.seconds() );
-
-  // Parameters required for the internal force computations.
-
-  int current_state = 0;
-  int previous_state = 0;
-  int next_state = 0;
-
-  perf_data.number_of_steps = total_num_steps ;
-
-#if defined( KOKKOS_HAVE_MPI )
-
-  typedef typename
-    fields_type::geom_state_array_type::value_type  comm_value_type ;
-
-  const unsigned comm_value_count = 6 ;
-
-  Kokkos::AsyncExchange< comm_value_type , execution_space ,
-                              Kokkos::ParallelDataMap >
-    comm_exchange( mesh.parallel_data_map , comm_value_count );
-
-#endif
-
-  for (int step = 0; step < total_num_steps; ++step) {
-
-    wall_clock.reset();
-
-    //------------------------------------------------------------------------
-#if defined( KOKKOS_HAVE_MPI )
-    {
-      // Communicate "send" nodes' displacement and velocity next_state
-      // to the ghosted nodes.
-      // buffer packages: { { dx , dy , dz , vx , vy , vz }_node }
-
-      pack_state< Scalar , execution_space >
-        ::apply( comm_exchange.buffer() ,
-                 mesh.parallel_data_map.count_interior ,
-                 mesh.parallel_data_map.count_send ,
-                 mesh_fields , next_state );
-
-      comm_exchange.setup();
-
-      comm_exchange.send_receive();
-
-      unpack_state< Scalar , execution_space >
-        ::apply( mesh_fields , next_state ,
-                 comm_exchange.buffer() ,
-                 mesh.parallel_data_map.count_owned ,
-                 mesh.parallel_data_map.count_receive );
-
-      execution_space::fence();
-    }
-#endif
-
-    perf_data.comm_time += comm::max( machine , wall_clock.seconds() );
-
-    //------------------------------------------------------------------------
-    // rotate the states
-
-    previous_state = current_state;
-    current_state = next_state;
-    ++next_state;
-    next_state %= NumStates;
-
-    wall_clock.reset();
-
-    // First kernel 'grad_hgop' combines two functions:
-    // gradient, velocity gradient
-    grad< Scalar , execution_space >::apply( mesh_fields ,
-                                         current_state ,
-                                         previous_state );
-
-    // Combine tensor decomposition and rotation functions.
-    decomp_rotate< Scalar , execution_space >::apply( mesh_fields ,
-                                                  current_state ,
-                                                  previous_state );
-
-    internal_force< Scalar , execution_space >::apply( mesh_fields ,
-                                                   user_dt ,
-                                                   current_state );
-
-    execution_space::fence();
-
-    perf_data.internal_force_time +=
-      comm::max( machine , wall_clock.seconds() );
-
-    wall_clock.reset();
-
-    // Assembly of elements' contributions to nodal force into
-    // a nodal force vector.  Update the accelerations, velocities,
-    // displacements.
-    // The same pattern can be used for matrix-free residual computations.
-    nodal_step< Scalar , execution_space >::apply( mesh_fields ,
-                                               x_bc ,
-                                               current_state,
-                                               next_state );
-    execution_space::fence();
-
-    perf_data.central_diff +=
-      comm::max( machine , wall_clock.seconds() );
-
-    if ( print_sample && 0 == step % 100 ) {
-      Kokkos::deep_copy( displacement_h , mesh_fields.displacement );
-      Kokkos::deep_copy( velocity_h ,     mesh_fields.velocity );
-
-      if ( 1 == print_sample ) {
-
-        std::cout << "step " << step
-                  << " : displacement(*,0,0) =" ;
-        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
-          if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) {
-            std::cout << " " << displacement_h(i,0,next_state);
-          }
-        }
-        std::cout << std::endl ;
-
-        const float tol = 1.0e-6 ;
-        const int yb = global_max_y ;
-        const int zb = global_max_z ;
-        std::cout << "step " << step
-                  << " : displacement(*," << yb << "," << zb << ") =" ;
-        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
-          if ( fabs( model_coords_h(i,1) - yb ) < tol &&
-               fabs( model_coords_h(i,2) - zb ) < tol ) {
-            std::cout << " " << displacement_h(i,0,next_state);
-          }
-        }
-        std::cout << std::endl ;
-      }
-      else if ( 2 == print_sample ) {
-
-        const float tol = 1.0e-6 ;
-        const int xb = global_max_x / 2 ;
-        const int yb = global_max_y / 2 ;
-        const int zb = global_max_z / 2 ;
-
-        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
-          if ( fabs( model_coords_h(i,0) - xb ) < tol &&
-               fabs( model_coords_h(i,1) - yb ) < tol &&
-               fabs( model_coords_h(i,2) - zb ) < tol ) {
-            std::cout << "step " << step
-                      << " : displacement("
-                      << xb << "," << yb << "," << zb << ") = {"
-                      << std::setprecision(6)
-                      << " " << displacement_h(i,0,next_state)
-                      << std::setprecision(2)
-                      << " " << displacement_h(i,1,next_state)
-                      << std::setprecision(2)
-                      << " " << displacement_h(i,2,next_state)
-                      << " }" << std::endl ;
-          }
-        }
-      }
-    }
-  }
-
-  return perf_data ;
-}
-
-
-template <typename Scalar, typename Device>
-static void driver( const char * const label ,
-                    comm::Machine machine ,
-                    const int gang_count ,
-                    const int elem_count_beg ,
-                    const int elem_count_end ,
-                    const int runs )
-{
-  typedef Scalar              scalar_type ;
-  typedef Device              execution_space ;
-  typedef double              coordinate_scalar_type ;
-  typedef FixtureElementHex8  fixture_element_type ;
-
-  typedef BoxMeshFixture< coordinate_scalar_type ,
-                          execution_space ,
-                          fixture_element_type > fixture_type ;
-
-  typedef typename fixture_type::FEMeshType mesh_type ;
-
-  const size_t proc_count = comm::size( machine );
-  const size_t proc_rank  = comm::rank( machine );
-
-  const int space = 15 ;
-  const int steps = 1000 ;
-  const int print_sample = 0 ;
-
-  if ( comm::rank( machine ) == 0 ) {
-
-    std::cout << std::endl ;
-    std::cout << "\"MiniExplicitDynamics with Kokkos " << label
-              << " time_steps(" << steps << ")"
-              << "\"" << std::endl;
-    std::cout << std::left << std::setw(space) << "\"Element\" , ";
-    std::cout << std::left << std::setw(space) << "\"Node\" , ";
-    std::cout << std::left << std::setw(space) << "\"Initialize\" , ";
-    std::cout << std::left << std::setw(space) << "\"ElemForce\" , ";
-    std::cout << std::left << std::setw(space) << "\"NodeUpdate\" , ";
-    std::cout << std::left << std::setw(space) << "\"NodeComm\" , ";
-    std::cout << std::left << std::setw(space) << "\"Time/Elem\" , ";
-    std::cout << std::left << std::setw(space) << "\"Time/Node\"";
-
-    std::cout << std::endl;
-
-    std::cout << std::left << std::setw(space) << "\"count\" , ";
-    std::cout << std::left << std::setw(space) << "\"count\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\"";
-
-    std::cout << std::endl;
-  }
-
-  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
-  {
-    const int iz = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
-    const int iy = iz + 1 ;
-    const int ix = 2 * iy ;
-    const int nelem = ix * iy * iz ;
-    const int nnode = ( ix + 1 ) * ( iy + 1 ) * ( iz + 1 );
-
-    mesh_type mesh =
-      fixture_type::create( proc_count , proc_rank , gang_count ,
-                            ix , iy , iz );
-
-    mesh.parallel_data_map.machine = machine ;
-
-    PerformanceData perf , best ;
-
-    for(int j = 0; j < runs; j++){
-
-     perf = run<scalar_type,fixture_type>(mesh,ix,iy,iz,steps,print_sample);
-
-     if( j == 0 ) {
-       best = perf ;
-     }
-     else {
-       best.best( perf );
-     }
-   }
-
-   if ( comm::rank( machine ) == 0 ) {
-     double time_per_element =
-       ( best.internal_force_time ) / ( nelem * perf.number_of_steps );
-     double time_per_node =
-       ( best.comm_time + best.central_diff ) / ( nnode * perf.number_of_steps );
-
-   std::cout << std::setw(space-3) << nelem << " , "
-             << std::setw(space-3) << nnode << " , "
-             << std::setw(space-3) << best.number_of_steps << " , "
-             << std::setw(space-3) << best.init_time * 1000000 << " , "
-             << std::setw(space-3)
-             << ( best.internal_force_time * 1000000 ) / best.number_of_steps << " , "
-             << std::setw(space-3)
-             << ( best.central_diff * 1000000 ) / best.number_of_steps << " , "
-             << std::setw(space-3)
-             << ( best.comm_time * 1000000 ) / best.number_of_steps << " , "
-             << std::setw(space-3) << time_per_element * 1000000 << " , "
-             << std::setw(space-3) << time_per_node * 1000000
-             << std::endl ;
-    }
-  }
-}
-
-
-} // namespace Explicit
-
-#endif /* #ifndef EXPLICIT_DRIVER_HPP */
--- a/lib/kokkos/example/multi_fem/ExplicitFunctors.hpp
+++ b/lib/kokkos/example/multi_fem/ExplicitFunctors.hpp
--- a/lib/kokkos/example/multi_fem/FEMesh.hpp
+++ b/lib/kokkos/example/multi_fem/FEMesh.hpp
@ -1,86 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_FEMESH_HPP
-#define KOKKOS_FEMESH_HPP
-
-#include <utility>
-#include <limits>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-#include <Kokkos_Core.hpp>
-#include <Kokkos_StaticCrsGraph.hpp>
-
-#include <ParallelComm.hpp>
-#include <ParallelDataMap.hpp>
-
-namespace HybridFEM {
-
-//----------------------------------------------------------------------------
-/** \brief  Finite element mesh fixture for hybrid parallel performance tests.
- */
-template< typename CoordScalarType , unsigned ElemNodeCount , class Device >
-struct FEMesh {
-
-  typedef typename Device::size_type size_type ;
-
-  static const size_type element_node_count = ElemNodeCount ;
-
-  typedef Kokkos::View< CoordScalarType*[3] , Device >       node_coords_type ;
-  typedef Kokkos::View< size_type*[ElemNodeCount], Device >  elem_node_ids_type ;
-  typedef Kokkos::StaticCrsGraph< size_type[2] ,  Device >   node_elem_ids_type ;
-
-  node_coords_type         node_coords ;
-  elem_node_ids_type       elem_node_ids ;
-  node_elem_ids_type       node_elem_ids ;
-  Kokkos::ParallelDataMap  parallel_data_map ;
-};
-
-//----------------------------------------------------------------------------
-
-} /* namespace HybridFEM */
-
-#endif /* #ifndef KOKKOS_FEMESH_HPP */
-
--- a/lib/kokkos/example/multi_fem/HexElement.hpp
+++ b/lib/kokkos/example/multi_fem/HexElement.hpp
@ -1,268 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef ELEMENTHEX_HPP
-#define ELEMENTHEX_HPP
-
-namespace HybridFEM {
-
-template< unsigned NodeCount >
-class HexElement_TensorData ;
-
-template< unsigned NodeCount , class Device >
-class HexElement_TensorEval ;
-
-//----------------------------------------------------------------------------
-/** \brief  Evaluate Hex element on interval [-1,1]^3 */
-template<>
-class HexElement_TensorData< 8 > {
-public:
-
-  static const unsigned element_node_count    = 8 ;
-  static const unsigned spatial_dimension     = 3 ;
-  static const unsigned integration_count_1d  = 2 ;
-  static const unsigned function_count_1d     = 2 ;
-
-  float values_1d [ function_count_1d ][ integration_count_1d ];
-  float derivs_1d [ function_count_1d ][ integration_count_1d ];
-  float weights_1d[ integration_count_1d ];
-
-  unsigned char eval_map[ element_node_count ][4] ;
-
-  static float eval_value_1d( const unsigned jf , const float x )
-  {
-    return 0 == jf ? 0.5 * ( 1.0 - x ) : (
-           1 == jf ? 0.5 * ( 1.0 + x ) : 0 );
-  }
-
-  static float eval_deriv_1d( const unsigned jf , const float )
-  {
-    return 0 == jf ? -0.5 : (
-           1 == jf ?  0.5 : 0 );
-  }
-
-  HexElement_TensorData()
-  {
-    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
-      { { 0 , 0 , 0 },
-        { 1 , 0 , 0 },
-        { 1 , 1 , 0 },
-        { 0 , 1 , 0 },
-        { 0 , 0 , 1 },
-        { 1 , 0 , 1 },
-        { 1 , 1 , 1 },
-        { 0 , 1 , 1 } };
-
-    weights_1d[0] = 1 ;
-    weights_1d[1] = 1 ;
-
-    const float points_1d[ integration_count_1d ] =
-      { -0.577350269 , 0.577350269 };
-
-    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
-      eval_map[i][0] = tmp_map[i][0];
-      eval_map[i][1] = tmp_map[i][1];
-      eval_map[i][2] = tmp_map[i][2];
-    }
-
-    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
-    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
-      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
-      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
-    }}
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template<>
-class HexElement_TensorData< 27 > {
-public:
-
-  static const unsigned element_node_count    = 27 ;
-  static const unsigned spatial_dimension     = 3 ;
-  static const unsigned integration_count_1d  = 3 ;
-  static const unsigned function_count_1d     = 3 ;
-
-  float values_1d [ function_count_1d ][ integration_count_1d ];
-  float derivs_1d [ function_count_1d ][ integration_count_1d ];
-  float weights_1d[ integration_count_1d ];
-
-  unsigned char eval_map[ element_node_count ][4] ;
-
-  // sizeof(EvaluateElementHex) = 111 bytes =
-  //   sizeof(float) * 9 +
-  //   sizeof(float) * 9 +
-  //   sizeof(float) * 3 +
-  //   sizeof(char) * 27 
-
-  static float eval_value_1d( const unsigned jf , const float p )
-  {
-    return 0 == jf ? 0.5 * p * ( p - 1 ) : (
-           1 == jf ? 1.0 - p * p : (
-           2 == jf ? 0.5 * p * ( p + 1 ) : 0 ));
-  }
-
-  static float eval_deriv_1d( const unsigned jf , const float p )
-  {
-    return 0 == jf ? p - 0.5 : (
-           1 == jf ? -2.0 * p : (
-           2 == jf ? p + 0.5 : 0 ));
-  }
-
-  HexElement_TensorData()
-  {
-    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
-      { { 0 , 0 , 0 },
-        { 2 , 0 , 0 },
-        { 2 , 2 , 0 },
-        { 0 , 2 , 0 },
-        { 0 , 0 , 2 },
-        { 2 , 0 , 2 },
-        { 2 , 2 , 2 },
-        { 0 , 2 , 2 },
-        { 1 , 0 , 0 },
-        { 2 , 1 , 0 },
-        { 1 , 2 , 0 },
-        { 0 , 1 , 0 },
-        { 0 , 0 , 1 },
-        { 2 , 0 , 1 },
-        { 2 , 2 , 1 },
-        { 0 , 2 , 1 },
-        { 1 , 0 , 2 },
-        { 2 , 1 , 2 },
-        { 1 , 2 , 2 },
-        { 0 , 1 , 2 },
-        { 1 , 1 , 1 },
-        { 1 , 1 , 0 },
-        { 1 , 1 , 2 },
-        { 0 , 1 , 1 },
-        { 2 , 1 , 1 },
-        { 1 , 0 , 1 },
-        { 1 , 2 , 1 } };
-
-    // Interval [-1,1]
-
-    weights_1d[0] = 0.555555556 ;
-    weights_1d[1] = 0.888888889 ;
-    weights_1d[2] = 0.555555556 ;
-
-    const float points_1d[3] = { -0.774596669 ,
-                                  0.000000000 ,
-                                  0.774596669 };
-
-    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
-      eval_map[i][0] = tmp_map[i][0];
-      eval_map[i][1] = tmp_map[i][1];
-      eval_map[i][2] = tmp_map[i][2];
-    }
-
-    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
-    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
-      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
-      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
-    }}
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< unsigned NodeCount >
-class HexElement_Data {
-public:
-  static const unsigned spatial_dimension   = 3 ;
-  static const unsigned element_node_count  = NodeCount ;
-  static const unsigned integration_count   = NodeCount ;
-  static const unsigned function_count      = NodeCount ;
-
-  float weights[   integration_count ] ;
-  float values[    integration_count ][ function_count ];
-  float gradients[ integration_count ][ spatial_dimension ][ function_count ];
-
-  HexElement_Data()
-  {
-    HexElement_TensorData< NodeCount > tensor_data ;
-
-    for ( unsigned ip = 0 ; ip < integration_count ; ++ip ) {
-
-      const unsigned ipx = tensor_data.eval_map[ip][0] ;
-      const unsigned ipy = tensor_data.eval_map[ip][1] ;
-      const unsigned ipz = tensor_data.eval_map[ip][2] ;
-
-      weights[ip] = tensor_data.weights_1d[ ipx ] *
-                    tensor_data.weights_1d[ ipy ] *
-                    tensor_data.weights_1d[ ipz ] ;
-
-      for ( unsigned jf = 0 ; jf < function_count ; ++jf ) {
-
-        const unsigned jfx = tensor_data.eval_map[jf][0] ;
-        const unsigned jfy = tensor_data.eval_map[jf][1] ;
-        const unsigned jfz = tensor_data.eval_map[jf][2] ;
-
-        values[ip][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
-                         tensor_data.values_1d[ ipy ][ jfy ] *
-                         tensor_data.values_1d[ ipz ][ jfz ] ;
-
-        gradients[ip][0][jf] = tensor_data.derivs_1d[ ipx ][ jfx ] *
-                               tensor_data.values_1d[ ipy ][ jfy ] *
-                               tensor_data.values_1d[ ipz ][ jfz ] ;
-
-        gradients[ip][1][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
-                               tensor_data.derivs_1d[ ipy ][ jfy ] *
-                               tensor_data.values_1d[ ipz ][ jfz ] ;
-
-        gradients[ip][2][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
-                               tensor_data.values_1d[ ipy ][ jfy ] *
-                               tensor_data.derivs_1d[ ipz ][ jfz ] ;
-      }
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} /* namespace HybridFEM */
-
-#endif /* #ifndef ELEMENTHEX_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/HexExplicitFunctions.hpp
+++ b/lib/kokkos/example/multi_fem/HexExplicitFunctions.hpp
@ -1,443 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_HEXEXPLICITFUNCTIONS_HPP
-#define KOKKOS_HEXEXPLICITFUNCTIONS_HPP
-
-#include <math.h>
-
-namespace Explicit {
-
-struct Hex8Functions
-{
-  static const unsigned SpatialDim    = 3 ;
-  static const unsigned ElemNodeCount = 8 ;
-
-  // Indices for full 3x3 tensor:
-
-  static const unsigned K_F_XX = 0 ;
-  static const unsigned K_F_YY = 1 ;
-  static const unsigned K_F_ZZ = 2 ;
-  static const unsigned K_F_XY = 3 ;
-  static const unsigned K_F_YZ = 4 ;
-  static const unsigned K_F_ZX = 5 ;
-  static const unsigned K_F_YX = 6 ;
-  static const unsigned K_F_ZY = 7 ;
-  static const unsigned K_F_XZ = 8 ;
-  static const unsigned K_F_SIZE = 9 ;
-
-  //  Indexes into a 3 by 3 symmetric tensor stored as a length 6 vector
-
-  static const unsigned K_S_XX = 0 ;
-  static const unsigned K_S_YY = 1 ;
-  static const unsigned K_S_ZZ = 2 ;
-  static const unsigned K_S_XY = 3 ;
-  static const unsigned K_S_YZ = 4 ;
-  static const unsigned K_S_ZX = 5 ;
-  static const unsigned K_S_YX = 3 ;
-  static const unsigned K_S_ZY = 4 ;
-  static const unsigned K_S_XZ = 5 ;
-  static const unsigned K_S_SIZE = 6 ;
-
-  //  Indexes into a 3 by 3 skew symmetric tensor stored as a length 3 vector
-
-  static const unsigned K_V_XY = 0 ;
-  static const unsigned K_V_YZ = 1 ;
-  static const unsigned K_V_ZX = 2 ;
-  static const unsigned K_V_SIZE = 3 ;
-
-  //--------------------------------------------------------------------------
-
-  template< typename ScalarA , typename ScalarB >
-  KOKKOS_INLINE_FUNCTION static
-  double dot8( const ScalarA * const a , const ScalarB * const b )
-  { return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3] +
-           a[4] * b[4] + a[5] * b[5] + a[6] * b[6] + a[7] * b[7] ; }
-
-  //--------------------------------------------------------------------------
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  KOKKOS_INLINE_FUNCTION static
-  void grad( const ScalarPrecise x[] ,
-             const ScalarPrecise z[] ,
-                   ScalarCompact grad_y[] )
-  {
-    const ScalarCompact R42=(x[3] - x[1]);
-    const ScalarCompact R52=(x[4] - x[1]);
-    const ScalarCompact R54=(x[4] - x[3]);
-
-    const ScalarCompact R63=(x[5] - x[2]);
-    const ScalarCompact R83=(x[7] - x[2]);
-    const ScalarCompact R86=(x[7] - x[5]);
-
-    const ScalarCompact R31=(x[2] - x[0]);
-    const ScalarCompact R61=(x[5] - x[0]);
-    const ScalarCompact R74=(x[6] - x[3]);
-
-    const ScalarCompact R72=(x[6] - x[1]);
-    const ScalarCompact R75=(x[6] - x[4]);
-    const ScalarCompact R81=(x[7] - x[0]);
-
-    const ScalarCompact t1=(R63 + R54);
-    const ScalarCompact t2=(R61 + R74);
-    const ScalarCompact t3=(R72 + R81);
-
-    const ScalarCompact t4 =(R86 + R42);
-    const ScalarCompact t5 =(R83 + R52);
-    const ScalarCompact t6 =(R75 + R31);
-
-    //  Calculate Y gradient from X and Z data
-
-    grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
-    grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
-    grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
-    grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
-    grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
-    grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
-    grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
-    grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);
-  }
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  static KOKKOS_INLINE_FUNCTION
-  void grad( const ScalarPrecise x[] ,
-             const ScalarPrecise y[] ,
-             const ScalarPrecise z[] ,
-                   ScalarCompact grad_x[] ,
-                   ScalarCompact grad_y[] ,
-                   ScalarCompact grad_z[] )
-  {
-    grad( x , z , grad_y );
-    grad( z , y , grad_x );
-    grad( y , x , grad_z );
-  }
-
-  //--------------------------------------------------------------------------
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  KOKKOS_INLINE_FUNCTION static
-  void polar_decomp( const float dt ,
-                     const ScalarCompact v_gr[] ,
-                           ScalarPrecise stretch[] /* INOUT */ ,
-                           ScalarCompact str_ten[] /* OUT */ ,
-                           ScalarCompact rot[]     /* OUT */ )
-  {
-    const float dt_half = 0.5 * dt;
-
-    ScalarCompact vort[ K_V_SIZE ];  // Vorticity
-
-    //  Symmetric part
-    str_ten[K_S_XX] = v_gr[K_F_XX];
-    str_ten[K_S_YY] = v_gr[K_F_YY];
-    str_ten[K_S_ZZ] = v_gr[K_F_ZZ];
-    str_ten[K_S_XY] = 0.5 * ( v_gr[K_F_XY] + v_gr[K_F_YX] );
-    str_ten[K_S_YZ] = 0.5 * ( v_gr[K_F_YZ] + v_gr[K_F_ZY] );
-    str_ten[K_S_ZX] = 0.5 * ( v_gr[K_F_ZX] + v_gr[K_F_XZ] );
-
-    //  Skew Symmetric part
-    vort[K_V_XY] = 0.5 * ( v_gr[K_F_XY] - v_gr[K_F_YX] );
-    vort[K_V_YZ] = 0.5 * ( v_gr[K_F_YZ] - v_gr[K_F_ZY] );
-    vort[K_V_ZX] = 0.5 * ( v_gr[K_F_ZX] - v_gr[K_F_XZ] );
-
-    //   calculate the rates of rotation via gauss elimination.
-
-    ScalarCompact z1 = str_ten[K_S_XY] * stretch[K_S_ZX] -
-                       str_ten[K_S_ZX] * stretch[K_S_XY] +
-                       str_ten[K_S_YY] * stretch[K_S_YZ] -
-                       str_ten[K_S_YZ] * stretch[K_S_YY] +
-                       str_ten[K_S_YZ] * stretch[K_S_ZZ] -
-                       str_ten[K_S_ZZ] * stretch[K_S_YZ];
-
-    ScalarCompact z2 = str_ten[K_S_ZX] * stretch[K_S_XX] -
-                       str_ten[K_S_XX] * stretch[K_S_ZX] +
-                       str_ten[K_S_YZ] * stretch[K_S_XY] -
-                       str_ten[K_S_XY] * stretch[K_S_YZ] +
-                       str_ten[K_S_ZZ] * stretch[K_S_ZX] -
-                       str_ten[K_S_ZX] * stretch[K_S_ZZ];
-
-    ScalarCompact z3 = str_ten[K_S_XX] * stretch[K_S_XY] -
-                       str_ten[K_S_XY] * stretch[K_S_XX] +
-                       str_ten[K_S_XY] * stretch[K_S_YY] -
-                       str_ten[K_S_YY] * stretch[K_S_XY] +
-                       str_ten[K_S_ZX] * stretch[K_S_YZ] -
-                       str_ten[K_S_YZ] * stretch[K_S_ZX];
-
-    {
-      //   forward elimination
-
-      const ScalarCompact a1inv  = 1.0 / (stretch[K_S_YY] + stretch[K_S_ZZ]);
-      const ScalarCompact a4BYa1 = -1 * stretch[K_S_XY] * a1inv;
-      const ScalarCompact a2inv  = 1.0 / (stretch[K_S_ZZ] + stretch[K_S_XX] + stretch[K_S_XY] * a4BYa1);
-
-     const ScalarCompact a5 =  -stretch[K_S_YZ] + stretch[K_S_ZX] * a4BYa1;
-
-      z2 -= z1 * a4BYa1;
-      const ScalarCompact a6BYa1 = -1 * stretch[K_S_ZX] * a1inv;
-      const ScalarCompact a5BYa2 = a5 * a2inv;
-      z3 -= z1 * a6BYa1 - z2 * a5BYa2;
-
-      //   backward substitution -
-
-      z3 /= (stretch[K_S_XX] + stretch[K_S_YY] + stretch[K_S_ZX] * a6BYa1 + a5 * a5BYa2);
-      z2 = (z2 - a5 * z3) * a2inv;
-      z1 = (z1*a1inv - a6BYa1 * z3 -a4BYa1 * z2);
-    }
-
-    //   calculate rotation rates - recall that spin_rate is an asymmetric tensor,
-    //   so compute spin rate vector as dual of spin rate tensor,
-    //   i.e   w_i = e_ijk * spin_rate_jk
-
-    z1 += vort[K_V_YZ];
-    z2 += vort[K_V_ZX];
-    z3 += vort[K_V_XY];
-
-    {
-      //   update rotation tensor:
-      //  1) premultiply old rotation tensor to get right-hand side.
-
-      ScalarCompact r_XX = rot[K_F_XX] + dt_half*( z3 * rot[K_F_YX] - z2 * rot[K_F_ZX] );
-      ScalarCompact r_YX = rot[K_F_YX] + dt_half*( z1 * rot[K_F_ZX] - z3 * rot[K_F_XX] );
-      ScalarCompact r_ZX = rot[K_F_ZX] + dt_half*( z2 * rot[K_F_XX] - z1 * rot[K_F_YX] );
-      ScalarCompact r_XY = rot[K_F_XY] + dt_half*( z3 * rot[K_F_YY] - z2 * rot[K_F_ZY] );
-      ScalarCompact r_YY = rot[K_F_YY] + dt_half*( z1 * rot[K_F_ZY] - z3 * rot[K_F_XY] );
-      ScalarCompact r_ZY = rot[K_F_ZY] + dt_half*( z2 * rot[K_F_XY] - z1 * rot[K_F_YY] );
-      ScalarCompact r_XZ = rot[K_F_XZ] + dt_half*( z3 * rot[K_F_YZ] - z2 * rot[K_F_ZZ] );
-      ScalarCompact r_YZ = rot[K_F_YZ] + dt_half*( z1 * rot[K_F_ZZ] - z3 * rot[K_F_XZ] );
-      ScalarCompact r_ZZ = rot[K_F_ZZ] + dt_half*( z2 * rot[K_F_XZ] - z1 * rot[K_F_YZ] );
-
-
-      //  2) solve for new rotation tensor via gauss elimination.
-      //   forward elimination -
-
-      const ScalarCompact a12 = - dt_half * z3;
-      const ScalarCompact a13 =   dt_half * z2;
-            ScalarCompact b32 = - dt_half * z1;
-      const ScalarCompact a22inv = 1.0 / (1.0 + a12 * a12);
-
-      const ScalarCompact a13a12 = a13*a12;
-      const ScalarCompact a23 = b32 + a13a12;
-
-      r_YX += r_XX * a12;
-      r_YY += r_XY * a12;
-      r_YZ += r_XZ * a12;
-
-      b32 = (b32 - a13a12) * a22inv;
-
-      r_ZX += r_XX * a13 + r_YX * b32;
-      r_ZY += r_XY * a13 + r_YY * b32;
-      r_ZZ += r_XZ * a13 + r_YZ * b32;
-
-      //   backward substitution -
-
-      const ScalarCompact a33inv = 1.0 / (1.0 + a13 * a13 + a23 * b32);
-
-      rot[K_F_ZX] = r_ZX * a33inv;
-      rot[K_F_ZY] = r_ZY * a33inv;
-      rot[K_F_ZZ] = r_ZZ * a33inv;
-      rot[K_F_YX] = ( r_YX - rot[K_F_ZX] * a23 ) * a22inv;
-      rot[K_F_YY] = ( r_YY - rot[K_F_ZY] * a23 ) * a22inv;
-      rot[K_F_YZ] = ( r_YZ - rot[K_F_ZZ] * a23 ) * a22inv;
-      rot[K_F_XX] = r_XX - rot[K_F_ZX] * a13 - rot[K_F_YX] * a12;
-      rot[K_F_XY] = r_XY - rot[K_F_ZY] * a13 - rot[K_F_YY] * a12;
-      rot[K_F_XZ] = r_XZ - rot[K_F_ZZ] * a13 - rot[K_F_YZ] * a12;
-    }
-
-    //   update stretch tensor in the new configuration -
-
-    const ScalarCompact a1 = str_ten[K_S_XY] + vort[K_V_XY];
-    const ScalarCompact a2 = str_ten[K_S_YZ] + vort[K_V_YZ];
-    const ScalarCompact a3 = str_ten[K_S_ZX] + vort[K_V_ZX];
-    const ScalarCompact b1 = str_ten[K_S_ZX] - vort[K_V_ZX];
-    const ScalarCompact b2 = str_ten[K_S_XY] - vort[K_V_XY];
-    const ScalarCompact b3 = str_ten[K_S_YZ] - vort[K_V_YZ];
-
-    const ScalarCompact s_XX = stretch[K_S_XX];
-    const ScalarCompact s_YY = stretch[K_S_YY];
-    const ScalarCompact s_ZZ = stretch[K_S_ZZ];
-    const ScalarCompact s_XY = stretch[K_S_XY];
-    const ScalarCompact s_YZ = stretch[K_S_YZ];
-    const ScalarCompact s_ZX = stretch[K_S_ZX];
-
-    stretch[K_S_XX] += dt * (str_ten[K_S_XX] * s_XX + ( a1 + z3 ) * s_XY + ( b1 - z2 ) * s_ZX);
-    stretch[K_S_YY] += dt * (str_ten[K_S_YY] * s_YY + ( a2 + z1 ) * s_YZ + ( b2 - z3 ) * s_XY);
-    stretch[K_S_ZZ] += dt * (str_ten[K_S_ZZ] * s_ZZ + ( a3 + z2 ) * s_ZX + ( b3 - z1 ) * s_YZ);
-    stretch[K_S_XY] += dt * (str_ten[K_S_XX] * s_XY + ( a1 )      * s_YY + ( b1      ) * s_YZ - z3 * s_XX + z1 * s_ZX);
-    stretch[K_S_YZ] += dt * (str_ten[K_S_YY] * s_YZ + ( a2 )      * s_ZZ + ( b2      ) * s_ZX - z1 * s_YY + z2 * s_XY);
-    stretch[K_S_ZX] += dt * (str_ten[K_S_ZZ] * s_ZX + ( a3 )      * s_XX + ( b3      ) * s_XY - z2 * s_ZZ + z3 * s_YZ);
-  }
-
-  //--------------------------------------------------------------------------
-
-  template< typename ScalarCompact >
-  static KOKKOS_INLINE_FUNCTION
-  void rotate_tensor( const ScalarCompact str_ten[] ,
-                      const ScalarCompact rot[] ,
-                            ScalarCompact rot_str[] )
-  {
-    ScalarCompact t[9];
-
-    t[0] = str_ten[K_S_XX]*rot[K_F_XX] + str_ten[K_S_XY]*rot[K_F_YX] + str_ten[K_S_XZ]*rot[K_F_ZX];
-    t[1] = str_ten[K_S_YX]*rot[K_F_XX] + str_ten[K_S_YY]*rot[K_F_YX] + str_ten[K_S_YZ]*rot[K_F_ZX];
-    t[2] = str_ten[K_S_ZX]*rot[K_F_XX] + str_ten[K_S_ZY]*rot[K_F_YX] + str_ten[K_S_ZZ]*rot[K_F_ZX];
-
-    t[3] = str_ten[K_S_XX]*rot[K_F_XY] + str_ten[K_S_XY]*rot[K_F_YY] + str_ten[K_S_XZ]*rot[K_F_ZY];
-    t[4] = str_ten[K_S_YX]*rot[K_F_XY] + str_ten[K_S_YY]*rot[K_F_YY] + str_ten[K_S_YZ]*rot[K_F_ZY];
-    t[5] = str_ten[K_S_ZX]*rot[K_F_XY] + str_ten[K_S_ZY]*rot[K_F_YY] + str_ten[K_S_ZZ]*rot[K_F_ZY];
-
-    t[6] = str_ten[K_S_XX]*rot[K_F_XZ] + str_ten[K_S_XY]*rot[K_F_YZ] + str_ten[K_S_XZ]*rot[K_F_ZZ];
-    t[7] = str_ten[K_S_YX]*rot[K_F_XZ] + str_ten[K_S_YY]*rot[K_F_YZ] + str_ten[K_S_YZ]*rot[K_F_ZZ];
-    t[8] = str_ten[K_S_ZX]*rot[K_F_XZ] + str_ten[K_S_ZY]*rot[K_F_YZ] + str_ten[K_S_ZZ]*rot[K_F_ZZ];
-
-
-    rot_str[ K_S_XX ] = rot[K_F_XX] * t[0] + rot[K_F_YX] * t[1] + rot[K_F_ZX] * t[2];
-    rot_str[ K_S_YY ] = rot[K_F_XY] * t[3] + rot[K_F_YY] * t[4] + rot[K_F_ZY] * t[5];
-    rot_str[ K_S_ZZ ] = rot[K_F_XZ] * t[6] + rot[K_F_YZ] * t[7] + rot[K_F_ZZ] * t[8];
-
-    rot_str[ K_S_XY ] = rot[K_F_XX] * t[3] + rot[K_F_YX] * t[4] + rot[K_F_ZX] * t[5];
-    rot_str[ K_S_YZ ] = rot[K_F_XY] * t[6] + rot[K_F_YY] * t[7] + rot[K_F_ZY] * t[8];
-    rot_str[ K_S_ZX ] = rot[K_F_XZ] * t[0] + rot[K_F_YZ] * t[1] + rot[K_F_ZZ] * t[2];
-  }
-
-  //--------------------------------------------------------------------------
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  static KOKKOS_INLINE_FUNCTION
-  void rotate_tensor_backward( const ScalarPrecise stress[] ,
-                               const ScalarCompact rot[] ,
-                                     ScalarCompact rot_stress[] )
-  {
-    ScalarCompact t[9] ;
-
-    t[0] = stress[K_S_XX]*rot[K_F_XX]+ stress[K_S_XY]*rot[K_F_XY]+ stress[K_S_XZ]*rot[K_F_XZ];
-    t[1] = stress[K_S_YX]*rot[K_F_XX]+ stress[K_S_YY]*rot[K_F_XY]+ stress[K_S_YZ]*rot[K_F_XZ];
-    t[2] = stress[K_S_ZX]*rot[K_F_XX]+ stress[K_S_ZY]*rot[K_F_XY]+ stress[K_S_ZZ]*rot[K_F_XZ];
-    t[3] = stress[K_S_XX]*rot[K_F_YX]+ stress[K_S_XY]*rot[K_F_YY]+ stress[K_S_XZ]*rot[K_F_YZ];
-    t[4] = stress[K_S_YX]*rot[K_F_YX]+ stress[K_S_YY]*rot[K_F_YY]+ stress[K_S_YZ]*rot[K_F_YZ];
-    t[5] = stress[K_S_ZX]*rot[K_F_YX]+ stress[K_S_ZY]*rot[K_F_YY]+ stress[K_S_ZZ]*rot[K_F_YZ];
-    t[6] = stress[K_S_XX]*rot[K_F_ZX]+ stress[K_S_XY]*rot[K_F_ZY]+ stress[K_S_XZ]*rot[K_F_ZZ];
-    t[7] = stress[K_S_YX]*rot[K_F_ZX]+ stress[K_S_YY]*rot[K_F_ZY]+ stress[K_S_YZ]*rot[K_F_ZZ];
-    t[8] = stress[K_S_ZX]*rot[K_F_ZX]+ stress[K_S_ZY]*rot[K_F_ZY]+ stress[K_S_ZZ]*rot[K_F_ZZ];
-
-    rot_stress[ K_S_XX ] = rot[K_F_XX]*t[0] + rot[K_F_XY]*t[1] + rot[K_F_XZ]*t[2];
-    rot_stress[ K_S_YY ] = rot[K_F_YX]*t[3] + rot[K_F_YY]*t[4] + rot[K_F_YZ]*t[5];
-    rot_stress[ K_S_ZZ ] = rot[K_F_ZX]*t[6] + rot[K_F_ZY]*t[7] + rot[K_F_ZZ]*t[8];
-
-    rot_stress[ K_S_XY ] = rot[K_F_XX]*t[3] + rot[K_F_XY]*t[4] + rot[K_F_XZ]*t[5];
-    rot_stress[ K_S_YZ ] = rot[K_F_YX]*t[6] + rot[K_F_YY]*t[7] + rot[K_F_YZ]*t[8];
-    rot_stress[ K_S_ZX ] = rot[K_F_ZX]*t[0] + rot[K_F_ZY]*t[1] + rot[K_F_ZZ]*t[2];
-  }
-
-  //--------------------------------------------------------------------------
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  KOKKOS_INLINE_FUNCTION static
-  void update_stress( const float dt ,
-                      const float two_mu ,
-                      const float bulk_modulus ,
-                      const ScalarCompact rot_str[] ,
-                            ScalarPrecise stress[] )
-  {
-    const ScalarCompact e = rot_str[ K_S_XX ] + rot_str[ K_S_YY ] + rot_str[ K_S_ZZ ] ;
-    const ScalarCompact eb = e * bulk_modulus ;
-    const ScalarCompact e3 = e / 3.0 ;
-
-    stress[K_S_XX] += dt * ( two_mu * ( rot_str[K_S_XX] - e3 ) + eb );
-    stress[K_S_YY] += dt * ( two_mu * ( rot_str[K_S_YY] - e3 ) + eb );
-    stress[K_S_ZZ] += dt * ( two_mu * ( rot_str[K_S_ZZ] - e3 ) + eb );
-
-    stress[K_S_XY] += dt * two_mu * rot_str[K_S_XY];
-    stress[K_S_YZ] += dt * two_mu * rot_str[K_S_YZ];
-    stress[K_S_ZX] += dt * two_mu * rot_str[K_S_ZX];
-  }
-
-  //--------------------------------------------------------------------------
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  static KOKKOS_INLINE_FUNCTION
-  void comp_force( const ScalarPrecise vx[] ,
-                   const ScalarPrecise vy[] ,
-                   const ScalarPrecise vz[] ,
-                   const ScalarCompact grad_x[] ,
-                   const ScalarCompact grad_y[] ,
-                   const ScalarCompact grad_z[] ,
-                   const ScalarCompact total_stress12th[] ,
-                         ScalarCompact force[][ SpatialDim ] ,
-                         ScalarCompact & energy )
-  {
-    ScalarPrecise internal_energy = 0 ;
-
-    for ( unsigned inode = 0; inode < ElemNodeCount ; ++inode ) {
-
-      force[inode][0] = total_stress12th[K_S_XX] * grad_x[inode] +
-                        total_stress12th[K_S_XY] * grad_y[inode] +
-                        total_stress12th[K_S_XZ] * grad_z[inode] ;
-
-      force[inode][1] = total_stress12th[K_S_YX] * grad_x[inode] +
-                        total_stress12th[K_S_YY] * grad_y[inode] +
-                        total_stress12th[K_S_YZ] * grad_z[inode] ;
-
-      force[inode][2] = total_stress12th[K_S_ZX] * grad_x[inode] +
-                        total_stress12th[K_S_ZY] * grad_y[inode] +
-                        total_stress12th[K_S_ZZ] * grad_z[inode] ;
-
-      internal_energy += force[inode][0] * vx[inode] +
-                         force[inode][1] * vy[inode] +
-                         force[inode][2] * vz[inode] ;
-    }
-
-    energy = internal_energy ;
-  }
-
-  //--------------------------------------------------------------------------
-};
-
-} // namespace Explicit
-
-#endif /* #ifndef KOKKOS_HEXEXPLICITFUNCTIONS_HPP */
-
--- a/lib/kokkos/example/multi_fem/Implicit.hpp
+++ b/lib/kokkos/example/multi_fem/Implicit.hpp
@ -1,341 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef HYBRIDFEM_IMPLICIT_HPP
-#define HYBRIDFEM_IMPLICIT_HPP
-
-#include <utility>
-#include <iostream>
-#include <iomanip>
-
-#include <Kokkos_Core.hpp>
-#include <SparseLinearSystem.hpp>
-#include <SparseLinearSystemFill.hpp>
-#include <ImplicitFunctors.hpp>
-#include <FEMesh.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace HybridFEM {
-namespace Implicit {
-
-struct PerformanceData {
-  double mesh_time ;
-  double graph_time ;
-  double elem_time ;
-  double matrix_gather_fill_time ;
-  double matrix_boundary_condition_time ;
-  double cg_iteration_time ;
-
-  PerformanceData()
-    : mesh_time(0)
-    , graph_time(0)
-    , elem_time(0)
-    , matrix_gather_fill_time(0)
-    , matrix_boundary_condition_time(0)
-    , cg_iteration_time(0)
-    {}
-
-  void best( const PerformanceData & rhs )
-  {
-    mesh_time = std::min( mesh_time , rhs.mesh_time );
-    graph_time = std::min( graph_time , rhs.graph_time );
-    elem_time = std::min( elem_time , rhs.elem_time );
-    matrix_gather_fill_time = std::min( matrix_gather_fill_time , rhs.matrix_gather_fill_time );
-    matrix_boundary_condition_time = std::min( matrix_boundary_condition_time , rhs.matrix_boundary_condition_time );
-    cg_iteration_time = std::min( cg_iteration_time , rhs.cg_iteration_time );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class FixtureType >
-PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
-                     const int , // global_max_x ,
-                     const int , // global_max_y ,
-                     const int global_max_z ,
-                     const bool print_sample )
-{
-  typedef Scalar                              scalar_type ;
-  typedef FixtureType                         fixture_type ;
-  typedef typename fixture_type::execution_space  execution_space;
-  //typedef typename execution_space::size_type     size_type ; // unused
-
-  typedef typename fixture_type::FEMeshType mesh_type ;
-  typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;
-
-  enum { ElementNodeCount = fixture_type::element_node_count };
-
-  const comm::Machine machine = mesh.parallel_data_map.machine ;
-
-  const size_t element_count = mesh.elem_node_ids.dimension_0();
-
-  const size_t iteration_limit = 200 ;
-  const double residual_tolerance = 1e-14 ;
-
-  size_t iteration_count = 0 ;
-  double residual_norm = 0 ;
-
-  PerformanceData perf_data ;
-
-  //------------------------------------
-  // Sparse linear system types:
-
-  typedef Kokkos::View< scalar_type* , execution_space >   vector_type ;
-  typedef Kokkos::CrsMatrix< scalar_type , execution_space >     matrix_type ;
-  typedef typename matrix_type::graph_type         matrix_graph_type ;
-  typedef typename matrix_type::coefficients_type  matrix_coefficients_type ;
-
-  typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;
-
-  //------------------------------------
-  // Problem setup types:
-
-  typedef ElementComputation< scalar_type , scalar_type , execution_space > ElementFunctor ;
-  typedef DirichletBoundary< scalar_type , scalar_type , execution_space > BoundaryFunctor ;
-
-  typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
-  typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;
-
-  typedef GatherFill< matrix_type ,
-                      mesh_type ,
-                      elem_matrices_type ,
-                      elem_vectors_type > GatherFillFunctor ;
-
-  //------------------------------------
-
-  const scalar_type elem_coeff_K = 2 ;
-  const scalar_type elem_load_Q  = 1 ;
-
-  matrix_type linsys_matrix ;
-  vector_type linsys_rhs ;
-  vector_type linsys_solution ;
-
-  typename graph_factory::element_map_type element_map ;
-
-  Kokkos::Impl::Timer wall_clock ;
-
-  //------------------------------------
-  // Generate sparse matrix graph and element->graph map.
-
-  graph_factory::create( mesh , linsys_matrix.graph , element_map );
-
-  execution_space::fence();
-  perf_data.graph_time = comm::max( machine , wall_clock.seconds() );
-
-  //------------------------------------
-  // Allocate linear system coefficients and rhs:
-
-  const size_t local_owned_length =
-    linsys_matrix.graph.row_map.dimension_0() - 1 ;
-
-  linsys_matrix.coefficients =
-    matrix_coefficients_type( "coeff" , linsys_matrix.graph.entries.dimension_0() );
-
-  linsys_rhs      = vector_type( "rhs" , local_owned_length );
-  linsys_solution = vector_type( "solution" , local_owned_length );
-
-  //------------------------------------
-  // Fill linear system
-  {
-    elem_matrices_type elem_matrices ;
-    elem_vectors_type  elem_vectors ;
-
-    if ( element_count ) {
-      elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
-      elem_vectors  = elem_vectors_type ( std::string("elem_vectors"), element_count );
-    }
-
-    //------------------------------------
-    // Compute element matrices and vectors:
-
-    wall_clock.reset();
-
-    ElementFunctor::apply( mesh ,
-                           elem_matrices , elem_vectors ,
-                           elem_coeff_K , elem_load_Q );
-
-    execution_space::fence();
-    perf_data.elem_time = comm::max( machine , wall_clock.seconds() );
-
-    //------------------------------------
-    // Fill linear system coefficients:
-
-    wall_clock.reset();
-
-    GatherFillFunctor::apply( linsys_matrix , linsys_rhs ,
-               mesh , element_map , elem_matrices , elem_vectors );
-
-    execution_space::fence();
-    perf_data.matrix_gather_fill_time = comm::max( machine , wall_clock.seconds() );
-
-    // Apply boundary conditions:
-
-    wall_clock.reset();
-
-    BoundaryFunctor::apply( linsys_matrix , linsys_rhs , mesh ,
-                            0 , global_max_z , 0 , global_max_z );
-
-    execution_space::fence();
-    perf_data.matrix_boundary_condition_time = comm::max( machine , wall_clock.seconds() );
-  }
-
-  //------------------------------------
-  // Solve linear sytem
-
-  cgsolve( mesh.parallel_data_map ,
-           linsys_matrix , linsys_rhs , linsys_solution ,
-           iteration_count , residual_norm ,
-           perf_data.cg_iteration_time ,
-           iteration_limit , residual_tolerance );
-
-  //------------------------------------
-
-  if ( print_sample ) {
-
-    typename mesh_type::node_coords_type::HostMirror coords_h =
-      Kokkos::create_mirror( mesh.node_coords );
-
-    typename vector_type::HostMirror X_h =
-      Kokkos::create_mirror( linsys_solution );
-
-    Kokkos::deep_copy( coords_h , mesh.node_coords );
-    Kokkos::deep_copy( X_h , linsys_solution );
-
-    for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
-      const coordinate_scalar_type x = coords_h(i,0);
-      const coordinate_scalar_type y = coords_h(i,1);
-      const coordinate_scalar_type z = coords_h(i,2);
-
-      if ( x <= 0 && y <= 0 ) {
-        std::cout << "  node( " << x << " " << y << " " << z << " ) = "
-                  << X_h(i) << std::endl ;
-      }
-    }
-  }
-
-  return perf_data ;
-}
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class Device >
-void driver( const char * const label ,
-             comm::Machine machine ,
-             const int gang_count ,
-             const int elem_count_beg ,
-             const int elem_count_end ,
-             const int runs )
-{
-  typedef Scalar              scalar_type ;
-  typedef Device              execution_space ;
-  typedef double              coordinate_scalar_type ;
-  typedef FixtureElementHex8  fixture_element_type ;
-
-  typedef BoxMeshFixture< coordinate_scalar_type ,
-                          execution_space ,
-                          fixture_element_type > fixture_type ;
-
-  typedef typename fixture_type::FEMeshType mesh_type ;
-
-  const size_t proc_count = comm::size( machine );
-  const size_t proc_rank  = comm::rank( machine );
-
-  if ( elem_count_beg == 0 || elem_count_end == 0 || runs == 0 ) return ;
-
-  if ( comm::rank( machine ) == 0 ) {
-    std::cout << std::endl ;
-    std::cout << "\"Kokkos::HybridFE::Implicit " << label << "\"" << std::endl;
-    std::cout << "\"Size\" ,  \"Graphing\" , \"Element\" , \"Fill\" ,   \"Boundary\" ,  \"CG-Iter\"" << std::endl
-              << "\"elems\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\"" << std::endl ;
-  }
-
-  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
-  {
-    const int ix = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
-    const int iy = ix + 1 ;
-    const int iz = 2 * iy ;
-    const int n  = ix * iy * iz ;
-
-    mesh_type mesh =
-      fixture_type::create( proc_count , proc_rank , gang_count ,
-                            ix , iy , iz );
-
-    mesh.parallel_data_map.machine = machine ;
-
-    PerformanceData perf_data , perf_best ;
-
-    for(int j = 0; j < runs; j++){
-
-     perf_data = run<scalar_type,fixture_type>(mesh,ix,iy,iz, false );
-
-     if( j == 0 ) {
-       perf_best = perf_data ;
-     }
-     else {
-       perf_best.best( perf_data );
-     }
-   }
-
-  if ( comm::rank( machine ) == 0 ) {
-
-     std::cout << std::setw(8) << n << " , "
-               << std::setw(10) << perf_best.graph_time * 1000 << " , "
-               << std::setw(10) << perf_best.elem_time * 1000 << " , "
-               << std::setw(10) << perf_best.matrix_gather_fill_time * 1000 << " , "
-               << std::setw(10) << perf_best.matrix_boundary_condition_time * 1000 << " , "
-               << std::setw(10) << perf_best.cg_iteration_time * 1000
-               << std::endl ;
-    }
-  }
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace Implicit */
-} /* namespace HybridFEM */
-
-
-#endif /* #ifndef HYBRIDFEM_IMPLICIT_HPP */
-
--- a/lib/kokkos/example/multi_fem/ImplicitFunctors.hpp
+++ b/lib/kokkos/example/multi_fem/ImplicitFunctors.hpp
@ -1,585 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <iostream>
-#include <fstream>
-#include <iomanip>
-#include <cstdlib>
-#include <cmath>
-
-namespace HybridFEM {
-namespace Implicit {
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , unsigned Dim , unsigned N >
-struct TensorIntegration ;
-
-template<typename Scalar >
-struct TensorIntegration<Scalar,1,1> {
-  Scalar pts[1] ;
-  Scalar wts[1] ;
-
-  TensorIntegration() { pts[0] = 0 ; wts[0] = 2 ; }
-};
-
-template<typename Scalar >
-struct TensorIntegration<Scalar,1,2>
-{
-  Scalar pts[2] ;
-  Scalar wts[2] ;
-
-  TensorIntegration()
-  {
-    const Scalar x2 = 0.577350269 ;
-    pts[0] = -x2; wts[0] = 1.0;
-    pts[1] =  x2; wts[1] = 1.0;
-  }
-};
-
-template<typename Scalar >
-struct TensorIntegration<Scalar,1,3>
-{
-  Scalar pts[3] ;
-  Scalar wts[3] ;
-
-  TensorIntegration()
-  {
-    const Scalar x3 = 0.774596669 ;
-    const Scalar w1 = 0.555555556 ;
-    const Scalar w2 = 0.888888889 ;
-    pts[0] =  -x3 ;  wts[0] = w1 ;
-    pts[1] =    0 ;  wts[1] = w2 ;
-    pts[2] =   x3 ;  wts[2] = w1 ;
-  }
-};
-
-template< typename Scalar , unsigned Order >
-struct TensorIntegration<Scalar,3,Order>
-{
-  static const unsigned N = Order * Order * Order ;
-
-  Scalar pts[N][3] ;
-  Scalar wts[N];
-
-  TensorIntegration()
-  {
-    TensorIntegration<Scalar,1,Order> oneD ;
-
-    unsigned n = 0 ;
-    for ( unsigned k = 0 ; k < Order ; ++k ) {
-    for ( unsigned j = 0 ; j < Order ; ++j ) {
-    for ( unsigned i = 0 ; i < Order ; ++i , ++n ) {
-      pts[n][0] = oneD.pts[i] ;
-      pts[n][1] = oneD.pts[j] ;
-      pts[n][2] = oneD.pts[k] ;
-      wts[n] = oneD.wts[i] * oneD.wts[j] * oneD.wts[k] ;
-    }}}
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar >
-struct ShapeFunctionEvaluation {
-
-  static const unsigned FunctionCount = 8 ;
-  static const unsigned SpatialDimension = 3 ;
-  static const unsigned IntegrationOrder = 2 ;
-
-  typedef TensorIntegration< Scalar , SpatialDimension , IntegrationOrder > 
-    TensorIntegrationType ;
-
-  static const unsigned PointCount = TensorIntegrationType::N ;
-
-  Scalar value   [ PointCount ][ FunctionCount ] ;
-  Scalar gradient[ PointCount ][ FunctionCount * SpatialDimension ];
-  Scalar weight  [ PointCount ];
-
-  ShapeFunctionEvaluation()
-  {
-    const TensorIntegration< Scalar , SpatialDimension , IntegrationOrder > 
-      integration ;
-
-    const Scalar ONE8TH = 0.125 ;
-
-    for ( unsigned i = 0 ; i < PointCount ; ++i ) {
-
-      const Scalar u = 1.0 - integration.pts[i][0];
-      const Scalar v = 1.0 - integration.pts[i][1];
-      const Scalar w = 1.0 - integration.pts[i][2];
-
-      const Scalar up1 = 1.0 + integration.pts[i][0];
-      const Scalar vp1 = 1.0 + integration.pts[i][1];
-      const Scalar wp1 = 1.0 + integration.pts[i][2];
-
-      weight[i] = integration.wts[i] ;
-
-      // Vaues:
-      value[i][0] = ONE8TH *   u *   v *  w ;
-      value[i][1] = ONE8TH * up1 *   v *  w ;
-      value[i][2] = ONE8TH * up1 * vp1 *  w ;
-      value[i][3] = ONE8TH *   u * vp1 *  w ;
-
-      value[i][4] = ONE8TH *   u *   v *  wp1 ;
-      value[i][5] = ONE8TH * up1 *   v *  wp1 ;
-      value[i][6] = ONE8TH * up1 * vp1 *  wp1 ;
-      value[i][7] = ONE8TH *   u * vp1 *  wp1 ;
-
-      //fn 0 = u * v * w
-      gradient[i][ 0] = ONE8TH * -1  *  v  *  w  ;
-      gradient[i][ 1] = ONE8TH *  u  * -1  *  w  ;
-      gradient[i][ 2] = ONE8TH *  u  *  v  * -1  ;
-
-      //fn 1 = up1 * v * w
-      gradient[i][ 3] = ONE8TH *  1  *  v  *  w  ;
-      gradient[i][ 4] = ONE8TH * up1 * -1  *  w  ;
-      gradient[i][ 5] = ONE8TH * up1 *  v  * -1  ;
-
-      //fn 2 = up1 * vp1 * w
-      gradient[i][ 6] = ONE8TH *  1  * vp1 *  w ;
-      gradient[i][ 7] = ONE8TH * up1 *  1  *  w ;
-      gradient[i][ 8] = ONE8TH * up1 * vp1 * -1 ;
-
-      //fn 3 = u * vp1 * w
-      gradient[i][ 9] = ONE8TH * -1 * vp1 *  w ;
-      gradient[i][10] = ONE8TH *  u *  1  *  w ;
-      gradient[i][11] = ONE8TH *  u * vp1 * -1 ;
-
-      //fn 4 = u * v * wp1
-      gradient[i][12] = ONE8TH * -1  *  v  * wp1 ;
-      gradient[i][13] = ONE8TH *  u  * -1  * wp1 ;
-      gradient[i][14] = ONE8TH *  u  *  v  *  1  ;
-
-      //fn 5 = up1 * v * wp1
-      gradient[i][15] = ONE8TH *  1  *  v  * wp1 ;
-      gradient[i][16] = ONE8TH * up1 * -1  * wp1 ;
-      gradient[i][17] = ONE8TH * up1 *  v  *  1  ;
-
-      //fn 6 = up1 * vp1 * wp1
-      gradient[i][18] = ONE8TH *  1  * vp1 * wp1 ;
-      gradient[i][19] = ONE8TH * up1 *  1  * wp1 ;
-      gradient[i][20] = ONE8TH * up1 * vp1 *  1 ;
-
-      //fn 7 = u * vp1 * wp1
-      gradient[i][21] = ONE8TH * -1 * vp1 * wp1 ;
-      gradient[i][22] = ONE8TH *  u *  1  * wp1 ;
-      gradient[i][23] = ONE8TH *  u * vp1 *  1 ;
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarType , typename ScalarCoordType , class DeviceType >
-struct ElementComputation
-{
-  typedef DeviceType     execution_space;
-  typedef ScalarType              scalar_type ;
-  typedef typename execution_space::size_type  size_type ;
-
-  static const size_type ElementNodeCount = 8 ;
-
-  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
-  typedef Kokkos::View< scalar_type[][ElementNodeCount][ElementNodeCount] , execution_space > elem_matrices_type ;
-  typedef Kokkos::View< scalar_type[][ElementNodeCount] , execution_space > elem_vectors_type ;
-
-  typedef ShapeFunctionEvaluation< scalar_type > shape_function_data ;
-
-  static const unsigned SpatialDim    = shape_function_data::SpatialDimension ;
-  static const unsigned FunctionCount = shape_function_data::FunctionCount ;
-
-private:
-
-  const shape_function_data               shape_eval ;
-  typename mesh_type::elem_node_ids_type  elem_node_ids ;
-  typename mesh_type::node_coords_type    node_coords ;
-  elem_matrices_type                      element_matrices ;
-  elem_vectors_type                       element_vectors ;
-  scalar_type                             coeff_K ;
-  scalar_type                             coeff_Q ;
-
-  ElementComputation( const mesh_type   & arg_mesh ,
-                      const elem_matrices_type  & arg_element_matrices , 
-                      const elem_vectors_type   & arg_element_vectors ,
-                      const scalar_type   arg_coeff_K ,
-                      const scalar_type   arg_coeff_Q )
-  : shape_eval()
-  , elem_node_ids( arg_mesh.elem_node_ids )
-  , node_coords(   arg_mesh.node_coords )
-  , element_matrices( arg_element_matrices )
-  , element_vectors( arg_element_vectors )
-  , coeff_K( arg_coeff_K )
-  , coeff_Q( arg_coeff_Q )
-  {}
-
-public:
-
-  static void apply( const mesh_type  & mesh ,
-                     const elem_matrices_type & elem_matrices ,
-                     const elem_vectors_type  & elem_vectors ,
-                     const scalar_type  elem_coeff_K ,
-                     const scalar_type  elem_coeff_Q )
-  {
-    ElementComputation comp( mesh , elem_matrices , elem_vectors , elem_coeff_K , elem_coeff_Q );
-    const size_t elem_count = mesh.elem_node_ids.dimension_0();
-
-    parallel_for( elem_count , comp );
-  }
-
-  //------------------------------------
-
-  static const unsigned FLOPS_jacobian =
-    FunctionCount * SpatialDim * SpatialDim * 2 ;
-
-  KOKKOS_INLINE_FUNCTION
-  void jacobian( const ScalarCoordType * x, 
-                 const ScalarCoordType * y, 
-                 const ScalarCoordType * z, 
-                 const scalar_type * grad_vals, 
-                 scalar_type * J) const
-  {
-    int i_grad = 0 ;
-
-    for( unsigned i = 0; i < ElementNodeCount ; ++i , i_grad += SpatialDim ) {
-      const scalar_type g0 = grad_vals[ i_grad ];
-      const scalar_type g1 = grad_vals[ i_grad + 1 ];
-      const scalar_type g2 = grad_vals[ i_grad + 2 ];
-      const scalar_type x0 = x[i] ;
-      const scalar_type x1 = y[i] ;
-      const scalar_type x2 = z[i] ;
-
-      J[0] += g0 * x0 ;
-      J[1] += g0 * x1 ;
-      J[2] += g0 * x2 ;
-
-      J[3] += g1 * x0 ;
-      J[4] += g1 * x1 ;
-      J[5] += g1 * x2 ;
-
-      J[6] += g2 * x0 ;
-      J[7] += g2 * x1 ;
-      J[8] += g2 * x2 ;
-    }
-  }
-
-  //------------------------------------
-
-  static const unsigned FLOPS_inverse_and_det = 46 ;
-
-  KOKKOS_INLINE_FUNCTION
-  scalar_type inverse_and_determinant3x3( scalar_type * const J ) const
-  {
-    const scalar_type J00 = J[0];
-    const scalar_type J01 = J[1];
-    const scalar_type J02 = J[2];
-
-    const scalar_type J10 = J[3];
-    const scalar_type J11 = J[4];
-    const scalar_type J12 = J[5];
-
-    const scalar_type J20 = J[6];
-    const scalar_type J21 = J[7];
-    const scalar_type J22 = J[8];
-
-    const scalar_type term0 = J22*J11 - J21*J12;
-    const scalar_type term1 = J22*J01 - J21*J02;
-    const scalar_type term2 = J12*J01 - J11*J02;
-
-    const scalar_type detJ = J00*term0 - J10*term1 + J20*term2;
-    const scalar_type inv_detJ = 1.0/detJ;
-
-    J[0] =  term0*inv_detJ;
-    J[1] = -term1*inv_detJ;
-    J[2] =  term2*inv_detJ;
-
-    J[3] = -(J22*J10 - J20*J12)*inv_detJ;
-    J[4] =  (J22*J00 - J20*J02)*inv_detJ;
-    J[5] = -(J12*J00 - J10*J02)*inv_detJ;
-
-    J[6] =  (J21*J10 - J20*J11)*inv_detJ;
-    J[7] = -(J21*J00 - J20*J01)*inv_detJ;
-    J[8] =  (J11*J00 - J10*J01)*inv_detJ;
-
-    return detJ ;
-  }
-
-  //------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  void matTransMat3x3_X_3xn( const scalar_type * A, int n,
-                             const scalar_type * B,
-                             scalar_type * C ) const
-  {
-    //A is 3x3, B is 3xn. So C is also 3xn.
-    //A,B,C are all assumed to be ordered such that columns are contiguous.
-
-    scalar_type * Cj = C;
-    const scalar_type * Bj = B;
-
-    for(int j=0; j<n; ++j) {
-      Cj[0] = A[0]*Bj[0] + A[1]*Bj[1] + A[2]*Bj[2];
-      Cj[1] = A[3]*Bj[0] + A[4]*Bj[1] + A[5]*Bj[2];
-      Cj[2] = A[6]*Bj[0] + A[7]*Bj[1] + A[8]*Bj[2];
-      Bj += 3;
-      Cj += 3;
-    }
-
-  }
-  //------------------------------------
-
-  static const unsigned FLOPS_contributeDiffusionMatrix = FunctionCount * ( 3 * 5 + FunctionCount * 7 ) ;
-
-  KOKKOS_INLINE_FUNCTION
-  void contributeDiffusionMatrix(
-    const scalar_type weight ,
-    const scalar_type grad_vals[] ,
-    const scalar_type invJ[] ,
-    scalar_type elem_mat[][8] ) const
-  {
-    scalar_type dpsidx[8], dpsidy[8], dpsidz[8];
-
-    int i_grad = 0 ;
-    for( unsigned i = 0; i < FunctionCount ; ++i , i_grad += 3 ) {
-      const scalar_type g0 = grad_vals[i_grad+0];
-      const scalar_type g1 = grad_vals[i_grad+1];
-      const scalar_type g2 = grad_vals[i_grad+2];
-
-      dpsidx[i] = g0 * invJ[0] + g1 * invJ[1] + g2 * invJ[2];
-      dpsidy[i] = g0 * invJ[3] + g1 * invJ[4] + g2 * invJ[5];
-      dpsidz[i] = g0 * invJ[6] + g1 * invJ[7] + g2 * invJ[8];
-    }
-
-    for( unsigned m = 0; m < FunctionCount; m++) {
-      for( unsigned n = 0; n < FunctionCount; n++) {
-
-        elem_mat[m][n] += weight * 
-          ((dpsidx[m] * dpsidx[n]) + 
-           (dpsidy[m] * dpsidy[n]) +
-           (dpsidz[m] * dpsidz[n]));            
-      }
-    }
-  }
-
-  //------------------------------------
-
-  static const unsigned FLOPS_contributeSourceVector = FunctionCount * 2 ;
-
-  KOKKOS_INLINE_FUNCTION
-  void contributeSourceVector( const scalar_type term ,
-                               const scalar_type psi[] ,
-                               scalar_type elem_vec[] ) const
-  {
-     for( unsigned i=0; i< FunctionCount ; ++i) {
-       elem_vec[i] += psi[i] * term ;
-     }
-  }
-
-
-  static const unsigned FLOPS_operator =
-           shape_function_data::PointCount * ( 3
-             + FLOPS_jacobian
-             + FLOPS_inverse_and_det
-             + FLOPS_contributeDiffusionMatrix
-             + FLOPS_contributeSourceVector ) ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( int ielem )const {
-
-    scalar_type elem_vec[8] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
-    scalar_type elem_mat[8][8] =
-      { { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } };
-
-    ScalarCoordType x[8], y[8], z[8];
-
-    for ( int i = 0 ; i < 8 ; ++i ) {
-      const int node_index = elem_node_ids( ielem , i );
-      x[i] = node_coords( node_index , 0 );
-      y[i] = node_coords( node_index , 1 );
-      z[i] = node_coords( node_index , 2 );
-    }
-
-    // This loop could be parallelized; however,
-    // it would require additional per-thread temporaries
-    // of 'elem_vec' and 'elem_mat' which would
-    // consume more local memory and have to be reduced.
-
-    for ( unsigned i = 0 ; i < shape_function_data::PointCount ; ++i ) {
-
-      scalar_type J[SpatialDim*SpatialDim] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
-
-      jacobian( x, y, z, shape_eval.gradient[i] , J );
-
-      // Overwrite J with its inverse to save scratch memory space.
-      const scalar_type detJ_w   = shape_eval.weight[i] * inverse_and_determinant3x3(J);
-      const scalar_type k_detJ_w = coeff_K * detJ_w ;
-      const scalar_type Q_detJ_w = coeff_Q * detJ_w ;
-
-      contributeDiffusionMatrix( k_detJ_w , shape_eval.gradient[i] , J , elem_mat );
-
-      contributeSourceVector( Q_detJ_w , shape_eval.value[i] , elem_vec );
-    }
-
-    for( size_type i=0; i< ElementNodeCount ; ++i) {
-      element_vectors(ielem, i) = elem_vec[i] ;
-    }
-
-    for( size_type i = 0; i < ElementNodeCount ; i++){
-      for( size_type j = 0; j < ElementNodeCount ; j++){
-        element_matrices(ielem, i, j) = elem_mat[i][j] ;
-      }
-    }
-  }
-}; /* ElementComputation */
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarType , typename ScalarCoordType , class DeviceType >
-struct DirichletBoundary
-{
-  typedef DeviceType     execution_space;
-  typedef typename execution_space::size_type  size_type ;
-
-  static const size_type ElementNodeCount = 8 ;
-
-  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
-  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
-
-  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
-
-  typename mesh_type::node_coords_type node_coords ;
-  matrix_type     matrix ;
-  vector_type     rhs ;
-  ScalarCoordType bc_lower_z ;
-  ScalarCoordType bc_upper_z ;
-  ScalarType      bc_lower_value ;
-  ScalarType      bc_upper_value ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( size_type inode ) const
-  {
-    //  Apply a dirichlet boundary condition to 'irow'
-    //  to maintain the symmetry of the original 
-    //  global stiffness matrix, zero out the columns
-    //  that correspond to boundary conditions, and
-    //  adjust the load vector accordingly
-
-    const size_type iBeg = matrix.graph.row_map[inode];
-    const size_type iEnd = matrix.graph.row_map[inode+1];
-
-    const ScalarCoordType z = node_coords(inode,2);
-    const bool bc_lower = z <= bc_lower_z ;
-    const bool bc_upper = bc_upper_z <= z ;
-
-    if ( bc_lower || bc_upper ) {
-      const ScalarType bc_value = bc_lower ? bc_lower_value
-                                           : bc_upper_value ;
-
-      rhs(inode) = bc_value ; //  set the rhs vector
-
-      //  zero each value on the row, and leave a one
-      //  on the diagonal
-
-      for( size_type i = iBeg ; i < iEnd ; i++) {
-        matrix.coefficients(i) =
-          (int) inode == matrix.graph.entries(i) ? 1 : 0 ;
-      }
-    }
-    else {
-      //  Find any columns that are boundary conditions.
-      //  Clear them and adjust the load vector
-
-      for( size_type i = iBeg ; i < iEnd ; i++ ) {
-        const size_type cnode = matrix.graph.entries(i) ;
-
-        const ScalarCoordType zc = node_coords(cnode,2);
-        const bool c_bc_lower = zc <= bc_lower_z ;
-        const bool c_bc_upper = bc_upper_z <= zc ;
-
-        if ( c_bc_lower || c_bc_upper ) {
-
-          const ScalarType c_bc_value = c_bc_lower ? bc_lower_value
-                                                   : bc_upper_value ;
-
-          rhs( inode ) -= c_bc_value * matrix.coefficients(i);
-
-          matrix.coefficients(i) = 0 ;
-        }
-      }
-    }
-  }
-
-
-  static void apply( const matrix_type & linsys_matrix ,
-                     const vector_type & linsys_rhs ,
-                     const mesh_type   & mesh ,
-                     const ScalarCoordType  bc_lower_z ,
-                     const ScalarCoordType  bc_upper_z ,
-                     const ScalarType       bc_lower_value ,
-                     const ScalarType       bc_upper_value )
-  {
-    const size_t row_count = linsys_matrix.graph.row_map.dimension_0() - 1 ;
-    DirichletBoundary op ;
-    op.node_coords    = mesh.node_coords ;
-    op.matrix         = linsys_matrix ;
-    op.rhs            = linsys_rhs ;
-    op.bc_lower_z     = bc_lower_z ;
-    op.bc_upper_z     = bc_upper_z ;
-    op.bc_lower_value = bc_lower_value ;
-    op.bc_upper_value = bc_upper_value ;
-    parallel_for( row_count , op );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} /* namespace Implicit */
-} /* namespace HybridFEM */
-
--- a/lib/kokkos/example/multi_fem/LinAlgBLAS.hpp
+++ b/lib/kokkos/example/multi_fem/LinAlgBLAS.hpp
@ -1,567 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef USESCASES_LINALG_BLAS_HPP
-#define USESCASES_LINALG_BLAS_HPP
-
-#include <cmath>
-#include <utility>
-#include <ParallelComm.hpp>
-#include <Kokkos_Core.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class Scalar , class Layout , class DeviceType > struct Dot ;
-
-template< class Scalar , class Layout , class DeviceType > struct Dot1 ;
-
-template< typename ScalarA ,
-          typename ScalarY ,
-          class Layout , class Device >
-struct Scale ;
-
-template< typename ScalarA ,
-          typename ScalarY ,
-          class Layout , class Device >
-struct Fill ;
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          typename ScalarY ,
-          class Layout , class Device >
-struct AXPY ;
-
-template< typename ScalarX ,
-          typename ScalarB ,
-          typename ScalarY ,
-          class Layout , class Device >
-struct XPBY ;
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          typename ScalarB ,
-          typename ScalarY ,
-          typename ScalarW ,
-          class Layout , class Device >
-struct WAXPBY ;
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_MPI )
-
-template< typename ScalarX /* Allow mix of const and non-const */ ,
-          typename ScalarY /* Allow mix of const and non-const */ ,
-          class L , class D ,
-          class MX /* Allow any management type */ ,
-          class MY /* Allow any management type */ >
-inline
-double dot( const size_t n ,
-            const View< ScalarX * , L , D , MX > & x ,
-            const View< ScalarY * , L , D , MY > & y ,
-            comm::Machine machine )
-{
-  double global_result = 0 ;
-  double local_result = 0 ;
-
-  Impl::Dot< ScalarX , L , D >( n , x , y , local_result );
-
-  MPI_Allreduce( & local_result , & global_result , 1 ,
-                 MPI_DOUBLE , MPI_SUM , machine.mpi_comm );
-
-  return global_result ;
-}
-
-#else
-
-template< typename ScalarX /* Allow mix of const and non-const */ ,
-          typename ScalarY /* Allow mix of const and non-const */ ,
-          class L , class D ,
-          class MX /* Allow any management type */ ,
-          class MY /* Allow any management type */ >
-inline
-double dot( const size_t n ,
-            const View< ScalarX * , L , D , MX > & x ,
-            const View< ScalarY * , L , D , MY > & y ,
-            comm::Machine )
-{
-  double global_result = 0 ;
-
-  Impl::Dot< ScalarX , L , D >( n , x , y , global_result );
-
-  return global_result ;
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_MPI )
-
-template< typename ScalarX /* Allow mix of const and non-const */ ,
-          class L , class D ,
-          class MX /* Allow any management type */ >
-inline
-double dot( const size_t n ,
-            const View< ScalarX * , L , D , MX > & x ,
-            comm::Machine machine )
-{
-  double global_result = 0 ;
-  double local_result = 0 ;
-
-  Impl::Dot1< ScalarX , L , D >( n , x , local_result );
-
-  MPI_Allreduce( & local_result , & global_result , 1 ,
-                 MPI_DOUBLE , MPI_SUM , machine.mpi_comm );
-
-  return global_result ;
-}
-
-#else
-
-template< typename ScalarX /* Allow mix of const and non-const */ ,
-          class L , class D ,
-          class MX /* Allow any management type */ >
-inline
-double dot( const size_t n ,
-            const View< ScalarX * , L , D , MX > & x ,
-            comm::Machine )
-{
-  double global_result = 0 ;
-
-  Impl::Dot1< ScalarX , L , D >( n , x , global_result );
-
-  return global_result ;
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarX /* Allow mix of const and non-const */ ,
-          class L , class D ,
-          class MX /* Allow any management type */ >
-inline
-double norm2( const size_t n ,
-              const View< ScalarX * , L , D , MX > & x ,
-              comm::Machine machine )
-{
-  return std::sqrt( dot( n , x , machine ) );
-}
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          class L ,
-          class D ,
-          class MX >
-void scale( const size_t n ,
-            const ScalarA & alpha ,
-            const View< ScalarX * , L , D , MX > & x )
-{
-  Impl::Scale< ScalarA , ScalarX , L , D >( n , alpha , x );
-}
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          class L ,
-          class D ,
-          class MX >
-void fill( const size_t n ,
-           const ScalarA & alpha ,
-           const View< ScalarX * , L , D , MX > & x )
-{
-  Impl::Fill< ScalarA , ScalarX , L , D >( n , alpha , x );
-}
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          typename ScalarY ,
-          class L ,
-          class D ,
-          class MX ,
-          class MY >
-void axpy( const size_t n ,
-           const ScalarA & alpha ,
-           const View< ScalarX *, L , D , MX > & x ,
-           const View< ScalarY *, L , D , MY > & y )
-{
-  Impl::AXPY< ScalarA, ScalarX, ScalarY , L , D >( n, alpha, x, y );
-}
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarX ,
-          typename ScalarB ,
-          typename ScalarY ,
-          class L ,
-          class D ,
-          class MX ,
-          class MY >
-void xpby( const size_t n ,
-           const View< ScalarX *, L , D , MX > & x ,
-           const ScalarB & beta ,
-           const View< ScalarY *, L , D , MY > & y )
-{
-  Impl::XPBY< ScalarX, ScalarB, ScalarY , L , D >( n, x, beta, y );
-}
-
-//----------------------------------------------------------------------------
-// w = alpha * x + beta * y
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          typename ScalarB ,
-          typename ScalarY ,
-          typename ScalarW ,
-          class L , class D ,
-          class MX , class MY , class MW >
-void waxpby( const size_t n ,
-             const ScalarA & alpha ,
-             const View< ScalarX * , L , D , MX > & x ,
-             const ScalarB & beta ,
-             const View< ScalarY * , L , D , MY > & y ,
-             const View< ScalarW * , L , D , MW > & w )
-{
-  Impl::WAXPBY<ScalarA,ScalarX,ScalarB,ScalarY,ScalarW,L,D>
-    ( n , alpha , x , beta , y , w );
-}
-
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< typename Scalar , class L , class D >
-struct Dot
-{
-private:
-
-  typedef View< const Scalar*, L, D, MemoryUnmanaged >  vector_const_type ;
-
-  const vector_const_type x ;
-  const vector_const_type y ;
-
-public:
-
-  typedef typename vector_const_type::execution_space  execution_space ; // Manycore device
-  typedef double      value_type ;  // Reduction value
-
-  template< class ArgX , class ArgY >
-  inline
-  Dot( const size_t n , const ArgX & arg_x , const ArgY & arg_y , double & result )
-    : x( arg_x ), y( arg_y )
-  {
-    parallel_reduce( n , *this , result );
-  }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i , value_type & update ) const
-  { update += x(i) * y(i); }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
-                    const volatile value_type & source )
-  { update += source;    }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init( value_type & update )
-  { update = 0 ; }
-}; // Dot
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class L , class D >
-struct Dot1
-{
-private:
-
-  typedef View< const Scalar*, L, D , MemoryUnmanaged >  vector_const_type ;
-
-  const vector_const_type x ;
-
-public:
-
-  typedef typename vector_const_type::execution_space  execution_space ; // Manycore device
-  typedef double      value_type ;  // Reduction value
-
-  template< class ArgX >
-  inline
-  Dot1( const size_t n , const ArgX & arg_x , double & result )
-    : x( arg_x )
-  {
-    parallel_reduce( n , *this , result );
-  }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i , value_type & update ) const
-  { update += x(i) * x(i) ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
-                    const volatile value_type & source )
-  { update += source ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init( value_type & update )
-  { update = 0 ; }
-}; // Dot
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template < typename ScalarA ,
-           typename ScalarX ,
-           typename ScalarB ,
-           typename ScalarY ,
-           typename ScalarW ,
-           class L , class D >
-struct WAXPBY
-{
-private:
-
-  typedef View<       ScalarW *, L , D , MemoryUnmanaged > ViewW ;
-  typedef View< const ScalarX *, L , D , MemoryUnmanaged > ViewX ;
-  typedef View< const ScalarY *, L , D , MemoryUnmanaged > ViewY ;
-
-  const ViewW    w ;
-  const ViewX    x ;
-  const ViewY    y ;
-  const ScalarA  alpha ;
-  const ScalarB  beta ;
-
-public:
-
-  typedef typename ViewW::execution_space  execution_space ;
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType inode ) const
-  {
-    w(inode) = alpha * x(inode) + beta * y(inode);
-  }
-
-  template< class ArgX , class ArgY , class ArgW >
-  inline
-  WAXPBY( const size_t  n ,
-          const ScalarA & arg_alpha ,
-          const ArgX    & arg_x ,
-          const ScalarB & arg_beta ,
-          const ArgY    & arg_y ,
-          const ArgW    & arg_w )
-    : w( arg_w ), x( arg_x ), y( arg_y )
-    , alpha( arg_alpha ), beta( arg_beta )
-  {
-    parallel_for( n , *this );
-  }
-}; // WAXPBY
-
-//----------------------------------------------------------------------------
-
-template < typename ScalarB ,
-           typename ScalarW ,
-           class L , class D >
-struct Scale
-{
-private:
-
-  typedef View< ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
-  const ViewW    w ;
-  const ScalarB  beta ;
-
-public:
-
-  typedef typename ViewW::execution_space  execution_space ;
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i ) const
-  { w(i) *= beta ; }
-
-  template< class ArgW >
-  inline
-  Scale( const size_t n , const ScalarB & arg_beta , const ArgW & arg_w )
-    : w( arg_w )
-    , beta( arg_beta )
-  {
-    parallel_for( n , *this );
-  }
-};
-
-template < typename ScalarB ,
-           typename ScalarW ,
-           class L , class D >
-struct Fill
-{
-private:
-
-  typedef View< ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
-  const ViewW    w ;
-  const ScalarB  beta ;
-
-public:
-
-  typedef typename ViewW::execution_space  execution_space ;
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i ) const
-  { w(i) = beta ; }
-
-  template< class ArgW >
-  inline
-  Fill( const size_t n , const ScalarB & arg_beta , const ArgW & arg_w )
-    : w( arg_w )
-    , beta( arg_beta )
-  {
-    parallel_for( n , *this );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template < typename ScalarA ,
-           typename ScalarX ,
-           typename ScalarW ,
-           class L , class D >
-struct AXPY
-{
-private:
-
-  typedef View<       ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
-  typedef View< const ScalarX *, L , D , MemoryUnmanaged >  ViewX ;
-
-  const ViewW    w ;
-  const ViewX    x ;
-  const ScalarA  alpha ;
-
-public:
-
-  typedef typename ViewW::execution_space  execution_space ;
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i ) const
-  { w(i) += alpha * x(i); }
-
-  template< class ArgX , class ArgW >
-  inline
-  AXPY( const size_t  n ,
-        const ScalarA & arg_alpha ,
-        const ArgX    & arg_x ,
-        const ArgW    & arg_w )
-    : w( arg_w ), x( arg_x )
-    , alpha( arg_alpha )
-  {
-    parallel_for( n , *this );
-  }
-}; // AXPY
-
-template< typename ScalarX ,
-          typename ScalarB ,
-          typename ScalarW ,
-          class L , class D >
-struct XPBY
-{
-private:
-
-  typedef View<       ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
-  typedef View< const ScalarX *, L , D , MemoryUnmanaged >  ViewX ;
-
-  const ViewW    w ;
-  const ViewX    x ;
-  const ScalarB  beta ;
-
-public:
-
-  typedef typename ViewW::execution_space  execution_space ;
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i ) const
-  { w(i) = x(i) + beta * w(i); }
-
-  template< class ArgX , class ArgW >
-  inline
-  XPBY( const size_t  n ,
-        const ArgX    & arg_x ,
-        const ScalarB & arg_beta ,
-        const ArgW    & arg_w )
-    : w( arg_w ), x( arg_x )
-    , beta( arg_beta )
-  {
-    parallel_for( n , *this );
-  }
-}; // XPBY
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef USESCASES_LINALG_BLAS_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/Makefile
+++ b/lib/kokkos/example/multi_fem/Makefile
@ -1,53 +0,0 @@
-KOKKOS_PATH ?= ../..
-
-MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-SRC_DIR := $(dir $(MAKEFILE_PATH))
-
-SRC = $(wildcard $(SRC_DIR)/*.cpp)
-OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
-
-#SRC = $(wildcard *.cpp)
-#OBJ = $(SRC:%.cpp=%.o)
-
-default: build
-	echo "Start Build"
-
-# use installed Makefile.kokkos
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = $(NVCC_WRAPPER)
-CXXFLAGS = -I$(SRC_DIR) -I$(CUDA_PATH) -O3
-LINK = $(CXX)
-LINKFLAGS = -L$(CUDA_PATH)/lib64 -lcusparse
-EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "Cuda,OpenMP"
-#KOKKOS_ARCH = "SNB,Kepler35"
-else
-CXX = g++
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS =  
-EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "OpenMP"
-#KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-LIB =
-
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: 
-	rm -f *.a *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
-
--- a/lib/kokkos/example/multi_fem/Nonlinear.hpp
+++ b/lib/kokkos/example/multi_fem/Nonlinear.hpp
@ -1,573 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef HYBRIDFEM_NONLINEAR_HPP
-#define HYBRIDFEM_NONLINEAR_HPP
-
-#include <utility>
-#include <iostream>
-#include <iomanip>
-
-#include <Kokkos_Core.hpp>
-#include <SparseLinearSystem.hpp>
-#include <SparseLinearSystemFill.hpp>
-#include <NonlinearFunctors.hpp>
-
-#include <FEMesh.hpp>
-#include <HexElement.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace HybridFEM {
-namespace Nonlinear {
-
-struct PerformanceData {
-  double mesh_time ;
-  double graph_time ;
-  double elem_time ;
-  double matrix_gather_fill_time ;
-  double matrix_boundary_condition_time ;
-  double cg_iteration_time ;
-  size_t cg_iteration_count ;
-  size_t newton_iteration_count ;
-  double error_max ;
-
-  PerformanceData()
-    : mesh_time(0)
-    , graph_time(0)
-    , elem_time(0)
-    , matrix_gather_fill_time(0)
-    , matrix_boundary_condition_time(0)
-    , cg_iteration_time(0)
-    , cg_iteration_count(0)
-    , newton_iteration_count(0)
-    , error_max(0)
-    {}
-
-  void best( const PerformanceData & rhs )
-  {
-    mesh_time = std::min( mesh_time , rhs.mesh_time );
-    graph_time = std::min( graph_time , rhs.graph_time );
-    elem_time = std::min( elem_time , rhs.elem_time );
-    matrix_gather_fill_time = std::min( matrix_gather_fill_time , rhs.matrix_gather_fill_time );
-    matrix_boundary_condition_time = std::min( matrix_boundary_condition_time , rhs.matrix_boundary_condition_time );
-    cg_iteration_time = std::min( cg_iteration_time , rhs.cg_iteration_time );
-    cg_iteration_count = std::min( cg_iteration_count , rhs.cg_iteration_count );
-    newton_iteration_count = std::min( newton_iteration_count , rhs.newton_iteration_count );
-    error_max = std::min( error_max , rhs.error_max );
-  }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-class ManufacturedSolution {
-public:
-
-  // Manufactured solution for one dimensional nonlinear PDE
-  //
-  //  -K T_zz + T^2 = 0 ; T(zmin) = T_zmin ; T(zmax) = T_zmax
-  //
-  //  Has an analytic solution of the form:
-  //
-  //    T(z) = ( a ( z - zmin ) + b )^(-2) where K = 1 / ( 6 a^2 )
-  //
-  //  Given T_0 and T_L compute K for this analytic solution.
-  //
-  //  Two analytic solutions:
-  //
-  //    Solution with singularity:
-  //    , a( ( 1.0 / sqrt(T_zmax) + 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
-  //    , b( -1.0 / sqrt(T_zmin) )
-  //
-  //    Solution without singularity:
-  //    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
-  //    , b( 1.0 / sqrt(T_zmin) )
-
-  const double zmin ;
-  const double zmax ;
-  const double T_zmin ;
-  const double T_zmax ;
-  const double a ;
-  const double b ;
-  const double K ;
-
-  ManufacturedSolution( const double arg_zmin ,
-                        const double arg_zmax ,
-                        const double arg_T_zmin ,
-                        const double arg_T_zmax )
-    : zmin( arg_zmin )
-    , zmax( arg_zmax )
-    , T_zmin( arg_T_zmin )
-    , T_zmax( arg_T_zmax )
-    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
-    , b( 1.0 / sqrt(T_zmin) )
-    , K( 1.0 / ( 6.0 * a * a ) )
-    {}
-
-  double operator()( const double z ) const
-  {
-    const double tmp = a * ( z - zmin ) + b ;
-    return 1.0 / ( tmp * tmp );
-  }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class FixtureType >
-PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
-                     const int , // global_max_x ,
-                     const int , // global_max_y ,
-                     const int global_max_z ,
-                     const bool print_error )
-{
-  typedef Scalar                              scalar_type ;
-  typedef FixtureType                         fixture_type ;
-  typedef typename fixture_type::execution_space  execution_space;
-  //typedef typename execution_space::size_type     size_type ; // unused
-
-  typedef typename fixture_type::FEMeshType mesh_type ;
-  typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;
-
-  enum { ElementNodeCount = fixture_type::element_node_count };
-
-  const comm::Machine machine = mesh.parallel_data_map.machine ;
-
-  const size_t element_count = mesh.elem_node_ids.dimension_0();
-
-  //------------------------------------
-  // The amount of nonlinearity is proportional to the ratio
-  // between T(zmax) and T(zmin).  For the manufactured solution
-  // 0 < T(zmin) and 0 < T(zmax)
-
-  const ManufacturedSolution
-    exact_solution( /* zmin */ 0 ,
-                    /* zmax */ global_max_z ,
-                    /* T(zmin) */ 1 ,
-                    /* T(zmax) */ 20 );
-
-  //-----------------------------------
-  // Convergence Criteria and perf data:
-
-  const size_t cg_iteration_limit = 200 ;
-  const double cg_tolerance = 1e-14 ;
-
-  const size_t newton_iteration_limit = 150 ;
-  const double newton_tolerance = 1e-14 ;
-
-  size_t cg_iteration_count_total = 0 ;
-  double cg_iteration_time = 0 ;
-
-  size_t newton_iteration_count = 0 ;
-  double residual_norm_init = 0 ;
-  double residual_norm = 0 ;
-
-  PerformanceData perf_data ;
-
-  //------------------------------------
-  // Sparse linear system types:
-
-  typedef Kokkos::View< scalar_type* , execution_space >     vector_type ;
-  typedef Kokkos::CrsMatrix< scalar_type , execution_space >  matrix_type ;
-  typedef typename matrix_type::graph_type                matrix_graph_type ;
-  typedef typename matrix_type::coefficients_type         matrix_coefficients_type ;
-
-  typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;
-
-  //------------------------------------
-  // Problem setup types:
-
-  typedef ElementComputation < mesh_type , scalar_type > ElementFunctor ;
-  typedef DirichletSolution  < mesh_type , scalar_type > DirichletSolutionFunctor ;
-  typedef DirichletResidual  < mesh_type , scalar_type > DirichletResidualFunctor ;
-
-  typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
-  typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;
-
-  typedef GatherFill< matrix_type ,
-                      mesh_type ,
-                      elem_matrices_type ,
-                      elem_vectors_type > GatherFillFunctor ;
-
-  //------------------------------------
-
-  matrix_type jacobian ;
-  vector_type residual ;
-  vector_type delta ;
-  vector_type nodal_solution ;
-
-  typename graph_factory::element_map_type element_map ;
-
-  //------------------------------------
-  // Generate mesh and corresponding sparse matrix graph
-
-  Kokkos::Impl::Timer wall_clock ;
-
-  //------------------------------------
-  // Generate sparse matrix graph and element->graph map.
-
-  wall_clock.reset();
-
-  graph_factory::create( mesh , jacobian.graph , element_map );
-
-  execution_space::fence();
-
-  perf_data.graph_time = comm::max( machine , wall_clock.seconds() );
-
-  //------------------------------------
-  // Allocate linear system coefficients and rhs:
-
-  const size_t local_owned_length = jacobian.graph.row_map.dimension_0() - 1 ;
-  const size_t local_total_length = mesh.node_coords.dimension_0();
-
-  jacobian.coefficients =
-    matrix_coefficients_type( "jacobian_coeff" , jacobian.graph.entries.dimension_0() );
-
-  // Nonlinear residual for owned nodes:
-  residual = vector_type( "residual" , local_owned_length );
-
-  // Nonlinear solution for owned and ghosted nodes:
-  nodal_solution = vector_type( "solution" , local_total_length );
-
-  // Nonlinear solution update for owned nodes:
-  delta = vector_type( "delta" , local_owned_length );
-
-  //------------------------------------
-  // Allocation of arrays to fill the linear system
-
-  elem_matrices_type elem_matrices ; // Jacobian matrices
-  elem_vectors_type  elem_vectors ;  // Residual vectors
-
-  if ( element_count ) {
-    elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
-    elem_vectors = elem_vectors_type( std::string("elem_vectors"), element_count );
-  }
-
-  //------------------------------------
-  // For boundary condition set the correct values in the solution vector
-  //   The 'zmin' face is assigned to 'T_zmin'.
-  //   The 'zmax' face is assigned to 'T_zmax'.
-  //   The resulting solution is one dimensional along the 'Z' axis.
-
-  DirichletSolutionFunctor::apply( nodal_solution , mesh ,
-                                   exact_solution.zmin ,
-                                   exact_solution.zmax ,
-                                   exact_solution.T_zmin ,
-                                   exact_solution.T_zmax );
-
-  for(;;) { // Nonlinear loop
-
-#if defined( KOKKOS_HAVE_MPI )
-
-    { //------------------------------------
-      // Import off-processor nodal solution values
-      // for residual and jacobian computations
-
-      Kokkos::AsyncExchange< typename vector_type::value_type , execution_space ,
-                                  Kokkos::ParallelDataMap >
-        exchange( mesh.parallel_data_map , 1 );
-
-      Kokkos::PackArray< vector_type >
-        ::pack( exchange.buffer() ,
-                mesh.parallel_data_map.count_interior ,
-                mesh.parallel_data_map.count_send ,
-                nodal_solution );
-
-      exchange.setup();
-
-      exchange.send_receive();
-
-      Kokkos::UnpackArray< vector_type >
-        ::unpack( nodal_solution , exchange.buffer() ,
-                  mesh.parallel_data_map.count_owned ,
-                  mesh.parallel_data_map.count_receive );
-    }
-
-#endif
-
-    //------------------------------------
-    // Compute element matrices and vectors:
-
-    wall_clock.reset();
-
-    ElementFunctor( mesh ,
-                    elem_matrices ,
-                    elem_vectors ,
-                    nodal_solution ,
-                    exact_solution.K );
-
-    execution_space::fence();
-    perf_data.elem_time += comm::max( machine , wall_clock.seconds() );
-
-    //------------------------------------
-    // Fill linear system coefficients:
-
-    wall_clock.reset();
-
-    fill( jacobian.coefficients.dimension_0(), 0 , jacobian.coefficients );
-    fill( residual.dimension_0() , 0 , residual );
-
-    GatherFillFunctor::apply( jacobian ,
-                              residual ,
-                              mesh ,
-                              element_map ,
-                              elem_matrices ,
-                              elem_vectors );
-
-    execution_space::fence();
-    perf_data.matrix_gather_fill_time += comm::max( machine , wall_clock.seconds() );
-
-    // Apply boundary conditions:
-
-    wall_clock.reset();
-
-    // Updates jacobian matrix to 1 on the diagonal, zero elsewhere,
-    // and 0 in the residual due to the solution vector having the correct value
-    DirichletResidualFunctor::apply( jacobian, residual, mesh ,
-                                     exact_solution.zmin ,
-                                     exact_solution.zmax );
-
-    execution_space::fence();
-    perf_data.matrix_boundary_condition_time +=
-      comm::max( machine , wall_clock.seconds() );
-
-    //------------------------------------
-    // Has the residual converged?
-
-    residual_norm = norm2( mesh.parallel_data_map.count_owned,
-                           residual,
-                           mesh.parallel_data_map.machine );
-
-    if ( 0 == newton_iteration_count ) {
-      residual_norm_init = residual_norm ;
-    }
-
-    if ( residual_norm / residual_norm_init < newton_tolerance ) {
-      break ;
-    }
-
-    //------------------------------------
-    // Solve linear sytem
-
-    size_t cg_iteration_count = 0 ;
-    double cg_residual_norm = 0 ;
-
-    cgsolve( mesh.parallel_data_map ,
-             jacobian , residual , delta ,
-             cg_iteration_count ,
-             cg_residual_norm ,
-             cg_iteration_time ,
-             cg_iteration_limit , cg_tolerance ) ;
-
-    perf_data.cg_iteration_time += cg_iteration_time ;
-    cg_iteration_count_total += cg_iteration_count ;
-
-    // Update non-linear solution with delta...
-    // delta is : - Dx = [Jacobian]^1 * Residual which is the negative update
-    // LaTeX:
-    // \vec {x}_{n+1} = \vec {x}_{n} - ( - \Delta \vec{x}_{n} )
-    // text:
-    // x[n+1] = x[n] + Dx
-
-    axpy( mesh.parallel_data_map.count_owned ,
-          -1.0, delta, nodal_solution);
-
-    ++newton_iteration_count ;
-
-    if ( newton_iteration_limit < newton_iteration_count ) {
-      break ;
-    }
-  };
-
-  if ( newton_iteration_count ) {
-    perf_data.elem_time /= newton_iteration_count ;
-    perf_data.matrix_gather_fill_time /= newton_iteration_count ;
-    perf_data.matrix_boundary_condition_time /= newton_iteration_count ;
-  }
-
-  if ( cg_iteration_count_total ) {
-    perf_data.cg_iteration_time /= cg_iteration_count_total ;
-  }
-
-  perf_data.newton_iteration_count = newton_iteration_count ;
-  perf_data.cg_iteration_count = cg_iteration_count_total ;
-
-  //------------------------------------
-
-  {
-    // For extracting the nodal solution and its coordinates:
-
-    typename mesh_type::node_coords_type::HostMirror node_coords_host =
-      Kokkos::create_mirror( mesh.node_coords );
-
-    typename vector_type::HostMirror nodal_solution_host =
-      Kokkos::create_mirror( nodal_solution );
-
-    Kokkos::deep_copy( node_coords_host , mesh.node_coords );
-    Kokkos::deep_copy( nodal_solution_host , nodal_solution );
-
-    double tmp = 0 ;
-
-    for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
-      const coordinate_scalar_type x = node_coords_host(i,0);
-      const coordinate_scalar_type y = node_coords_host(i,1);
-      const coordinate_scalar_type z = node_coords_host(i,2);
-
-      const double Tx = exact_solution(z);
-      const double Ts = nodal_solution_host(i);
-      const double Te = std::abs( Tx - Ts ) / std::abs( Tx );
-
-      tmp = std::max( tmp , Te );
-
-      if ( print_error && 0.02 < Te ) {
-        std::cout << "  node( " << x << " " << y << " " << z << " ) = "
-                  << Ts << " != exact_solution " << Tx
-                  << std::endl ;
-      }
-    }
-    perf_data.error_max = comm::max( machine , tmp );
-  }
-
-  return perf_data ;
-}
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class Device , class FixtureElement >
-void driver( const char * const label ,
-             comm::Machine machine ,
-             const int gang_count ,
-             const int elem_count_beg ,
-             const int elem_count_end ,
-             const int runs )
-{
-  typedef Scalar          scalar_type ;
-  typedef Device          execution_space ;
-  typedef double          coordinate_scalar_type ;
-  typedef FixtureElement  fixture_element_type ;
-
-  typedef BoxMeshFixture< coordinate_scalar_type ,
-                          execution_space ,
-                          fixture_element_type > fixture_type ;
-
-  typedef typename fixture_type::FEMeshType mesh_type ;
-
-  const size_t proc_count = comm::size( machine );
-  const size_t proc_rank  = comm::rank( machine );
-
-  if ( elem_count_beg == 0 || elem_count_end == 0 || runs == 0 ) return ;
-
-  if ( comm::rank( machine ) == 0 ) {
-    std::cout << std::endl ;
-    std::cout << "\"Kokkos::HybridFE::Nonlinear " << label << "\"" << std::endl;
-    std::cout
-      << "\"Size\" ,  \"Size\" ,  \"Graphing\" , \"Element\" ,  \"Fill\" ,     \"Boundary\" , \"CG-Iter\" , \"CG-Iter\" ,      \"Newton-Iter\" , \"Max-node-error\""
-      << std::endl
-      << "\"elems\" , \"nodes\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"total-count\" , \"total-count\" , \"ratio\""
-      << std::endl ;
-  }
-
-  const bool print_sample = 0 ;
-  const double x_curve = 1.0 ;
-  const double y_curve = 1.0 ;
-  const double z_curve = 0.8 ;
-
-  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
-  {
-    const int ix = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
-    const int iy = 1 + ix ;
-    const int iz = 2 * iy ;
-    const int global_elem_count = ix * iy * iz ;
-    const int global_node_count = ( 2 * ix + 1 ) *
-                                  ( 2 * iy + 1 ) *
-                                  ( 2 * iz + 1 );
-
-    mesh_type mesh =
-      fixture_type::create( proc_count , proc_rank , gang_count ,
-                            ix , iy , iz ,
-                            x_curve , y_curve , z_curve );
-
-    mesh.parallel_data_map.machine = machine ;
-
-
-    PerformanceData perf_data , perf_best ;
-
-    for(int j = 0; j < runs; j++){
-
-      perf_data = run<scalar_type,fixture_type>(mesh,ix,iy,iz, print_sample );
-
-      if( j == 0 ) {
-        perf_best = perf_data ;
-      }
-      else {
-        perf_best.best( perf_data );
-      }
-    }
-
-    if ( comm::rank( machine ) == 0 ) {
-
-      std::cout << std::setw(8) << global_elem_count << " , "
-                << std::setw(8) << global_node_count << " , "
-                << std::setw(10) << perf_best.graph_time * 1000 << " , "
-                << std::setw(10) << perf_best.elem_time * 1000 << " , "
-                << std::setw(10) << perf_best.matrix_gather_fill_time * 1000 << " , "
-                << std::setw(10) << perf_best.matrix_boundary_condition_time * 1000 << " , "
-                << std::setw(10) << perf_best.cg_iteration_time * 1000 << " , "
-                << std::setw(7) << perf_best.cg_iteration_count << " , "
-                << std::setw(3) << perf_best.newton_iteration_count << " , "
-                << std::setw(10) << perf_best.error_max
-                << std::endl ;
-    }
-  }
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace Nonlinear */
-} /* namespace HybridFEM */
-
-
-#endif /* #ifndef HYBRIDFEM_IMPLICIT_HPP */
-
--- a/lib/kokkos/example/multi_fem/NonlinearElement_Cuda.hpp
+++ b/lib/kokkos/example/multi_fem/NonlinearElement_Cuda.hpp
@ -1,390 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-#include <stdio.h>
-
-#include <iostream>
-#include <fstream>
-#include <iomanip>
-#include <cstdlib>
-#include <cmath>
-
-#include <Kokkos_Core.hpp>
-#include <HexElement.hpp>
-#include <FEMesh.hpp>
-
-namespace HybridFEM {
-namespace Nonlinear {
-
-template< class MeshType , typename ScalarType > struct ElementComputation ;
-
-//----------------------------------------------------------------------------
-
-template<>
-struct ElementComputation< FEMesh< double , 27 , Kokkos::Cuda > , double >
-{
-  typedef Kokkos::Cuda    execution_space ;
-
-  static const unsigned ElementNodeCount = 27 ;
-
-  typedef HexElement_Data< ElementNodeCount >                element_data_type ;
-  typedef FEMesh< double , ElementNodeCount , execution_space >  mesh_type ;
-
-  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
-  static const unsigned FunctionCount    = element_data_type::function_count ;
-  static const unsigned IntegrationCount = element_data_type::integration_count ;
-  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
-
-  typedef Kokkos::View< double[][FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
-  typedef Kokkos::View< double[][FunctionCount] , execution_space > elem_vectors_type ;
-  typedef Kokkos::View< double[] , execution_space > value_vector_type ;
-
-private:
-
-  const element_data_type                       elem_data ;
-  const typename mesh_type::elem_node_ids_type  elem_node_ids ;
-  const typename mesh_type::node_coords_type    node_coords ;
-  const value_vector_type                       nodal_values ;
-  const elem_matrices_type                      element_matrices ;
-  const elem_vectors_type                       element_vectors ;
-  const float                                   coeff_K ;
-  const unsigned                                elem_count ;
-        unsigned                                invJacIndex[9][4] ;
-
-  static const unsigned j11 = 0 , j12 = 1 , j13 = 2 ,
-                        j21 = 3 , j22 = 4 , j23 = 5 ,
-                        j31 = 6 , j32 = 7 , j33 = 8 ;
-
-  // Can only handle up to 16 warps:
-  static const unsigned BlockDimX = 32 ;
-  static const unsigned BlockDimY = 7 ;
-
-  struct WorkSpace {
-    double sum[ BlockDimY ][ BlockDimX ];
-
-    double  value_at_integ[ IntegrationCount ];
-    double  gradx_at_integ[ IntegrationCount ];
-    double  grady_at_integ[ IntegrationCount ];
-    double  gradz_at_integ[ IntegrationCount ];
-
-    float  spaceJac[    BlockDimY ][ 9 ];
-    float  spaceInvJac[ BlockDimY ][ 9 ];
-
-    float  detJweight[ IntegrationCount ];
-
-    float  dpsidx[ FunctionCount ][ IntegrationCount ];
-    float  dpsidy[ FunctionCount ][ IntegrationCount ];
-    float  dpsidz[ FunctionCount ][ IntegrationCount ];
-  };
-
-public:
-
-  ElementComputation ( const mesh_type          & arg_mesh ,
-                       const elem_matrices_type & arg_element_matrices ,
-                       const elem_vectors_type  & arg_element_vectors ,
-                       const value_vector_type  & arg_nodal_values ,
-                       const float                arg_coeff_K )
-  : elem_data()
-  , elem_node_ids(    arg_mesh.elem_node_ids )
-  , node_coords(      arg_mesh.node_coords )
-  , nodal_values(     arg_nodal_values )
-  , element_matrices( arg_element_matrices )
-  , element_vectors(  arg_element_vectors )
-  , coeff_K(          arg_coeff_K )
-  , elem_count(       arg_mesh.elem_node_ids.dimension_0() )
-  {
-    const unsigned jInvJ[9][4] = 
-     { { j22 , j33 , j23 , j32 } ,
-       { j13 , j32 , j12 , j33 } ,
-       { j12 , j23 , j13 , j22 } ,
-
-       { j23 , j31 , j21 , j33 } ,
-       { j11 , j33 , j13 , j31 } ,
-       { j13 , j21 , j11 , j23 } ,
-
-       { j21 , j32 , j22 , j31 } ,
-       { j12 , j31 , j11 , j32 } ,
-       { j11 , j22 , j12 , j21 } };
-
-    for ( unsigned i = 0 ; i < 9 ; ++i ) {
-    for ( unsigned j = 0 ; j < 4 ; ++j ) {
-      invJacIndex[i][j] = jInvJ[i][j] ;
-    }
-    }
-
-    const unsigned shmem = sizeof(WorkSpace);
-    const unsigned grid_max = 65535 ;
-    const unsigned grid_count = std::min( grid_max , elem_count );
-
-    // For compute capability 2.x up to 1024 threads per block
-    const dim3 block( BlockDimX , BlockDimY , 1 );
-    const dim3 grid( grid_count , 1 , 1 );
-
-    Kokkos::Impl::CudaParallelLaunch< ElementComputation >( *this , grid , block , shmem );
-  }
-
-public:
-
-  //------------------------------------
-  // Sum among the threadIdx.x 
-
-  template< typename Type >
-  __device__ inline static
-  void sum_x( Type & result , const double value )
-  {
-    extern __shared__ WorkSpace work_data[] ;
-
-    volatile double * const base_sum =
-      & work_data->sum[ threadIdx.y ][ threadIdx.x ] ;
-
-    base_sum[ 0] = value ;
-
-    if ( threadIdx.x < 16 ) {
-      base_sum[0] += base_sum[16];
-      base_sum[0] += base_sum[ 8];
-      base_sum[0] += base_sum[ 4];
-      base_sum[0] += base_sum[ 2];
-      base_sum[0] += base_sum[ 1];
-    }
-
-    if ( 0 == threadIdx.x ) {
-      result = base_sum[0] ;
-    }
-  }
-
-  __device__ inline static
-  void sum_x_clear()
-  {
-    extern __shared__ WorkSpace work_data[] ;
-
-    work_data->sum[ threadIdx.y ][ threadIdx.x ] = 0 ;
-  }
-
-  //------------------------------------
-  //------------------------------------
-
-  __device__ inline
-  void evaluateFunctions( const unsigned ielem ) const
-  {
-    extern __shared__ WorkSpace work_data[] ;
-
-    // Each warp (threadIdx.y) computes an integration point
-    // Each thread is responsible for a node / function.
-
-    const unsigned iFunc = threadIdx.x ;
-    const bool     hasFunc = iFunc < FunctionCount ;
-
-    //------------------------------------
-    // Each warp gathers a different variable into 'elem_mat' shared memory.
-
-    if ( hasFunc ) {
-
-      const unsigned node = elem_node_ids( ielem , iFunc );
-
-      for ( unsigned iy = threadIdx.y ; iy < 4 ; iy += blockDim.y ) {
-      switch( iy ) {
-      case 0 : work_data->sum[0][iFunc] = node_coords(node,0); break ;
-      case 1 : work_data->sum[1][iFunc] = node_coords(node,1); break ;
-      case 2 : work_data->sum[2][iFunc] = node_coords(node,2); break ;
-      case 3 : work_data->sum[3][iFunc] = nodal_values(node); break ;
-      default: break ;
-      }
-      }
-    }
-
-    __syncthreads(); // Wait for all warps to finish gathering
-
-    // now get local 'const' copies in register space:
-
-    const double x       = work_data->sum[0][ iFunc ];
-    const double y       = work_data->sum[1][ iFunc ];
-    const double z       = work_data->sum[2][ iFunc ];
-    const double dof_val = work_data->sum[3][ iFunc ];
-
-    __syncthreads(); // Wait for all warps to finish extracting
-
-    sum_x_clear(); // Make sure summation scratch is zero
-
-    //------------------------------------
-    // Each warp is now on its own computing an integration point
-    // so no further explicit synchronizations are required.
-
-    if ( hasFunc ) {
-
-      float * const J    = work_data->spaceJac[    threadIdx.y ];
-      float * const invJ = work_data->spaceInvJac[ threadIdx.y ];
-
-      for ( unsigned iInt = threadIdx.y ;
-                     iInt < IntegrationCount ; iInt += blockDim.y ) {
-
-        const float val = elem_data.values[iInt][iFunc] ;
-        const float gx  = elem_data.gradients[iInt][0][iFunc] ;
-        const float gy  = elem_data.gradients[iInt][1][iFunc] ;
-        const float gz  = elem_data.gradients[iInt][2][iFunc] ;
-
-        sum_x( J[j11], gx * x );
-        sum_x( J[j12], gx * y );
-        sum_x( J[j13], gx * z );
-
-        sum_x( J[j21], gy * x );
-        sum_x( J[j22], gy * y );
-        sum_x( J[j23], gy * z );
-
-        sum_x( J[j31], gz * x );
-        sum_x( J[j32], gz * y );
-        sum_x( J[j33], gz * z );
-
-        // Inverse jacobian, only enough parallel work for 9 threads in the warp
-
-        if ( iFunc < TensorDim ) {
-
-          invJ[ iFunc ] =
-            J[ invJacIndex[iFunc][0] ] * J[ invJacIndex[iFunc][1] ] -
-            J[ invJacIndex[iFunc][2] ] * J[ invJacIndex[iFunc][3] ] ;
-
-          // Let all threads in the warp compute determinant into a register
-
-          const float detJ = J[j11] * invJ[j11] +
-                             J[j21] * invJ[j12] +
-                             J[j31] * invJ[j13] ;
-
-          invJ[ iFunc ] /= detJ ;
-
-          if ( 0 == iFunc ) {
-            work_data->detJweight[ iInt ] = detJ * elem_data.weights[ iInt ] ;
-          }
-        }
-
-        // Transform bases gradients and compute value and gradient
-
-        const float dx = gx * invJ[j11] + gy * invJ[j12] + gz * invJ[j13];
-        const float dy = gx * invJ[j21] + gy * invJ[j22] + gz * invJ[j23];
-        const float dz = gx * invJ[j31] + gy * invJ[j32] + gz * invJ[j33];
-
-        work_data->dpsidx[iFunc][iInt] = dx ;
-        work_data->dpsidy[iFunc][iInt] = dy ;
-        work_data->dpsidz[iFunc][iInt] = dz ;
-
-        sum_x( work_data->gradx_at_integ[iInt] , dof_val * dx );
-        sum_x( work_data->grady_at_integ[iInt] , dof_val * dy );
-        sum_x( work_data->gradz_at_integ[iInt] , dof_val * dz );
-        sum_x( work_data->value_at_integ[iInt] , dof_val * val );
-      }
-    }
-
-    __syncthreads(); // All shared data must be populated at return.
-  }
-
-  __device__ inline
-  void contributeResidualJacobian( const unsigned ielem ) const
-  {
-    extern __shared__ WorkSpace work_data[] ;
-
-    sum_x_clear(); // Make sure summation scratch is zero
-
-    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$ 
-    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$ 
-
-    const unsigned iInt = threadIdx.x ;
-
-    if ( iInt < IntegrationCount ) {
-
-      const double value_at_integ = work_data->value_at_integ[ iInt ] ;
-      const double gradx_at_integ = work_data->gradx_at_integ[ iInt ] ;
-      const double grady_at_integ = work_data->grady_at_integ[ iInt ] ;
-      const double gradz_at_integ = work_data->gradz_at_integ[ iInt ] ;
-
-      const float detJweight     = work_data->detJweight[ iInt ] ;
-      const float coeff_K_detJweight = coeff_K * detJweight ;
-
-      for ( unsigned iRow = threadIdx.y ;
-                     iRow < FunctionCount ; iRow += blockDim.y ) {
-
-        const float value_row  = elem_data.values[ iInt ][ iRow ] * detJweight ;
-        const float dpsidx_row = work_data->dpsidx[ iRow ][ iInt ] * coeff_K_detJweight ;
-        const float dpsidy_row = work_data->dpsidy[ iRow ][ iInt ] * coeff_K_detJweight ;
-        const float dpsidz_row = work_data->dpsidz[ iRow ][ iInt ] * coeff_K_detJweight ;
-
-        const double res_del = dpsidx_row * gradx_at_integ +
-                               dpsidy_row * grady_at_integ +
-                               dpsidz_row * gradz_at_integ ;
-
-        const double res_val = value_at_integ * value_at_integ * value_row ;
-        const double jac_val_row = 2 * value_at_integ * value_row ;
-
-        sum_x( element_vectors( ielem , iRow ) , res_del + res_val );
-
-        for ( unsigned iCol = 0 ; iCol < FunctionCount ; ++iCol ) {
-
-          const float jac_del = 
-            dpsidx_row * work_data->dpsidx[iCol][iInt] +
-            dpsidy_row * work_data->dpsidy[iCol][iInt] +
-            dpsidz_row * work_data->dpsidz[iCol][iInt] ;
-
-          const double jac_val =
-            jac_val_row * elem_data.values[ iInt ][ iCol ] ;
-
-          sum_x( element_matrices( ielem , iRow , iCol ) , jac_del + jac_val );
-        }
-      }
-    }
-
-    __syncthreads(); // All warps finish before refilling shared data 
-  }
-
-  __device__ inline
-  void operator()(void) const
-  {
-    extern __shared__ WorkSpace work_data[] ;
-
-    for ( unsigned ielem = blockIdx.x ; ielem < elem_count ; ielem += gridDim.x ) {
-
-      evaluateFunctions( ielem );
-
-      contributeResidualJacobian( ielem );
-    }
-  }
-
-}; /* ElementComputation */
-
-} /* namespace Nonlinear */
-} /* namespace HybridFEM */
-
--- a/lib/kokkos/example/multi_fem/NonlinearFunctors.hpp
+++ b/lib/kokkos/example/multi_fem/NonlinearFunctors.hpp
@ -1,482 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_NONLINEARFUNCTORS_HPP
-#define KOKKOS_NONLINEARFUNCTORS_HPP
-
-#include <iostream>
-#include <fstream>
-#include <iomanip>
-#include <cstdlib>
-#include <cmath>
-
-namespace HybridFEM {
-namespace Nonlinear {
-
-template< class MeshType , typename ScalarType > struct ElementComputation ;
-template< class MeshType , typename ScalarType > struct DirichletSolution ;
-template< class MeshType , typename ScalarType > struct DirichletResidual ;
-
-}
-}
-
-/* A Cuda-specific specialization for the element computation functor. */
-#if defined( __CUDACC__ )
-#include <NonlinearElement_Cuda.hpp>
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace HybridFEM {
-namespace Nonlinear {
-
-template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
-          typename ScalarType >
-struct ElementComputation<
-  FEMesh< ScalarCoordType , ElemNode , DeviceType > , ScalarType >
-{
-  typedef DeviceType  execution_space;
-  typedef ScalarType           scalar_type ;
-
-  static const unsigned ElementNodeCount = ElemNode ;
-
-  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
-
-  typedef HexElement_Data< ElementNodeCount > element_data_type ;
-
-  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
-  static const unsigned FunctionCount    = element_data_type::function_count ;
-  static const unsigned IntegrationCount = element_data_type::integration_count ;
-  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
-
-  typedef Kokkos::View< scalar_type[][FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
-  typedef Kokkos::View< scalar_type[][FunctionCount] , execution_space > elem_vectors_type ;
-  typedef Kokkos::View< scalar_type[] , execution_space > value_vector_type ;
-
-
-private:
-
-  const element_data_type                 elem_data ;
-  typename mesh_type::elem_node_ids_type  elem_node_ids ;
-  typename mesh_type::node_coords_type    node_coords ;
-  value_vector_type                       nodal_values ;
-  elem_matrices_type                      element_matrices ;
-  elem_vectors_type                       element_vectors ;
-  scalar_type                             coeff_K ;
-
-public:
-
-  ElementComputation( const mesh_type   & arg_mesh ,
-                      const elem_matrices_type  & arg_element_matrices ,
-                      const elem_vectors_type   & arg_element_vectors ,
-                      const value_vector_type   & arg_nodal_values ,
-	              const scalar_type   arg_coeff_K )
-  : elem_data()
-  , elem_node_ids( arg_mesh.elem_node_ids )
-  , node_coords(   arg_mesh.node_coords )
-  , nodal_values(   arg_nodal_values )
-  , element_matrices( arg_element_matrices )
-  , element_vectors( arg_element_vectors )
-  , coeff_K( arg_coeff_K )
-  {
-    const size_t elem_count = arg_mesh.elem_node_ids.dimension_0();
-
-    parallel_for( elem_count , *this );
-  }
-
-  //------------------------------------
-
-  static const unsigned FLOPS_transform_gradients =
-     /* Jacobian */           FunctionCount * TensorDim * 2 +
-     /* Inverse jacobian */   TensorDim * 6 + 6 +
-     /* Gradient transform */ FunctionCount * 15 ;
-
-  KOKKOS_INLINE_FUNCTION
-  float transform_gradients(
-    const float grad[][ FunctionCount ] , // Gradient of bases master element
-    const double x[] ,
-    const double y[] ,
-    const double z[] ,
-    float dpsidx[] ,
-    float dpsidy[] ,
-    float dpsidz[] ) const
-  {
-    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
-           j21 = 3 , j22 = 4 , j23 = 5 ,
-           j31 = 6 , j32 = 7 , j33 = 8 };
-
-    // Jacobian accumulation:
-
-    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
-
-    for( unsigned i = 0; i < FunctionCount ; ++i ) {
-      const double x1 = x[i] ;
-      const double x2 = y[i] ;
-      const double x3 = z[i] ;
-
-      const float g1 = grad[0][i] ;
-      const float g2 = grad[1][i] ;
-      const float g3 = grad[2][i] ;
-
-      J[j11] += g1 * x1 ;
-      J[j12] += g1 * x2 ;
-      J[j13] += g1 * x3 ;
-
-      J[j21] += g2 * x1 ;
-      J[j22] += g2 * x2 ;
-      J[j23] += g2 * x3 ;
-
-      J[j31] += g3 * x1 ;
-      J[j32] += g3 * x2 ;
-      J[j33] += g3 * x3 ;
-    }
-
-    // Inverse jacobian:
-
-    float invJ[ TensorDim ] = {
-      static_cast<float>( J[j22] * J[j33] - J[j23] * J[j32] ) ,
-      static_cast<float>( J[j13] * J[j32] - J[j12] * J[j33] ) ,
-      static_cast<float>( J[j12] * J[j23] - J[j13] * J[j22] ) ,
-
-      static_cast<float>( J[j23] * J[j31] - J[j21] * J[j33] ) ,
-      static_cast<float>( J[j11] * J[j33] - J[j13] * J[j31] ) ,
-      static_cast<float>( J[j13] * J[j21] - J[j11] * J[j23] ) ,
-
-      static_cast<float>( J[j21] * J[j32] - J[j22] * J[j31] ) ,
-      static_cast<float>( J[j12] * J[j31] - J[j11] * J[j32] ) ,
-      static_cast<float>( J[j11] * J[j22] - J[j12] * J[j21] ) };
-
-    const float detJ = J[j11] * invJ[j11] +
-                       J[j21] * invJ[j12] +
-                       J[j31] * invJ[j13] ;
-
-    const float detJinv = 1.0 / detJ ;
-
-    for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
-
-    // Transform gradients:
-
-    for( unsigned i = 0; i < FunctionCount ; ++i ) {
-      const float g0 = grad[0][i];
-      const float g1 = grad[1][i];
-      const float g2 = grad[2][i];
-
-      dpsidx[i] = g0 * invJ[j11] + g1 * invJ[j12] + g2 * invJ[j13];
-      dpsidy[i] = g0 * invJ[j21] + g1 * invJ[j22] + g2 * invJ[j23];
-      dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33];
-    }
-
-    return detJ ;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void contributeResidualJacobian(
-    const float coeff_k ,
-    const double dof_values[] ,
-    const float dpsidx[] ,
-    const float dpsidy[] ,
-    const float dpsidz[] ,
-    const float detJ ,
-    const float integ_weight ,
-    const float bases_vals[] ,
-    double elem_res[] ,
-    double elem_mat[][ FunctionCount ] ) const
-  {
-    double value_at_pt = 0 ;
-    double gradx_at_pt = 0 ;
-    double grady_at_pt = 0 ;
-    double gradz_at_pt = 0 ;
-
-    for ( unsigned m = 0 ; m < FunctionCount ; m++ ) {
-      value_at_pt += dof_values[m] * bases_vals[m] ;
-      gradx_at_pt += dof_values[m] * dpsidx[m] ;
-      grady_at_pt += dof_values[m] * dpsidy[m] ;
-      gradz_at_pt += dof_values[m] * dpsidz[m] ;
-    }
-
-    const scalar_type k_detJ_weight = coeff_k        * detJ * integ_weight ;
-    const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ;
-    const double mat_val = 2.0 * value_at_pt         * detJ * integ_weight ;
-
-    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
-    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
-
-    for ( unsigned m = 0; m < FunctionCount; m++) {
-      double * const mat = elem_mat[m] ;
-      const float bases_val_m = bases_vals[m];
-      const float dpsidx_m    = dpsidx[m] ;
-      const float dpsidy_m    = dpsidy[m] ;
-      const float dpsidz_m    = dpsidz[m] ;
-
-      elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt +
-                                       dpsidy_m * grady_at_pt +
-                                       dpsidz_m * gradz_at_pt ) +
-                     res_val * bases_val_m ;
-
-      for( unsigned n = 0; n < FunctionCount; n++) {
-
-        mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] +
-                                    dpsidy_m * dpsidy[n] +
-                                    dpsidz_m * dpsidz[n] ) +
-                  mat_val * bases_val_m * bases_vals[n];
-      }
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned ielem ) const
-  {
-    // Gather nodal coordinates and solution vector:
-
-    double x[ FunctionCount ] ;
-    double y[ FunctionCount ] ;
-    double z[ FunctionCount ] ;
-    double val[ FunctionCount ] ;
-
-    for ( unsigned i = 0 ; i < ElementNodeCount ; ++i ) {
-      const unsigned node_index = elem_node_ids( ielem , i );
-
-      x[i] = node_coords( node_index , 0 );
-      y[i] = node_coords( node_index , 1 );
-      z[i] = node_coords( node_index , 2 );
-
-      val[i] = nodal_values( node_index );
-    }
-
-    double elem_vec[ FunctionCount ] ;
-    double elem_mat[ FunctionCount ][ FunctionCount ] ;
-
-    for( unsigned i = 0; i < FunctionCount ; i++ ) {
-      elem_vec[i] = 0 ;
-      for( unsigned j = 0; j < FunctionCount ; j++){
-        elem_mat[i][j] = 0 ;
-      }
-    }
-
-    for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) {
-      float dpsidx[ FunctionCount ] ;
-      float dpsidy[ FunctionCount ] ;
-      float dpsidz[ FunctionCount ] ;
-
-      const float detJ =
-        transform_gradients( elem_data.gradients[i] , x , y , z ,
-                             dpsidx , dpsidy , dpsidz );
-
-      contributeResidualJacobian( coeff_K ,
-                                  val , dpsidx , dpsidy , dpsidz ,
-                                  detJ ,
-                                  elem_data.weights[i] ,
-                                  elem_data.values[i] ,
-                                  elem_vec , elem_mat );
-    }
-
-    for( unsigned i = 0; i < FunctionCount ; i++){
-      element_vectors(ielem, i) = elem_vec[i] ;
-      for( unsigned j = 0; j < FunctionCount ; j++){
-        element_matrices(ielem, i, j) = elem_mat[i][j] ;
-      }
-    }
-  }
-
-}; /* ElementComputation */
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
-          typename ScalarType >
-struct DirichletSolution<
-  FEMesh< ScalarCoordType , ElemNode , DeviceType > ,
-  ScalarType >
-{
-  typedef DeviceType  execution_space;
-
-  static const unsigned ElementNodeCount = ElemNode ;
-
-  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
-
-  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
-
-  typename mesh_type::node_coords_type node_coords ;
-
-  vector_type     solution ;
-  ScalarCoordType bc_lower_z ;
-  ScalarCoordType bc_upper_z ;
-  ScalarType      bc_lower_value ;
-  ScalarType      bc_upper_value ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned inode ) const
-  {
-
-  // Apply dirichlet boundary condition on the Solution vector.
-  // Define boundary node values to be either bc_lower_value or
-  // bc_upper_value, depending on which boundary face they lie on.
-  // Non-boundary terms will be left at their previous value.
-
-    const ScalarCoordType z = node_coords(inode,2);
-    const bool bc_lower = z <= bc_lower_z ;
-    const bool bc_upper = bc_upper_z <= z ;
-
-    if ( bc_lower || bc_upper ) {
-      const ScalarType bc_value = bc_lower ? bc_lower_value
-                                           : bc_upper_value ;
-
-      solution(inode) = bc_value ; //  set the solution vector
-    }
-  }
-
-  static void apply( const vector_type    & solution ,
-                     const mesh_type      & mesh ,
-                     const ScalarCoordType  bc_lower_z ,
-                     const ScalarCoordType  bc_upper_z ,
-                     const ScalarType       bc_lower_value ,
-                     const ScalarType       bc_upper_value )
-  {
-    DirichletSolution op ;
-    op.node_coords    = mesh.node_coords ;
-    op.solution       = solution ;
-    op.bc_lower_z     = bc_lower_z ;
-    op.bc_upper_z     = bc_upper_z ;
-    op.bc_lower_value = bc_lower_value ;
-    op.bc_upper_value = bc_upper_value ;
-    parallel_for( solution.dimension_0() , op );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
-          typename ScalarType >
-struct DirichletResidual<
-  FEMesh< ScalarCoordType , ElemNode , DeviceType > , ScalarType >
-{
-  typedef DeviceType     execution_space;
-  typedef typename execution_space::size_type  size_type ;
-
-  static const unsigned ElementNodeCount = ElemNode ;
-
-  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
-  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
-
-  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
-
-  typename mesh_type::node_coords_type node_coords ;
-  matrix_type     matrix ;
-  vector_type     rhs ;
-  ScalarCoordType bc_lower_z ;
-  ScalarCoordType bc_upper_z ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned inode ) const
-  {
-    //  Apply a dirichlet boundary condition to 'irow'
-    //  to maintain the symmetry of the original
-    //  global stiffness matrix, zero out the columns
-    //  that correspond to boundary conditions, and
-    //  adjust the load vector accordingly
-
-    const size_type iBeg = matrix.graph.row_map[inode];
-    const size_type iEnd = matrix.graph.row_map[inode+1];
-
-    const ScalarCoordType z = node_coords(inode,2);
-    const bool bc_lower = z <= bc_lower_z ;
-    const bool bc_upper = bc_upper_z <= z ;
-
-    if ( bc_lower || bc_upper ) {
-      rhs(inode) = 0 ; //  set the residual vector
-
-      //  zero each value on the row, and leave a one
-      //  on the diagonal
-
-      for( size_type i = iBeg ; i < iEnd ; i++) {
-        matrix.coefficients(i) =
-          (int) inode == matrix.graph.entries(i) ? 1 : 0 ;
-      }
-    }
-    else {
-
-      //  Find any columns that are boundary conditions.
-      //  Clear them and adjust the load vector
-
-      for( size_type i = iBeg ; i < iEnd ; i++ ) {
-        const size_type cnode = matrix.graph.entries(i) ;
-
-        const ScalarCoordType zc = node_coords(cnode,2);
-        const bool c_bc_lower = zc <= bc_lower_z ;
-        const bool c_bc_upper = bc_upper_z <= zc ;
-
-        if ( c_bc_lower || c_bc_upper ) {
-
-	   matrix.coefficients(i) = 0 ;
-        }
-      }
-    }
-  }
-
-
-  static void apply( const matrix_type & linsys_matrix ,
-                     const vector_type & linsys_rhs ,
-                     const mesh_type   & mesh ,
-                     const ScalarCoordType  bc_lower_z ,
-                     const ScalarCoordType  bc_upper_z)
-  {
-    const size_t row_count = linsys_matrix.graph.row_map.dimension_0() - 1 ;
-
-    DirichletResidual op ;
-    op.node_coords    = mesh.node_coords ;
-    op.matrix         = linsys_matrix ;
-    op.rhs            = linsys_rhs ;
-    op.bc_lower_z     = bc_lower_z ;
-    op.bc_upper_z     = bc_upper_z ;
-    parallel_for( row_count , op );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} /* namespace Nonlinear */
-} /* namespace HybridFEM */
-
-#endif /* #ifndef KOKKOS_NONLINEARFUNCTORS_HPP */
-
--- a/lib/kokkos/example/multi_fem/ParallelComm.hpp
+++ b/lib/kokkos/example/multi_fem/ParallelComm.hpp
@ -1,167 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef PARALLELCOMM_HPP
-#define PARALLELCOMM_HPP
-
-//------------------------------------------------------------------------
-
-#include <Kokkos_Macros.hpp>
-
-//------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_MPI )
-
-#include <mpi.h>
-#include <string>
-
-namespace comm {
-
-struct Machine {
-  MPI_Comm mpi_comm ;
-
-  Machine() : mpi_comm( MPI_COMM_NULL ) {}
-
-  Machine( const Machine & rhs )
-    : mpi_comm( rhs.mpi_comm ) {}
-
-  Machine( MPI_Comm c ) : mpi_comm( c ) {}
-
-  static Machine init( int * argc , char *** argv )
-  {
-    MPI_Init( argc , argv );
-    return Machine( MPI_COMM_WORLD );
-  }
-
-  static void finalize() { MPI_Finalize(); }
-};
-
-inline
-unsigned  size( Machine machine )
-{
-  int np ; MPI_Comm_size( machine.mpi_comm , & np ); return np ;
-}
-
-inline
-unsigned  rank( Machine machine )
-{
-  int ip ; MPI_Comm_rank( machine.mpi_comm , & ip ); return ip ;
-}
-
-inline
-double max( Machine machine , double local )
-{
-  double global = 0;
-  MPI_Allreduce( & local , & global , 1 , MPI_DOUBLE , MPI_MAX , machine.mpi_comm );
-  return global ;
-}
-
-inline
-std::string command_line( Machine machine , const int argc , const char * const * const argv )
-{
-  std::string argline ;
-
-  if ( 0 == rank( machine ) ) {
-    for ( int i = 1 ; i < argc ; ++i ) {
-      argline.append(" ").append( argv[i] );
-    }
-  }
-
-  int length = argline.length();
-  MPI_Bcast( & length , 1 , MPI_INT , 0 , machine.mpi_comm );
-  argline.resize( length , ' ' );
-  MPI_Bcast( (void*) argline.data() , length , MPI_CHAR , 0 , machine.mpi_comm );
-
-  return argline ;
-}
-
-}
-
-#else /* ! defined( KOKKOS_HAVE_MPI ) */
-
-#include <string>
-
-namespace comm {
-
-// Stub for non-parallel
-
-struct Machine {
-  static Machine init( int * , char *** )
-  { return Machine(); }
-
-  static void finalize() {}
-};
-
-inline
-unsigned  size( Machine ) { return 1 ; }
-
-inline
-unsigned  rank( Machine ) { return 0 ; }
-
-inline
-double max( Machine , double local )
-{ return local ; }
-
-inline
-std::string command_line( Machine machine , const int argc , const char * const * const argv )
-{
-  std::string argline ;
-
-  if ( 0 == rank( machine ) ) {
-    for ( int i = 1 ; i < argc ; ++i ) {
-      argline.append(" ").append( argv[i] );
-    }
-  }
-
-  return argline ;
-}
-
-}
-
-#endif /* ! defined( KOKKOS_HAVE_MPI ) */
-
-//------------------------------------------------------------------------
-
-#endif /* #ifndef PARALLELCOMM_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/ParallelDataMap.hpp
+++ b/lib/kokkos/example/multi_fem/ParallelDataMap.hpp
@ -1,517 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_PARALLELDATAMAP_HPP
-#define KOKKOS_PARALLELDATAMAP_HPP
-
-#include <utility>
-#include <limits>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-#include <Kokkos_Core.hpp>
-#include <ParallelComm.hpp>
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-/** \brief  Parallel distributed data mapping
- *
- *  ordering { interior : { owned items not sent elsewhere }
- *             send     : { owned items sent }
- *             receive  : { not-owned items received } }
- *
- *  recv { { N ghosted items from process P : ( P , N ) } }
- *
- *  send { { N send items to process P : ( P , N ) } }
- *
- *  send_item { send item offsets within 'send' range }
- */
-struct ParallelDataMap {
-  typedef View< unsigned*[2], HostSpace >  host_recv_type ;
-  typedef View< unsigned*[2], HostSpace >  host_send_type ;
-  typedef View< unsigned* ,   HostSpace >  host_send_item_type ;
-
-  comm::Machine        machine ;
-  host_recv_type       host_recv ;
-  host_send_type       host_send ;
-  host_send_item_type  host_send_item ;
-  unsigned             count_interior ;
-  unsigned             count_send ;
-  unsigned             count_owned ; // = count_interior + count_send
-  unsigned             count_receive ;
-
-  void assign( const unsigned arg_count_interior ,
-               const unsigned arg_count_owned ,
-               const unsigned arg_count_total ,
-               const unsigned arg_recv_msg ,
-               const unsigned arg_send_msg ,
-               const unsigned arg_send_count )
-  {
-    const std::string label("Kokkos::ParallelDataMap buffer");
-
-    count_interior = arg_count_interior ;
-    count_owned    = arg_count_owned ;
-    count_send     = arg_count_owned - arg_count_interior ;
-    count_receive  = arg_count_total - arg_count_owned ;
-
-    host_recv = host_recv_type( label , arg_recv_msg );
-    host_send = host_send_type( label , arg_send_msg );
-    host_send_item = host_send_item_type( label , arg_send_count );
-  }
-};
-
-//----------------------------------------------------------------------------
-//PackArray
-//----------------------------------------------------------------------------
-template< class ArrayType , class Rank = void >
-struct PackArray ;
-
-template< typename DeviceType, typename ValueType >
-struct PackArray< View< ValueType* , DeviceType > , void >
-{
-  typedef DeviceType                         execution_space ;
-  typedef typename DeviceType::size_type     size_type ;
-  typedef View< ValueType* , execution_space >  array_type ;
-  typedef View< ValueType* , execution_space >  buffer_type ;
-
-private:
-
-  buffer_type  output ;
-  array_type   input ;
-  size_type    base ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i ) const
-  { output[i] = input(base+i); }
-
-  inline
-  static
-  void pack( const buffer_type & arg_output ,
-             const size_type     arg_begin ,
-             const size_type     arg_count ,
-             const array_type  & arg_input )
-  {
-    PackArray op ;
-    op.output = arg_output ;
-    op.input  = arg_input ;
-    op.base   = arg_begin ;
-    parallel_for( arg_count , op );
-  }
-};
-
-template< typename DeviceType, typename ValueType , unsigned N1 >
-struct PackArray< View< ValueType*[N1] , DeviceType > , void >
-{
-  typedef DeviceType                                  execution_space ;
-  typedef typename DeviceType::size_type              size_type ;
-  typedef View< ValueType*[N1] , execution_space >       array_type ;
-  typedef View< ValueType* , execution_space >           buffer_type ;
-
-private:
-
-  buffer_type  output ;
-  array_type   input ;
-  size_type    base ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i ) const
-  {
-    for ( size_type j = 0 , k = i * N1 ; j < N1 ; ++j , ++k ) {
-      output[k] = input(base+i,j);
-    }
-  }
-
-  inline static
-  void pack( const buffer_type & arg_output ,
-             const size_type     arg_begin ,
-             const size_type     arg_count ,
-             const array_type  & arg_input )
-  {
-    if ( arg_count ) {
-      PackArray op ;
-      op.output = arg_output ;
-      op.input  = arg_input ;
-      op.base   = arg_begin ;
-      parallel_for( arg_count , op );
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-//UnpackArray
-//----------------------------------------------------------------------------
-template< class ArrayType , class Rank = void > struct UnpackArray ;
-
-template< typename DeviceType, typename ValueType >
-struct UnpackArray< View< ValueType* , DeviceType > , void >
-{
-  typedef DeviceType                         execution_space ;
-  typedef typename DeviceType::size_type     size_type ;
-  typedef View< ValueType* , execution_space >  array_type ;
-  typedef View< ValueType* , execution_space >  buffer_type ;
-
-private:
-
-  array_type   output ;
-  buffer_type  input ;
-  size_type    base ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i ) const
-  { output(base+i) = input[i]; }
-
-  inline
-  static
-  void unpack( const array_type  & arg_output ,
-               const buffer_type & arg_input ,
-               const size_type     arg_begin ,
-               const size_type     arg_count )
-  {
-    UnpackArray op ;
-    op.output = arg_output ;
-    op.input  = arg_input ;
-    op.base   = arg_begin ;
-    parallel_for( arg_count , op );
-  }
-};
-
-template< typename DeviceType, typename ValueType , unsigned N1 >
-struct UnpackArray< View< ValueType*[N1] , DeviceType > , void >
-{
-  typedef DeviceType                                  execution_space ;
-  typedef typename DeviceType::size_type              size_type ;
-  typedef View< ValueType* , execution_space >           buffer_type ;
-  typedef View< ValueType*[N1] , execution_space >       array_type ;
-
-private:
-
-  array_type   output ;
-  buffer_type  input ;
-  size_type    base ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i ) const
-  {
-    for ( size_type j = 0 , k = i * N1 ; j < N1 ; ++j , ++k ) {
-      output(base+i,j) = input(k);
-    }
-  }
-
-  inline
-  static
-  void unpack( const array_type  & arg_output ,
-               const buffer_type & arg_input ,
-               const size_type     arg_begin ,
-               const size_type     arg_count )
-  {
-    if ( arg_count ) {
-      UnpackArray op ;
-      op.output = arg_output ;
-      op.input  = arg_input ;
-      op.base   = arg_begin ;
-      parallel_for( arg_count , op );
-    }
-  }
-};
-//----------------------------------------------------------------------------
-template< class ValueType , class Device , class DataMap >
-class AsyncExchange ;
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-// Application call procedure:
-//
-// construct: AsyncExchange object
-// * pack send buffer on device
-// initiate: copy send buffer from device to host
-// * dispatch asynchronous local work
-// complete: send/receive on host, copy receive buffer to device
-// * unpack receive buffer on device
-// destroy: AsyncExchange object
-//
-//----------------------------------------------------------------------------
-
-#ifdef KOKKOS_HAVE_MPI
-
-namespace Kokkos {
-
-template< class ValueType , class Device >
-class AsyncExchange< ValueType, Device , Kokkos::ParallelDataMap > {
-public:
-
-  typedef Device                                    execution_space ;
-  typedef Kokkos::ParallelDataMap                   data_map_type ;
-  typedef Kokkos::View< ValueType* , execution_space >  buffer_dev_type ;
-  typedef typename buffer_dev_type::HostMirror      buffer_host_type ;
-
-private:
-
-  static const int mpi_tag = 11 ;
-
-  const data_map_type  data_map ;
-  unsigned             chunk_size ;
-  unsigned             send_count_max ;
-  buffer_host_type     host_recv_buffer ;
-  buffer_host_type     host_send_buffer ;
-  buffer_host_type     send_msg_buffer ;
-  buffer_dev_type      dev_buffer ;
-  buffer_dev_type      dev_send_buffer ; // Subview for send
-  buffer_dev_type      dev_recv_buffer ; // Subview for receive
-  std::vector< MPI_Request > recv_request ;
-
-public:
-
-  const buffer_dev_type & buffer() const { return dev_buffer ; }
-
-  AsyncExchange( const data_map_type & arg_data_map ,
-                 const size_t          arg_chunk )
-  : data_map( arg_data_map )
-  , chunk_size( arg_chunk )
-  , send_count_max( 0 )
-  , host_recv_buffer()
-  , host_send_buffer()
-  , send_msg_buffer()
-  , dev_buffer()
-  , dev_send_buffer()
-  , dev_recv_buffer()
-  , recv_request()
-  {
-    const size_t send_msg_count = arg_data_map.host_send.dimension_0();
-    const size_t recv_msg_count = arg_data_map.host_recv.dimension_0();
-
-    const size_t send_msg_length = arg_chunk * arg_data_map.count_send ;
-    const size_t recv_msg_length = arg_chunk * arg_data_map.count_receive ;
-
-    for ( size_t i = 0 ; i < send_msg_count ; ++i ) {
-      send_count_max = std::max( send_count_max ,
-                                 (unsigned) arg_data_map.host_send(i,1) );
-    }
-
-    // A single shared buffer on the device can be used for
-    // send and receive message buffers.
-    dev_buffer = buffer_dev_type(
-                     std::string("AsyncExchange dev_buffer") ,
-                     std::max( send_msg_length , recv_msg_length ) );
-
-    // Total send subview of the device buffer
-    dev_send_buffer =
-      Kokkos::subview( dev_buffer , std::pair<size_t,size_t>( 0 , send_msg_length ) );
-
-    // Total receive subview of the device buffer
-    dev_recv_buffer =
-      Kokkos::subview( dev_buffer , std::pair<size_t,size_t>( 0 , recv_msg_length ) );
-
-    // Total receive message buffer on the host:
-    host_recv_buffer = buffer_host_type(
-                           std::string("AsyncExchange host_recv_buffer") ,
-                           recv_msg_length );
-
-    // Total send message buffer on the host:
-    host_send_buffer = buffer_host_type(
-                           std::string("AsyncExchange host_send_buffer") ,
-                           send_msg_length );
-
-    // Individual send message buffer on the host:
-    send_msg_buffer = buffer_host_type(
-                          std::string("AsyncExchange send_msg_buffer") ,
-                          arg_chunk * send_count_max );
-
-    // MPI asynchronous receive request handles:
-    recv_request.assign( recv_msg_count , MPI_REQUEST_NULL );
-  }
-
-  //------------------------------------------------------------------------
-
-  void setup()
-  {
-    { // Post receives:
-      const size_t recv_msg_count = data_map.host_recv.dimension_0();
-
-      ValueType * ptr = host_recv_buffer.ptr_on_device();
-
-      for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
-        const int proc  = data_map.host_recv(i,0);
-        const int count = data_map.host_recv(i,1) * chunk_size ;
-
-        MPI_Irecv( ptr , count * sizeof(ValueType) , MPI_BYTE ,
-                   proc , mpi_tag , data_map.machine.mpi_comm ,
-                   & recv_request[i] );
-
-        ptr += count ;
-      }
-    }
-
-    // Copy send buffer from the device to host memory for sending
-
-    Kokkos::deep_copy( host_send_buffer , dev_send_buffer );
-
-    // Done with the device until communication is complete.
-    // Application can dispatch asynchronous work on the device.
-  }
-
-  // Application can dispatch local work to device ...
-  // No communication progress until main thread calls 'send_receive'
-
-  void send_receive()
-  {
-    const size_t recv_msg_count = data_map.host_recv.dimension_0();
-    const size_t send_msg_count = data_map.host_send.dimension_0();
-
-    // Pack and send:
-
-    for ( size_t i = 0 , j = 0 ; i < send_msg_count ; ++i ) {
-      const int proc  = data_map.host_send(i,0);
-      const int count = data_map.host_send(i,1);
-
-      for ( int k = 0 , km = 0 ; k < count ; ++k , ++j ) {
-        const int km_end = km + chunk_size ;
-        int ki = chunk_size * data_map.host_send_item(j);
-
-        for ( ; km < km_end ; ++km , ++ki ) {
-          send_msg_buffer[km] = host_send_buffer[ki];
-        }
-      }
-
-      // MPI_Ssend blocks until
-      // (1) a receive is matched for the message and
-      // (2) the send buffer can be re-used.
-      //
-      // It is suggested that MPI_Ssend will have the best performance:
-      // http://www.mcs.anl.gov/research/projects/mpi/sendmode.html .
-
-      MPI_Ssend( send_msg_buffer.ptr_on_device(),
-                 count * chunk_size * sizeof(ValueType) , MPI_BYTE ,
-                 proc , mpi_tag , data_map.machine.mpi_comm );
-    }
-
-    // Wait for receives and verify:
-
-    for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
-      MPI_Status recv_status ;
-      int recv_which = 0 ;
-      int recv_size  = 0 ;
-
-      MPI_Waitany( recv_msg_count , & recv_request[0] ,
-                   & recv_which , & recv_status );
-
-      const int recv_proc = recv_status.MPI_SOURCE ;
-
-      MPI_Get_count( & recv_status , MPI_BYTE , & recv_size );
-
-      // Verify message properly received:
-
-      const int  expected_proc = data_map.host_recv(recv_which,0);
-      const int  expected_size = data_map.host_recv(recv_which,1) *
-                                 chunk_size * sizeof(ValueType);
-
-      if ( ( expected_proc != recv_proc ) ||
-           ( expected_size != recv_size ) ) {
-        std::ostringstream msg ;
-        msg << "AsyncExchange error:"
-            << " P" << comm::rank( data_map.machine )
-            << " received from P" << recv_proc
-            << " size "     << recv_size
-            << " expected " << expected_size
-            << " from P"    << expected_proc ;
-        throw std::runtime_error( msg.str() );
-      }
-    }
-
-    // Copy received data to device memory.
-
-    Kokkos::deep_copy( dev_recv_buffer , host_recv_buffer );
-  }
-};
-
-} // namespace Kokkos
-
-#else /* ! #ifdef KOKKOS_HAVE_MPI */
-
-namespace Kokkos {
-
-template< class ValueType , class Device >
-class AsyncExchange< ValueType, Device , Kokkos::ParallelDataMap > {
-public:
-
-  typedef Device                                    execution_space ;
-  typedef Kokkos::ParallelDataMap                   data_map_type ;
-  typedef Kokkos::View< ValueType* , execution_space >  buffer_dev_type ;
-  typedef typename buffer_dev_type::HostMirror      buffer_host_type ;
-
-  buffer_dev_type      dev_buffer ;
-
-public:
-
-  const buffer_dev_type & buffer() const { return dev_buffer ; }
-
-  AsyncExchange( const data_map_type & , const size_t )
-  : dev_buffer()
-  { }
-
-  //------------------------------------------------------------------------
-
-  void setup() { }
-
-  void send_receive() { }
-};
-
-} // namespace Kokkos
-
-#endif /* ! #ifdef KOKKOS_HAVE_MPI */
-
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_PARALLELDATAMAP_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/ParallelMachine.cpp
+++ b/lib/kokkos/example/multi_fem/ParallelMachine.cpp
@ -1,178 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#if 0
-
-#include <stdlib.h>
-#include <string.h>
-
-#include <ParallelMachine.hpp>
-
-#include <Kokkos_Core.hpp>
-
-#if ! defined( KOKKOS_HAVE_MPI )
-#define MPI_COMM_NULL 0
-#endif
-
-//------------------------------------------------------------------------
-
-namespace Parallel {
-
-Machine::Machine( int * argc , char *** argv )
-  : m_mpi_comm( MPI_COMM_NULL )
-  , m_mpi_size(0)
-  , m_mpi_rank(0)
-  , m_mpi_gpu(0)
-{
-
-#if defined( KOKKOS_HAVE_CUDA )
-  //------------------------------------
-  // Might be using a Cuda aware version of MPI.
-  // Must select Cuda device before initializing MPI.
-  {
-    int i = 1 ;
-    for ( ; i < *argc && strcmp((*argv)[i],"mpi_cuda") ; ++i );
-
-    if ( i < *argc ) {
-      // Determine, if possible, what will be the node-local
-      // rank of the MPI process once MPI has been initialized.
-      // This rank is needed to set the Cuda device before 'mvapich'
-      // is initialized.
-
-      const char * const mvapich_local_rank = getenv("MV2_COMM_WORLD_LOCAL_RANK");
-      const char * const slurm_local_rank   = getenv("SLURM_LOCALID");
-
-      const int pre_mpi_local_rank =
-        0 != mvapich_local_rank ? atoi( mvapich_local_rank ) : (
-        0 != slurm_local_rank   ? atoi( slurm_local_rank ) : (
-        -1 ) );
-
-      if ( 0 <= pre_mpi_local_rank ) {
-
-        const int ngpu = Kokkos::Cuda::detect_device_count();
-
-        const int cuda_device_rank = pre_mpi_local_rank % ngpu ;
-
-        Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cuda_device_rank ) );
-
-        m_mpi_gpu = 1 ;
-      }
-    }
-  }
-#endif
-
-  //------------------------------------
-
-#if defined( KOKKOS_HAVE_MPI )
-  MPI_Init( argc , argv );
-  m_mpi_comm = MPI_COMM_WORLD ;
-  MPI_Comm_size( m_mpi_comm , & m_mpi_size );
-  MPI_Comm_rank( m_mpi_comm , & m_mpi_rank );
-#endif
-
-  // Query hwloc after MPI initialization to allow MPI binding:
-  //------------------------------------
-  // Request to use host device:
-  {
-    int i = 1 ;
-    for ( ; i < *argc && strcmp((*argv)[i],"host") ; ++i );
-
-    if ( i < *argc ) {
-
-      unsigned team_count       = Kokkos::hwloc::get_available_numa_count();
-      unsigned threads_per_team = Kokkos::hwloc::get_available_cores_per_numa() *
-                                  Kokkos::hwloc::get_available_threads_per_core();
- 
-      if ( i + 2 < *argc ) {
-        team_count       = atoi( (*argv)[i+1] );
-        threads_per_team = atoi( (*argv)[i+2] );
-      }
-
-      Kokkos::Threads::initialize( team_count * threads_per_team );
-    }
-  }
-
-#if defined( KOKKOS_HAVE_CUDA )
-  //------------------------------------
-  // Request to use Cuda device and not already initialized.
-  if ( ! m_mpi_gpu ) {
-    int i = 1 ;
-    for ( ; i < *argc && strcmp((*argv)[i],"mpi_cuda") && strcmp((*argv)[i],"cuda") ; ++i );
-
-    if ( i < *argc ) {
-
-      const int ngpu = Kokkos::Cuda::detect_device_count();
-
-      const int cuda_device_rank = m_mpi_rank % ngpu ;
-
-      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cuda_device_rank ) );
-    }
-  }
-#endif
-
-}
-
-Machine::~Machine()
-{
-  Kokkos::Threads::finalize();
-#if defined( KOKKOS_HAVE_CUDA )
-  Kokkos::Cuda::finalize();
-#endif
-#if defined( KOKKOS_HAVE_MPI )
-  MPI_Finalize();
-#endif
-}
-
-void Machine::print_configuration( std::ostream & msg ) const
-{
-  msg << "MPI [ " << m_mpi_rank << " / " << m_mpi_size << " ]" << std::endl ;
-  Kokkos::Threads::print_configuration( msg );
-#if defined( KOKKOS_HAVE_CUDA )
-  Kokkos::Cuda::print_configuration( msg );
-#endif
-}
-
-}
-
-#endif /* #if 0 */
-
--- a/lib/kokkos/example/multi_fem/ParallelMachine.hpp
+++ b/lib/kokkos/example/multi_fem/ParallelMachine.hpp
@ -1,118 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#error "ParallelMachine"
-
-#ifndef PARALLELMACHINE_HPP
-#define PARALLELMACHINE_HPP
-
-//------------------------------------------------------------------------
-
-#include <iosfwd>
-
-#include <Kokkos_Core.hpp>
-
-//------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_MPI )
-#include <mpi.h>
-#else
-  typedef int MPI_Comm ;
-#endif
-
-//------------------------------------------------------------------------
-//------------------------------------------------------------------------
-
-namespace Parallel {
-
-/** \brief  Hybrid parallel machine with MPI+Kokkos::Threads or MPI+Kokkos::Cuda.
- *
- *  Initialization of MPI and Kokkos device has interdependencies which this
- *  class manages.  The command line and environment variables are queried to initialize
- *  the Threads or Cuda device:
- *
- *    1)  cuda               : initializes Cuda device
- *    2)  host               : initializes Threads device with all hwloc detected cores.
- *    3)  host #gang #worker : initializes Threads with specified
- */
-class Machine {
-private:
-
-  MPI_Comm m_mpi_comm ;
-  int      m_mpi_size ;
-  int      m_mpi_rank ;
-  unsigned m_mpi_gpu ;
-  unsigned m_gpu_arch ;
-
-  Machine();
-  Machine( const Machine & );
-  Machine & operator = ( const Machine & );
-
-public:
-
-  /** \brief  Coordinated initialize MPI, Cuda, or Threads devices from 'main'.  */
-  Machine( int * argc , char *** argv );
-
-  ~Machine();
-
-  MPI_Comm mpi_comm() const { return m_mpi_comm ; }
-
-  int mpi_size() const { return m_mpi_size ; }
-  int mpi_rank() const { return m_mpi_rank ; }
-
-  /** \brief  If using MPI that can directly operate on GPU memory */
-  bool mpi_gpu() const { return m_mpi_gpu ; }
-
-  /** \brief  If using GPU then what architecture */
-  unsigned gpu_arch() const { return m_gpu_arch ; }
-
-  void print_configuration( std::ostream & ) const ;
-};
-
-}
-
-//------------------------------------------------------------------------
-
-#endif /* #ifndef PARALLELMACHINE_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp
+++ b/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp
@ -1,400 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef SPARSELINEARSYSTEM_HPP
-#define SPARSELINEARSYSTEM_HPP
-
-#include <cmath>
-#include <impl/Kokkos_Timer.hpp>
-
-#include <Kokkos_Core.hpp>
-#include <Kokkos_StaticCrsGraph.hpp>
-
-#include <LinAlgBLAS.hpp>
-
-namespace Kokkos {
-
-template< typename ScalarType , class Device >
-struct CrsMatrix {
-  typedef Device      execution_space ;
-  typedef ScalarType  value_type ;
-
-  typedef StaticCrsGraph< int , execution_space , void , int >  graph_type ;
-  typedef View< value_type* , execution_space >   coefficients_type ;
-
-  graph_type         graph ;
-  coefficients_type  coefficients ;
-};
-
-//----------------------------------------------------------------------------
-
-namespace Impl {
-
-template< class Matrix , class OutputVector , class InputVector >
-struct Multiply ;
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< typename AScalarType ,
-          typename VScalarType ,
-          class DeviceType >
-struct Multiply< CrsMatrix<AScalarType,DeviceType> ,
-                 View<VScalarType*,DeviceType > ,
-                 View<VScalarType*,DeviceType > >
-{
-  typedef DeviceType                       execution_space ;
-  typedef typename execution_space::size_type  size_type ;
-
-  typedef View<       VScalarType*, execution_space, MemoryUnmanaged >  vector_type ;
-  typedef View< const VScalarType*, execution_space, MemoryUnmanaged >  vector_const_type ;
-
-  typedef CrsMatrix< AScalarType , execution_space >    matrix_type ;
-
-private:
-
-  matrix_type        m_A ;
-  vector_const_type  m_x ;
-  vector_type        m_y ;
-
-public:
-
-  //--------------------------------------------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type iRow ) const
-  {
-    const size_type iEntryBegin = m_A.graph.row_map[iRow];
-    const size_type iEntryEnd   = m_A.graph.row_map[iRow+1];
-
-    double sum = 0 ;
-
-#if defined( __INTEL_COMPILER )
-#pragma simd reduction(+:sum)
-#pragma ivdep
-    for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
-      sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
-    }
-#else
-    for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
-      sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
-    }
-#endif
-
-    m_y(iRow) = sum ;
-  }
-
-  Multiply( const matrix_type & A ,
-            const size_type nrow ,
-            const size_type , // ncol ,
-            const vector_type & x ,
-            const vector_type & y )
-    : m_A( A ), m_x( x ), m_y( y )
-  {
-    parallel_for( nrow , *this );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-template< typename AScalarType ,
-          typename VScalarType ,
-          class Device >
-class Operator {
-  typedef CrsMatrix<AScalarType,Device>  matrix_type ;
-  typedef View<VScalarType*,Device>     vector_type ;
-
-private:
-  const CrsMatrix<AScalarType,Device> A ;
-
-  ParallelDataMap                                         data_map ;
-  AsyncExchange< VScalarType , Device , ParallelDataMap > exchange ;
-
-public:
-
-  Operator( const ParallelDataMap                  & arg_data_map ,
-            const CrsMatrix<AScalarType,Device>    & arg_A )
-    : A( arg_A )
-    , data_map( arg_data_map )
-    , exchange( arg_data_map , 1 )
-    {}
-
-  void apply( const View<VScalarType*,Device>  & x ,
-              const View<VScalarType*,Device>  & y )
-  {
-    // Gather off-processor data for 'x'
-
-    PackArray< vector_type >::pack( exchange.buffer() ,
-                                    data_map.count_interior ,
-                                    data_map.count_send , x );
-
-    exchange.setup();
-
-    // If interior & boundary matrices then could launch interior multiply
-
-    exchange.send_receive();
-
-    UnpackArray< vector_type >::unpack( x , exchange.buffer() ,
-                                        data_map.count_owned ,
-                                        data_map.count_receive );
-
-    const typename Device::size_type nrow = data_map.count_owned ;
-    const typename Device::size_type ncol = data_map.count_owned +
-                                            data_map.count_receive ;
-
-    Impl::Multiply<matrix_type,vector_type,vector_type>( A, nrow, ncol, x, y);
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename AScalarType , typename VScalarType , class Device >
-void cgsolve(
-  const ParallelDataMap                 data_map ,
-  const CrsMatrix<AScalarType,Device>   A ,
-  const View<VScalarType*,Device> b ,
-  const View<VScalarType*,Device> x ,
-  size_t & iteration ,
-  double & normr ,
-  double & iter_time ,
-  const size_t maximum_iteration = 200 ,
-  const double tolerance = std::numeric_limits<VScalarType>::epsilon() )
-{
-  typedef View<VScalarType*,Device> vector_type ;
-  //typedef View<VScalarType,  Device> value_type ; // unused
-
-  const size_t count_owned = data_map.count_owned ;
-  const size_t count_total = data_map.count_owned + data_map.count_receive ;
-
-  Operator<AScalarType,VScalarType,Device> matrix_operator( data_map , A );
-
-  // Need input vector to matvec to be owned + received
-  vector_type pAll ( "cg::p" , count_total );
-
-  vector_type p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
-  vector_type r ( "cg::r" , count_owned );
-  vector_type Ap( "cg::Ap", count_owned );
-
-  /* r = b - A * x ; */
-
-  /* p  = x      */ deep_copy( p , x );
-  /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );
-  /* r  = b - Ap */ waxpby( count_owned , 1.0 , b , -1.0 , Ap , r );
-  /* p  = r      */ deep_copy( p , r );
-
-  double old_rdot = dot( count_owned , r , data_map.machine );
-
-  normr     = sqrt( old_rdot );
-  iteration = 0 ;
-
-  Kokkos::Impl::Timer wall_clock ;
-
-  while ( tolerance < normr && iteration < maximum_iteration ) {
-
-    /* pAp_dot = dot( p , Ap = A * p ) */
-
-    /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );
-
-    const double pAp_dot = dot( count_owned , p , Ap , data_map.machine );
-    const double alpha   = old_rdot / pAp_dot ;
-
-    /* x += alpha * p ;  */ axpy( count_owned,  alpha, p , x );
-    /* r -= alpha * Ap ; */ axpy( count_owned, -alpha, Ap, r );
-
-    const double r_dot = dot( count_owned , r , data_map.machine );
-    const double beta  = r_dot / old_rdot ;
-
-    /* p = r + beta * p ; */ xpby( count_owned , r , beta , p );
-
-    normr = sqrt( old_rdot = r_dot );
-    ++iteration ;
-  }
-
-  iter_time = wall_clock.seconds();
-}
-
-//----------------------------------------------------------------------------
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_CUDA )
-
-#if ( CUDA_VERSION < 6000 )
-#pragma message "cusparse_v2.h"
-#include <cusparse_v2.h>
-#else
-#pragma message "cusparse.h"
-#include <cusparse.h>
-#endif
-
-namespace Kokkos {
-namespace Impl {
-
-struct CudaSparseSingleton {
-  cusparseHandle_t   handle;
-  cusparseMatDescr_t descra;
-
-  CudaSparseSingleton()
-  {
-    cusparseCreate( & handle );
-    cusparseCreateMatDescr( & descra );
-    cusparseSetMatType(       descra , CUSPARSE_MATRIX_TYPE_GENERAL );
-    cusparseSetMatIndexBase(  descra , CUSPARSE_INDEX_BASE_ZERO );
-  }
-
-  static CudaSparseSingleton & singleton();
-
-};
-
-template<>
-struct Multiply< CrsMatrix<double,Cuda> ,
-                 View<double*,Cuda > ,
-                 View<double*,Cuda > >
-{
-  typedef Cuda                                      execution_space ;
-  typedef execution_space::size_type                    size_type ;
-  typedef double                                    scalar_type ;
-  typedef View< scalar_type* , execution_space >        vector_type ;
-  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
-
-public:
-
-  Multiply( const matrix_type & A ,
-            const size_type nrow ,
-            const size_type ncol ,
-            const vector_type & x ,
-            const vector_type & y )
-  {
-    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
-    const scalar_type alpha = 1 , beta = 0 ;
-
-    cusparseStatus_t status =
-      cusparseDcsrmv( s.handle ,
-                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
-                      nrow , ncol , A.coefficients.dimension_0() ,
-                      &alpha ,
-                      s.descra ,
-                      A.coefficients.ptr_on_device() ,
-                      A.graph.row_map.ptr_on_device() ,
-                      A.graph.entries.ptr_on_device() ,
-                      x.ptr_on_device() ,
-                      &beta ,
-                      y.ptr_on_device() );
-
-    if ( CUSPARSE_STATUS_SUCCESS != status ) {
-      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
-    }
-  }
-};
-
-
-template<>
-struct Multiply< CrsMatrix<float,Cuda> ,
-                 View<float*,Cuda > ,
-                 View<float*,Cuda > >
-{
-  typedef Cuda                                      execution_space ;
-  typedef execution_space::size_type                    size_type ;
-  typedef float                                     scalar_type ;
-  typedef View< scalar_type* , execution_space >        vector_type ;
-  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
-
-public:
-
-  Multiply( const matrix_type & A ,
-            const size_type nrow ,
-            const size_type ncol ,
-            const vector_type & x ,
-            const vector_type & y )
-  {
-    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
-    const scalar_type alpha = 1 , beta = 0 ;
-
-    cusparseStatus_t status =
-      cusparseScsrmv( s.handle ,
-                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
-                      nrow , ncol , A.coefficients.dimension_0() ,
-                      &alpha ,
-                      s.descra ,
-                      A.coefficients.ptr_on_device() ,
-                      A.graph.row_map.ptr_on_device() ,
-                      A.graph.entries.ptr_on_device() ,
-                      x.ptr_on_device() ,
-                      &beta ,
-                      y.ptr_on_device() );
-
-    if ( CUSPARSE_STATUS_SUCCESS != status ) {
-      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
-    }
-  }
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef SPARSELINEARSYSTEM_HPP */
-
--- a/lib/kokkos/example/multi_fem/SparseLinearSystemFill.hpp
+++ b/lib/kokkos/example/multi_fem/SparseLinearSystemFill.hpp
@ -1,276 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef SPARSELINEARSYSTEMFILL_HPP
-#define SPARSELINEARSYSTEMFILL_HPP
-
-#include <vector>
-#include <algorithm>
-#include <limits>
-
-#include <FEMesh.hpp>
-#include <SparseLinearSystem.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace HybridFEM {
-
-template< class MatrixType , class MeshType ,
-          class elem_matrices_type ,
-          class elem_vectors_type > struct GatherFill ;
-
-
-template< typename ScalarType ,
-          class    DeviceType ,
-          unsigned ElemNode ,
-          typename CoordScalarType ,
-          class elem_matrices_type ,
-          class elem_vectors_type >
-struct GatherFill< 
-  Kokkos::CrsMatrix< ScalarType , DeviceType > ,
-  FEMesh< CoordScalarType , ElemNode , DeviceType > ,
-  elem_matrices_type , elem_vectors_type >
-{
-  typedef DeviceType     execution_space ;
-  typedef typename execution_space::size_type  size_type ;
-
-  static const size_type ElemNodeCount = ElemNode ;
-
-  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
-  typedef typename matrix_type::coefficients_type   coefficients_type ;
-  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
-  typedef Kokkos::View< size_type[][ElemNodeCount][ElemNodeCount] , execution_space >       elem_graph_type ;
-
-  typedef FEMesh< CoordScalarType , ElemNodeCount , execution_space > mesh_type ;
-  typedef typename mesh_type::node_elem_ids_type node_elem_ids_type ;
-
-private:
-
-  node_elem_ids_type  node_elem_ids ;
-  elem_graph_type     elem_graph ;
-  elem_matrices_type  elem_matrices ;
-  elem_vectors_type   elem_vectors ;
-  coefficients_type   system_coeff ;
-  vector_type         system_rhs ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( size_type irow ) const
-  {
-    const size_type node_elem_begin = node_elem_ids.row_map[irow];
-    const size_type node_elem_end   = node_elem_ids.row_map[irow+1];
-
-    //  for each element that a node belongs to 
-
-    for ( size_type i = node_elem_begin ; i < node_elem_end ; i++ ) {
-
-      const size_type elem_id   = node_elem_ids.entries( i, 0);
-      const size_type row_index = node_elem_ids.entries( i, 1);
-
-      system_rhs(irow) += elem_vectors(elem_id, row_index);
-
-      //  for each node in a particular related element  
-      //  gather the contents of the element stiffness
-      //  matrix that belong in irow
-
-      for ( size_type j = 0 ; j < ElemNodeCount ; ++j ){
-        const size_type A_index = elem_graph( elem_id , row_index , j );
-
-        system_coeff( A_index ) += elem_matrices( elem_id, row_index, j );
-      }
-    }
-  }
-
-
-  static void apply( const matrix_type & matrix ,
-                     const vector_type & rhs ,
-                     const mesh_type   & mesh ,
-                     const elem_graph_type    & elem_graph ,
-                     const elem_matrices_type & elem_matrices ,
-                     const elem_vectors_type  & elem_vectors )
-  {
-    const size_t row_count = matrix.graph.row_map.dimension_0() - 1 ;
-    GatherFill op ;
-    op.node_elem_ids = mesh.node_elem_ids ;
-    op.elem_graph    = elem_graph ;
-    op.elem_matrices = elem_matrices ;
-    op.elem_vectors  = elem_vectors ;
-    op.system_coeff  = matrix.coefficients ;
-    op.system_rhs    = rhs ;
-
-    parallel_for( row_count , op );
-  }
-};
-
-} /* namespace HybridFEM */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace HybridFEM {
-
-template< class GraphType , class MeshType >
-struct GraphFactory {
-  typedef GraphType                         graph_type ;
-  typedef MeshType                          mesh_type ;
-  typedef typename graph_type::execution_space  execution_space ;
-  typedef typename execution_space::size_type   size_type  ;
-
-  static const unsigned ElemNodeCount = mesh_type::element_node_count ;
-
-  typedef Kokkos::View< size_type[][ElemNodeCount][ElemNodeCount] , execution_space >         element_map_type ;
-
-  static
-  void
-  create( const mesh_type & mesh ,
-          graph_type & graph ,
-          element_map_type & elem_map )
-  {
-    typename mesh_type::node_elem_ids_type::HostMirror
-      node_elem_ids = create_mirror( mesh.node_elem_ids );
-
-    typename mesh_type::elem_node_ids_type::HostMirror
-      elem_node_ids = create_mirror( mesh.elem_node_ids );
-
-    typedef typename element_map_type::HostMirror element_map_host_type ;
-
-    deep_copy( elem_node_ids , mesh.elem_node_ids );
-    deep_copy( node_elem_ids.entries , mesh.node_elem_ids.entries );
-
-    const size_t owned_node = mesh.parallel_data_map.count_owned ;
-    const size_t total_elem = mesh.elem_node_ids.dimension_0();
-
-    if ( total_elem ) {
-      elem_map = element_map_type( std::string("element_map"), total_elem );
-    }
-
-    element_map_host_type elem_map_host = create_mirror( elem_map );
-
-    //------------------------------------
-    //  Node->node mapping for the CrsMatrix graph
-
-    std::vector< std::vector< unsigned > > node_node_ids( owned_node );
-    std::vector< unsigned > node_node_begin( owned_node );
-
-    size_t offset = 0 ;
-    for ( size_t i = 0 ; i < owned_node ; ++i ) {
-      const size_t j_end = node_elem_ids.row_map[i+1];
-            size_t j     = node_elem_ids.row_map[i];
-
-      node_node_begin[i] = offset ;
-
-      std::vector< unsigned > & work = node_node_ids[i] ;
-
-      for ( ; j < j_end ; ++j ) {
-        const size_t elem_id = node_elem_ids.entries(j,0);
-        for ( size_t k = 0 ; k < ElemNodeCount ; ++k ) {
-          work.push_back( elem_node_ids( elem_id , k ) );
-        }
-      }
-
-      std::sort( work.begin() , work.end() );
-
-      work.erase( std::unique( work.begin() , work.end() ) , work.end() );
-
-      offset += work.size();
-    }
-
-    graph = Kokkos::create_staticcrsgraph< graph_type >( "node_node_ids" , node_node_ids );
-
-    //------------------------------------
-    // ( element , node_row , node_column ) -> matrix_crs_column
-
-    for ( size_t elem_id = 0 ; elem_id < total_elem ; ++elem_id ) {
-      for ( size_t i = 0 ; i < ElemNodeCount ; ++i ) {
-
-        const size_t node_row = elem_node_ids( elem_id , i );
-        const size_t node_row_begin = node_node_begin[ node_row ];
-        const std::vector< unsigned > & column = node_node_ids[ node_row ] ;
-
-        if ( owned_node <= node_row ) {
-          for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
-            elem_map_host( elem_id , i , j ) = std::numeric_limits<size_type>::max();
-          }
-        }
-        else {
-
-          for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
-            const size_type node_col = elem_node_ids( elem_id , j );
-
-            int col_search = 0 ;
-
-            for ( int len = column.size() ; 0 < len ; ) {
-
-              const int half = len >> 1;
-              const int middle = col_search + half ;
-
-              if ( column[middle] < node_col ){
-                col_search = middle + 1 ;
-                len -= half + 1 ;
-              }
-              else {
-                len = half ;
-              }
-            }
-if ( node_col != column[col_search] ) {
-  throw std::runtime_error(std::string("Failed"));
-}
-            elem_map_host( elem_id , i , j ) = col_search + node_row_begin ;
-          }
-        }
-      }
-    }
-
-    deep_copy( elem_map , elem_map_host );
-  }
-};
-
-} // namespace HybridFEM
-
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef SPARSELINEARSYSTEMFILL_HPP */
-
--- a/lib/kokkos/example/multi_fem/SparseLinearSystem_Cuda.hpp
+++ b/lib/kokkos/example/multi_fem/SparseLinearSystem_Cuda.hpp
@ -1,164 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef SPARSELINEARSYSTEM_CUDA_HPP
-#define SPARSELINEARSYSTEM_CUDA_HPP
-
-#if defined( BUILD_FROM_CU_FILE )
-
-#include <cusparse_v2.h>
-#include <Kokkos_Core.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-
-struct CudaSparseSingleton {
-  cusparseHandle_t   handle;
-  cusparseMatDescr_t descra;
-
-  CudaSparseSingleton()
-  {
-    cusparseCreate( & handle );
-    cusparseCreateMatDescr( & descra );
-    cusparseSetMatType(       descra , CUSPARSE_MATRIX_TYPE_GENERAL );
-    cusparseSetMatIndexBase(  descra , CUSPARSE_INDEX_BASE_ZERO );
-  }
-
-  static CudaSparseSingleton & singleton();
-
-};
-
-CudaSparseSingleton & CudaSparseSingleton::singleton()
-{ static CudaSparseSingleton s ; return s ; }
-
-
-template<>
-struct Multiply< CrsMatrix<double,Cuda> ,
-                 View<double*,Cuda > ,
-                 View<double*,Cuda > >
-{
-  typedef Cuda                                      execution_space ;
-  typedef execution_space::size_type                    size_type ;
-  typedef double                                    scalar_type ;
-  typedef View< scalar_type* , execution_space >        vector_type ;
-  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
-
-public:
-
-  Multiply( const matrix_type & A ,
-            const size_type nrow ,
-            const size_type ncol ,
-            const vector_type & x ,
-            const vector_type & y )
-  {
-    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
-    const scalar_type alpha = 1 , beta = 0 ;
-
-    cusparseStatus_t status =
-      cusparseDcsrmv( s.handle ,
-                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
-                      nrow , ncol , A.coefficients.dimension_0() ,
-                      &alpha ,
-                      s.descra ,
-                      A.coefficients.ptr_on_device() ,
-                      A.graph.row_map.ptr_on_device() ,
-                      A.graph.entries.ptr_on_device() ,
-                      x.ptr_on_device() ,
-                      &beta ,
-                      y.ptr_on_device() );
-
-    if ( CUSPARSE_STATUS_SUCCESS != status ) {
-      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
-    }
-  }
-};
-
-
-template<>
-struct Multiply< CrsMatrix<float,Cuda> ,
-                 View<float*,Cuda > ,
-                 View<float*,Cuda > >
-{
-  typedef Cuda                                      execution_space ;
-  typedef execution_space::size_type                    size_type ;
-  typedef float                                     scalar_type ;
-  typedef View< scalar_type* , execution_space >        vector_type ;
-  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
-
-public:
-
-  Multiply( const matrix_type & A ,
-            const size_type nrow ,
-            const size_type ncol ,
-            const vector_type & x ,
-            const vector_type & y )
-  {
-    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
-    const scalar_type alpha = 1 , beta = 0 ;
-
-    cusparseStatus_t status =
-      cusparseScsrmv( s.handle ,
-                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
-                      nrow , ncol , A.coefficients.dimension_0() ,
-                      &alpha ,
-                      s.descra ,
-                      A.coefficients.ptr_on_device() ,
-                      A.graph.row_map.ptr_on_device() ,
-                      A.graph.entries.ptr_on_device() ,
-                      x.ptr_on_device() ,
-                      &beta ,
-                      y.ptr_on_device() );
-
-    if ( CUSPARSE_STATUS_SUCCESS != status ) {
-      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
-    }
-  }
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#endif /* #if defined( __CUDACC__ ) */
-#endif /* #ifndef SPARSELINEARSYSTEM_CUDA_HPP */
-
--- a/lib/kokkos/example/multi_fem/TestBoxMeshFixture.hpp
+++ b/lib/kokkos/example/multi_fem/TestBoxMeshFixture.hpp
@ -1,242 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef TESTFEMESHBOXFIXTURE_HPP
-#define TESTFEMESHBOXFIXTURE_HPP
-
-#include <stdio.h>
-#include <iostream>
-#include <stdexcept>
-#include <limits>
-#include <utility>
-#include <BoxMeshFixture.hpp>
-
-#include <ParallelComm.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace TestFEMesh {
-
-template< class ViewType >
-struct VerifyUnpack  ;
-
-template< typename DeviceType, typename T >
-struct VerifyUnpack< Kokkos::View< T*[3] , DeviceType > >
-{
-  typedef DeviceType     execution_space ;
-  typedef typename execution_space::size_type  size_type ;
-  typedef size_type               value_type ;
-
-  typedef Kokkos::View< T* ,    execution_space > buffer_type ;
-  typedef Kokkos::View< T*[3] , execution_space > array_type ;
-
-private:
-
-  array_type  node_coords ;
-  buffer_type buffer ;
-  size_type   node_begin ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  static void init( value_type & update )
-  { update = 0 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
-                    const volatile value_type & source )
-  { update += source ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i , value_type & update ) const
-  {
-    const size_type node_id = i + node_begin ;
-    const size_type k = i * 3 ;
-
-    const long xb = buffer[k];
-    const long yb = buffer[k+1];
-    const long zb = buffer[k+2];
-    const long xn = node_coords(node_id,0);
-    const long yn = node_coords(node_id,1);
-    const long zn = node_coords(node_id,2);
-
-    if ( xb != xn || yb != yn || zb != zn ) {
-      printf("TestFEMesh::VerifyUnpack failed at %d : node %d : { %ld %ld %ld } != { %ld %ld %ld }\n",
-             (int)i,(int)node_id, xb,yb,zb, xn, yn, zn );
-      ++update ;
-    }
-  }
-
-  static inline
-  size_type unpack( const array_type  & arg_node_coords ,
-                    const size_type     arg_node_begin ,
-                    const size_type     arg_node_count ,
-                    const buffer_type & arg_buffer )
-  {
-    VerifyUnpack op ;
-    op.node_coords = arg_node_coords ;
-    op.buffer      = arg_buffer ;
-    op.node_begin  = arg_node_begin ;
-    size_type count = 0 ;
-    Kokkos::parallel_reduce( arg_node_count , op , count );
-    return count ;
-  }
-};
-
-}
-
-//----------------------------------------------------------------------------
-
-#ifdef KOKKOS_HAVE_MPI
-
-namespace TestFEMesh {
-
-template< typename coordinate_scalar_type ,
-          unsigned ElemNodeCount ,
-          class Device >
-void verify_parallel(
-  const HybridFEM::FEMesh< coordinate_scalar_type ,
-                           ElemNodeCount ,
-                           Device > & mesh )
-{
-  typedef HybridFEM::FEMesh< coordinate_scalar_type, ElemNodeCount, Device > femesh_type ;
-  typedef typename femesh_type::node_coords_type node_coords_type ;
-
-  comm::Machine machine = mesh.parallel_data_map.machine ;
-
-  // Communicate node coordinates to verify communication and setup.
-
-  const size_t chunk_size = 3 ;
-
-  Kokkos::AsyncExchange< coordinate_scalar_type, Device, Kokkos::ParallelDataMap >
-    exchange( mesh.parallel_data_map , chunk_size );
-
-  const size_t send_begin = mesh.parallel_data_map.count_interior ;
-  const size_t send_count = mesh.parallel_data_map.count_send ;
-
-  const size_t recv_begin = mesh.parallel_data_map.count_owned ;
-  const size_t recv_count = mesh.parallel_data_map.count_receive ;
-
-  typedef Kokkos::PackArray< node_coords_type > pack_type ;
-
-  pack_type::pack( exchange.buffer(), send_begin, send_count, mesh.node_coords );
-
-  exchange.setup();
-
-  // Launch local-action device kernels
-
-  exchange.send_receive();
-
-  unsigned long local[3] ;
-  local[0] = mesh.parallel_data_map.count_owned ;
-  local[1] = mesh.parallel_data_map.count_receive ;
-  local[2] = TestFEMesh::VerifyUnpack< node_coords_type >::unpack( mesh.node_coords, recv_begin, recv_count, exchange.buffer() );
-
-  unsigned long global[3] = { 0 , 0 , 0 };
-
-  MPI_Allreduce( local , global ,
-                 3 , MPI_UNSIGNED_LONG , MPI_SUM , machine.mpi_comm );
-
-  if ( 0 == comm::rank( machine ) ) {
-    std::cout << ( global[2] ? "FAILED" : "PASSED" )
-              << ": TestFEMesh::verify_parallel "
-              << "NP(" << comm::size( machine )
-              << ") total_node(" << global[0]
-              << ") verified_nodes(" << global[1]
-              << ") failed_nodes(" << global[2]
-              << ")" << std::endl ;
-  }
-}
-
-} // namespace TestFEMesh
-
-#else /* ! #ifdef KOKKOS_HAVE_MPI */
-
-namespace TestFEMesh {
-
-template< typename coordinate_scalar_type ,
-          unsigned ElemNodeCount ,
-          class Device >
-void verify_parallel(
-  const HybridFEM::FEMesh< coordinate_scalar_type ,
-                           ElemNodeCount ,
-                           Device > & )
-{}
-
-} // namespace TestFEMesh
-
-#endif /* ! #ifdef KOKKOS_HAVE_MPI */
-
-//----------------------------------------------------------------------------
-
-template< class Device >
-void test_box_fixture( comm::Machine machine ,
-                       const size_t gang_count ,
-                       const size_t nodes_nx ,
-                       const size_t nodes_ny ,
-                       const size_t nodes_nz )
-{
-  typedef long                coordinate_scalar_type ;
-  typedef FixtureElementHex8  fixture_element_type ;
-
-  typedef BoxMeshFixture< coordinate_scalar_type ,
-                          Device ,
-                          fixture_element_type > fixture_type ;
-
-  typedef typename fixture_type::FEMeshType  mesh_type ;
-
-  const size_t proc_count = comm::size( machine );
-  const size_t proc_local = comm::rank( machine ) ;
-
-  mesh_type mesh =
-    fixture_type::create( proc_count , proc_local , gang_count ,
-                          nodes_nx - 1 , nodes_ny - 1 , nodes_nz - 1 );
-
-  mesh.parallel_data_map.machine = machine ;
-
-  TestFEMesh::verify_parallel( mesh );
-}
-
-#endif /* #ifndef TESTFEMESHBOXFIXTURE_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/TestBoxMeshPartition.cpp
+++ b/lib/kokkos/example/multi_fem/TestBoxMeshPartition.cpp
@ -1,172 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-#include <iostream>
-#include <stdexcept>
-#include <limits>
-#include <utility>
-#include <BoxMeshPartition.hpp>
-
-//----------------------------------------------------------------------------
-
-void test_box_partition( bool print )
-{
-  const size_t np_max = 10000 ;
-
-  const BoxBoundsLinear use_box ;
-
-  BoxType root_box ;
-
-  root_box[0][0] = 0 ; root_box[0][1] = 100 ;
-  root_box[1][0] = 0 ; root_box[1][1] = 200 ;
-  root_box[2][0] = 0 ; root_box[2][1] = 300 ;
-
-  const size_t cell_total =
-    ( root_box[0][1] - root_box[0][0] ) *
-    ( root_box[1][1] - root_box[1][0] ) *
-    ( root_box[2][1] - root_box[2][0] );
-
-  for ( size_t np = 2 ; np < np_max ; np = 2 * ( np + 1 ) ) {
-
-    std::vector<BoxType> part_boxes( np );
-
-    box_partition_rcb( root_box , part_boxes );
-
-    size_t cell_goal = ( cell_total + np - 1 ) / np ;
-    size_t cell_max = 0 ;
-
-    for ( size_t i = 0 ; i < np ; ++i ) {
-      cell_max = std::max( cell_max , count( part_boxes[i] ) );
-    }
-
-    if ( print ) {
-      std::cout << std::endl
-                << "box_part( " << np 
-                << " ) max( " << cell_max
-                << " ) goal( " << cell_goal
-                << " ) ratio( " << double(cell_max) / double(cell_goal)
-                << " )" << std::endl ;
-    }
-
-    const size_t nsample = std::min(np,(size_t)4);
-    const size_t stride = ( np + nsample - 1 ) / nsample ;
-
-    for ( size_t my_part = 0 ; my_part < np ; my_part += stride ) {
-      BoxType             my_use_box ;
-      std::vector<size_t> my_use_id_map ;
-      size_t              my_count_interior ;
-      size_t              my_count_owned ;
-      size_t              my_count_uses ;
-      std::vector<size_t> my_recv_counts ;
-      std::vector<std::vector<size_t> > my_send_map ;
-
-      size_t count_verify = 0 ;
-
-      box_partition_maps( root_box , part_boxes ,
-                          use_box , my_part ,
-                          my_use_box , my_use_id_map ,
-                          my_count_interior ,
-                          my_count_owned ,
-                          my_count_uses ,
-                          my_recv_counts ,
-                          my_send_map );
-
-      count_verify = my_count_owned ;
-
-      if ( print ) {
-        std::cout << "  my_part(" << my_part << ") layout { "
-                  << "P" << my_part
-                  << "(" << my_count_interior
-                  << "," << ( my_count_owned - my_count_interior )
-                  << ")" ;
-      }
-
-      for ( size_t i = 1 ; i < np ; ++i ) {
-        if ( my_recv_counts[i] ) {
-          count_verify += my_recv_counts[i] ;
-          const size_t ip = ( my_part + i ) % np ;
-
-          if ( print ) {
-            std::cout << " P" << ip << "(" << my_recv_counts[i] << ")" ;
-          }
-
-          // Compare recv & send lists
-
-          BoxType             ip_use_box ;
-          std::vector<size_t> ip_use_id_map ;
-          size_t              ip_count_interior ;
-          size_t              ip_count_owned ;
-          size_t              ip_count_uses ;
-          std::vector<size_t> ip_recv_counts ;
-          std::vector<std::vector<size_t> > ip_send_map ;
-
-          box_partition_maps( root_box , part_boxes ,
-                              use_box , ip ,
-                              ip_use_box , ip_use_id_map ,
-                              ip_count_interior ,
-                              ip_count_owned ,
-                              ip_count_uses ,
-                              ip_recv_counts ,
-                              ip_send_map );
-
-          // Sent by ip, received by my_part:
-
-          const BoxType recv_send = intersect( part_boxes[ip] , my_use_box );
-          const size_t recv_send_count = count( recv_send );
-
-          const size_t j = ( my_part + np - ip ) % np ;
-
-          if ( recv_send_count != my_recv_counts[i] ||
-               recv_send_count != ip_send_map[j].size() ) {
-            throw std::runtime_error( std::string("bad recv/send map") );
-          }
-        }
-      }
-      if ( print ) { std::cout << " }" << std::endl ; }
-
-      if ( count_verify != my_count_uses ) {
-        throw std::runtime_error( std::string("bad partition map") );
-      }
-    }
-  }
-}
-
-
--- a/lib/kokkos/example/multi_fem/TestCuda.cpp
+++ b/lib/kokkos/example/multi_fem/TestCuda.cpp
@ -1,192 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-
-#include <TestBoxMeshFixture.hpp>
-#include <Implicit.hpp>
-#include <Nonlinear.hpp>
-#include <Explicit.hpp>
-
-#include <SparseLinearSystem.hpp>
-
-#if defined( KOKKOS_HAVE_CUDA )
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-CudaSparseSingleton & CudaSparseSingleton::singleton()
-{ static CudaSparseSingleton s ; return s ; }
-
-}
-}
-
-//----------------------------------------------------------------------------
-
-void test_cuda_query( comm::Machine machine )
-{
-  const size_t comm_rank = comm::rank( machine );
-  std::cout << "P" << comm_rank
-            << ": Cuda device_count = "
-            << Kokkos::Cuda::detect_device_count()
-            << std::endl ;
-}
-
-//----------------------------------------------------------------------------
-
-void test_cuda_fixture( comm::Machine machine ,
-                        size_t nx , size_t ny , size_t nz )
-{
-  const size_t comm_rank = comm::rank( machine );
-  const size_t comm_size = comm::size( machine );
-  const size_t dev_count = Kokkos::Cuda::detect_device_count();
-  const size_t dev_rank =
-    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
-  const size_t gang_count = 0 ;
-
-  Kokkos::HostSpace::execution_space::initialize();
-  Kokkos::Cuda::SelectDevice select_device( dev_rank );
-  Kokkos::Cuda::initialize( select_device );
-  test_box_fixture<Kokkos::Cuda>( machine , gang_count , nx , ny , nz );
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-void test_cuda_implicit( comm::Machine machine , 
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run )
-{
-  const size_t comm_rank = comm::rank( machine );
-  const size_t comm_size = comm::size( machine );
-  const size_t dev_count = Kokkos::Cuda::detect_device_count();
-  const size_t dev_rank =
-    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
-  const size_t gang_count = 0 ;
-
-  Kokkos::HostSpace::execution_space::initialize();
-  Kokkos::Cuda::SelectDevice select_device( dev_rank );
-  Kokkos::Cuda::initialize( select_device );
-  HybridFEM::Implicit::driver<double,Kokkos::Cuda>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-void test_cuda_explicit( comm::Machine machine , 
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run )
-{
-  const size_t comm_rank = comm::rank( machine );
-  const size_t comm_size = comm::size( machine );
-  const size_t dev_count = Kokkos::Cuda::detect_device_count();
-  const size_t dev_rank =
-    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
-  const size_t gang_count = 0 ;
-
-  Kokkos::HostSpace::execution_space::initialize();
-  Kokkos::Cuda::SelectDevice select_device( dev_rank );
-  Kokkos::Cuda::initialize( select_device );
-  Explicit::driver<double,Kokkos::Cuda>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-void test_cuda_nonlinear( comm::Machine machine , 
-                          size_t elem_count_begin ,
-                          size_t elem_count_end ,
-                          size_t count_run )
-{
-  const size_t comm_rank = comm::rank( machine );
-  const size_t comm_size = comm::size( machine );
-  const size_t dev_count = Kokkos::Cuda::detect_device_count();
-  const size_t dev_rank =
-    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
-  const size_t gang_count = 0 ;
-
-  Kokkos::HostSpace::execution_space::initialize();
-  Kokkos::Cuda::SelectDevice select_device( dev_rank );
-  Kokkos::Cuda::initialize( select_device );
-
-  typedef Kokkos::Cuda device ;
-  typedef FixtureElementHex8 hex8 ;
-  HybridFEM::Nonlinear::driver<double,device,hex8>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-void test_cuda_nonlinear_quadratic( comm::Machine machine , 
-                                    size_t elem_count_begin ,
-                                    size_t elem_count_end ,
-                                    size_t count_run )
-{
-  const size_t comm_rank = comm::rank( machine );
-  const size_t comm_size = comm::size( machine );
-  const size_t dev_count = Kokkos::Cuda::detect_device_count();
-  const size_t dev_rank =
-    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
-  const size_t gang_count = 0 ;
-
-  Kokkos::HostSpace::execution_space::initialize();
-  Kokkos::Cuda::SelectDevice select_device( dev_rank );
-  Kokkos::Cuda::initialize( select_device );
-
-  typedef Kokkos::Cuda device ;
-  typedef FixtureElementHex27 hex27 ;
-  HybridFEM::Nonlinear::driver<double,device,hex27>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-#endif  /* #if defined( KOKKOS_HAVE_CUDA ) */
-
--- a/lib/kokkos/example/multi_fem/TestHost.cpp
+++ b/lib/kokkos/example/multi_fem/TestHost.cpp
@ -1,137 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-// Must be included first on Intel-Phi systems due to
-// redefinition of SEEK_SET in <mpi.h>.
-
-#include <ParallelComm.hpp>
-
-#include <iostream>
-#include <stdexcept>
-#include <limits>
-#include <utility>
-
-//----------------------------------------------------------------------------
-
-#include <Kokkos_Core.hpp>
-
-#include <BoxMeshFixture.hpp>
-#include <TestBoxMeshFixture.hpp>
-#include <Implicit.hpp>
-#include <Nonlinear.hpp>
-#include <Explicit.hpp>
-#include <SparseLinearSystem.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-void test_host_fixture( comm::Machine machine ,
-                        size_t gang_count ,
-                        size_t gang_worker_count ,
-                        size_t nx , size_t ny , size_t nz )
-{
-  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
-  test_box_fixture<Kokkos::HostSpace::execution_space>( machine , gang_count , nx , ny , nz );
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-void test_host_implicit( comm::Machine machine ,
-                         size_t gang_count ,
-                         size_t gang_worker_count ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run )
-{
-  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
-  HybridFEM::Implicit::driver<double,Kokkos::HostSpace::execution_space>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-void test_host_explicit( comm::Machine machine ,
-                         size_t gang_count ,
-                         size_t gang_worker_count ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run )
-{
-  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
-  Explicit::driver<double,Kokkos::HostSpace::execution_space>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-void test_host_nonlinear( comm::Machine machine ,
-                          size_t gang_count ,
-                          size_t gang_worker_count ,
-                          size_t elem_count_begin ,
-                          size_t elem_count_end ,
-                          size_t count_run )
-{
-  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
-  typedef FixtureElementHex8 hex8 ;
-  typedef Kokkos::HostSpace::execution_space             device ;
-  HybridFEM::Nonlinear::driver<double,device,hex8>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-void test_host_nonlinear_quadratic( comm::Machine machine ,
-                                    size_t gang_count ,
-                                    size_t gang_worker_count ,
-                                    size_t elem_count_begin ,
-                                    size_t elem_count_end ,
-                                    size_t count_run )
-{
-  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
-  typedef FixtureElementHex27 hex27 ;
-  typedef Kokkos::HostSpace::execution_space              device ;
-  HybridFEM::Nonlinear::driver<double,device,hex27>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-
--- a/lib/kokkos/example/multi_fem/TestHybridFEM.cpp
+++ b/lib/kokkos/example/multi_fem/TestHybridFEM.cpp
@ -1,348 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-// Must be included first on Intel-Phi systems due to
-// redefinition of SEEK_SET in <mpi.h>.
-
-#include <ParallelComm.hpp>
-
-#include <string>
-#include <sstream>
-#include <iostream>
-#include <Kokkos_hwloc.hpp>
-
-//----------------------------------------------------------------------------
-
-void test_box_partition( bool print );
-
-//----------------------------------------------------------------------------
-
-void test_host_fixture( comm::Machine machine ,
-                        size_t gang_count ,
-                        size_t gang_worker_count ,
-                        size_t nx , size_t ny , size_t nz );
-
-void test_host_implicit( comm::Machine machine ,
-                         size_t gang_count ,
-                         size_t gang_worker_count ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run );
-
-void test_host_explicit( comm::Machine machine ,
-                         size_t gang_count ,
-                         size_t gang_worker_count ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run );
-
-void test_host_nonlinear( comm::Machine machine ,
-                          size_t gang_count ,
-                          size_t gang_worker_count ,
-                          size_t elem_count_begin ,
-                          size_t elem_count_end ,
-                          size_t count_run );
-
-void test_host_nonlinear_quadratic( comm::Machine machine ,
-                                    size_t gang_count ,
-                                    size_t gang_worker_count ,
-                                    size_t elem_count_begin ,
-                                    size_t elem_count_end ,
-                                    size_t count_run );
-
-
-//----------------------------------------------------------------------------
-
-void test_cuda_query( comm::Machine );
-
-void test_cuda_fixture( comm::Machine machine ,
-                        size_t nx , size_t ny , size_t nz );
-
-void test_cuda_implicit( comm::Machine machine ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run );
-
-void test_cuda_explicit( comm::Machine machine ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run );
-
-void test_cuda_nonlinear( comm:: Machine machine ,
-                          size_t elem_count_begin ,
-                          size_t elem_count_end ,
-                          size_t count_run );
-
-void test_cuda_nonlinear_quadratic( comm::Machine machine ,
-                                    size_t elem_count_begin ,
-                                    size_t elem_count_end ,
-                                    size_t count_run );
-
-
-//----------------------------------------------------------------------------
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace {
-
-bool run_host( std::istream & input ,
-               comm::Machine machine ,
-               const size_t host_gang_count ,
-               const size_t host_gang_worker_count )
-{
-  bool cmd_error = false ;
-
-  std::string which ; input >> which ;
-
-  if ( which == std::string("fixture") ) {
-
-    size_t nx = 0 , ny = 0 , nz = 0 ;
-    input >> nx >> ny >> nz ;
-    test_host_fixture( machine , host_gang_count , host_gang_worker_count , nx , ny , nz );
-
-  }
-  else if ( which == std::string("explicit") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_host_explicit( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("implicit") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_host_implicit( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("nonlinear") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_host_nonlinear( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("nonlinear_quadratic") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_host_nonlinear_quadratic( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else {
-    cmd_error = true ;
-  }
-
-  return cmd_error ;
-}
-
-#if defined( KOKKOS_HAVE_CUDA )
-bool run_cuda( std::istream & input , comm::Machine machine )
-{
-  bool cmd_error = false ;
-
-  std::string which ; input >> which ;
-
-  if ( which == std::string("fixture") ) {
-
-    size_t nx = 0 , ny = 0 , nz = 0 ;
-    input >> nx >> ny >> nz ;
-    test_cuda_fixture( machine , nx , ny , nz );
-
-  }
-  else if ( which == std::string("explicit") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_cuda_explicit( machine , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("implicit") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_cuda_implicit( machine , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("nonlinear") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_cuda_nonlinear( machine , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("nonlinear_quadratic") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_cuda_nonlinear_quadratic( machine , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else {
-    cmd_error = true ;
-  }
-
-  return cmd_error ;
-}
-#endif
-
-void run( const std::string & argline , comm::Machine machine )
-{
-  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-  std::istringstream input( argline );
-
-  bool cmd_error = false ;
-
-  std::string which ; input >> which ;
-
-  if ( which == std::string("query") ) {
-    std::cout << "P" << comm::rank( machine )
-              << ": hwloc { NUMA[" << numa_count << "]"
-              << " CORE[" << cores_per_numa << "]"
-              << " PU[" << threads_per_core << "] }"
-              << std::endl ;
-#if defined( KOKKOS_HAVE_CUDA )
-    test_cuda_query( machine );
-#endif
-  }
-  else if ( which == std::string("partition") ) {
-    if ( 0 == comm::rank( machine ) ) {
-      test_box_partition( false /* print flag */ );
-    }
-  }
-  else {
-    if ( which == std::string("host") ) {
-      size_t host_gang_count = 0 ;
-      size_t host_gang_worker_count = 1 ;
-
-      input >> host_gang_count ;
-      input >> host_gang_worker_count ;
-
-      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
-    }
-    else if ( which == std::string("host-all") ) {
-      size_t host_gang_count        = numa_count ;
-      size_t host_gang_worker_count = cores_per_numa * threads_per_core ;
-
-      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
-    }
-    else if ( which == std::string("host-most") ) {
-      size_t host_gang_count        = numa_count ;
-      size_t host_gang_worker_count = ( cores_per_numa - 1 ) * threads_per_core ;
-
-      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
-    }
-#if defined( KOKKOS_HAVE_CUDA )
-    else if ( which == std::string("cuda") ) {
-      cmd_error = run_cuda( input , machine );
-    }
-#endif
-    else {
-      cmd_error = true ;
-    }
-  }
-
-  if ( cmd_error && 0 == comm::rank( machine ) ) {
-    std::cout << "Expecting command line with" << std::endl
-              << "    query" << std::endl
-              << "    partition" << std::endl
-              << "    host NumNumaNode NumThreadPerNode <test>" << std::endl
-              << "    host-all <test>" << std::endl
-              << "    host-most <test>" << std::endl
-              << "    cuda <test>" << std::endl
-              << "where <test> is" << std::endl
-              << "    fixture   NumElemX NumElemY NumElemZ" << std::endl
-              << "    implicit  NumElemBegin NumElemEnd NumRun" << std::endl
-              << "    explicit  NumElemBegin NumElemEnd NumRun" << std::endl
-              << "    nonlinear NumElemBegin NumElemEnd NumRun" << std::endl
-              << "    nonlinear_quadratic NumElemBegin NumElemEnd NumRun" << std::endl ;
-
-  }
-}
-
-} // namespace
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-int main( int argc , char ** argv )
-{
-  comm::Machine machine = comm::Machine::init( & argc , & argv );
-
-  const unsigned comm_rank = comm::rank( machine );
-
-  const std::string argline = comm::command_line( machine , argc , argv );
-
-  try {
-    run( argline , machine );
-  }
-  catch( const std::exception & x ) {
-    std::cerr << "P" << comm_rank << " throw: " << x.what() << std::endl ;
-  }
-  catch( ... ) {
-    std::cerr << "P" << comm_rank << " throw: unknown exception" << std::endl ;
-  }
-
-  comm::Machine::finalize();
-
-  return 0 ;
-}
-