Updating kokkos lib

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14918 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2016-05-02 22:06:50 +00:00
parent c5d0c55bee
commit 0a1b765248
411 changed files with 0 additions and 133424 deletions
--- a/lib/kokkos/example/CMakeLists.txt
+++ b/lib/kokkos/example/CMakeLists.txt
@ -1,20 +0,0 @@
-
-
-# Subpackage name must match what appears in kokkos/cmake/Dependencies.cmake
-#
-TRIBITS_SUBPACKAGE(Example)
-
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(query_device)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(fixture)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(feint)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(fenl)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(multi_fem)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(md_skeleton)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(global_2_local_ids)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(grow_array)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(sort_array)
-if(NOT Kokkos_ENABLE_Cuda)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(tutorial)
-endif()
-TRIBITS_SUBPACKAGE_POSTPROCESS()
-
--- a/lib/kokkos/example/README
+++ b/lib/kokkos/example/README
@ -1,16 +0,0 @@
-This directory contains example application proxies that use different
-parts of Kokkos.  If you are looking for the FENL ("finite element
-nonlinear" solve) example, it has moved into the LinAlg subpackage of
-Tpetra.
-
-MANIFEST:
-
-  - common:  Header files used by different examples
-  - feint:   Unstructured finite-element method
-  - fixture: Some other finite-element method example
-  - global_2_local_ids: Example of global-to-local index lookup
-  - grow_array:   Parallel dynamic memory allocation
-  - md_skeleton:  Molecular dynamics
-  - query_device: Kokkos' HWLOC wrapper for querying device topology
-  - sort_array:   Parallel sort
-  - tutorial:     Kokkos tutorial (START HERE)
--- a/lib/kokkos/example/cmake/Dependencies.cmake
+++ b/lib/kokkos/example/cmake/Dependencies.cmake
@ -1,4 +0,0 @@
-TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_REQUIRED_DEP_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms
-  TEST_OPTIONAL_DEP_TPLS CUSPARSE MKL
-  )
--- a/lib/kokkos/example/common/VectorImport.hpp
+++ b/lib/kokkos/example/common/VectorImport.hpp
@ -1,294 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_VECTORIMPORT_HPP
-#define KOKKOS_VECTORIMPORT_HPP
-
-#include <utility>
-#include <limits>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-#include <Kokkos_Core.hpp>
-
-#include <WrapMPI.hpp>
-
-namespace Kokkos {
-namespace Example {
-
-template< class CommMessageType , class CommIdentType , class VectorType >
-struct VectorImport ;
-
-} // namespace Example
-} // namespace Kokkos
-
-#if ! defined( KOKKOS_HAVE_MPI )
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-
-template< class CommMessageType , class CommIdentType , class VectorType >
-struct VectorImport {
-
-  const MPI_Comm comm ;
-  const unsigned count_owned ;
-  const unsigned count_receive ;
-
-  VectorImport( MPI_Comm arg_comm ,
-                const CommMessageType & ,
-                const CommMessageType & ,
-                const CommIdentType   & ,
-                const unsigned arg_count_owned ,
-                const unsigned arg_count_receive )
-    : comm( arg_comm )
-    , count_owned( arg_count_owned )
-    , count_receive( arg_count_receive )
-    {}
-
-  inline
-  void operator()( const VectorType & ) const {}
-};
-
-
-} // namespace Example
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#else /* defined( KOKKOS_HAVE_MPI ) */
-
-namespace Kokkos {
-namespace Example {
-
-template< class CommMessageType , class CommIdentType , class VectorType >
-class VectorImport {
-private:
-
-  // rank == 1 or array_layout == LayoutRight
-  enum { OK = Kokkos::Impl::StaticAssert<
-           ( VectorType::rank == 1 ) ||
-           Kokkos::Impl::is_same< typename VectorType::array_layout , Kokkos::LayoutRight >::value
-         >::value };
-
-  typedef typename VectorType::HostMirror HostVectorType ;
-
-  enum { ReceiveInPlace =
-    Kokkos::Impl::is_same< typename VectorType::memory_space ,
-                           typename HostVectorType::memory_space >::value };
-
-  const CommMessageType  recv_msg ;
-  const CommMessageType  send_msg ;
-  const CommIdentType    send_nodeid ;
-  VectorType             send_buffer ;
-  HostVectorType         host_send_buffer ;
-  HostVectorType         host_recv_buffer ;
-  unsigned               chunk ;
-
-public:
-
-  const MPI_Comm         comm ;
-  const unsigned         count_owned ;
-  const unsigned         count_receive ;
-
-  struct Pack {
-    typedef typename VectorType::execution_space execution_space ;
-    const CommIdentType  index ;
-    const VectorType     source ;
-    const VectorType     buffer ;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()( const unsigned i ) const
-      { buffer( i ) = source( index(i) ); }
-
-    Pack( const CommIdentType  & arg_index ,
-          const VectorType     & arg_source ,
-          const VectorType     & arg_buffer )
-      : index( arg_index )
-      , source( arg_source )
-      , buffer( arg_buffer )
-    {
-      Kokkos::parallel_for( index.dimension_0() , *this );
-      execution_space::fence();
-    }
-  };
-
-  VectorImport( MPI_Comm arg_comm ,
-                const CommMessageType & arg_recv_msg ,
-                const CommMessageType & arg_send_msg ,
-                const CommIdentType   & arg_send_nodeid ,
-                const unsigned arg_count_owned ,
-                const unsigned arg_count_receive )
-    : recv_msg( arg_recv_msg )
-    , send_msg( arg_send_msg )
-    , send_nodeid( arg_send_nodeid )
-    , send_buffer()
-    , host_send_buffer()
-    , host_recv_buffer()
-    , comm( arg_comm )
-    , count_owned( arg_count_owned )
-    , count_receive( arg_count_receive )
-    {
-      if ( ! ReceiveInPlace ) {
-        host_recv_buffer = HostVectorType("recv_buffer",count_receive);
-      }
-
-      unsigned send_count = 0 ;
-      for ( unsigned i = 0 ; i < send_msg.dimension_0() ; ++i ) { send_count += send_msg(i,1); }
-      send_buffer      = VectorType("send_buffer",send_count);
-      host_send_buffer = Kokkos::create_mirror_view( send_buffer );
-    }
-
-  inline
-  void operator()( const VectorType & v ) const
-  {
-    typedef typename VectorType::value_type  scalar_type ;
-
-    const int mpi_tag = 42 ;
-    const unsigned chunk = v.dimension_1();
-
-    // Subvector for receives
-    const std::pair<unsigned,unsigned> recv_range( count_owned , count_owned + count_receive );
-    const VectorType recv_vector = Kokkos::subview( v , recv_range );
-
-    std::vector< MPI_Request > recv_request( recv_msg.dimension_0() , MPI_REQUEST_NULL );
-
-    { // Post receives
-      scalar_type * ptr =
-        ReceiveInPlace ? recv_vector.ptr_on_device() : host_recv_buffer.ptr_on_device();
-
-      for ( size_t i = 0 ; i < recv_msg.dimension_0() ; ++i ) {
-        const int proc  = recv_msg(i,0);
-        const int count = recv_msg(i,1) * chunk ;
-
-        MPI_Irecv( ptr , count * sizeof(scalar_type) , MPI_BYTE ,
-                   proc , mpi_tag , comm , & recv_request[i] );
-
-        ptr += count ;
-      }
-    }
-
-    MPI_Barrier( comm );
-
-    { // Pack and send 
-      const Pack pack( send_nodeid , v , send_buffer );
-
-      Kokkos::deep_copy( host_send_buffer , send_buffer );
-
-      scalar_type * ptr = host_send_buffer.ptr_on_device();
-
-      for ( size_t i = 0 ; i < send_msg.dimension_0() ; ++i ) {
-        const int proc  = send_msg(i,0);
-        const int count = send_msg(i,1) * chunk ;
-
-        // MPI_Ssend blocks until
-        // (1) a receive is matched for the message and
-        // (2) the send buffer can be re-used.
-        //
-        // It is suggested that MPI_Ssend will have the best performance:
-        // http://www.mcs.anl.gov/research/projects/mpi/sendmode.html .
-
-        MPI_Ssend( ptr ,
-                   count * sizeof(scalar_type) , MPI_BYTE ,
-                   proc , mpi_tag , comm );
-
-        ptr += count ;
-      }
-    }
-
-    // Wait for receives and verify:
-
-    for ( size_t i = 0 ; i < recv_msg.dimension_0() ; ++i ) {
-      MPI_Status recv_status ;
-      int recv_which = 0 ;
-      int recv_size  = 0 ;
-
-      MPI_Waitany( recv_msg.dimension_0() , & recv_request[0] , & recv_which , & recv_status );
-
-      const int recv_proc = recv_status.MPI_SOURCE ;
-
-      MPI_Get_count( & recv_status , MPI_BYTE , & recv_size );
-
-      // Verify message properly received:
-
-      const int  expected_proc = recv_msg(recv_which,0);
-      const int  expected_size = recv_msg(recv_which,1) * chunk * sizeof(scalar_type);
-
-      if ( ( expected_proc != recv_proc ) ||
-           ( expected_size != recv_size ) ) {
-
-        int local_rank  = 0 ;
-
-        MPI_Comm_rank( comm , & local_rank );
-
-        std::ostringstream msg ;
-        msg << "VectorImport error:"
-            << " P" << local_rank
-            << " received from P" << recv_proc
-            << " size "     << recv_size
-            << " expected " << expected_size
-            << " from P"    << expected_proc ;
-        throw std::runtime_error( msg.str() );
-      }
-    }
-
-    // Copy received data to device memory.
-
-    if ( ! ReceiveInPlace ) { Kokkos::deep_copy( recv_vector , host_recv_buffer ); }
-  }
-};
-
-} // namespace Example
-} // namespace Kokkos
-
-#endif
-
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_VECTORIMPORT_HPP */
-
-
--- a/lib/kokkos/example/common/WrapMPI.hpp
+++ b/lib/kokkos/example/common/WrapMPI.hpp
@ -1,103 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXAMPLE_WRAP_MPI
-#define KOKKOS_EXAMPLE_WRAP_MPI
-
-#include <Kokkos_Macros.hpp>
-#include <string>
-
-#if defined( KOKKOS_HAVE_MPI )
-
-#include <mpi.h>
-
-namespace Kokkos {
-namespace Example {
-
-inline
-double all_reduce( double value , MPI_Comm comm )
-{
-  double local = value ;
-  MPI_Allreduce( & local , & value , 1 , MPI_DOUBLE , MPI_SUM , comm );
-  return value ;
-}
-
-inline
-double all_reduce_max( double value , MPI_Comm comm )
-{
-  double local = value ;
-  MPI_Allreduce( & local , & value , 1 , MPI_DOUBLE , MPI_MAX , comm );
-  return value ;
-}
-
-} // namespace Example
-} // namespace Kokkos
-
-#elif ! defined( KOKKOS_HAVE_MPI )
-
-/* Wrap the the MPI_Comm type and heavily used MPI functions
- * to reduce the number of '#if defined( KOKKOS_HAVE_MPI )'
- * blocks which have to be sprinkled throughout the examples.
- */
-
-typedef int MPI_Comm ;
-
-inline int MPI_Comm_size( MPI_Comm , int * size ) { *size = 1 ; return 0 ; }
-inline int MPI_Comm_rank( MPI_Comm , int * rank ) { *rank = 0 ; return 0 ; }
-inline int MPI_Barrier( MPI_Comm ) { return 0; }
-
-namespace Kokkos {
-namespace Example {
-
-inline
-double all_reduce( double value , MPI_Comm ) { return value ; }
-
-inline
-double all_reduce_max( double value , MPI_Comm ) { return value ; }
-
-} // namespace Example
-} // namespace Kokkos
-
-#endif /* ! defined( KOKKOS_HAVE_MPI ) */
-#endif /* #ifndef KOKKOS_EXAMPLE_WRAP_MPI */
-
--- a/lib/kokkos/example/feint/CMakeLists.txt
+++ b/lib/kokkos/example/feint/CMakeLists.txt
@ -1,18 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../common)
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../fixture)
-
-SET(SOURCES "")
-
-FILE(GLOB SOURCES *.cpp)
-
-LIST( APPEND SOURCES ../fixture/BoxElemPart.cpp)
-
-TRIBITS_ADD_EXECUTABLE(
-  feint
-  SOURCES ${SOURCES}
-  COMM serial mpi
-  )
-
--- a/lib/kokkos/example/feint/ElemFunctor.hpp
+++ b/lib/kokkos/example/feint/ElemFunctor.hpp
@ -1,489 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXAMPLE_FEINT_FUNCTORS_HPP
-#define KOKKOS_EXAMPLE_FEINT_FUNCTORS_HPP
-
-#include <stdio.h>
-#include <Kokkos_Core.hpp>
-#include <BoxElemFixture.hpp>
-
-namespace Kokkos {
-namespace Example {
-
-/** \brief  Numerically integrate a function on a finite element mesh and
- *          project the integrated values to nodes.
- */
-template< class FixtureType ,
-          class FunctionType ,
-          bool PerformScatterAddWithAtomic >
-struct FiniteElementIntegration ;
-
-// Specialized for an 'Example::BoxElemFixture' finite element mesh
-template< class Device , BoxElemPart::ElemOrder ElemOrder , class GridMap ,
-          class FunctionType ,
-          bool PerformScatterAddWithAtomic >
-struct FiniteElementIntegration<
-  Kokkos::Example::BoxElemFixture< Device , ElemOrder , GridMap > ,
-  FunctionType ,
-  PerformScatterAddWithAtomic >
-{
-  // Element mesh types:
-  typedef Kokkos::Example::BoxElemFixture< Device , ElemOrder >
-    BoxFixtureType ;
-
-  typedef Kokkos::Example::HexElement_Data< BoxFixtureType::ElemNode >
-    HexElemDataType ;
-
-  enum { ElemNodeCount    = HexElemDataType::element_node_count  };
-  enum { IntegrationCount = HexElemDataType::integration_count };
-  enum { ValueCount       = FunctionType::value_count };
-
-  // Dictionary of view types:
-  typedef View<int*,                              Device> ElemErrorType ;
-  typedef View<double*[ElemNodeCount][ValueCount],Device> ElemValueType ;
-  typedef View<double*[ValueCount],               Device> NodeValueType ;
-
-  // Data members for this Functor:
-  const HexElemDataType  m_hex_elem_data ; ///< Master element
-  const BoxFixtureType   m_box_fixture ;   ///< Unstructured mesh data
-  const FunctionType     m_function ;      ///< Function to integrate
-  const ElemErrorType    m_elem_error ;    ///< Flags for element errors
-  const ElemValueType    m_elem_integral ; ///< Per-element quantities
-  const NodeValueType    m_node_lumped ;   ///< Quantities lumped to nodes
-
-  //----------------------------------------
-
-  FiniteElementIntegration(
-    const BoxFixtureType & box_fixture ,
-    const FunctionType   & function )
-    : m_hex_elem_data()
-    , m_box_fixture( box_fixture ) // Shallow copy of the mesh fixture
-    , m_function( function )
-    , m_elem_error(    "elem_error"    , box_fixture.elem_count() )
-    , m_elem_integral( "elem_integral" , box_fixture.elem_count() )
-    , m_node_lumped(   "node_lumped"   , box_fixture.node_count() )
-    {}
-
-  //----------------------------------------
-  // Device for parallel dispatch.
-  typedef typename Device::execution_space execution_space;
-
-  // Value type for global parallel reduction.
-  struct value_type {
-    double value[ ValueCount ]; ///< Integrated quantitie
-    int    error ;              ///< Element inversion flag
-  };
-
-  //----------------------------------------
-  // Transform element interpolation function gradients and
-  // compute determinant of spatial jacobian.
-  KOKKOS_INLINE_FUNCTION
-  float transform_gradients(
-    const float  grad[][  ElemNodeCount ] , // Gradient of bases master element
-    const double coord[][ ElemNodeCount ] ,
-          float  dpsi[][  ElemNodeCount ] ) const
-  {
-    enum { TensorDim = 9 };
-    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
-           j21 = 3 , j22 = 4 , j23 = 5 ,
-           j31 = 6 , j32 = 7 , j33 = 8 };
-
-    // Temporary for jacobian accumulation is double for summation accuracy.
-    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
-
-    for( int i = 0; i < ElemNodeCount ; ++i ) {
-      J[j11] += grad[0][i] * coord[0][i] ;
-      J[j12] += grad[0][i] * coord[1][i] ;
-      J[j13] += grad[0][i] * coord[2][i] ;
-
-      J[j21] += grad[1][i] * coord[0][i] ;
-      J[j22] += grad[1][i] * coord[1][i] ;
-      J[j23] += grad[1][i] * coord[2][i] ;
-
-      J[j31] += grad[2][i] * coord[0][i] ;
-      J[j32] += grad[2][i] * coord[1][i] ;
-      J[j33] += grad[2][i] * coord[2][i] ;
-    }
-
-    // Inverse jacobian, compute as double and store as float.
-    float invJ[ TensorDim ] = {
-      float( J[j22] * J[j33] - J[j23] * J[j32] ) ,
-      float( J[j13] * J[j32] - J[j12] * J[j33] ) ,
-      float( J[j12] * J[j23] - J[j13] * J[j22] ) ,
-
-      float( J[j23] * J[j31] - J[j21] * J[j33] ) ,
-      float( J[j11] * J[j33] - J[j13] * J[j31] ) ,
-      float( J[j13] * J[j21] - J[j11] * J[j23] ) ,
-
-      float( J[j21] * J[j32] - J[j22] * J[j31] ) ,
-      float( J[j12] * J[j31] - J[j11] * J[j32] ) ,
-      float( J[j11] * J[j22] - J[j12] * J[j21] ) };
-
-    const float detJ = J[j11] * invJ[j11] +
-                       J[j21] * invJ[j12] +
-                       J[j31] * invJ[j13] ;
-
-    {
-      const float detJinv = 1.0 / detJ ;
-      for ( int i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
-    }
-
-    // Transform gradients:
-    for ( int i = 0; i < ElemNodeCount ; ++i ) {
-      dpsi[0][i] = grad[0][i] * invJ[j11] +
-                   grad[1][i] * invJ[j12] +
-                   grad[2][i] * invJ[j13];
-      dpsi[1][i] = grad[0][i] * invJ[j21] +
-                   grad[1][i] * invJ[j22] +
-                   grad[2][i] * invJ[j23];
-      dpsi[2][i] = grad[0][i] * invJ[j31] +
-                   grad[1][i] * invJ[j32] +
-                   grad[2][i] * invJ[j33];
-    }
-
-    return detJ ;
-  }
-
-  // Functor's function called for each element in the mesh
-  // to numerically integrate the function and add element quantities
-  // to the global integral.
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const int ielem , value_type & update ) const
-  {
-    // Local temporaries for gathering nodal data.
-    double node_coord[3][ ElemNodeCount ];
-
-    int inode[ ElemNodeCount ] ;
-
-    // Gather indices of element's node from global memory to local memory.
-    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
-      inode[i] = m_box_fixture.elem_node( ielem , i );
-    }
-
-    // Gather coordinates of element's nodes from global memory to local memory.
-    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
-      node_coord[0][i] = m_box_fixture.node_coord( inode[i] , 0 );
-      node_coord[1][i] = m_box_fixture.node_coord( inode[i] , 1 );
-      node_coord[2][i] = m_box_fixture.node_coord( inode[i] , 2 );
-    }
-
-    // Local temporary to accumulate numerical integration
-    // of vector valued function.
-    double accum[ ValueCount ];
-
-    for ( int j = 0 ; j < ValueCount ; ++j ) { accum[j] = 0 ; }
-
-    int error = 0 ;
-
-    // Numerical integration loop for this element:
-    for ( int k = 0 ; k < IntegrationCount ; ++k ) {
-
-      // Integration point in space as interpolated from nodal coordinates:
-      double point[3] = { 0 , 0 , 0 };
-      for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
-        point[0] += node_coord[0][i] * m_hex_elem_data.values[k][i] ;
-        point[1] += node_coord[1][i] * m_hex_elem_data.values[k][i] ;
-        point[2] += node_coord[2][i] * m_hex_elem_data.values[k][i] ;
-      }
-
-      // Example function vector value at cubature point:
-      double val_at_pt[ ValueCount ];
-      m_function( point , val_at_pt );
-
-      // Temporary array for transformed element basis functions' gradient.
-      // Not used in this example, but computed anyway by the more general
-      // deformation function.
-      float dpsi[3][ ElemNodeCount ];
-
-      // Compute deformation jacobian, transform basis function gradient,
-      // and return determinant of deformation jacobian.
-      float detJ = transform_gradients( m_hex_elem_data.gradients[k] ,
-                                        node_coord , dpsi );
-
-      // Check for inverted spatial jacobian
-      if ( detJ <= 0 ) { error = 1 ; detJ = 0 ; }
-
-      // Integration weight.
-      const float w = m_hex_elem_data.weights[k] * detJ ;
-
-      // Cubature of function.
-      for ( int j = 0 ; j < ValueCount ; ++j ) {
-        accum[j] += val_at_pt[j] * w ;
-      }
-    }
-
-    m_elem_error(ielem) = error ;
-
-
-    // Element contribution to global integral:
-
-    if ( error ) { update.error = 1 ; }
-
-    for ( int j = 0 ; j < ValueCount ; ++j ) { update.value[j] += accum[j] ; }
-
-    // Element-node quantity for lumping to nodes:
-    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
-      for ( int j = 0 ; j < ValueCount ; ++j ) {
-        // Save element's integral apportionment to nodes to global memory
-        m_elem_integral( ielem , i , j ) = accum[j] / ElemNodeCount ;
-      }
-    }
-
-    if ( PerformScatterAddWithAtomic ) {
-      // Option to immediately scatter-add the integrated quantities to nodes.
-      // This is a race condition as two or more threads could attempt
-      // concurrent update of nodal values.  The atomic_fetch_add (+=)
-      // function guarantees that the summation will occur correctly;
-      // however, there can be no guarantee for the order of summation.
-      // Due to non-associativity of floating point arithmetic the result
-      // is non-deterministic within bounds of floating point round-off.
-
-      for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
-        for ( int j = 0 ; j < ValueCount ; ++j ) {
-          Kokkos::atomic_fetch_add( & m_node_lumped( inode[i] , j ) ,
-                                    m_elem_integral( ielem , i , j ) );
-        }
-      }
-    }
-  }
-  //--------------------------------------------------------------------------
-
-  // Initialization of the global reduction value.
-  KOKKOS_INLINE_FUNCTION
-  void init( value_type & update ) const
-  {
-    for ( int j = 0 ; j < ValueCount ; ++j ) update.value[j] = 0 ;
-    update.error = 0 ;
-  }
-
-  // Join two contributions to global reduction value.
-  KOKKOS_INLINE_FUNCTION
-  void join( volatile       value_type & update ,
-             volatile const value_type & input ) const
-  {
-    for ( int j = 0 ; j < ValueCount ; ++j ) update.value[j] += input.value[j] ;
-    if ( input.error ) update.error = 1 ;
-  }
-};
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-
-template< class ViewElemNode ,
-          class ViewNodeScan ,
-          class ViewNodeElem >
-void map_node_to_elem( const ViewElemNode & elem_node ,
-                       const ViewNodeScan & node_scan ,
-                       const ViewNodeElem & node_elem );
-
-/** \brief  Functor to gather-sum elements' per-node quantities
- *          to element nodes.  Gather-sum is thread safe and
- *          does not require atomic updates.
- */
-template< class ViewNodeValue ,
-          class ViewElemValue ,
-          bool  AlreadyUsedAtomic >
-struct LumpElemToNode {
-
-  typedef typename ViewElemValue::execution_space execution_space ;
-
-  // In this example we know that the ViewElemValue
-  // array specification is < double*[nNode][nValue] >
-
-#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
-  enum { value_count = ViewElemValue::dimension::N2 };
-#else
-  enum { value_count = ViewElemValue::shape_type::N2 };
-#endif
-
-  ViewNodeValue             m_node_value ; ///< Integrated values at nodes
-  ViewElemValue             m_elem_value ; ///< Values apportioned to nodes
-  View<int*,   execution_space> m_node_scan ;  ///< Offsets for nodes->element
-  View<int*[2],execution_space> m_node_elem ;  ///< Node->element connectivity
-
-  // Only allocate node->element connectivity if have
-  // not already used atomic updates for the nodes.
-  template< class ViewElemNode >
-  LumpElemToNode( const ViewNodeValue & node_value ,
-                  const ViewElemValue & elem_value ,
-                  const ViewElemNode  & elem_node )
-    : m_node_value( node_value )
-    , m_elem_value( elem_value )
-    , m_node_scan( "node_scan" ,
-                   AlreadyUsedAtomic ? 0 : node_value.dimension_0() + 1 )
-    , m_node_elem( "node_elem" ,
-                   AlreadyUsedAtomic ? 0 : elem_node.dimension_0() *
-                                           elem_node.dimension_1() )
-    {
-      if ( ! AlreadyUsedAtomic ) {
-        map_node_to_elem( elem_node , m_node_scan , m_node_elem );
-      }
-    }
-
-  //----------------------------------------
-
-  struct value_type { double value[ value_count ]; };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const int inode , value_type & update ) const
-  {
-    if ( ! AlreadyUsedAtomic ) {
-      // Sum element quantities to a local variable.
-      value_type local ;
-      for ( int j = 0 ; j < value_count ; ++j ) { local.value[j] = 0 ; }
-
-      {
-        // nodes' element ids span [i,end)
-        int i = m_node_scan(inode);
-        const int end = m_node_scan(inode+1);
-
-        for ( ; i < end ; ++i ) {
-          // element #ielem , local node #ielem_node is this node:
-          const int ielem      = m_node_elem(i,0);
-          const int ielem_node = m_node_elem(i,1);
-          // Sum the vector-values quantity
-          for ( int j = 0 ; j < value_count ; ++j ) {
-            local.value[j] += m_elem_value( ielem , ielem_node , j );
-          }
-        }
-      }
-
-      // Assign nodal quantity (no race condition).
-      // Sum global value.
-      for ( int j = 0 ; j < value_count ; ++j ) {
-        m_node_value( inode , j ) = local.value[j] ;
-        update.value[j] += local.value[j] ;
-      }
-    }
-    else {
-      // Already used atomic update of the nodal quantity,
-      // query and sum the value.
-      for ( int j = 0 ; j < value_count ; ++j ) {
-        update.value[j] += m_node_value( inode , j );
-      }
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void init( value_type & update ) const
-    { for ( int j = 0 ; j < value_count ; ++j ) { update.value[j] = 0 ; } }
-
-  KOKKOS_INLINE_FUNCTION
-  void join( volatile       value_type & update ,
-             volatile const value_type & input ) const
-    {
-      for ( int j = 0 ; j < value_count ; ++j ) {
-        update.value[j] += input.value[j] ;
-      }
-    }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template< class ViewElemNode ,
-          class ViewNodeScan ,
-          class ViewNodeElem >
-void map_node_to_elem( const ViewElemNode & elem_node ,
-                       const ViewNodeScan & node_scan ,
-                       const ViewNodeElem & node_elem )
-{
-  typedef typename ViewElemNode::host_mirror_space host_mirror_space ;
-
-  const typename ViewElemNode::HostMirror host_elem_node =
-    Kokkos::create_mirror_view(elem_node);
-
-  const typename ViewNodeScan::HostMirror host_node_scan =
-    Kokkos::create_mirror_view(node_scan);
-
-  const typename ViewNodeElem::HostMirror host_node_elem =
-    Kokkos::create_mirror_view(node_elem);
-
-  const int elem_count      = host_elem_node.dimension_0();
-  const int elem_node_count = host_elem_node.dimension_1();
-  const int node_count      = host_node_scan.dimension_0() - 1 ;
-
-  const View<int*, host_mirror_space >
-    node_elem_count( "node_elem_count" , node_count );
-
-  Kokkos::deep_copy( host_elem_node , elem_node );
-
-  for ( int i = 0 ; i < elem_count ; ++i ) {
-    for ( int j = 0 ; j < elem_node_count ; ++j ) {
-      ++node_elem_count( host_elem_node(i,j) );
-    }
-  }
-
-  for ( int i = 0 ; i < node_count ; ++i ) {
-    host_node_scan(i+1) += host_node_scan(i) + node_elem_count(i);
-    node_elem_count(i) = 0 ;
-  }
-
-  for ( int i = 0 ; i < elem_count ; ++i ) {
-    for ( int j = 0 ; j < elem_node_count ; ++j ) {
-      const int inode  = host_elem_node(i,j);
-      const int offset = host_node_scan(inode) + node_elem_count(inode);
-
-      host_node_elem( offset , 0 ) = i ;
-      host_node_elem( offset , 1 ) = j ;
-
-      ++node_elem_count(inode);
-    }
-  }
-
-  Kokkos::deep_copy( node_scan , host_node_scan );
-  Kokkos::deep_copy( node_elem , host_node_elem );
-}
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_EXAMPLE_FEINT_FUNCTORS_HPP */
-
--- a/lib/kokkos/example/feint/Makefile
+++ b/lib/kokkos/example/feint/Makefile
@ -1,62 +0,0 @@
-KOKKOS_PATH = ../..
-
-vpath %.cpp ${KOKKOS_PATH}/example/fixture ${KOKKOS_PATH}/example/feint
-
-EXAMPLE_HEADERS = $(wildcard $(KOKKOS_PATH)/example/common/*.hpp ${KOKKOS_PATH}/example/fixture/*.hpp ${KOKKOS_PATH}/example/feint/*.hpp)
-
-default: build_all
-	echo "End Build"
-        
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-	CXX = nvcc_wrapper
-	CXXFLAGS ?= -O3
-	LINK = $(CXX)
-	LDFLAGS ?= -lpthread
-else
-	CXX ?= g++
-	CXXFLAGS ?= -O3
-	LINK ?= $(CXX)
-	LDFLAGS ?= -lpthread
-endif
-
-KOKKOS_CXXFLAGS +=	\
-	-I${KOKKOS_PATH}/example/common	\
-	-I${KOKKOS_PATH}/example/fixture	\
-	-I${KOKKOS_PATH}/example/feint
-
-
-EXE_EXAMPLE_FEINT = KokkosExample_Feint
-OBJ_EXAMPLE_FEINT = BoxElemPart.o main.o
-
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-  OBJ_EXAMPLE_FEINT += feint_cuda.o
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-  OBJ_EXAMPLE_FEINT += feint_threads.o
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-  OBJ_EXAMPLE_FEINT += feint_openmp.o
-endif
-
-TARGETS = $(EXE_EXAMPLE_FEINT)
-
-#TEST_TARGETS =
-
-$(EXE_EXAMPLE_FEINT) : $(OBJ_EXAMPLE_FEINT) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_EXAMPLE_FEINT) $(KOKKOS_LIBS) $(LIB) -o $(EXE_EXAMPLE_FEINT)
-
-build_all : $(TARGETS)
-
-
-test : build_all
-
-
-# Compilation rules
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(EXAMPLE_HEADERS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
-
--- a/lib/kokkos/example/feint/feint.hpp
+++ b/lib/kokkos/example/feint/feint.hpp
@ -1,165 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXAMPLE_FEINT_HPP
-#define KOKKOS_EXAMPLE_FEINT_HPP
-
-#include <iostream>
-#include <BoxElemFixture.hpp>
-#include <ElemFunctor.hpp>
-#include <feint_fwd.hpp>
-
-namespace Kokkos {
-namespace Example {
-
-/** \brief  Vector valued function to numerically integrate.
- *
- *  F(X) = { 1 , x , y , z , x*y , y*z , z*x , x*y*z }
- *
- *  Integrates on a unit cube to:
- *    { 1 , 1/2 , 1/2 , 1/2 , 1/4 , 1/4 , 1/4 , 1/8 }
- */
-struct MyFunctionType {
-
-  enum { value_count = 8 };
-
-  // Evaluate function at coordinate.
-  template< typename CoordType , typename ValueType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const CoordType point[] , ValueType value[] ) const
-    {
-      value[0] = 1 ;
-      value[1] = point[0] ;
-      value[2] = point[1] ;
-      value[3] = point[2] ;
-      value[4] = point[0] * point[1] ;
-      value[5] = point[1] * point[2] ;
-      value[6] = point[2] * point[0] ;
-      value[7] = point[0] * point[1] * point[2] ;
-    }
-};
-
-template < class Device , bool UseAtomic >
-void feint(
-  const unsigned global_elem_nx ,
-  const unsigned global_elem_ny ,
-  const unsigned global_elem_nz )
-{
-  //----------------------------------------
-  // Create the unstructured finite element mesh box fixture on the device:
-
-  typedef Kokkos::Example::
-    BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear >
-    // BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
-      BoxFixtureType ;
-
-  // MPI distributed parallel domain decomposition of the fixture.
-  // Either by element (DecomposeElem) or by node (DecomposeNode)
-  // with ghosted elements.
-
-  static const Kokkos::Example::BoxElemPart::Decompose
-    decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ;
-    // decompose = Kokkos::Example::BoxElemPart:: DecomposeNode ;
-
-  // Not using MPI in this example.
-  const unsigned mpi_rank = 0 ;
-  const unsigned mpi_size = 1 ;
-
-  const BoxFixtureType fixture( decompose , mpi_size , mpi_rank ,
-                                global_elem_nx ,
-                                global_elem_ny ,
-                                global_elem_nz );
-
-  //----------------------------------------
-  // Create and execute the numerical integration functor on the device:
-
-  typedef Kokkos::Example::
-    FiniteElementIntegration< BoxFixtureType , MyFunctionType , UseAtomic >
-      FeintType ;
-
-  const FeintType feint( fixture , MyFunctionType() );
-
-  typename FeintType::value_type elem_integral ;
-
-  // A reduction for the global integral:
-  Kokkos::parallel_reduce( fixture.elem_count() , feint , elem_integral );
-
-  if ( elem_integral.error ) {
-    std::cout << "An element had a spatial jacobian error" << std::endl ;
-    return ;
-  }
-
-  std::cout << "Elem integral =" ;
-  for ( int i = 0 ; i < MyFunctionType::value_count ; ++i ) {
-    std::cout << " " << elem_integral.value[i] ;
-  }
-  std::cout << std::endl ;
- 
-  //----------------------------------------
-  // Create and execute the nodal lumped value projection and reduction functor:
-
-  typedef Kokkos::Example::
-    LumpElemToNode< typename FeintType::NodeValueType ,
-                    typename FeintType::ElemValueType ,
-                    UseAtomic > LumpType ;
-
-  const LumpType lump( feint.m_node_lumped ,
-                       feint.m_elem_integral ,
-                       fixture.elem_node() );
-
-  typename LumpType ::value_type node_sum ;
-
-  Kokkos::parallel_reduce( fixture.node_count() , lump , node_sum );
-
-  std::cout << "Node lumped sum =" ;
-  for ( int i = 0 ; i < MyFunctionType::value_count ; ++i ) {
-    std::cout << " " << node_sum.value[i] ;
-  }
-  std::cout << std::endl ;
-}
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_EXAMPLE_FEINT_HPP */
-
--- a/lib/kokkos/example/feint/feint_cuda.cpp
+++ b/lib/kokkos/example/feint/feint_cuda.cpp
@ -1,67 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-
-#if defined( KOKKOS_HAVE_CUDA )
-
-#include <feint.hpp>
-
-namespace Kokkos {
-namespace Example {
-
-template void feint<Kokkos::Cuda,false>(
-  const unsigned global_elem_nx ,
-  const unsigned global_elem_ny ,
-  const unsigned global_elem_nz );
-
-template void feint<Kokkos::Cuda,true>(
-  const unsigned global_elem_nx ,
-  const unsigned global_elem_ny ,
-  const unsigned global_elem_nz );
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-#endif
-
--- a/lib/kokkos/example/feint/feint_fwd.hpp
+++ b/lib/kokkos/example/feint/feint_fwd.hpp
@ -1,60 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXAMPLE_FEINT_FWD_HPP
-#define KOKKOS_EXAMPLE_FEINT_FWD_HPP
-
-namespace Kokkos {
-namespace Example {
-
-template < class Device , bool UseAtomic >
-void feint(
-  const unsigned global_elem_nx = 100 ,
-  const unsigned global_elem_ny = 115 ,
-  const unsigned global_elem_nz = 130 );
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_EXAMPLE_FEINT_FWD_HPP */
-
--- a/lib/kokkos/example/feint/feint_openmp.cpp
+++ b/lib/kokkos/example/feint/feint_openmp.cpp
@ -1,67 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-
-#ifdef KOKKOS_HAVE_OPENMP
-
-#include <feint.hpp>
-
-namespace Kokkos {
-namespace Example {
-
-template void feint<Kokkos::OpenMP,false>(
-  const unsigned global_elem_nx ,
-  const unsigned global_elem_ny ,
-  const unsigned global_elem_nz );
-
-template void feint<Kokkos::OpenMP,true>(
-  const unsigned global_elem_nx ,
-  const unsigned global_elem_ny ,
-  const unsigned global_elem_nz );
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-#endif
-
--- a/lib/kokkos/example/feint/feint_threads.cpp
+++ b/lib/kokkos/example/feint/feint_threads.cpp
@ -1,66 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-
-#include <feint.hpp>
-
-namespace Kokkos {
-namespace Example {
-
-template void feint< Kokkos::Threads ,false>(
-  const unsigned global_elem_nx ,
-  const unsigned global_elem_ny ,
-  const unsigned global_elem_nz );
-
-template void feint< Kokkos::Threads ,true>(
-  const unsigned global_elem_nx ,
-  const unsigned global_elem_ny ,
-  const unsigned global_elem_nz );
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
--- a/lib/kokkos/example/feint/main.cpp
+++ b/lib/kokkos/example/feint/main.cpp
@ -1,110 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-#include <utility>
-#include <iostream>
-
-#include <Kokkos_Core.hpp>
-
-#include <feint_fwd.hpp>
-
-int main()
-{
-#if defined( KOKKOS_HAVE_PTHREAD )
-  {
-    // Use 4 cores per NUMA region, unless fewer available
-
-    const unsigned use_numa_count     = Kokkos::hwloc::get_available_numa_count();
-    const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() );
-
-    Kokkos::Threads::initialize( use_numa_count * use_cores_per_numa );
-
-    std::cout << "feint< Threads , NotUsingAtomic >" << std::endl ;
-    Kokkos::Example::feint< Kokkos::Threads , false >();
-
-    std::cout << "feint< Threads , Usingtomic >" << std::endl ;
-    Kokkos::Example::feint< Kokkos::Threads , true  >();
-
-    Kokkos::Threads::finalize();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_OPENMP )
-  {
-    // Use 4 cores per NUMA region, unless fewer available
-
-    const unsigned use_numa_count     = Kokkos::hwloc::get_available_numa_count();
-    const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() );
-
-    Kokkos::OpenMP::initialize( use_numa_count * use_cores_per_numa );
-
-    std::cout << "feint< OpenMP , NotUsingAtomic >" << std::endl ;
-    Kokkos::Example::feint< Kokkos::OpenMP , false >();
-
-    std::cout << "feint< OpenMP , Usingtomic >" << std::endl ;
-    Kokkos::Example::feint< Kokkos::OpenMP , true  >();
-
-    Kokkos::OpenMP::finalize();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_CUDA )
-  {
-    // Initialize Host mirror device
-    Kokkos::HostSpace::execution_space::initialize(1);
-    const unsigned device_count = Kokkos::Cuda::detect_device_count();
-
-    // Use the last device:
-    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(device_count-1) );
-
-    std::cout << "feint< Cuda , NotUsingAtomic >" << std::endl ;
-    Kokkos::Example::feint< Kokkos::Cuda , false >();
-
-    std::cout << "feint< Cuda , UsingAtomic >" << std::endl ;
-    Kokkos::Example::feint< Kokkos::Cuda , true  >();
-
-    Kokkos::Cuda::finalize();
-    Kokkos::HostSpace::execution_space::finalize();
-
-  }
-#endif
-}
-
--- a/lib/kokkos/example/fenl/CGSolve.hpp
+++ b/lib/kokkos/example/fenl/CGSolve.hpp
@ -1,296 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXAMPLE_CG_SOLVE
-#define KOKKOS_EXAMPLE_CG_SOLVE
-
-#include <cmath>
-#include <limits>
-#include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
-
-#include <WrapMPI.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-
-template< typename ValueType , class Space >
-struct CrsMatrix {
-  typedef Kokkos::StaticCrsGraph< unsigned , Space , void , unsigned >  StaticCrsGraphType ;
-  typedef View< ValueType * , Space > coeff_type ;
-
-  StaticCrsGraphType  graph ;
-  coeff_type          coeff ;
-
-  CrsMatrix() : graph(), coeff() {}
-
-  CrsMatrix( const StaticCrsGraphType & arg_graph )
-    : graph( arg_graph )
-    , coeff( "crs_matrix_coeff" , arg_graph.entries.dimension_0() )
-    {}
-};
-
-template< typename MScalar 
-        , typename VScalar
-        , class Space >
-struct Multiply {
-
-  const Example::CrsMatrix< MScalar , Space >    m_A ;
-  const Kokkos::View< const VScalar * , Space > m_x ;
-  const Kokkos::View<       VScalar * , Space > m_y ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const int iRow ) const
-    {
-      const int iEntryBegin = m_A.graph.row_map[iRow];
-      const int iEntryEnd   = m_A.graph.row_map[iRow+1];
-
-      double sum = 0 ;
-
-      for ( int iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
-        sum += m_A.coeff(iEntry) * m_x( m_A.graph.entries(iEntry) );
-      }
-
-      m_y(iRow) = sum ;
-    }
-
-  Multiply( const View<       VScalar * , Space > & y 
-          , const CrsMatrix< MScalar , Space >    & A 
-          , const View< const VScalar * , Space > & x 
-          )
-  : m_A( A ), m_x( x ), m_y( y )
-  {}
-};
-
-template< typename MScalar
-        , typename VScalar
-        , class Space >
-inline
-void multiply( const int nrow
-             , const Kokkos::View< VScalar * , Space >    & y
-             , const Example::CrsMatrix< MScalar , Space > & A
-             , const Kokkos::View< VScalar * , Space >    & x
-             )
-{
-  Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,nrow), Multiply<MScalar,VScalar,Space>( y , A , x ) );
-}
-
-template< typename ValueType , class Space >
-struct WAXPBY {
-  const Kokkos::View< const ValueType * , Space >  m_x ;
-  const Kokkos::View< const ValueType * , Space >  m_y ;
-  const Kokkos::View<       ValueType * , Space >  m_w ;
-  const double m_alpha ;
-  const double m_beta ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const int i ) const
-    { m_w(i) = m_alpha * m_x(i) + m_beta * m_y(i); }
-
-  WAXPBY( const View< ValueType * , Space >  & arg_w
-        , const double arg_alpha
-        , const View< ValueType * , Space >  & arg_x
-        , const double arg_beta
-        , const View< ValueType * , Space >  & arg_y
-        )
-    : m_x( arg_x )
-    , m_y( arg_y )
-    , m_w( arg_w )
-    , m_alpha( arg_alpha )
-    , m_beta( arg_beta )
-    {}
-};
-
-template< typename VScalar , class Space >
-void waxpby( const int n
-           , const Kokkos::View< VScalar * , Space > & arg_w
-           , const double                      arg_alpha
-           , const Kokkos::View< VScalar * , Space > & arg_x
-           , const double                      arg_beta
-           , const Kokkos::View< VScalar * , Space > & arg_y
-           )
-{
-  Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,n), WAXPBY<VScalar,Space>(arg_w,arg_alpha,arg_x,arg_beta,arg_y) );
-}
-
-template< typename VScalar , class Space >
-struct Dot {
-  typedef double value_type ;
-
-  const Kokkos::View< const VScalar * , Space >  m_x ;
-  const Kokkos::View< const VScalar * , Space >  m_y ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const int i , value_type & update ) const
-    { update += m_x(i) * m_y(i); }
-
-  Dot( const Kokkos::View< VScalar * , Space >  & arg_x
-     , const Kokkos::View< VScalar * , Space >  & arg_y
-     )
-    : m_x(arg_x), m_y(arg_y) {}
-};
-
-template< typename VScalar , class Space >
-double dot( const int n
-          , const Kokkos::View< VScalar * , Space > & arg_x
-          , const Kokkos::View< VScalar * , Space > & arg_y
-          )
-{
-  double result = 0 ;
-  Kokkos::parallel_reduce( Kokkos::RangePolicy<Space>(0,n) , Dot<VScalar,Space>( arg_x , arg_y ) , result );
-  return result ;
-}
-
-} // namespace Example
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-
-struct CGSolveResult {
-  size_t  iteration ;
-  double  iter_time ;
-  double  matvec_time ;
-  double  norm_res ;
-};
-
-template< class ImportType
-        , typename MScalar
-        , typename VScalar
-        , class Space
-        >
-inline
-void cgsolve( const ImportType & import
-            , const CrsMatrix< MScalar , Space >      & A
-            , const Kokkos::View< VScalar * , Space > & b
-            , const Kokkos::View< VScalar * , Space > & x
-            , const size_t  maximum_iteration = 200
-            , const double  tolerance = std::numeric_limits<double>::epsilon()
-            , CGSolveResult * result = 0
-            )
-{
-  typedef View< VScalar * , Space >  VectorType ;
-
-  const size_t count_owned = import.count_owned ;
-  const size_t count_total = import.count_owned + import.count_receive;
-
-  size_t  iteration = 0 ;
-  double  iter_time = 0 ;
-  double  matvec_time = 0 ;
-  double  norm_res = 0 ;
-
-  // Need input vector to matvec to be owned + received
-  VectorType pAll ( "cg::p" , count_total );
-
-  VectorType p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
-  VectorType r ( "cg::r" , count_owned );
-  VectorType Ap( "cg::Ap", count_owned );
-
-  /* r = b - A * x ; */
-
-  /* p  = x       */  Kokkos::deep_copy( p , x );
-  /* import p     */  import( pAll );
-  /* Ap = A * p   */  multiply( count_owned , Ap , A , pAll );
-  /* r = b - Ap   */  waxpby( count_owned , r , 1.0 , b , -1.0 , Ap );
-  /* p  = r       */  Kokkos::deep_copy( p , r );
-
-  double old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
-
-  norm_res  = sqrt( old_rdot );
-  iteration = 0 ;
-
-  Kokkos::Impl::Timer wall_clock ;
-  Kokkos::Impl::Timer timer;
-
-  while ( tolerance < norm_res && iteration < maximum_iteration ) {
-
-    /* pAp_dot = dot( p , Ap = A * p ) */
-
-    timer.reset();
-    /* import p    */  import( pAll );
-    /* Ap = A * p  */  multiply( count_owned , Ap , A , pAll );
-    Space::fence();
-    matvec_time += timer.seconds();
-
-    const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm );
-    const double alpha   = old_rdot / pAp_dot ;
-
-    /* x +=  alpha * p ;  */ waxpby( count_owned , x ,  alpha, p  , 1.0 , x );
-    /* r += -alpha * Ap ; */ waxpby( count_owned , r , -alpha, Ap , 1.0 , r );
-
-    const double r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
-    const double beta  = r_dot / old_rdot ;
-
-    /* p = r + beta * p ; */ waxpby( count_owned , p , 1.0 , r , beta , p );
-
-    norm_res = sqrt( old_rdot = r_dot );
-
-    ++iteration ;
-  }
-
-  Space::fence();
-  iter_time = wall_clock.seconds();
-
-  if ( 0 != result ) {
-    result->iteration   = iteration ;
-    result->iter_time   = iter_time ;
-    result->matvec_time = matvec_time ;
-    result->norm_res    = norm_res ;
-  }
-}
-
-} // namespace Example
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_EXAMPLE_CG_SOLVE */
-
-
--- a/lib/kokkos/example/fenl/CMakeLists.txt
+++ b/lib/kokkos/example/fenl/CMakeLists.txt
@ -1,17 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../common)
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../fixture)
-
-SET(SOURCES "")
-
-FILE( GLOB SOURCES *.cpp )
-
-LIST( APPEND SOURCES ../fixture/BoxElemPart.cpp )
-
-TRIBITS_ADD_EXECUTABLE(
-  fenl
-  SOURCES ${SOURCES}
-  COMM serial mpi
-  )
--- a/lib/kokkos/example/fenl/Makefile
+++ b/lib/kokkos/example/fenl/Makefile
@ -1,57 +0,0 @@
-KOKKOS_PATH ?= ../..
-
-MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-SRC_DIR := $(dir $(MAKEFILE_PATH))
-
-vpath %.cpp ${SRC_DIR}/../fixture ${SRC_DIR}
-
-EXAMPLE_HEADERS = $(wildcard $(SRC_DIR)/../common/*.hpp ${SRC_DIR}/../fixture/*.hpp ${SRC_DIR}/*.hpp)
-
-default: build_all
-	echo "End Build"
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-# KOKKOS_INTERNAL_USE_CUDA is not exported to installed Makefile.kokkos
-# use KOKKOS_DEVICE here
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-	CXX = nvcc_wrapper
-	CXXFLAGS ?= -O3
-	LINK = $(CXX)
-	LDFLAGS ?= -lpthread
-else
-	CXX ?= g++
-	CXXFLAGS ?= -O3
-	LINK ?= $(CXX)
-	LDFLAGS ?= -lpthread
-endif
-
-KOKKOS_CXXFLAGS +=	\
-	-I${SRC_DIR}/../common	\
-	-I${SRC_DIR}/../fixture	\
-	-I${SRC_DIR}
-
-
-EXE_EXAMPLE_FENL = KokkosExample_Fenl
-OBJ_EXAMPLE_FENL = BoxElemPart.o main.o fenl.o
-
-TARGETS = $(EXE_EXAMPLE_FENL)
-
-#TEST_TARGETS =
-
-$(EXE_EXAMPLE_FENL) : $(OBJ_EXAMPLE_FENL) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_EXAMPLE_FENL) $(KOKKOS_LIBS) $(LIB) -o $(EXE_EXAMPLE_FENL)
-
-build_all : $(TARGETS)
-
-
-test : build_all
-
-clean:
-	rm -f *.o $(EXE_EXAMPLE_FENL) KokkosCore_config.*
-
-# Compilation rules
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(EXAMPLE_HEADERS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
-
--- a/lib/kokkos/example/fenl/fenl.cpp
+++ b/lib/kokkos/example/fenl/fenl.cpp
@ -1,117 +0,0 @@
-/*
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-*/
-
-#include <HexElement.hpp>
-#include <fenl_impl.hpp>
-
-namespace Kokkos {
-namespace Example {
-namespace FENL {
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-
-template
-Perf fenl< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >(
-  MPI_Comm comm ,
-  const int use_print ,
-  const int use_trials ,
-  const int use_atomic ,
-  const int global_elems[] );
-
-
-template
-Perf fenl< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemQuadratic >(
-  MPI_Comm comm ,
-  const int use_print ,
-  const int use_trials ,
-  const int use_atomic ,
-  const int global_elems[] );
-
-#endif
-
-
-#if defined (KOKKOS_HAVE_OPENMP)
-
-template
-Perf fenl< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >(
-  MPI_Comm comm ,
-  const int use_print ,
-  const int use_trials ,
-  const int use_atomic ,
-  const int global_elems[] );
-
-
-template
-Perf fenl< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemQuadratic >(
-  MPI_Comm comm ,
-  const int use_print ,
-  const int use_trials ,
-  const int use_atomic ,
-  const int global_elems[] );
-
-#endif
-
-#if defined( KOKKOS_HAVE_CUDA )
-
-template
-Perf fenl< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >(
-  MPI_Comm comm ,
-  const int use_print ,
-  const int use_trials ,
-  const int use_atomic ,
-  const int global_elems[] );
-
-
-template
-Perf fenl< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemQuadratic >(
-  MPI_Comm comm ,
-  const int use_print ,
-  const int use_trials ,
-  const int use_atomic ,
-  const int global_elems[] );
-
-#endif
-
-
-} /* namespace FENL */
-} /* namespace Example */
-} /* namespace Kokkos */
-
--- a/lib/kokkos/example/fenl/fenl.hpp
+++ b/lib/kokkos/example/fenl/fenl.hpp
@ -1,89 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXAMPLE_FENL_HPP
-#define KOKKOS_EXAMPLE_FENL_HPP
-
-#include <stdlib.h>
-#include <BoxElemPart.hpp>
-#include <WrapMPI.hpp>
-
-namespace Kokkos {
-namespace Example {
-namespace FENL {
-
-struct Perf {
-  size_t global_elem_count ;
-  size_t global_node_count ;
-  size_t newton_iter_count ;
-  size_t cg_iter_count ;
-  double map_ratio ;
-  double fill_node_set ;
-  double scan_node_count ;
-  double fill_graph_entries ;
-  double sort_graph_entries ;
-  double fill_element_graph ;
-  double create_sparse_matrix ;
-  double fill_time ;
-  double bc_time ;
-  double matvec_time ;
-  double cg_time ;
-  double newton_residual ;
-  double error_max ;
-
-};
-
-template < class Device , BoxElemPart::ElemOrder ElemOrder >
-Perf fenl(
-  MPI_Comm comm ,
-  const int use_print ,
-  const int use_trials ,
-  const int use_atomic ,
-  const int global_elems[] );
-
-} /* namespace FENL */
-} /* namespace Example */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_EXAMPLE_FENL_HPP */
-
--- a/lib/kokkos/example/fenl/fenl_functors.hpp
+++ b/lib/kokkos/example/fenl/fenl_functors.hpp
--- a/lib/kokkos/example/fenl/fenl_impl.hpp
+++ b/lib/kokkos/example/fenl/fenl_impl.hpp
@ -1,598 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXAMPLE_FENL_IMPL_HPP
-#define KOKKOS_EXAMPLE_FENL_IMPL_HPP
-
-#include <math.h>
-
-// Kokkos libraries' headers:
-
-#include <Kokkos_UnorderedMap.hpp>
-#include <Kokkos_StaticCrsGraph.hpp>
-#include <impl/Kokkos_Timer.hpp>
-
-// Examples headers:
-
-#include <BoxElemFixture.hpp>
-#include <VectorImport.hpp>
-#include <CGSolve.hpp>
-
-#include <fenl.hpp>
-#include <fenl_functors.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-namespace FENL {
-
-inline
-double maximum( MPI_Comm comm , double local )
-{
-  double global = local ;
-#if defined( KOKKOS_HAVE_MPI )
-  MPI_Allreduce( & local , & global , 1 , MPI_DOUBLE , MPI_MAX , comm );
-#endif
-  return global ;
-}
-
-} /* namespace FENL */
-} /* namespace Example */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-namespace FENL {
-
-class ManufacturedSolution {
-public:
-
-  // Manufactured solution for one dimensional nonlinear PDE
-  //
-  //  -K T_zz + T^2 = 0 ; T(zmin) = T_zmin ; T(zmax) = T_zmax
-  //
-  //  Has an analytic solution of the form:
-  //
-  //    T(z) = ( a ( z - zmin ) + b )^(-2) where K = 1 / ( 6 a^2 )
-  //
-  //  Given T_0 and T_L compute K for this analytic solution.
-  //
-  //  Two analytic solutions:
-  //
-  //    Solution with singularity:
-  //    , a( ( 1.0 / sqrt(T_zmax) + 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
-  //    , b( -1.0 / sqrt(T_zmin) )
-  //
-  //    Solution without singularity:
-  //    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
-  //    , b( 1.0 / sqrt(T_zmin) )
-
-  const double zmin ;
-  const double zmax ;
-  const double T_zmin ;
-  const double T_zmax ;
-  const double a ;
-  const double b ;
-  const double K ;
-
-  ManufacturedSolution( const double arg_zmin ,
-                        const double arg_zmax ,
-                        const double arg_T_zmin ,
-                        const double arg_T_zmax )
-    : zmin( arg_zmin )
-    , zmax( arg_zmax )
-    , T_zmin( arg_T_zmin )
-    , T_zmax( arg_T_zmax )
-    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
-    , b( 1.0 / sqrt(T_zmin) )
-    , K( 1.0 / ( 6.0 * a * a ) )
-    {}
-
-  double operator()( const double z ) const
-  {
-    const double tmp = a * ( z - zmin ) + b ;
-    return 1.0 / ( tmp * tmp );
-  }
-};
-
-} /* namespace FENL */
-} /* namespace Example */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-namespace FENL {
-
-template < class Space , BoxElemPart::ElemOrder ElemOrder >
-Perf fenl(
-  MPI_Comm comm ,
-  const int use_print ,
-  const int use_trials ,
-  const int use_atomic ,
-  const int use_elems[] )
-{
-  typedef Kokkos::Example::BoxElemFixture< Space , ElemOrder > FixtureType ;
-
-  typedef Kokkos::Example::CrsMatrix< double , Space >
-    SparseMatrixType ;
-
-  typedef typename SparseMatrixType::StaticCrsGraphType
-    SparseGraphType ;
-
-  typedef Kokkos::Example::FENL::NodeNodeGraph< typename FixtureType::elem_node_type , SparseGraphType , FixtureType::ElemNode >
-     NodeNodeGraphType ;
-
-  typedef Kokkos::Example::FENL::ElementComputation< FixtureType , SparseMatrixType >
-    ElementComputationType ;
-
-  typedef Kokkos::Example::FENL::DirichletComputation< FixtureType , SparseMatrixType >
-    DirichletComputationType ;
-
-  typedef NodeElemGatherFill< ElementComputationType >
-    NodeElemGatherFillType ;
-
-  typedef typename ElementComputationType::vector_type VectorType ;
-
-  typedef Kokkos::Example::VectorImport<
-     typename FixtureType::comm_list_type ,
-     typename FixtureType::send_nodeid_type ,
-     VectorType > ImportType ;
-
-  //------------------------------------
-
-  const unsigned newton_iteration_limit     = 10 ;
-  const double   newton_iteration_tolerance = 1e-7 ;
-  const unsigned cg_iteration_limit         = 200 ;
-  const double   cg_iteration_tolerance     = 1e-7 ;
-
-  //------------------------------------
-
-  const int print_flag = use_print && Kokkos::Impl::is_same< Kokkos::HostSpace , typename Space::memory_space >::value ;
-
-  int comm_rank ;
-  int comm_size ;
-
-  MPI_Comm_rank( comm , & comm_rank );
-  MPI_Comm_size( comm , & comm_size );
-
-  // Decompose by node to avoid mpi-communication for assembly
-
-  const float bubble_x = 1.0 ;
-  const float bubble_y = 1.0 ;
-  const float bubble_z = 1.0 ;
-
-  const FixtureType fixture( BoxElemPart::DecomposeNode , comm_size , comm_rank ,
-                             use_elems[0] , use_elems[1] , use_elems[2] ,
-                             bubble_x , bubble_y , bubble_z );
-
-
-  {
-    int global_error = ! fixture.ok();
-
-#if defined( KOKKOS_HAVE_MPI )
-    int local_error = global_error ;
-    global_error = 0 ;
-    MPI_Allreduce( & local_error , & global_error , 1 , MPI_INT , MPI_SUM , comm );
-#endif
-
-    if ( global_error ) {
-      throw std::runtime_error(std::string("Error generating finite element fixture"));
-    }
-  }
-
-  //------------------------------------
-
-  const ImportType comm_nodal_import(
-    comm ,
-    fixture.recv_node() ,
-    fixture.send_node() ,
-    fixture.send_nodeid() ,
-    fixture.node_count_owned() ,
-    fixture.node_count() - fixture.node_count_owned() );
-
-  //------------------------------------
-
-  const double bc_lower_value = 1 ;
-  const double bc_upper_value = 2 ;
-
-  const Kokkos::Example::FENL::ManufacturedSolution
-    manufactured_solution( 0 , 1 , bc_lower_value , bc_upper_value  );
-
-  //------------------------------------
-
-  for ( int k = 0 ; k < comm_size && use_print ; ++k ) {
-    if ( k == comm_rank ) {
-      typename FixtureType::node_grid_type::HostMirror
-        h_node_grid = Kokkos::create_mirror_view( fixture.node_grid() );
-
-      typename FixtureType::node_coord_type::HostMirror
-        h_node_coord = Kokkos::create_mirror_view( fixture.node_coord() );
-
-      typename FixtureType::elem_node_type::HostMirror
-        h_elem_node = Kokkos::create_mirror_view( fixture.elem_node() );
-
-      Kokkos::deep_copy( h_node_grid , fixture.node_grid() );
-      Kokkos::deep_copy( h_node_coord , fixture.node_coord() );
-      Kokkos::deep_copy( h_elem_node , fixture.elem_node() );
-
-      std::cout << "MPI[" << comm_rank << "]" << std::endl ;
-      std::cout << "Node grid {" ;
-      for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) {
-        std::cout << " (" << h_node_grid(inode,0)
-                  << "," << h_node_grid(inode,1)
-                  << "," << h_node_grid(inode,2)
-                  << ")" ;
-      }
-      std::cout << " }" << std::endl ;
-  
-      std::cout << "Node coord {" ;
-      for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) {
-        std::cout << " (" << h_node_coord(inode,0)
-                  << "," << h_node_coord(inode,1)
-                  << "," << h_node_coord(inode,2)
-                  << ")" ;
-      }
-      std::cout << " }" << std::endl ;
-
-      std::cout << "Manufactured solution"
-                << " a[" << manufactured_solution.a << "]"
-                << " b[" << manufactured_solution.b << "]"
-                << " K[" << manufactured_solution.K << "]"
-                << " {" ;
-      for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) {
-        std::cout << " " << manufactured_solution( h_node_coord( inode , 2 ) );
-      }
-      std::cout << " }" << std::endl ;
-
-      std::cout << "ElemNode {" << std::endl ;
-      for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
-        std::cout << "  elem[" << ielem << "]{" ;
-        for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
-          std::cout << " " << h_elem_node(ielem,inode);
-        }
-        std::cout << " }{" ;
-        for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
-          std::cout << " (" << h_node_grid(h_elem_node(ielem,inode),0)
-                    << "," << h_node_grid(h_elem_node(ielem,inode),1)
-                    << "," << h_node_grid(h_elem_node(ielem,inode),2)
-                    << ")" ;
-        }
-        std::cout << " }" << std::endl ;
-      }
-      std::cout << "}" << std::endl ;
-    }
-    std::cout.flush();
-    MPI_Barrier( comm );
-  }
-
-  //------------------------------------
-
-  Kokkos::Impl::Timer wall_clock ;
-
-  Perf perf_stats = Perf() ;
-
-  for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
-
-    Perf perf = Perf() ;
-
-    perf.global_elem_count = fixture.elem_count_global();
-    perf.global_node_count = fixture.node_count_global();
-
-    //----------------------------------
-    // Create the sparse matrix graph and element-to-graph map
-    // from the element->to->node identifier array.
-    // The graph only has rows for the owned nodes.
-
-    typename NodeNodeGraphType::Times graph_times;
-
-    const NodeNodeGraphType
-      mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(), graph_times );
-
-    perf.map_ratio          = maximum(comm, graph_times.ratio);
-    perf.fill_node_set      = maximum(comm, graph_times.fill_node_set);
-    perf.scan_node_count    = maximum(comm, graph_times.scan_node_count);
-    perf.fill_graph_entries = maximum(comm, graph_times.fill_graph_entries);
-    perf.sort_graph_entries = maximum(comm, graph_times.sort_graph_entries);
-    perf.fill_element_graph = maximum(comm, graph_times.fill_element_graph);
-
-    wall_clock.reset();
-    // Create the sparse matrix from the graph:
-
-    SparseMatrixType jacobian( mesh_to_graph.graph );
-
-    Space::fence();
-
-    perf.create_sparse_matrix = maximum( comm , wall_clock.seconds() );
-
-    //----------------------------------
-
-    for ( int k = 0 ; k < comm_size && print_flag ; ++k ) {
-      if ( k == comm_rank ) {
-        const unsigned nrow = jacobian.graph.numRows();
-        std::cout << "MPI[" << comm_rank << "]" << std::endl ;
-        std::cout << "JacobianGraph {" << std::endl ;
-        for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
-          std::cout << "  row[" << irow << "]{" ;
-          const unsigned entry_end = jacobian.graph.row_map(irow+1);
-          for ( unsigned entry = jacobian.graph.row_map(irow) ; entry < entry_end ; ++entry ) {
-            std::cout << " " << jacobian.graph.entries(entry);
-          }
-          std::cout << " }" << std::endl ;
-        }
-        std::cout << "}" << std::endl ;
-
-        std::cout << "ElemGraph {" << std::endl ;
-        for ( unsigned ielem = 0 ; ielem < mesh_to_graph.elem_graph.dimension_0() ; ++ielem ) {
-          std::cout << "  elem[" << ielem << "]{" ;
-          for ( unsigned irow = 0 ; irow < mesh_to_graph.elem_graph.dimension_1() ; ++irow ) {
-            std::cout << " {" ;
-            for ( unsigned icol = 0 ; icol < mesh_to_graph.elem_graph.dimension_2() ; ++icol ) {
-              std::cout << " " << mesh_to_graph.elem_graph(ielem,irow,icol);
-            }
-            std::cout << " }" ;
-          }
-          std::cout << " }" << std::endl ;
-        }
-        std::cout << "}" << std::endl ;
-      }
-      std::cout.flush();
-      MPI_Barrier( comm );
-    }
-
-    //----------------------------------
-
-    // Allocate solution vector for each node in the mesh and residual vector for each owned node
-    const VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
-    const VectorType nodal_residual( "nodal_residual" , fixture.node_count_owned() );
-    const VectorType nodal_delta(    "nodal_delta" ,    fixture.node_count_owned() );
-
-    // Create element computation functor
-    const ElementComputationType elemcomp(
-      use_atomic ? ElementComputationType( fixture , manufactured_solution.K , nodal_solution ,
-                                           mesh_to_graph.elem_graph , jacobian , nodal_residual )
-                 : ElementComputationType( fixture , manufactured_solution.K , nodal_solution ) );
-
-    const NodeElemGatherFillType gatherfill(
-      use_atomic ? NodeElemGatherFillType()
-                 : NodeElemGatherFillType( fixture.elem_node() ,
-                                           mesh_to_graph.elem_graph ,
-                                           nodal_residual ,
-                                           jacobian ,
-                                           elemcomp.elem_residuals ,
-                                           elemcomp.elem_jacobians ) );
-
-    // Create boundary condition functor
-    const DirichletComputationType dirichlet(
-      fixture , nodal_solution , jacobian , nodal_residual ,
-      2 /* apply at 'z' ends */ ,
-      manufactured_solution.T_zmin ,
-      manufactured_solution.T_zmax );
-
-    //----------------------------------
-    // Nonlinear Newton iteration:
-
-    double residual_norm_init = 0 ;
-
-    for ( perf.newton_iter_count = 0 ;
-          perf.newton_iter_count < newton_iteration_limit ;
-          ++perf.newton_iter_count ) {
-
-      //--------------------------------
-
-      comm_nodal_import( nodal_solution );
-
-      //--------------------------------
-      // Element contributions to residual and jacobian
-
-      wall_clock.reset();
-
-      Kokkos::deep_copy( nodal_residual , double(0) );
-      Kokkos::deep_copy( jacobian.coeff , double(0) );
-
-      elemcomp.apply();
-
-      if ( ! use_atomic ) {
-        gatherfill.apply();
-      }
-
-      Space::fence();
-      perf.fill_time = maximum( comm , wall_clock.seconds() );
-
-      //--------------------------------
-      // Apply boundary conditions
-
-      wall_clock.reset();
-
-      dirichlet.apply();
-
-      Space::fence();
-      perf.bc_time = maximum( comm , wall_clock.seconds() );
-
-      //--------------------------------
-      // Evaluate convergence
-
-      const double residual_norm =
-        std::sqrt(
-          Kokkos::Example::all_reduce(
-            Kokkos::Example::dot( fixture.node_count_owned() , nodal_residual, nodal_residual ) , comm ) );
-
-      perf.newton_residual = residual_norm ;
-
-      if ( 0 == perf.newton_iter_count ) { residual_norm_init = residual_norm ; }
-
-      if ( residual_norm < residual_norm_init * newton_iteration_tolerance ) { break ; }
-
-      //--------------------------------
-      // Solve for nonlinear update
-
-      CGSolveResult cg_result ;
-
-      Kokkos::Example::cgsolve( comm_nodal_import
-                              , jacobian
-                              , nodal_residual
-                              , nodal_delta
-                              , cg_iteration_limit
-                              , cg_iteration_tolerance
-                              , & cg_result
-                              );
-
-      // Update solution vector
-
-      Kokkos::Example::waxpby( fixture.node_count_owned() , nodal_solution , -1.0 , nodal_delta , 1.0 , nodal_solution );
-
-      perf.cg_iter_count += cg_result.iteration ;
-      perf.matvec_time   += cg_result.matvec_time ;
-      perf.cg_time       += cg_result.iter_time ;
-
-      //--------------------------------
-
-      if ( print_flag ) {
-        const double delta_norm =
-          std::sqrt(
-            Kokkos::Example::all_reduce(
-              Kokkos::Example::dot( fixture.node_count_owned() , nodal_delta, nodal_delta ) , comm ) );
-
-        if ( 0 == comm_rank ) {
-          std::cout << "Newton iteration[" << perf.newton_iter_count << "]"
-                    << " residual[" << perf.newton_residual << "]"
-                    << " update[" << delta_norm << "]"
-                    << " cg_iteration[" << cg_result.iteration << "]"
-                    << " cg_residual[" << cg_result.norm_res << "]"
-                    << std::endl ;
-        }
-
-        for ( int k = 0 ; k < comm_size ; ++k ) {
-          if ( k == comm_rank ) {
-            const unsigned nrow = jacobian.graph.numRows();
-
-            std::cout << "MPI[" << comm_rank << "]" << std::endl ;
-            std::cout << "Residual {" ;
-            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
-              std::cout << " " << nodal_residual(irow);
-            }
-            std::cout << " }" << std::endl ;
-
-            std::cout << "Delta {" ;
-            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
-              std::cout << " " << nodal_delta(irow);
-            }
-            std::cout << " }" << std::endl ;
-
-            std::cout << "Solution {" ;
-            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
-              std::cout << " " << nodal_solution(irow);
-            }
-            std::cout << " }" << std::endl ;
-
-            std::cout << "Jacobian[ "
-                      << jacobian.graph.numRows() << " x " << Kokkos::maximum_entry( jacobian.graph )
-                      << " ] {" << std::endl ;
-            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
-              std::cout << "  {" ;
-              const unsigned entry_end = jacobian.graph.row_map(irow+1);
-              for ( unsigned entry = jacobian.graph.row_map(irow) ; entry < entry_end ; ++entry ) {
-                std::cout << " (" << jacobian.graph.entries(entry)
-                          << "," << jacobian.coeff(entry)
-                          << ")" ;
-              }
-              std::cout << " }" << std::endl ;
-            }
-            std::cout << "}" << std::endl ;
-          }
-          std::cout.flush();
-          MPI_Barrier( comm );
-        }
-      }
-      //--------------------------------
-    }
-
-    // Evaluate solution error
-
-    if ( 0 == itrial ) {
-      const typename FixtureType::node_coord_type::HostMirror
-        h_node_coord = Kokkos::create_mirror_view( fixture.node_coord() );
-
-      const typename VectorType::HostMirror
-        h_nodal_solution = Kokkos::create_mirror_view( nodal_solution );
-
-      Kokkos::deep_copy( h_node_coord , fixture.node_coord() );
-      Kokkos::deep_copy( h_nodal_solution , nodal_solution );
-
-      double error_max = 0 ;
-      for ( unsigned inode = 0 ; inode < fixture.node_count_owned() ; ++inode ) {
-        const double answer = manufactured_solution( h_node_coord( inode , 2 ) );
-        const double error = ( h_nodal_solution(inode) - answer ) / answer ;
-        if ( error_max < fabs( error ) ) { error_max = fabs( error ); }
-      }
-
-      perf.error_max = std::sqrt( Kokkos::Example::all_reduce_max( error_max , comm ) );
-
-      perf_stats = perf ;
-    }
-    else {
-      perf_stats.fill_node_set = std::min( perf_stats.fill_node_set , perf.fill_node_set );
-      perf_stats.scan_node_count = std::min( perf_stats.scan_node_count , perf.scan_node_count );
-      perf_stats.fill_graph_entries = std::min( perf_stats.fill_graph_entries , perf.fill_graph_entries );
-      perf_stats.sort_graph_entries = std::min( perf_stats.sort_graph_entries , perf.sort_graph_entries );
-      perf_stats.fill_element_graph = std::min( perf_stats.fill_element_graph , perf.fill_element_graph );
-      perf_stats.create_sparse_matrix = std::min( perf_stats.create_sparse_matrix , perf.create_sparse_matrix );
-      perf_stats.fill_time = std::min( perf_stats.fill_time , perf.fill_time );
-      perf_stats.bc_time = std::min( perf_stats.bc_time , perf.bc_time );
-      perf_stats.cg_time = std::min( perf_stats.cg_time , perf.cg_time );
-    }
-  }
-
-  return perf_stats ;
-}
-
-} /* namespace FENL */
-} /* namespace Example */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_EXAMPLE_FENL_IMPL_HPP */
-
--- a/lib/kokkos/example/fenl/main.cpp
+++ b/lib/kokkos/example/fenl/main.cpp
@ -1,422 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <strings.h>
-
-#include <utility>
-#include <string>
-#include <vector>
-#include <sstream>
-#include <iostream>
-#include <iomanip>
-
-#include <Kokkos_Core.hpp>
-
-#include <WrapMPI.hpp>
-#include <fenl.hpp>
-
-// For vtune
-#include <sys/types.h>
-#include <unistd.h>
-
-//----------------------------------------------------------------------------
-
-enum { CMD_USE_THREADS = 0
-     , CMD_USE_NUMA
-     , CMD_USE_CORE_PER_NUMA
-     , CMD_USE_CUDA
-     , CMD_USE_OPENMP
-     , CMD_USE_CUDA_DEV
-     , CMD_USE_FIXTURE_X
-     , CMD_USE_FIXTURE_Y
-     , CMD_USE_FIXTURE_Z
-     , CMD_USE_FIXTURE_BEGIN
-     , CMD_USE_FIXTURE_END
-     , CMD_USE_FIXTURE_QUADRATIC
-     , CMD_USE_ATOMIC
-     , CMD_USE_TRIALS
-     , CMD_VTUNE
-     , CMD_PRINT
-     , CMD_ECHO
-     , CMD_ERROR
-     , CMD_COUNT };
-
-void print_cmdline( std::ostream & s , const int cmd[] )
-{
-  if ( cmd[ CMD_USE_THREADS ] ) {
-    s << " Threads(" << cmd[ CMD_USE_THREADS ]
-      << ") NUMA(" << cmd[ CMD_USE_NUMA ]
-      << ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ]
-      << ")" ;
-  }
-  if ( cmd[ CMD_USE_OPENMP ] ) {
-    s << " OpenMP(" << cmd[ CMD_USE_OPENMP ]
-      << ") NUMA(" << cmd[ CMD_USE_NUMA ]
-      << ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ]
-      << ")" ;
-  }
-  if ( cmd[ CMD_USE_FIXTURE_X ] ) {
-    s << " Fixture(" << cmd[ CMD_USE_FIXTURE_X ]
-      << "x" << cmd[ CMD_USE_FIXTURE_Y ]
-      << "x" << cmd[ CMD_USE_FIXTURE_Z ]
-      << ")" ;
-  }
-  if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
-    s << " Fixture( " << cmd[ CMD_USE_FIXTURE_BEGIN ]
-      << " .. " << cmd[ CMD_USE_FIXTURE_END ]
-      << " )" ;
-  }
-  if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) {
-    s << " Quadratic-Element" ;
-  }
-  if ( cmd[ CMD_USE_CUDA ] ) {
-    s << " CUDA(" << cmd[ CMD_USE_CUDA_DEV ] << ")" ;
-  }
-  if ( cmd[ CMD_USE_ATOMIC ] ) {
-    s << " ATOMIC" ;
-  }
-  if ( cmd[ CMD_USE_TRIALS ] ) {
-    s << " TRIALS(" << cmd[ CMD_USE_TRIALS ] << ")" ;
-  }
-  if ( cmd[ CMD_VTUNE ] ) {
-    s << " VTUNE" ;
-  }
-  if ( cmd[ CMD_PRINT ] ) {
-    s << " PRINT" ;
-  }
-  s << std::endl ;
-}
-
-void print_perf_value( std::ostream & s , const std::vector<size_t> & widths,  const Kokkos::Example::FENL::Perf & perf )
-{
-  int i=0;
-  s << std::setw(widths[i++]) << perf.global_elem_count << " ,";
-  s << std::setw(widths[i++]) << perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << perf.newton_iter_count << " ,";
-  s << std::setw(widths[i++]) << perf.cg_iter_count << " ,";
-  s << std::setw(widths[i++]) << perf.map_ratio << " ,";
-  s << std::setw(widths[i++]) << ( perf.fill_node_set * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.scan_node_count * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.fill_graph_entries * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.sort_graph_entries * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.fill_element_graph * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.create_sparse_matrix * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.fill_time * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.bc_time * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( ( perf.matvec_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( ( perf.cg_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i])   << perf.error_max;
-  s << std::endl ;
-}
-
-template< class Device , Kokkos::Example::BoxElemPart::ElemOrder ElemOrder >
-void run( MPI_Comm comm , const int cmd[] )
-{
-  int comm_rank = 0 ;
-
-#if defined( KOKKOS_HAVE_MPI )
-  MPI_Comm_rank( comm , & comm_rank );
-#else
-  comm = 0 ;
-#endif
-
-
-  if ( 0 == comm_rank ) {
-    if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; }
-    else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; }
-    else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; }
-
-    if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; }
-    else { std::cout << " , LINEAR-ELEMENT" ; }
-
-    if ( cmd[ CMD_USE_ATOMIC ] ) { std::cout << " , USING ATOMICS" ; }
-  }
-
-  std::vector< std::pair<std::string,std::string> > headers;
-
-
-  headers.push_back(std::make_pair("ELEMS","count"));
-  headers.push_back(std::make_pair("NODES","count"));
-  headers.push_back(std::make_pair("NEWTON","iter"));
-  headers.push_back(std::make_pair("CG","iter"));
-  headers.push_back(std::make_pair("MAP_RATIO","ratio"));
-  headers.push_back(std::make_pair("SET_FILL/NODE","millisec"));
-  headers.push_back(std::make_pair("SCAN/NODE","millisec"));
-  headers.push_back(std::make_pair("GRAPH_FILL/NODE","millisec"));
-  headers.push_back(std::make_pair("SORT/NODE","millisec"));
-  headers.push_back(std::make_pair("ELEM_GRAPH_FILL/NODE","millisec"));
-  headers.push_back(std::make_pair("MATRIX_CREATE/NODE","millisec"));
-  headers.push_back(std::make_pair("MATRIX_FILL/NODE","millisec"));
-  headers.push_back(std::make_pair("BOUNDARY/NODE","millisec"));
-  headers.push_back(std::make_pair("MAT_VEC/ITER/ROW","millisec"));
-  headers.push_back(std::make_pair("CG/ITER/ROW","millisec"));
-  headers.push_back(std::make_pair("ERROR","ratio"));
-
-  // find print widths
-  size_t min_width = 10;
-  std::vector< size_t > widths(headers.size());
-  for (size_t i=0, ie=headers.size(); i<ie; ++i)
-    widths[i] = std::max(min_width, headers[i].first.size()+1);
-
-  // print column headers
-  if ( 0 == comm_rank ) {
-    std::cout << std::endl ;
-    for (size_t i=0; i<headers.size(); ++i)
-      std::cout << std::setw(widths[i]) << headers[i].first << " ,";
-    std::cout << "\b\b  " << std::endl;
-    for (size_t i=0; i<headers.size(); ++i)
-      std::cout << std::setw(widths[i]) << headers[i].second << " ,";
-    std::cout << "\b\b  " << std::endl;
-
-    std::cout << std::scientific;
-    std::cout.precision(3);
-  }
-
-  if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
-    for ( int i = cmd[CMD_USE_FIXTURE_BEGIN] ; i < cmd[CMD_USE_FIXTURE_END] * 2 ; i *= 2 ) {
-      int nelem[3] ;
-      nelem[0] = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
-      nelem[1] = 1 + nelem[0] ;
-      nelem[2] = 2 * nelem[0] ;
-
-      const Kokkos::Example::FENL::Perf perf =
-        cmd[ CMD_USE_FIXTURE_QUADRATIC ]
-        ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
-            ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
-        : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
-            ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
-        ;
-
-      if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
-    }
-  }
-  else {
-    int nelem[3] = { cmd[ CMD_USE_FIXTURE_X ] ,
-                     cmd[ CMD_USE_FIXTURE_Y ] ,
-                     cmd[ CMD_USE_FIXTURE_Z ] };
-
-    const Kokkos::Example::FENL::Perf perf =
-      cmd[ CMD_USE_FIXTURE_QUADRATIC ]
-      ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
-          ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
-      : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
-          ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
-      ;
-
-    if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
-  }
-}
-
-//----------------------------------------------------------------------------
-
-int main( int argc , char ** argv )
-{
-  int comm_rank = 0 ;
-
-#if defined( KOKKOS_HAVE_MPI )
-  MPI_Init( & argc , & argv );
-  MPI_Comm comm = MPI_COMM_WORLD ;
-  MPI_Comm_rank( comm , & comm_rank );
-#else
-  MPI_Comm comm = 0 ;
-  (void) comm ; // suppress warning
-#endif
-
-  int cmdline[ CMD_COUNT ] ;
-
-  for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;
-
-  if ( 0 == comm_rank ) {
-    for ( int i = 1 ; i < argc ; ++i ) {
-      if ( 0 == strcasecmp( argv[i] , "threads" ) ) {
-        cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] );
-      }
-      else if ( 0 == strcasecmp( argv[i] , "openmp" ) ) {
-        cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
-      }
-      else if ( 0 == strcasecmp( argv[i] , "cores" ) ) {
-        sscanf( argv[++i] , "%dx%d" ,
-                cmdline + CMD_USE_NUMA ,
-                cmdline + CMD_USE_CORE_PER_NUMA );
-      }
-      else if ( 0 == strcasecmp( argv[i] , "cuda" ) ) {
-        cmdline[ CMD_USE_CUDA ] = 1 ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "cuda-dev" ) ) {
-        cmdline[ CMD_USE_CUDA ] = 1 ;
-        cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) {
-        sscanf( argv[++i] , "%dx%dx%d" ,
-                cmdline + CMD_USE_FIXTURE_X ,
-                cmdline + CMD_USE_FIXTURE_Y ,
-                cmdline + CMD_USE_FIXTURE_Z );
-      }
-      else if ( 0 == strcasecmp( argv[i] , "fixture-range" ) ) {
-        sscanf( argv[++i] , "%d..%d" ,
-                cmdline + CMD_USE_FIXTURE_BEGIN ,
-                cmdline + CMD_USE_FIXTURE_END );
-      }
-      else if ( 0 == strcasecmp( argv[i] , "fixture-quadratic" ) ) {
-        cmdline[ CMD_USE_FIXTURE_QUADRATIC ] = 1 ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "atomic" ) ) {
-        cmdline[ CMD_USE_ATOMIC ] = 1 ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "trials" ) ) {
-        cmdline[ CMD_USE_TRIALS ] = atoi( argv[++i] ) ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "vtune" ) ) {
-        cmdline[ CMD_VTUNE ] = 1 ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "print" ) ) {
-        cmdline[ CMD_PRINT ] = 1 ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "echo" ) ) {
-        cmdline[ CMD_ECHO ] = 1 ;
-      }
-      else {
-        cmdline[ CMD_ERROR ] = 1 ;
-
-        std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
-      }
-    }
-
-    if ( cmdline[ CMD_ECHO ] && 0 == comm_rank ) { print_cmdline( std::cout , cmdline ); }
-  }
-
-#if defined( KOKKOS_HAVE_MPI )
-  MPI_Bcast( cmdline , CMD_COUNT , MPI_INT , 0 , comm );
-#endif
-
-  if ( cmdline[ CMD_VTUNE ] ) {
-    std::stringstream cmd;
-    pid_t my_os_pid=getpid();
-    const std::string vtune_loc =
-      "/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl";
-    const std::string output_dir = "./vtune/vtune.";
-    const int p_rank = comm_rank;
-    cmd << vtune_loc
-        << " -collect hotspots -result-dir " << output_dir << p_rank
-        << " -target-pid " << my_os_pid << " &";
-    if (p_rank == 0)
-      std::cout << cmd.str() << std::endl;
-    system(cmd.str().c_str());
-    system("sleep 10");
-  }
-
-  if ( ! cmdline[ CMD_ERROR ] && ! cmdline[ CMD_ECHO ] ) {
-
-    if ( ! cmdline[ CMD_USE_TRIALS ] ) { cmdline[ CMD_USE_TRIALS ] = 1 ; }
-
-    if ( ! cmdline[ CMD_USE_FIXTURE_X ] && ! cmdline[ CMD_USE_FIXTURE_BEGIN ] ) {
-      cmdline[ CMD_USE_FIXTURE_X ] = 2 ;
-      cmdline[ CMD_USE_FIXTURE_Y ] = 2 ;
-      cmdline[ CMD_USE_FIXTURE_Z ] = 2 ;
-    }
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-
-    if ( cmdline[ CMD_USE_THREADS ] ) {
-
-      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
-        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] ,
-                                     cmdline[ CMD_USE_NUMA ] ,
-                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
-      }
-      else {
-        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] );
-      }
-
-      run< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
-
-      Kokkos::Threads::finalize();
-    }
-
-#endif
-
-#if defined( KOKKOS_HAVE_OPENMP )
-
-    if ( cmdline[ CMD_USE_OPENMP ] ) {
-
-      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
-        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] ,
-                                     cmdline[ CMD_USE_NUMA ] ,
-                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
-      }
-      else {
-        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] );
-      }
-
-      run< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
-
-      Kokkos::OpenMP::finalize();
-    }
-
-#endif
-
-#if defined( KOKKOS_HAVE_CUDA )
-    if ( cmdline[ CMD_USE_CUDA ] ) {
-      // Use the last device:
-
-      Kokkos::HostSpace::execution_space::initialize();
-      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cmdline[ CMD_USE_CUDA_DEV ] ) );
-
-      run< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
-
-      Kokkos::Cuda::finalize();
-      Kokkos::HostSpace::execution_space::finalize();
-    }
-
-#endif
-
-  }
-
-#if defined( KOKKOS_HAVE_MPI )
-  MPI_Finalize();
-#endif
-
-  return cmdline[ CMD_ERROR ] ? -1 : 0 ;
-}
-
--- a/lib/kokkos/example/fixture/BoxElemFixture.hpp
+++ b/lib/kokkos/example/fixture/BoxElemFixture.hpp
@ -1,355 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXAMPLE_BOXELEMFIXTURE_HPP
-#define KOKKOS_EXAMPLE_BOXELEMFIXTURE_HPP
-
-#include <stdio.h>
-#include <utility>
-
-#include <Kokkos_Core.hpp>
-
-#include <HexElement.hpp>
-#include <BoxElemPart.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-
-/** \brief  Map a grid onto a unit cube with smooth nonlinear grading
- *          of the map.
- */
-struct MapGridUnitCube {
-
-  const float m_a ;
-  const float m_b ;
-  const float m_c ;
-  const size_t m_max_x ;
-  const size_t m_max_y ;
-  const size_t m_max_z ;
-
-  MapGridUnitCube( const size_t grid_max_x ,
-                   const size_t grid_max_y ,
-                   const size_t grid_max_z ,
-                   const float bubble_x ,
-                   const float bubble_y ,
-                   const float bubble_z )
-    : m_a( bubble_x )
-    , m_b( bubble_y )
-    , m_c( bubble_z )
-    , m_max_x( grid_max_x )
-    , m_max_y( grid_max_y )
-    , m_max_z( grid_max_z )
-    {}
-
-  template< typename Scalar >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( int grid_x ,
-                   int grid_y ,
-                   int grid_z ,
-                   Scalar & coord_x ,
-                   Scalar & coord_y ,
-                   Scalar & coord_z ) const
-    {
-      // Map to a unit cube [0,1]^3
-
-      const double x = double(grid_x) / double(m_max_x);
-      const double y = double(grid_y) / double(m_max_y);
-      const double z = double(grid_z) / double(m_max_z);
-    
-      coord_x = x + x * x * ( x - 1 ) * ( x - 1 ) * m_a ;
-      coord_y = y + y * y * ( y - 1 ) * ( y - 1 ) * m_b ;
-      coord_z = z + z * z * ( z - 1 ) * ( z - 1 ) * m_c ;
-    }
-};
-
-} // namespace Example
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-
-/** \brief  Generate a distributed unstructured finite element mesh
- *          from a partitioned NX*NY*NZ box of elements.
- *
- *  Order owned nodes first followed by off-process nodes
- *  grouped by owning process.
- */
-template< class Device ,
-          BoxElemPart::ElemOrder Order ,
-          class CoordinateMap = MapGridUnitCube >
-class BoxElemFixture {
-public:
-
-  typedef Device execution_space ;
-
-  enum { SpaceDim = 3 };
-  enum { ElemNode = Order == BoxElemPart::ElemLinear ? 8 :
-                    Order == BoxElemPart::ElemQuadratic ? 27 : 0 };
-
-private:
-
-  typedef Kokkos::Example::HexElement_TensorData< ElemNode > hex_data ;
-
-  Kokkos::Example::BoxElemPart m_box_part ;
-  CoordinateMap                m_coord_map ;
-
-  Kokkos::View< double *[SpaceDim] , Device > m_node_coord ;
-  Kokkos::View< size_t *[SpaceDim] , Device > m_node_grid ;
-  Kokkos::View< size_t *[ElemNode] , Device > m_elem_node ;
-  Kokkos::View< size_t *[2] ,        Device > m_recv_node ;
-  Kokkos::View< size_t *[2] ,        Device > m_send_node ;
-  Kokkos::View< size_t * ,           Device > m_send_node_id ;
-
-  unsigned char m_elem_node_local[ ElemNode ][4] ;
-
-public:
-
-  typedef Kokkos::View< const size_t  * [ElemNode], Device > elem_node_type ;
-  typedef Kokkos::View< const double  * [SpaceDim], Device > node_coord_type ;
-  typedef Kokkos::View< const size_t  * [SpaceDim], Device > node_grid_type ;
-  typedef Kokkos::View< const size_t  * [2] , Device > comm_list_type ;
-  typedef Kokkos::View< const size_t  *     , Device > send_nodeid_type ;
-
-  inline bool ok() const { return m_box_part.ok(); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t node_count() const { return m_node_grid.dimension_0(); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t node_count_owned() const { return m_box_part.owns_node_count(); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t node_count_global() const { return m_box_part.global_node_count(); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t elem_count() const { return m_elem_node.dimension_0(); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t elem_count_global() const { return m_box_part.global_elem_count(); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t elem_node_local( size_t inode , int k ) const
-    { return m_elem_node_local[inode][k] ; }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t node_grid( size_t inode , int iaxis ) const
-    { return m_node_grid(inode,iaxis); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t node_global_index( size_t local ) const
-    {
-      const size_t tmp_node_grid[SpaceDim] =
-        { m_node_grid(local,0) , m_node_grid(local,1) , m_node_grid(local,2) };
-      return m_box_part.global_node_id( tmp_node_grid );
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  double node_coord( size_t inode , int iaxis ) const
-    { return m_node_coord(inode,iaxis); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t node_grid_max( int iaxis ) const
-    { return m_box_part.global_coord_max(iaxis); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t elem_node( size_t ielem , size_t inode ) const
-    { return m_elem_node(ielem,inode); }
-
-  elem_node_type   elem_node()   const { return m_elem_node ; }
-  node_coord_type  node_coord()  const { return m_node_coord ; }
-  node_grid_type   node_grid()   const { return m_node_grid ; }
-  comm_list_type   recv_node()   const { return m_recv_node ; }
-  comm_list_type   send_node()   const { return m_send_node ; }
-  send_nodeid_type send_nodeid() const { return m_send_node_id ; }
-
-  KOKKOS_INLINE_FUNCTION
-  BoxElemFixture( const BoxElemFixture & rhs )
-    : m_box_part(   rhs.m_box_part )
-    , m_coord_map(  rhs.m_coord_map )
-    , m_node_coord( rhs.m_node_coord )
-    , m_node_grid(  rhs.m_node_grid )
-    , m_elem_node(  rhs.m_elem_node )
-    , m_recv_node(  rhs.m_recv_node )
-    , m_send_node(  rhs.m_send_node )
-    , m_send_node_id( rhs.m_send_node_id )
-    {
-      for ( int i = 0 ; i < ElemNode ; ++i ) {
-        m_elem_node_local[i][0] = rhs.m_elem_node_local[i][0] ;
-        m_elem_node_local[i][1] = rhs.m_elem_node_local[i][1] ;
-        m_elem_node_local[i][2] = rhs.m_elem_node_local[i][2] ;
-        m_elem_node_local[i][3] = 0 ;
-      }
-    }
-
-  BoxElemFixture & operator = ( const BoxElemFixture & rhs )
-    {
-      m_box_part      = rhs.m_box_part ;
-      m_coord_map     = rhs.m_coord_map ;
-      m_node_coord    = rhs.m_node_coord ;
-      m_node_grid     = rhs.m_node_grid ;
-      m_elem_node     = rhs.m_elem_node ;
-      m_recv_node     = rhs.m_recv_node ;
-      m_send_node     = rhs.m_send_node ;
-      m_send_node_id  = rhs.m_send_node_id ;
-     
-      for ( int i = 0 ; i < ElemNode ; ++i ) {
-        m_elem_node_local[i][0] = rhs.m_elem_node_local[i][0] ;
-        m_elem_node_local[i][1] = rhs.m_elem_node_local[i][1] ;
-        m_elem_node_local[i][2] = rhs.m_elem_node_local[i][2] ;
-        m_elem_node_local[i][3] = 0 ;
-      }
-      return *this ;
-    }
-
-  BoxElemFixture( const BoxElemPart::Decompose decompose ,
-                  const size_t global_size ,
-                  const size_t global_rank ,
-                  const size_t elem_nx ,
-                  const size_t elem_ny ,
-                  const size_t elem_nz ,
-                  const float bubble_x = 1.1f ,
-                  const float bubble_y = 1.2f ,
-                  const float bubble_z = 1.3f )
-  : m_box_part( Order , decompose , global_size , global_rank , elem_nx , elem_ny , elem_nz )
-  , m_coord_map( m_box_part.global_coord_max(0) ,
-                 m_box_part.global_coord_max(1) ,
-                 m_box_part.global_coord_max(2) ,
-                 bubble_x ,
-                 bubble_y ,
-                 bubble_z )
-  , m_node_coord( "fixture_node_coord" , m_box_part.uses_node_count() )
-  , m_node_grid(  "fixture_node_grid" , m_box_part.uses_node_count() )
-  , m_elem_node(  "fixture_elem_node" , m_box_part.uses_elem_count() )
-  , m_recv_node(  "fixture_recv_node" , m_box_part.recv_node_msg_count() )
-  , m_send_node(  "fixture_send_node" , m_box_part.send_node_msg_count() )
-  , m_send_node_id( "fixture_send_node_id" , m_box_part.send_node_id_count() )
-  {
-    {
-      const hex_data elem_data ;
-
-      for ( int i = 0 ; i < ElemNode ; ++i ) {
-        m_elem_node_local[i][0] = elem_data.eval_map[i][0] ;
-        m_elem_node_local[i][1] = elem_data.eval_map[i][1] ;
-        m_elem_node_local[i][2] = elem_data.eval_map[i][2] ;
-        m_elem_node_local[i][3] = 0 ;
-      }
-    }
-
-    const size_t nwork = 
-      std::max( m_recv_node.dimension_0() ,
-      std::max( m_send_node.dimension_0() ,
-      std::max( m_send_node_id.dimension_0() ,
-      std::max( m_node_grid.dimension_0() ,
-                m_elem_node.dimension_0() * m_elem_node.dimension_1() ))));
-
-    Kokkos::parallel_for( nwork , *this );
-  }
-
-
-  // Initialization:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( size_t i ) const
-  {
-    if ( i < m_elem_node.dimension_0() * m_elem_node.dimension_1() ) {
-
-      const size_t ielem = i / ElemNode ;
-      const size_t inode = i % ElemNode ;
-
-      size_t elem_grid[SpaceDim] ;
-      size_t tmp_node_grid[SpaceDim] ;
-
-      m_box_part.uses_elem_coord( ielem , elem_grid );
-
-      enum { elem_node_scale = Order == BoxElemPart::ElemLinear ? 1 :
-                               Order == BoxElemPart::ElemQuadratic ? 2 : 0 };
-
-      tmp_node_grid[0] = elem_node_scale * elem_grid[0] + m_elem_node_local[inode][0] ;
-      tmp_node_grid[1] = elem_node_scale * elem_grid[1] + m_elem_node_local[inode][1] ;
-      tmp_node_grid[2] = elem_node_scale * elem_grid[2] + m_elem_node_local[inode][2] ;
-
-      m_elem_node(ielem,inode) = m_box_part.local_node_id( tmp_node_grid );
-    }
-
-    if ( i < m_node_grid.dimension_0() ) {
-      size_t tmp_node_grid[SpaceDim] ;
-      m_box_part.local_node_coord( i , tmp_node_grid );
-      m_node_grid(i,0) = tmp_node_grid[0] ;
-      m_node_grid(i,1) = tmp_node_grid[1] ;
-      m_node_grid(i,2) = tmp_node_grid[2] ;
-
-      m_coord_map( tmp_node_grid[0] ,
-                   tmp_node_grid[1] ,
-                   tmp_node_grid[2] ,
-                   m_node_coord(i,0) ,
-                   m_node_coord(i,1) ,
-                   m_node_coord(i,2) );
-    }
-
-    if ( i < m_recv_node.dimension_0() ) {
-      m_recv_node(i,0) = m_box_part.recv_node_rank(i);
-      m_recv_node(i,1) = m_box_part.recv_node_count(i);
-    }
-
-    if ( i < m_send_node.dimension_0() ) {
-      m_send_node(i,0) = m_box_part.send_node_rank(i);
-      m_send_node(i,1) = m_box_part.send_node_count(i);
-    }
-
-    if ( i < m_send_node_id.dimension_0() ) {
-      m_send_node_id(i) = m_box_part.send_node_id(i);
-    }
-  }
-};
-
-} // namespace Example
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_EXAMPLE_BOXELEMFIXTURE_HPP */
-
--- a/lib/kokkos/example/fixture/BoxElemPart.cpp
+++ b/lib/kokkos/example/fixture/BoxElemPart.cpp
@ -1,413 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <utility>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <limits>
-#include <BoxElemPart.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-
-void box_partition( const size_t global_size ,
-                    const size_t global_rank ,
-                    const size_t global_box[][2] ,
-                          size_t box[][2] )
-{
-  box[0][0] = global_box[0][0] ; box[0][1] = global_box[0][1] ;
-  box[1][0] = global_box[1][0] ; box[1][1] = global_box[1][1] ;
-  box[2][0] = global_box[2][0] ; box[2][1] = global_box[2][1] ;
-
-  size_t ip = 0 ;
-  size_t np = global_size ;
-
-  while ( 1 < np ) {
-
-    // P = [ ip + j * portion , ip + ( j + 1 ) * portion )
-
-    size_t jip , jup ;
-
-    {
-      const size_t part = ( 0 == ( np % 5 ) ) ? 5 : (
-                          ( 0 == ( np % 3 ) ) ? 3 : 2 );
-
-      const size_t portion = np / part ;
-
-      if ( 2 < part || global_rank < ip + portion ) {
-        jip = portion * size_t( double( global_rank - ip ) / double(portion) );
-        jup = jip + portion ;
-      }
-      else {
-        jip = portion ;
-        jup = np ;
-      }
-    }
-
-    // Choose axis with largest count:
-
-    const size_t nb[3] = {
-      box[0][1] - box[0][0] ,
-      box[1][1] - box[1][0] ,
-      box[2][1] - box[2][0] };
-
-    const int axis = nb[2] > nb[1] ? ( nb[2] > nb[0] ? 2 : 0 )
-                                        : ( nb[1] > nb[0] ? 1 : 0 );
-
-    box[ axis ][1] = box[ axis ][0] + size_t( double(nb[axis]) * ( double(jup) / double(np) ));
-    box[ axis ][0] = box[ axis ][0] + size_t( double(nb[axis]) * ( double(jip) / double(np) ));
-
-    np = jup - jip ;
-    ip = ip + jip ;
-  }
-}
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-
-void BoxElemPart::local( const size_t  rank ,
-                               size_t  uses_elem[][2] ,
-                               size_t  owns_node[][2] ,
-                               size_t  uses_node[][2] ) const
-{
-  if ( BoxElemPart::DecomposeElem == m_decompose ) {
-
-    Kokkos::Example::box_partition( m_global_size , rank , m_global_elem_box , uses_elem );
-
-    for ( int i = 0 ; i < 3 ; ++i ) {
-      owns_node[i][0] = uses_elem[i][0] ;
-      owns_node[i][1] = uses_elem[i][1] + ( m_global_elem_box[i][1] == uses_elem[i][1] ? 1 : 0 );
-    }
-  }
-  else {
-
-    const size_t global_vert[3][2] =
-      { { 0 , m_global_elem_box[0][1] + 1 },
-        { 0 , m_global_elem_box[1][1] + 1 },
-        { 0 , m_global_elem_box[2][1] + 1 } };
-
-    Kokkos::Example::box_partition( m_global_size , rank , global_vert , owns_node );
-
-    for ( int i = 0 ; i < 3 ; ++i ) {
-      uses_elem[i][0] = global_vert[i][0] == owns_node[i][0] ? owns_node[i][0] : owns_node[i][0] - 1 ;
-      uses_elem[i][1] = global_vert[i][1] == owns_node[i][1] ? owns_node[i][1] - 1 : owns_node[i][1] ;
-    }
-  }
-
-  for ( int i = 0 ; i < 3 ; ++i ) {
-    uses_node[i][0] = uses_elem[i][0] ;
-    uses_node[i][1] = uses_elem[i][1] + 1 ;
-  }
-
-  if ( BoxElemPart::ElemQuadratic == m_elem_order ) {
-    for ( int i = 0 ; i < 3 ; ++i ) {
-      owns_node[i][0] = 2 * owns_node[i][0] ;
-      uses_node[i][0] = 2 * uses_node[i][0] ;
-      owns_node[i][1] = 2 * owns_node[i][1] - 1 ;
-      uses_node[i][1] = 2 * uses_node[i][1] - 1 ;
-    }
-  }
-}
-
-BoxElemPart::BoxElemPart(
-  const BoxElemPart::ElemOrder elem_order ,
-  const BoxElemPart::Decompose decompose ,
-  const size_t global_size ,
-  const size_t global_rank ,
-  const size_t elem_nx ,
-  const size_t elem_ny ,
-  const size_t elem_nz )
-{
-  m_global_size = global_size ;
-  m_global_rank = global_rank ;
-  m_decompose   = decompose ;
-  m_elem_order  = elem_order ;
-
-  m_global_elem_box[0][0] = 0 ; m_global_elem_box[0][1] = elem_nx ;
-  m_global_elem_box[1][0] = 0 ; m_global_elem_box[1][1] = elem_ny ;
-  m_global_elem_box[2][0] = 0 ; m_global_elem_box[2][1] = elem_nz ;
-
-  m_global_node_box[0][0] = 0 ; m_global_node_box[0][1] = 0 ;
-  m_global_node_box[1][0] = 0 ; m_global_node_box[1][1] = 0 ;
-  m_global_node_box[2][0] = 0 ; m_global_node_box[2][1] = 0 ;
-
-  m_owns_node_count = 0 ;
-  m_send_node_count = 0 ;
-
-  m_ok = true ;
-
-  //----------------------------------------
-
-  if ( ElemLinear == elem_order ) {
-    m_global_node_box[0][1] = elem_nx + 1 ;
-    m_global_node_box[1][1] = elem_ny + 1 ;
-    m_global_node_box[2][1] = elem_nz + 1 ;
-  }
-  else if ( ElemQuadratic == elem_order ) {
-    m_global_node_box[0][1] = 2 * elem_nx + 1 ;
-    m_global_node_box[1][1] = 2 * elem_ny + 1 ;
-    m_global_node_box[2][1] = 2 * elem_nz + 1 ;
-  }
-
-  //----------------------------------------
-
-  local( m_global_rank , m_uses_elem_box , m_owns_node_box[0] , m_uses_node_box );
-
-  const size_t global_node_count_ = Kokkos::Example::box_count( m_global_node_box );
-  const size_t global_elem_count_ = Kokkos::Example::box_count( m_global_elem_box );
-
-  //----------------------------------------
-
-  size_t elem_count = Kokkos::Example::box_count( m_uses_elem_box );
-  size_t node_count = Kokkos::Example::box_count( m_owns_node_box[0] );
-
-  m_owns_node[0][0] = global_rank ;
-  m_owns_node[0][1] = node_count ;
-  m_owns_node_count = 1 ;
-  m_send_node_count = 0 ;
-
-  for ( size_t rr = 1 ; rr < m_global_size && m_ok ; ++rr ) {
-
-    const size_t rank = ( m_global_rank + rr ) % m_global_size ;
-
-    size_t elem_box[3][2] , o_node_box[3][2] , u_node_box[3][2] ;
-
-    // Boxes for process 'rank'
-    local( rank , elem_box , o_node_box , u_node_box );
-
-    // Box that this process uses but is owned by process 'rank'
-    Kokkos::Example::box_intersect( m_owns_node_box[ m_owns_node_count ] , m_uses_node_box , o_node_box );
-
-    m_owns_node[ m_owns_node_count ][1] = Kokkos::Example::box_count( m_owns_node_box[ m_owns_node_count ] );
-
-    if ( m_owns_node[ m_owns_node_count ][1] ) {
-
-      if ( ( PROC_NEIGH_MAX - 1 ) <= m_owns_node_count ) {
-        std::cout << "BoxElemPart exceeded maximum neighbor count" << std::endl ;
-        m_ok = false ;
-        break ;
-      }
-
-      m_owns_node[ m_owns_node_count ][0] = rank ;
-
-      ++m_owns_node_count ;
-    }
-
-    // Box that this process owns and is used by process 'rank'
-    Kokkos::Example::box_intersect( m_send_node_box[ m_send_node_count ] , m_owns_node_box[0] , u_node_box );
-
-    m_send_node[ m_send_node_count ][1] = Kokkos::Example::box_count( m_send_node_box[ m_send_node_count ] );
-
-    if ( m_send_node[ m_send_node_count ][1] ) {
-
-      if ( ( PROC_NEIGH_MAX - 1 ) <= m_send_node_count ) {
-        std::cout << "BoxElemPart exceeded maximum neighbor count" << std::endl ;
-        m_ok = false ;
-        break ;
-      }
-
-      m_send_node[ m_send_node_count ][0] = rank ;
-      ++m_send_node_count ;
-    }
-
-    // Error checking:
-
-    size_t test_box[3][2] ;
-
-    elem_count += Kokkos::Example::box_count( elem_box );
-    node_count += Kokkos::Example::box_count( o_node_box );
-
-    {
-      Kokkos::Example::box_intersect( test_box , m_owns_node_box[0] , o_node_box );
-
-      if ( Kokkos::Example::box_count( test_box ) ) {
-        std::cout << "Box partitioning error" << std::endl ;
-        std::cout << "owns_node[" << m_global_rank << "]{"
-                  << " [" << m_owns_node_box[0][0][0] << "," << m_owns_node_box[0][0][1] << ")"
-                  << " [" << m_owns_node_box[0][1][0] << "," << m_owns_node_box[0][1][1] << ")"
-                  << " [" << m_owns_node_box[0][2][0] << "," << m_owns_node_box[0][2][1] << ")"
-                  << "} intersects"
-                  << " owns_node[" << rank << "]{"
-                  << " [" << o_node_box[0][0] << "," << o_node_box[0][1] << ")"
-                  << " [" << o_node_box[1][0] << "," << o_node_box[1][1] << ")"
-                  << " [" << o_node_box[2][0] << "," << o_node_box[2][1] << ")"
-                  << "}" << std::endl ;
-        m_ok = false ;
-        break ;
-      }
-    }
-
-    if ( DecomposeElem == decompose ) {
-
-      Kokkos::Example::box_intersect( test_box , m_uses_elem_box , elem_box );
-
-      if ( Kokkos::Example::box_count( test_box ) ) {
-        std::cout << "Box partitioning error" << std::endl ;
-        std::cout << "ElemBox[" << m_global_rank << "]{"
-                  << " [" << m_uses_elem_box[0][0] << "," << m_uses_elem_box[0][1] << ")"
-                  << " [" << m_uses_elem_box[1][0] << "," << m_uses_elem_box[1][1] << ")"
-                  << " [" << m_uses_elem_box[2][0] << "," << m_uses_elem_box[2][1] << ")"
-                  << "} intersects"
-                  << " ElemBox[" << rank << "]{"
-                  << " [" << elem_box[0][0] << "," << elem_box[0][1] << ")"
-                  << " [" << elem_box[1][0] << "," << elem_box[1][1] << ")"
-                  << " [" << elem_box[2][0] << "," << elem_box[2][1] << ")"
-                  << "}" << std::endl ;
-        m_ok = false ;
-        break ;
-      }
-    }
-  }
-
-  // Sentinal values at the end of the owns and send lists:
-
-  m_owns_node[ m_owns_node_count ][0] = ~0u ;
-  m_owns_node[ m_owns_node_count ][1] = ~0u ;
-  m_owns_node_box[ m_owns_node_count ][0][0] = 0u ; m_owns_node_box[ m_owns_node_count ][0][0] = ~0u ;
-  m_owns_node_box[ m_owns_node_count ][1][0] = 0u ; m_owns_node_box[ m_owns_node_count ][1][0] = ~0u ;
-  m_owns_node_box[ m_owns_node_count ][2][0] = 0u ; m_owns_node_box[ m_owns_node_count ][2][0] = ~0u ;
-
-  m_send_node[ m_send_node_count ][0] = ~0u ;
-  m_send_node[ m_send_node_count ][1] = ~0u ;
-  m_send_node_box[ m_send_node_count ][0][0] = 0u ; m_send_node_box[ m_send_node_count ][0][0] = ~0u ;
-  m_send_node_box[ m_send_node_count ][1][0] = 0u ; m_send_node_box[ m_send_node_count ][1][0] = ~0u ;
-  m_send_node_box[ m_send_node_count ][2][0] = 0u ; m_send_node_box[ m_send_node_count ][2][0] = ~0u ;
-
-  {
-    size_t count = 0 ;
-    for ( size_t i = 0 ; i < m_owns_node_count ; ++i ) {
-      count += m_owns_node[i][1] ;
-    }
-    if ( count != Kokkos::Example::box_count( m_uses_node_box ) ) {
-      std::cout << "Node uses count = " << Kokkos::Example::box_count( m_uses_node_box )
-                << " error count = " << count << std::endl ;
-      m_ok = false ;
-    }
-  }
-
-  if ( global_node_count_ != node_count ) {
-    std::cout << "Node count = " << global_node_count_ << " overlap error count = " << node_count << std::endl ;
-    m_ok = false ;
-  }
-
-  if ( DecomposeElem == decompose && global_elem_count_ != elem_count ) {
-    std::cout << "Elem count = " << global_elem_count_ << " overlap error count = " << elem_count << std::endl ;
-    m_ok = false ;
-  }
-
-  if ( ! m_ok ) {
-    for ( int i = 0 ; i < 3 ; ++i ) { for ( int j = 0 ; j < 2 ; ++j ) {
-      m_global_elem_box[i][j] = 0 ;
-      m_global_node_box[i][j] = 0 ;
-      m_uses_elem_box[i][j] = 0 ;
-      m_uses_node_box[i][j] = 0 ;
-    }}
-    m_owns_node_count = 0 ;
-    m_send_node_count = 0 ;
-  }
-}
-
-void BoxElemPart::print( std::ostream & s ) const
-{
-  s << "BoxElemPart P[" << m_global_rank << ":" << m_global_size << "]"
-    << std::endl
-    << "  elem_box {"
-    << " [" << m_uses_elem_box[0][0] << "," << m_uses_elem_box[0][1] << ")"
-    << " [" << m_uses_elem_box[1][0] << "," << m_uses_elem_box[1][1] << ")"
-    << " [" << m_uses_elem_box[2][0] << "," << m_uses_elem_box[2][1] << ")"
-    << " } / {"
-    << " [" << m_global_elem_box[0][0] << "," << m_global_elem_box[0][1] << ")"
-    << " [" << m_global_elem_box[1][0] << "," << m_global_elem_box[1][1] << ")"
-    << " [" << m_global_elem_box[2][0] << "," << m_global_elem_box[2][1] << ")"
-    << " }"
-    << std::endl
-    << "  node_box {"
-    << " [" << m_owns_node_box[0][0][0] << "," << m_owns_node_box[0][0][1] << ")"
-    << " [" << m_owns_node_box[0][1][0] << "," << m_owns_node_box[0][1][1] << ")"
-    << " [" << m_owns_node_box[0][2][0] << "," << m_owns_node_box[0][2][1] << ")"
-    << " } / {"
-    << " [" << m_uses_node_box[0][0] << "," << m_uses_node_box[0][1] << ")"
-    << " [" << m_uses_node_box[1][0] << "," << m_uses_node_box[1][1] << ")"
-    << " [" << m_uses_node_box[2][0] << "," << m_uses_node_box[2][1] << ")"
-    << " } / {"
-    << " [" << m_global_node_box[0][0] << "," << m_global_node_box[0][1] << ")"
-    << " [" << m_global_node_box[1][0] << "," << m_global_node_box[1][1] << ")"
-    << " [" << m_global_node_box[2][0] << "," << m_global_node_box[2][1] << ")"
-    << " }"
-    << std::endl ;
-
-  for ( size_t i = 1 ; i < m_owns_node_count ; ++i ) {
-    s << "  P[" << m_owns_node[i][0] << "]"
-      << " recv node_box {"
-      << " [" << m_owns_node_box[i][0][0] << "," << m_owns_node_box[i][0][1] << ")"
-      << " [" << m_owns_node_box[i][1][0] << "," << m_owns_node_box[i][1][1] << ")"
-      << " [" << m_owns_node_box[i][2][0] << "," << m_owns_node_box[i][2][1] << ")"
-      << " }"
-      << std::endl ;
-  }
-
-  for ( size_t i = 0 ; i < m_send_node_count ; ++i ) {
-    s << "  P[" << m_send_node[i][0] << "]"
-      << " send node_box {"
-      << " [" << m_send_node_box[i][0][0] << "," << m_send_node_box[i][0][1] << ")"
-      << " [" << m_send_node_box[i][1][0] << "," << m_send_node_box[i][1][1] << ")"
-      << " [" << m_send_node_box[i][2][0] << "," << m_send_node_box[i][2][1] << ")"
-      << " }"
-      << std::endl ;
-  }
-}
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-
--- a/lib/kokkos/example/fixture/BoxElemPart.hpp
+++ b/lib/kokkos/example/fixture/BoxElemPart.hpp
@ -1,320 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_BOXELEMPART_HPP
-#define KOKKOS_BOXELEMPART_HPP
-
-#include <utility>
-#include <ostream>
-#include <Kokkos_Macros.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-
-KOKKOS_INLINE_FUNCTION
-void box_intersect( size_t box[][2] ,
-                    const size_t boxA[][2] ,
-                    const size_t boxB[][2] )
-{
-  for ( int i = 0 ; i < 3 ; ++i ) {
-    box[i][0] = boxA[i][0] > boxB[i][0] ? boxA[i][0] : boxB[i][0] ;
-    box[i][1] = boxA[i][1] < boxB[i][1] ? boxA[i][1] : boxB[i][1] ;
-    if ( box[i][0] > box[i][1] ) box[i][1] = box[i][0] ;
-  }
-}
-
-KOKKOS_INLINE_FUNCTION
-size_t box_count( const size_t box[][2] )
-{
-  return size_t( box[0][1] - box[0][0] ) *
-         size_t( box[1][1] - box[1][0] ) *
-         size_t( box[2][1] - box[2][0] );
-}
-
-KOKKOS_INLINE_FUNCTION
-void box_ghost_layer( const size_t global_box[][2] ,
-                      const size_t local_box[][2] ,
-                      const size_t ghost_layer ,
-                            size_t ghost_box[][2] )
-{
-  for ( int i = 0 ; i < 3 ; ++i ) {
-    ghost_box[i][0] = global_box[i][0] + ghost_layer > local_box[i][0] ? global_box[i][0] : local_box[i][0] - ghost_layer ;
-    ghost_box[i][1] = global_box[i][1] < local_box[i][1] + ghost_layer ? global_box[i][1] : local_box[i][1] + ghost_layer ;
-  }
-}
-
-void box_partition( const size_t global_size ,
-                    const size_t global_rank ,
-                    const size_t global_box[][2] ,
-                          size_t box[][2] );
-
-} // namespace Example
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Example {
-
-/** \brief Partition a box of hexahedral elements among subdomains.
- *
- *  Nodes are ordered locally as follows:
- *    { owned_by[ this_process ] ,
- *      owned_by[ neighbor_process[0] ] ,
- *      owned_by[ neighbor_process[1] ] ,
- *      owned_by[ neighbor_process[2] ] ,
- *      ... };
- */
-class BoxElemPart {
-public:
-
-  enum Decompose { DecomposeNode , DecomposeElem };
-  enum ElemOrder { ElemLinear , ElemQuadratic };
-
-  bool ok() const { return m_ok ; }
-
-  BoxElemPart( const ElemOrder elem_order ,
-               const Decompose decompose ,
-               const size_t global_size ,
-               const size_t global_rank ,
-               const size_t elem_nx ,
-               const size_t elem_ny ,
-               const size_t elem_nz );
-
-  KOKKOS_INLINE_FUNCTION
-  size_t global_elem_count() const
-    { return Kokkos::Example::box_count( m_global_elem_box ); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t global_node_count() const
-    { return Kokkos::Example::box_count( m_global_node_box ); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t uses_elem_count() const
-    { return Kokkos::Example::box_count( m_uses_elem_box ); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t owns_node_count() const
-    { return Kokkos::Example::box_count( m_owns_node_box[0] ); }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t uses_node_count() const
-    { return Kokkos::Example::box_count( m_uses_node_box ); }
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  size_t uses_elem_offset( const size_t ix ,
-                           const size_t iy ,
-                           const size_t iz ) const
-  {
-    return size_t( ix - m_uses_elem_box[0][0] ) + size_t( m_uses_elem_box[0][1] - m_uses_elem_box[0][0] ) * (
-           size_t( iy - m_uses_elem_box[1][0] ) + size_t( m_uses_elem_box[1][1] - m_uses_elem_box[1][0] ) * (
-           size_t( iz - m_uses_elem_box[2][0] ) ) );
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void uses_elem_coord( size_t lid , size_t c[] ) const
-  {
-    const size_t nx = m_uses_elem_box[0][1] - m_uses_elem_box[0][0] ;
-    const size_t ny = m_uses_elem_box[1][1] - m_uses_elem_box[1][0] ;
-
-    c[0] = m_uses_elem_box[0][0] + lid % nx ; lid /= nx ;
-    c[1] = m_uses_elem_box[1][0] + lid % ny ; lid /= ny ;
-    c[2] = m_uses_elem_box[2][0] + lid ;
-  }
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  size_t global_coord_max( size_t axis ) const
-  { return m_global_node_box[axis][1] - 1 ; }
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  void local_node_coord( size_t lid , size_t coord[] ) const
-  {
-    // Local id within an 'owns' block (has sentinal)
-    size_t j = 0 ;
-    while ( m_owns_node[j][1] <= lid ) { lid -= m_owns_node[j][1] ; ++j ; }
-
-    // Map to global coordinates:
-    const size_t nx = m_owns_node_box[j][0][1] - m_owns_node_box[j][0][0] ;
-    const size_t ny = m_owns_node_box[j][1][1] - m_owns_node_box[j][1][0] ;
-
-    coord[0] = m_owns_node_box[j][0][0] + lid % nx ; lid /= nx ;
-    coord[1] = m_owns_node_box[j][1][0] + lid % ny ; lid /= ny ;
-    coord[2] = m_owns_node_box[j][2][0] + lid ;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t local_node_id( const size_t c[] ) const
-  {
-    // Find which 'owns' block and accumulate the offset of this block:
-    size_t lid = 0 ;
-    size_t j = 0 ;
-    while ( ! ( m_owns_node_box[j][0][0] <= c[0] && c[0] < m_owns_node_box[j][0][1] &&
-                m_owns_node_box[j][1][0] <= c[1] && c[1] < m_owns_node_box[j][1][1] &&
-                m_owns_node_box[j][2][0] <= c[2] && c[2] < m_owns_node_box[j][2][1] ) ) {
-      
-      lid += m_owns_node[j][1] ;
-      ++j ;
-    }
-
-    // Map offset to the block plus offset within the block:
-    return lid +
-           size_t( c[0] - m_owns_node_box[j][0][0] ) + size_t( m_owns_node_box[j][0][1] - m_owns_node_box[j][0][0] ) * (
-           size_t( c[1] - m_owns_node_box[j][1][0] ) + size_t( m_owns_node_box[j][1][1] - m_owns_node_box[j][1][0] ) * (
-           size_t( c[2] - m_owns_node_box[j][2][0] ) ) );
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t global_node_id( const size_t c[] ) const
-  {
-    return size_t( c[0] - m_global_node_box[0][0] ) + size_t( m_global_node_box[0][1] - m_global_node_box[0][0] ) * (
-           size_t( c[1] - m_global_node_box[1][0] ) + size_t( m_global_node_box[1][1] - m_global_node_box[1][0] ) * (
-           size_t( c[2] - m_global_node_box[2][0] ) ) );
-  }
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  size_t recv_node_msg_count() const { return m_owns_node_count - 1 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t recv_node_rank(  size_t msg ) const { return m_owns_node[msg+1][0] ; }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t recv_node_count( size_t msg ) const { return m_owns_node[msg+1][1] ; }
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  size_t send_node_msg_count() const { return m_send_node_count ; }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t send_node_rank(  size_t msg ) const { return m_send_node[msg][0] ; }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t send_node_count( size_t msg ) const { return m_send_node[msg][1] ; }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t send_node_id_count() const
-  {
-    size_t count = 0 ;
-    for ( size_t i = 0 ; i < m_send_node_count ; ++i ) {
-      count += m_send_node[i][1] ;
-    }
-    return count ;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t send_node_id( size_t item ) const
-  {
-    // Find which send list this send item is in:
-    size_t j = 0 ;
-    while ( m_send_node[j][1] <= item ) { item -= m_send_node[j][1] ; ++j ; }
-
-    // Map to global coordinate:
-    const size_t nx = m_send_node_box[j][0][1] - m_send_node_box[j][0][0] ;
-    const size_t ny = m_send_node_box[j][1][1] - m_send_node_box[j][1][0] ;
-
-    size_t c[3] ;
-
-    c[0] = m_send_node_box[j][0][0] + item % nx ; item /= nx ;
-    c[1] = m_send_node_box[j][1][0] + item % ny ; item /= ny ;
-    c[2] = m_send_node_box[j][2][0] + item ;
-
-    // Map to local id:
-    return size_t( c[0] - m_owns_node_box[0][0][0] ) + size_t( m_owns_node_box[0][0][1] - m_owns_node_box[0][0][0] ) * (
-           size_t( c[1] - m_owns_node_box[0][1][0] ) + size_t( m_owns_node_box[0][1][1] - m_owns_node_box[0][1][0] ) * (
-           size_t( c[2] - m_owns_node_box[0][2][0] ) ) );
-  }
-
-  //----------------------------------------
-
-  void print( std::ostream & s ) const ;
-
-private:
-
-  // Maximum number of processes in a neighborhood, including this process
-  enum { PROC_NEIGH_MAX = 64 };
-
-  void local( const size_t  rank ,
-                    size_t  uses_elem[][2] ,
-                    size_t  owns_node[][2] ,
-                    size_t  uses_node[][2] ) const ;
-
-  size_t  m_global_size ;
-  size_t  m_global_rank ;
-
-  Decompose m_decompose ;
-  ElemOrder m_elem_order ;
-
-  size_t m_global_elem_box[3][2] ;
-  size_t m_global_node_box[3][2] ;
-  size_t m_uses_elem_box[3][2] ;
-  size_t m_uses_node_box[3][2] ;
-
-  // [ processor rank , count ]
-  size_t m_owns_node_box[ PROC_NEIGH_MAX ][3][2] ;
-  size_t m_owns_node[     PROC_NEIGH_MAX ][2] ;
-  size_t m_owns_node_count ;
-
-  size_t m_send_node_box[ PROC_NEIGH_MAX ][3][2] ;
-  size_t m_send_node[     PROC_NEIGH_MAX ][2] ;
-  size_t m_send_node_count ;
-
-  bool   m_ok ;
-};
-
-} // namespace Example
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_BOXELEMPART_HPP */
-
--- a/lib/kokkos/example/fixture/CMakeLists.txt
+++ b/lib/kokkos/example/fixture/CMakeLists.txt
@ -1,13 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../common)
-
-SET(SOURCES_TEST Main.cpp TestFixture.cpp BoxElemPart.cpp )
-
-# Automatically picks up 'kokkosexample_fixture'
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  TestFixture
-  SOURCES ${SOURCES_TEST}
-  )
-
--- a/lib/kokkos/example/fixture/HexElement.hpp
+++ b/lib/kokkos/example/fixture/HexElement.hpp
@ -1,270 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_HEXELEMENT_HPP
-#define KOKKOS_HEXELEMENT_HPP
-
-namespace Kokkos {
-namespace Example {
-
-template< unsigned NodeCount >
-class HexElement_TensorData ;
-
-template< unsigned NodeCount , class Device >
-class HexElement_TensorEval ;
-
-//----------------------------------------------------------------------------
-/** \brief  Evaluate Hex element on interval [-1,1]^3 */
-template<>
-class HexElement_TensorData< 8 > {
-public:
-
-  static const unsigned element_node_count    = 8 ;
-  static const unsigned spatial_dimension     = 3 ;
-  static const unsigned integration_count_1d  = 2 ;
-  static const unsigned function_count_1d     = 2 ;
-
-  float values_1d [ function_count_1d ][ integration_count_1d ];
-  float derivs_1d [ function_count_1d ][ integration_count_1d ];
-  float weights_1d[ integration_count_1d ];
-
-  unsigned char eval_map[ element_node_count ][4] ;
-
-  static float eval_value_1d( const unsigned jf , const float x )
-  {
-    return 0 == jf ? 0.5 * ( 1.0 - x ) : (
-           1 == jf ? 0.5 * ( 1.0 + x ) : 0 );
-  }
-
-  static float eval_deriv_1d( const unsigned jf , const float )
-  {
-    return 0 == jf ? -0.5 : (
-           1 == jf ?  0.5 : 0 );
-  }
-
-  HexElement_TensorData()
-  {
-    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
-      { { 0 , 0 , 0 },
-        { 1 , 0 , 0 },
-        { 1 , 1 , 0 },
-        { 0 , 1 , 0 },
-        { 0 , 0 , 1 },
-        { 1 , 0 , 1 },
-        { 1 , 1 , 1 },
-        { 0 , 1 , 1 } };
-
-    weights_1d[0] = 1 ;
-    weights_1d[1] = 1 ;
-
-    const float points_1d[ integration_count_1d ] =
-      { -0.577350269 , 0.577350269 };
-
-    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
-      eval_map[i][0] = tmp_map[i][0];
-      eval_map[i][1] = tmp_map[i][1];
-      eval_map[i][2] = tmp_map[i][2];
-    }
-
-    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
-    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
-      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
-      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
-    }}
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template<>
-class HexElement_TensorData< 27 > {
-public:
-
-  static const unsigned element_node_count    = 27 ;
-  static const unsigned spatial_dimension     = 3 ;
-  static const unsigned integration_count_1d  = 3 ;
-  static const unsigned function_count_1d     = 3 ;
-
-  float values_1d [ function_count_1d ][ integration_count_1d ];
-  float derivs_1d [ function_count_1d ][ integration_count_1d ];
-  float weights_1d[ integration_count_1d ];
-
-  unsigned char eval_map[ element_node_count ][4] ;
-
-  // sizeof(EvaluateElementHex) = 111 bytes =
-  //   sizeof(float) * 9 +
-  //   sizeof(float) * 9 +
-  //   sizeof(float) * 3 +
-  //   sizeof(char) * 27 
-
-  static float eval_value_1d( const unsigned jf , const float p )
-  {
-    return 0 == jf ? 0.5 * p * ( p - 1 ) : (
-           1 == jf ? 1.0 - p * p : (
-           2 == jf ? 0.5 * p * ( p + 1 ) : 0 ));
-  }
-
-  static float eval_deriv_1d( const unsigned jf , const float p )
-  {
-    return 0 == jf ? p - 0.5 : (
-           1 == jf ? -2.0 * p : (
-           2 == jf ? p + 0.5 : 0 ));
-  }
-
-  HexElement_TensorData()
-  {
-    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
-      { { 0 , 0 , 0 },
-        { 2 , 0 , 0 },
-        { 2 , 2 , 0 },
-        { 0 , 2 , 0 },
-        { 0 , 0 , 2 },
-        { 2 , 0 , 2 },
-        { 2 , 2 , 2 },
-        { 0 , 2 , 2 },
-        { 1 , 0 , 0 },
-        { 2 , 1 , 0 },
-        { 1 , 2 , 0 },
-        { 0 , 1 , 0 },
-        { 0 , 0 , 1 },
-        { 2 , 0 , 1 },
-        { 2 , 2 , 1 },
-        { 0 , 2 , 1 },
-        { 1 , 0 , 2 },
-        { 2 , 1 , 2 },
-        { 1 , 2 , 2 },
-        { 0 , 1 , 2 },
-        { 1 , 1 , 1 },
-        { 1 , 1 , 0 },
-        { 1 , 1 , 2 },
-        { 0 , 1 , 1 },
-        { 2 , 1 , 1 },
-        { 1 , 0 , 1 },
-        { 1 , 2 , 1 } };
-
-    // Interval [-1,1]
-
-    weights_1d[0] = 0.555555556 ;
-    weights_1d[1] = 0.888888889 ;
-    weights_1d[2] = 0.555555556 ;
-
-    const float points_1d[3] = { -0.774596669 ,
-                                  0.000000000 ,
-                                  0.774596669 };
-
-    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
-      eval_map[i][0] = tmp_map[i][0];
-      eval_map[i][1] = tmp_map[i][1];
-      eval_map[i][2] = tmp_map[i][2];
-    }
-
-    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
-    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
-      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
-      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
-    }}
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< unsigned NodeCount >
-class HexElement_Data {
-public:
-  static const unsigned spatial_dimension   = 3 ;
-  static const unsigned element_node_count  = NodeCount ;
-  static const unsigned integration_count   = NodeCount ;
-  static const unsigned function_count      = NodeCount ;
-
-  float weights[   integration_count ] ;
-  float values[    integration_count ][ function_count ];
-  float gradients[ integration_count ][ spatial_dimension ][ function_count ];
-
-  HexElement_Data()
-  {
-    HexElement_TensorData< NodeCount > tensor_data ;
-
-    for ( unsigned ip = 0 ; ip < integration_count ; ++ip ) {
-
-      const unsigned ipx = tensor_data.eval_map[ip][0] ;
-      const unsigned ipy = tensor_data.eval_map[ip][1] ;
-      const unsigned ipz = tensor_data.eval_map[ip][2] ;
-
-      weights[ip] = tensor_data.weights_1d[ ipx ] *
-                    tensor_data.weights_1d[ ipy ] *
-                    tensor_data.weights_1d[ ipz ] ;
-
-      for ( unsigned jf = 0 ; jf < function_count ; ++jf ) {
-
-        const unsigned jfx = tensor_data.eval_map[jf][0] ;
-        const unsigned jfy = tensor_data.eval_map[jf][1] ;
-        const unsigned jfz = tensor_data.eval_map[jf][2] ;
-
-        values[ip][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
-                         tensor_data.values_1d[ ipy ][ jfy ] *
-                         tensor_data.values_1d[ ipz ][ jfz ] ;
-
-        gradients[ip][0][jf] = tensor_data.derivs_1d[ ipx ][ jfx ] *
-                               tensor_data.values_1d[ ipy ][ jfy ] *
-                               tensor_data.values_1d[ ipz ][ jfz ] ;
-
-        gradients[ip][1][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
-                               tensor_data.derivs_1d[ ipy ][ jfy ] *
-                               tensor_data.values_1d[ ipz ][ jfz ] ;
-
-        gradients[ip][2][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
-                               tensor_data.values_1d[ ipy ][ jfy ] *
-                               tensor_data.derivs_1d[ ipz ][ jfz ] ;
-      }
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_HEXELEMENT_HPP */
-
-
--- a/lib/kokkos/example/fixture/Main.cpp
+++ b/lib/kokkos/example/fixture/Main.cpp
@ -1,304 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-#include <utility>
-#include <iostream>
-
-#include <Kokkos_Core.hpp>
-
-#include <BoxElemPart.hpp>
-
-namespace Kokkos {
-namespace Example {
-template< class > void test_fixture();
-}
-}
-
-int test_box( const size_t global_size
-            , const size_t global_box[][2]
-            , const bool print_verbose )
-{
-  size_t global_count = 0 ;
-  size_t global_max = 0 ;
-  size_t global_min = Kokkos::Example::box_count( global_box );
-  size_t global_box_max[3][2] = { { 0 , 0 } , { 0 , 0 } , { 0 , 0 } };
-  size_t global_box_min[3][2] = { { 0 , global_box[0][1] } , { 0 , global_box[1][1] } , { 0 , global_box[2][1] } };
-  size_t intersect_error = 0 ;
-  size_t neighbor_max = 0 ;
-
-  for ( size_t global_rank = 0 ; global_rank < global_size ; ++global_rank ) {
-    size_t box[3][2] = { { 0 , global_box[0][1] } , { 0 , global_box[1][1] } , { 0 , global_box[2][1] } };
-    size_t ghost_box[3][2] ;
-    size_t neighbor_count = 0 ;
-
-    Kokkos::Example::box_partition( global_size , global_rank , global_box , box );
-
-    Kokkos::Example::box_ghost_layer( global_box , box , 1 , ghost_box );
-
-    {
-      const size_t n = Kokkos::Example::box_count( box );
-
-      for ( int i = 0 ; i < 3 ; ++i ) {
-        if ( ( box[i][1] - box[i][0] ) < ( global_box_min[i][1] - global_box_min[i][0] ) ) {
-          global_box_min[i][0] = box[i][0] ;
-          global_box_min[i][1] = box[i][1] ;
-        }
-        if ( ( box[i][1] - box[i][0] ) > ( global_box_max[i][1] - global_box_max[i][0] ) ) {
-          global_box_max[i][0] = box[i][0] ;
-          global_box_max[i][1] = box[i][1] ;
-        }
-      }
-
-      global_max = std::max( global_max , n );
-      global_min = std::min( global_min , n );
-      global_count += n ;
-    }
-
-    for ( size_t other_rank = 0 ; other_rank  < global_size ; ++other_rank ) {
-
-      if ( other_rank == global_rank ) continue ;
-
-      size_t other_box[3][2] = { { 0 , global_box[0][1] } , { 0 , global_box[1][1] } , { 0 , global_box[2][1] } };
-      size_t intersect_box[3][2] ;
-
-      Kokkos::Example::box_partition( global_size , other_rank , global_box , other_box );
-
-      Kokkos::Example::box_intersect( intersect_box , box , other_box );
-
-      const size_t n = Kokkos::Example::box_count( intersect_box );
-
-      intersect_error += n ;
-
-      Kokkos::Example::box_intersect( intersect_box , ghost_box , other_box );
-
-      neighbor_count += Kokkos::Example::box_count( intersect_box ) ? 1 : 0 ;
-
-      if ( n ) {
-        std::cout << "box partition intersection error" << std::endl ;
-        std::cout << "box = {"
-                  << " [ " << box[0][0] << " , " << box[0][1] << " )"
-                  << " [ " << box[1][0] << " , " << box[1][1] << " )"
-                  << " [ " << box[2][0] << " , " << box[2][1] << " )"
-                  << " }" << std::endl ;
-        std::cout << "other_box = {"
-                  << " [ " << other_box[0][0] << " , " << other_box[0][1] << " )"
-                  << " [ " << other_box[1][0] << " , " << other_box[1][1] << " )"
-                  << " [ " << other_box[2][0] << " , " << other_box[2][1] << " )"
-                  << " }" << std::endl ;
-        return 0 ;
-      }
-    }
-
-    neighbor_max = std::max( neighbor_max , neighbor_count );
-  }
-
-  if ( print_verbose ) {
-
-    std::cout << "global_part = " << global_size << std::endl ;
-    std::cout << "global_box  = { "
-              << " [ " << global_box[0][0] << " .. " << global_box[0][1] << " ) X"
-              << " [ " << global_box[1][0] << " .. " << global_box[1][1] << " ) X"
-              << " [ " << global_box[2][0] << " .. " << global_box[2][1] << " )"
-              << " }" << std::endl ;
-    std::cout << "count( global_box ) = " << Kokkos::Example::box_count( global_box ) << std::endl ;
-    std::cout << "sum partition( global_box ) = " << global_count << std::endl ;
-    std::cout << "avg partition( global_box ) = " << size_t( double(global_count) / double(global_size)) << std::endl ;
-    std::cout << "min partition( global_box ) = " << global_min << std::endl ;
-    std::cout << "min part X   ( global_box ) = [ " << global_box_min[0][0] << " .. " << global_box_min[0][1] << " )" << std::endl ;
-    std::cout << "min part Y   ( global_box ) = [ " << global_box_min[1][0] << " .. " << global_box_min[1][1] << " )" << std::endl ;
-    std::cout << "min part Z   ( global_box ) = [ " << global_box_min[2][0] << " .. " << global_box_min[2][1] << " )" << std::endl ;
-    std::cout << "max partition( global_box ) = " << global_max << std::endl ;
-    std::cout << "max part X   ( global_box ) = [ " << global_box_max[0][0] << " .. " << global_box_max[0][1] << " )" << std::endl ;
-    std::cout << "max part Y   ( global_box ) = [ " << global_box_max[1][0] << " .. " << global_box_max[1][1] << " )" << std::endl ;
-    std::cout << "max part Z   ( global_box ) = [ " << global_box_max[2][0] << " .. " << global_box_max[2][1] << " )" << std::endl ;
-    std::cout << "sum intersect( global_box ) = " << intersect_error << std::endl ;
-    std::cout << "max neighbor = " << neighbor_max << std::endl ;
-  }
-
-  return neighbor_max ;
-}
-
-void test_elem()
-{
-  const Kokkos::Example::BoxElemPart::Decompose
-    decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ; // DecomposeElem | DecomposeNode ;
-  const size_t global_size = 256 ;
-  const size_t global_nx = 100 ;
-  const size_t global_ny = 120 ;
-  const size_t global_nz = 140 ;
-
-  double node_count_avg = 0 ;
-  size_t node_count_max = 0 ;
-  size_t node_count_min = ( global_nx + 1 ) * ( global_ny + 1 ) * ( global_nz + 1 );
-  double elem_count_avg = 0 ;
-  size_t elem_count_max = 0 ;
-  size_t elem_count_min = global_nx * global_ny * global_nz ;
-  double recv_count_avg = 0 ;
-  size_t recv_count_max = 0 ;
-  size_t recv_count_min = global_size ;
-  double send_count_avg = 0 ;
-  size_t send_count_max = 0 ;
-  size_t send_count_min = global_size ;
-
-  for ( size_t r = 0 ; r < global_size ; ++r ) {
-    const Kokkos::Example::BoxElemPart
-       fixture( Kokkos::Example::BoxElemPart::ElemLinear ,
-                decompose , global_size , r , global_nx , global_ny , global_nz );
-
-    // Print a sample:
-
-    // if ( r == global_size * 2 / 3 ) fixture.print( std::cout );
-
-    // Verify recv/send alignment:
-
-    {
-      size_t recv_lid = fixture.owns_node_count();
-
-      for ( size_t i = 0 ; i < fixture.recv_node_msg_count() ; ++i ) {
-        const size_t recv_rank  = fixture.recv_node_rank( i );
-        const size_t recv_count = fixture.recv_node_count( i );
-
-        const Kokkos::Example::BoxElemPart other_fixture(
-           Kokkos::Example::BoxElemPart::ElemLinear ,
-           decompose , global_size , recv_rank , global_nx , global_ny , global_nz );
-
-        size_t send_item = 0 ;
-
-        size_t j = 0 ;
-        while ( j < other_fixture.send_node_msg_count() && other_fixture.send_node_rank(j) != r ) {
-          send_item += other_fixture.send_node_count( j );
-           ++j ;
-        }
-
-        if ( recv_count != other_fixture.send_node_count(j) ) {
-          std::cout << "Error P[" << r << "].recv(" << recv_count << ") != "
-                    << "P[" << recv_rank << "].send(" << other_fixture.send_node_count(j) << ")"
-                    << std::endl ;
-        }
-        else {
-
-          for ( size_t k = 0 ; k < recv_count ; ++k , ++send_item , ++recv_lid ) {
-
-            const size_t send_lid = other_fixture.send_node_id( send_item );
-
-            size_t recv_coord[3] , send_coord[3] ;
-
-            fixture.local_node_coord( recv_lid , recv_coord );
-
-            other_fixture.local_node_coord( send_lid , send_coord );
-
-            if ( recv_coord[0] != send_coord[0] ||
-                 recv_coord[1] != send_coord[1] ||
-                 recv_coord[2] != send_coord[2] ) {
-              std::cout << "Error P[" << r << "].recv[" << recv_lid << "]{ "
-                        << recv_coord[0] << " , "
-                        << recv_coord[1] << " , "
-                        << recv_coord[2] << " } != "
-                        << "P[" << recv_rank << "].send[" << send_lid << "]{ "
-                        << send_coord[0] << " , "
-                        << send_coord[1] << " , "
-                        << send_coord[2] << " }"
-                        << std::endl ;
-            }
-          }
-        }
-      }
-    }
-
-    node_count_avg += fixture.owns_node_count();
-    elem_count_avg += fixture.uses_elem_count();
-    recv_count_avg += fixture.recv_node_msg_count();
-    send_count_avg += fixture.send_node_msg_count();
-
-    elem_count_min = std::min( (size_t) fixture.uses_elem_count() , elem_count_min );
-    elem_count_max = std::max( (size_t) fixture.uses_elem_count() , elem_count_max );
-    node_count_min = std::min( (size_t) fixture.owns_node_count() , node_count_min );
-    node_count_max = std::max( (size_t) fixture.owns_node_count() , node_count_max );
-
-    recv_count_max = std::max( (size_t) fixture.recv_node_msg_count() , recv_count_max );
-    recv_count_min = std::min( (size_t) fixture.recv_node_msg_count() , recv_count_min );
-    send_count_max = std::max( (size_t) fixture.send_node_msg_count() , send_count_max );
-    send_count_min = std::min( (size_t) fixture.send_node_msg_count() , send_count_min );
-  }
-
-  node_count_avg /= double(global_size);
-  elem_count_avg /= double(global_size);
-  recv_count_avg /= double(global_size);
-  send_count_avg /= double(global_size);
-
-  std::cout << "Elem min(" << elem_count_min << ") avg(" << elem_count_avg << ") max(" << elem_count_max << ") " << std::endl
-            << "Node min(" << node_count_min << ") avg(" << node_count_avg << ") max(" << node_count_max << ") " << std::endl
-            << "Recv min(" << recv_count_min << ") avg(" << recv_count_avg << ") max(" << recv_count_max << ") " << std::endl
-            << "Send min(" << send_count_min << ") avg(" << send_count_avg << ") max(" << send_count_max << ") " << std::endl
-            ;
-}
-
-int main()
-{
-  for ( int i = 1 ; i <= 32 ; ++i ) {
-    const size_t global_size = 16 * i ;
-    const size_t global_box[3][2] = { { 0 , 65 } , { 0 , 65 } , { 0 , 65 } };
-    if ( 30 < test_box( global_size , global_box , false ) ) {
-      test_box( global_size , global_box , true );
-    }
-  }
-
-//  test_elem();
-
-  {
-    std::cout << "test_fixture< Host >" << std::endl ;
-    Kokkos::HostSpace::execution_space::initialize( 1 );
-    Kokkos::Example::test_fixture< Kokkos::HostSpace::execution_space >();
-    Kokkos::HostSpace::execution_space::finalize();
-  }
-
-#if defined( KOKKOS_HAVE_CUDA )
-  {
-    std::cout << "test_fixture< Cuda >" << std::endl ;
-    Kokkos::HostSpace::execution_space::initialize();
-    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
-    Kokkos::Example::test_fixture< Kokkos::Cuda >();
-    Kokkos::Cuda::finalize();
-    Kokkos::HostSpace::execution_space::finalize();
-  }
-#endif
-}
-
--- a/lib/kokkos/example/fixture/Makefile
+++ b/lib/kokkos/example/fixture/Makefile
@ -1,49 +0,0 @@
-KOKKOS_PATH = ../..
-
-vpath %.cpp ${KOKKOS_PATH}/example/fixture
-
-EXAMPLE_HEADERS = $(wildcard $(KOKKOS_PATH)/example/common/*.hpp ${KOKKOS_PATH}/example/fixture/*.hpp )
-
-default: build_all
-	echo "End Build"
-        
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-	CXX = nvcc_wrapper
-	CXXFLAGS ?= -O3
-	LINK = $(CXX)
-	LDFLAGS ?= -lpthread
-else
-	CXX ?= g++
-	CXXFLAGS ?= -O3
-	LINK ?= $(CXX)
-	LDFLAGS ?= -lpthread
-endif
-
-KOKKOS_CXXFLAGS +=	\
-	-I${KOKKOS_PATH}/example/common	\
-	-I${KOKKOS_PATH}/example/fixture
-
-OBJ_EXAMPLE_FIXTURE = Main.o TestFixture.o BoxElemPart.o
-EXE_EXAMPLE_FIXTURE = KokkosExample_Fixture
-
-TARGETS = $(EXE_EXAMPLE_FIXTURE)
-
-#TEST_TARGETS =
-
-$(EXE_EXAMPLE_FIXTURE) : $(OBJ_EXAMPLE_FIXTURE) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_EXAMPLE_FIXTURE) $(KOKKOS_LIBS) $(LIB) -o $(EXE_EXAMPLE_FIXTURE)
-
-
-build_all : $(TARGETS)
-
-
-test : build_all
-
-
-# Compilation rules
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(EXAMPLE_HEADERS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
-
--- a/lib/kokkos/example/fixture/TestFixture.cpp
+++ b/lib/kokkos/example/fixture/TestFixture.cpp
@ -1,58 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-#include <TestFixture.hpp>
-
-namespace Kokkos {
-namespace Example {
-
-template void test_fixture< Kokkos::HostSpace::execution_space >();
-
-#if defined( KOKKOS_HAVE_CUDA )
-template void test_fixture<Kokkos::Cuda>();
-#endif
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
--- a/lib/kokkos/example/fixture/TestFixture.hpp
+++ b/lib/kokkos/example/fixture/TestFixture.hpp
@ -1,156 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXAMPLE_TESTFIXTURE_HPP
-#define KOKKOS_EXAMPLE_TESTFIXTURE_HPP
-
-#include <utility>
-#include <iostream>
-
-#include <Kokkos_Core.hpp>
-
-#include <BoxElemPart.hpp>
-#include <BoxElemFixture.hpp>
-
-namespace Kokkos {
-namespace Example {
-
-template< class Device >
-struct FixtureVerifyElemNodeCoord
-{
-  typedef Device execution_space ;
-
-  typedef struct { size_t success , error ; } value_type ;
-
-  typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ;
-
-  FixtureType m_fixture ;
-
-  KOKKOS_INLINE_FUNCTION
-  void init( value_type & update ) const { update.success = update.error = 0 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void join( volatile       value_type & update ,
-             volatile const value_type & input ) const
-    {
-      update.success += input.success ;
-      update.error += input.error ;
-    }
-  
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( size_t ielem , value_type & update ) const
-  {
-    unsigned node_coord[ FixtureType::ElemNode ][3] ;
-
-    for ( unsigned i = 0 ; i < FixtureType::ElemNode ; ++i ) {
-      const unsigned node_id = m_fixture.elem_node(ielem,i);
-      node_coord[i][0] = m_fixture.node_grid(node_id,0);
-      node_coord[i][1] = m_fixture.node_grid(node_id,1);
-      node_coord[i][2] = m_fixture.node_grid(node_id,2);
-    }
-
-    int error = 0 ;
-    for ( unsigned i = 1 ; i < FixtureType::ElemNode ; ++i ) {
-      if ( node_coord[0][0] + m_fixture.elem_node_local(i,0) != node_coord[i][0] ||
-           node_coord[0][1] + m_fixture.elem_node_local(i,1) != node_coord[i][1] ||
-           node_coord[0][2] + m_fixture.elem_node_local(i,2) != node_coord[i][2] ) {
-        error = 1 ;
-      }
-    }
-
-    if ( error ) {
-      ++update.error ;
-    }
-    else {
-      ++update.success ;
-    }
-  }
-
-  FixtureVerifyElemNodeCoord( const FixtureType & f ) : m_fixture(f) {}
-};
-
-
-template< class Device >
-void test_fixture()
-{
-  typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ;
-
-  const Kokkos::Example::BoxElemPart::Decompose
-    decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ; // DecomposeElem | DecomposeNode ;
-
-  const unsigned global_size = 256 ;
-  const unsigned global_nx = 400 ;
-  const unsigned global_ny = 400 ;
-  const unsigned global_nz = 400 ;
-
-  for ( unsigned my_rank = 0 ; my_rank < global_size ; ++my_rank ) {
-
-    const FixtureType fixture( decompose , global_size , my_rank , global_nx , global_ny , global_nz );
-
-    // Verify grid coordinates of element's nodes
-    
-    typename FixtureVerifyElemNodeCoord<Device>::value_type result = { 0 , 0 };
-
-    Kokkos::parallel_reduce( fixture.elem_node().dimension_0() , FixtureVerifyElemNodeCoord<Device>( fixture ) , result );
-
-    if ( result.error ) {
-      std::cout << "P[" << my_rank << ":" << global_size
-                << "] Fixture elem_node_coord"
-                << " success(" << result.success << ")"
-                << " error(" << result.error << ")"
-                << std::endl ;
-    }
-
-    // Check send/recv alignment
-
-
-  }
-}
-
-
-} /* namespace Example */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_EXAMPLE_TESTFIXTURE_HPP */
-
--- a/lib/kokkos/example/global_2_local_ids/CMakeLists.txt
+++ b/lib/kokkos/example/global_2_local_ids/CMakeLists.txt
@ -1,17 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-SET(SOURCES "")
-
-SET(SOURCES
-  G2L_Main.cpp 
-  )
-
-TRIBITS_ADD_EXECUTABLE(
-  global_2_local_ids
-  SOURCES ${SOURCES}
-  COMM serial mpi
-  )
-
-
--- a/lib/kokkos/example/global_2_local_ids/G2L.hpp
+++ b/lib/kokkos/example/global_2_local_ids/G2L.hpp
@ -1,266 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-#ifndef KOKKOS_GLOBAL_TO_LOCAL_IDS_HPP
-#define KOKKOS_GLOBAL_TO_LOCAL_IDS_HPP
-
-#include <Kokkos_Core.hpp>
-
-#include <Kokkos_UnorderedMap.hpp>
-
-#include <vector>
-#include <algorithm>
-#include <iomanip>
-
-#include <impl/Kokkos_Timer.hpp>
-
-// This test will simulate global ids
-
-namespace G2L {
-
-static const unsigned begin_id_size = 256u;
-static const unsigned end_id_size = 1u << 25;
-static const unsigned id_step = 2u;
-
-//use to help generate global ids
-union helper
-{
-  uint32_t word;
-  uint8_t byte[4];
-};
-
-
-//generate a unique global id from the local id
-template <typename Device>
-struct generate_ids
-{
-  typedef Device execution_space;
-  typedef typename execution_space::size_type size_type;
-  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
-
-  local_id_view local_2_global;
-
-  generate_ids( local_id_view & ids)
-    : local_2_global(ids)
-  {
-    Kokkos::parallel_for(local_2_global.size(), *this);
-  }
-
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(size_type i) const
-  {
-
-    helper x = {static_cast<uint32_t>(i)};
-
-    // shuffle the bytes of i to create a unique, semi-random global_id
-    x.word = ~x.word;
-
-    uint8_t tmp = x.byte[3];
-    x.byte[3] = x.byte[1];
-    x.byte[1] = tmp;
-
-    tmp = x.byte[2];
-    x.byte[2] = x.byte[0];
-    x.byte[0] = tmp;
-
-    local_2_global[i] = x.word;
-  }
-
-};
-
-// fill a map of global_id -> local_id
-template <typename Device>
-struct fill_map
-{
-  typedef Device execution_space;
-  typedef typename execution_space::size_type size_type;
-  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
-  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
-
-  global_id_view global_2_local;
-  local_id_view local_2_global;
-
-  fill_map( global_id_view gIds, local_id_view lIds)
-    : global_2_local(gIds) , local_2_global(lIds)
-  {
-    Kokkos::parallel_for(local_2_global.size(), *this);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(size_type i) const
-  {
-    global_2_local.insert( local_2_global[i], i);
-  }
-
-};
-
-// check that the global id is found and that it maps to the local id
-template <typename Device>
-struct find_test
-{
-  typedef Device execution_space;
-  typedef typename execution_space::size_type size_type;
-  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
-  typedef Kokkos::UnorderedMap<const uint32_t, const size_type,execution_space> global_id_view;
-
-  global_id_view global_2_local;
-  local_id_view local_2_global;
-
-  typedef size_t value_type;
-
-  find_test( global_id_view gIds, local_id_view lIds, value_type & num_errors)
-    : global_2_local(gIds) , local_2_global(lIds)
-  {
-    Kokkos::parallel_reduce(local_2_global.size(), *this, num_errors);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void init(value_type & v) const
-  { v = 0; }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type & dst, volatile value_type const & src) const
-  { dst += src; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(size_type i, value_type & num_errors) const
-  {
-    uint32_t index = global_2_local.find( local_2_global[i] );
-
-    if (  !global_2_local.valid_at(index)
-        || global_2_local.key_at(index) != local_2_global[i]
-        || global_2_local.value_at(index) != i)
-      ++num_errors;
-  }
-
-};
-
-// run test
-template <typename Device>
-size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations)
-{
-
-  typedef Device execution_space;
-  typedef typename execution_space::size_type size_type;
-
-  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
-  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
-
-  double elasped_time = 0;
-  Kokkos::Impl::Timer timer;
-
-  local_id_view local_2_global("local_ids", num_ids);
-  global_id_view global_2_local(capacity);
-
-  int shiftw = 15;
-
-  //create
-  elasped_time = timer.seconds();
-  std::cout << std::setw(shiftw) <<  "allocate: " <<  elasped_time << std::endl;
-  timer.reset();
-
-  // generate unique ids
-  {
-    generate_ids<Device> gen(local_2_global);
-  }
-
-  // generate
-  elasped_time = timer.seconds();
-  std::cout << std::setw(shiftw) << "generate: " <<  elasped_time << std::endl;
-  timer.reset();
-
-  {
-    fill_map<Device> fill(global_2_local, local_2_global);
-  }
-
-  // fill
-  elasped_time = timer.seconds();
-  std::cout << std::setw(shiftw) << "fill: " <<  elasped_time << std::endl;
-  timer.reset();
-
-
-  size_t num_errors = global_2_local.failed_insert();
-
-  if (num_errors == 0u) {
-    for (unsigned i=0; i<num_find_iterations; ++i)
-    {
-      find_test<Device> find(global_2_local, local_2_global,num_errors);
-    }
-
-    // find
-    elasped_time = timer.seconds();
-    std::cout << std::setw(shiftw) << "lookup: " <<  elasped_time << std::endl;
-  }
-  else {
-    std::cout << "    !!! Fill Failed !!!" << std::endl;
-  }
-
-  return num_errors;
-}
-
-template <typename Device>
-size_t run_test(unsigned num_ids, unsigned num_find_iterations)
-{
-  // expect to fail
-  unsigned capacity = (num_ids*2u)/3u;
-  std::cout << " 66% of needed capacity (should fail)" << std::endl;
-  test_global_to_local_ids<Device>(num_ids, capacity, num_find_iterations);
-
-  //should not fail
-  std::cout << " 100% of needed capacity" << std::endl;
-  capacity = num_ids;
-  size_t num_errors = test_global_to_local_ids<Device>(num_ids, capacity, num_find_iterations);
-
-  //should not fail
-  std::cout << " 150% of needed capacity" << std::endl;
-  capacity = (num_ids*3u)/2u;
-  num_errors += test_global_to_local_ids<Device>(num_ids, capacity, num_find_iterations);
-
-  return num_errors;
-}
-
-
-} // namespace G2L
-
-
-#endif //KOKKOS_GLOBAL_TO_LOCAL_IDS_HPP
-
--- a/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp
+++ b/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp
@ -1,149 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-
-#include <G2L.hpp>
-
-namespace G2L {
-
-size_t run_serial(unsigned num_ids, unsigned num_find_iterations)
-{
-#ifdef KOKKOS_HAVE_SERIAL
-  std::cout << "Serial" << std::endl;
-  return run_test<Kokkos::Serial>(num_ids,num_find_iterations);
-#else
-  return 0;
-#endif // KOKKOS_HAVE_SERIAL
-}
-
-size_t run_threads(unsigned num_ids, unsigned num_find_iterations)
-{
-#ifdef KOKKOS_HAVE_PTHREAD
-  std::cout << "Threads" << std::endl;
-  return run_test<Kokkos::Threads>(num_ids,num_find_iterations);
-#else
-  return 0;
-#endif
-}
-
-size_t run_openmp(unsigned num_ids, unsigned num_find_iterations)
-{
-#ifdef KOKKOS_HAVE_OPENMP
-  std::cout << "OpenMP" << std::endl;
-  return run_test<Kokkos::OpenMP>(num_ids,num_find_iterations);
-#else
-  return 0;
-#endif
-}
-
-size_t run_cuda(unsigned num_ids, unsigned num_find_iterations)
-{
-#ifdef KOKKOS_HAVE_CUDA
-  std::cout << "Cuda" << std::endl;
-  return run_test<Kokkos::Cuda>(num_ids,num_find_iterations);
-#else
-  return 0;
-#endif
-}
-
-} // namespace G2L
-
-
-int main(int argc, char *argv[])
-{
-  unsigned num_ids = 100000;
-  unsigned num_find_iterations = 1000;
-
-  if (argc == 3) {
-    num_ids = atoi(argv[1]);
-    num_find_iterations = atoi(argv[2]);
-  }
-  else if (argc != 1) {
-    std::cout << argv[0] << " num_ids num_find_iterations" << std::endl;
-    return 0;
-  }
-
-
-  // query the topology of the host
-  unsigned threads_count = 4 ;
-
-  if (Kokkos::hwloc::available()) {
-    threads_count = Kokkos::hwloc::get_available_numa_count() *
-                    Kokkos::hwloc::get_available_cores_per_numa() *
-                    Kokkos::hwloc::get_available_threads_per_core();
-
-  }
-
-  std::cout << "Threads: " << threads_count << std::endl;
-  std::cout << "Number of ids: " << num_ids << std::endl;
-  std::cout << "Number of find iterations: " << num_find_iterations << std::endl;
-
-  size_t num_errors = 0;
-
-  num_errors += G2L::run_serial(num_ids,num_find_iterations);
-
-#ifdef KOKKOS_HAVE_CUDA
-  Kokkos::HostSpace::execution_space::initialize(threads_count);
-  Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
-  num_errors += G2L::run_cuda(num_ids,num_find_iterations);
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-#endif
-
-#ifdef KOKKOS_HAVE_PTHREAD
-  Kokkos::Threads::initialize( threads_count );
-  num_errors += G2L::run_threads(num_ids,num_find_iterations);
-  Kokkos::Threads::finalize();
-#endif
-
-#ifdef KOKKOS_HAVE_OPENMP
-  Kokkos::OpenMP::initialize( threads_count );
-  num_errors += G2L::run_openmp(num_ids,num_find_iterations);
-  Kokkos::OpenMP::finalize();
-#endif
-
-
-  return num_errors;
-}
-
--- a/lib/kokkos/example/global_2_local_ids/Makefile
+++ b/lib/kokkos/example/global_2_local_ids/Makefile
@ -1,53 +0,0 @@
-KOKKOS_PATH ?= ../..
-
-MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-SRC_DIR := $(dir $(MAKEFILE_PATH))
-
-SRC = $(wildcard $(SRC_DIR)/*.cpp)
-OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
-
-#SRC = $(wildcard *.cpp)
-#OBJ = $(SRC:%.cpp=%.o)
-
-default: build
-	echo "Start Build"
-
-# use installed Makefile.kokkos
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = $(NVCC_WRAPPER)
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS = 
-EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "Cuda,OpenMP"
-#KOKKOS_ARCH = "SNB,Kepler35"
-else
-CXX = g++
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS =  
-EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "OpenMP"
-#KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-LIB =
-
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: 
-	rm -f *.a *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
-
--- a/lib/kokkos/example/grow_array/CMakeLists.txt
+++ b/lib/kokkos/example/grow_array/CMakeLists.txt
@ -1,14 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-SET(SOURCES "")
-
-FILE(GLOB SOURCES *.cpp)
-
-TRIBITS_ADD_EXECUTABLE(
-  grow_array
-  SOURCES ${SOURCES}
-  COMM serial mpi
-  )
-
--- a/lib/kokkos/example/grow_array/Makefile
+++ b/lib/kokkos/example/grow_array/Makefile
@ -1,53 +0,0 @@
-KOKKOS_PATH ?= ../..
-
-MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-SRC_DIR := $(dir $(MAKEFILE_PATH))
-
-SRC = $(wildcard $(SRC_DIR)/*.cpp)
-OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
-
-#SRC = $(wildcard *.cpp)
-#OBJ = $(SRC:%.cpp=%.o)
-
-default: build
-	echo "Start Build"
-
-# use installed Makefile.kokkos
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = $(NVCC_WRAPPER)
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS = 
-EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "Cuda,OpenMP"
-#KOKKOS_ARCH = "SNB,Kepler35"
-else
-CXX = g++
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS =  
-EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "OpenMP"
-#KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-LIB =
-
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: 
-	rm -f *.a *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
-
--- a/lib/kokkos/example/grow_array/grow_array.hpp
+++ b/lib/kokkos/example/grow_array/grow_array.hpp
@ -1,257 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef EXAMPLE_GROW_ARRAY
-#define EXAMPLE_GROW_ARRAY
-
-#include <stdlib.h>
-
-#include <Kokkos_Core.hpp>
-
-#include <algorithm>
-
-#if defined(KOKKOS_HAVE_CUDA)
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-#endif
-
-namespace Example {
-
-//----------------------------------------------------------------------------
-
-template< class ExecSpace >
-struct SortView {
-
-  template< typename ValueType >
-  SortView( const Kokkos::View<ValueType*,ExecSpace> v , int begin , int end )
-    {
-      std::sort( v.ptr_on_device() + begin , v.ptr_on_device() + end );
-    }
-};
-
-#if defined(KOKKOS_HAVE_CUDA)
-template<>
-struct SortView< Kokkos::Cuda > {
-  template< typename ValueType >
-  SortView( const Kokkos::View<ValueType*,Kokkos::Cuda> v , int begin , int end )
-    {
-      thrust::sort( thrust::device_ptr<ValueType>( v.ptr_on_device() + begin )
-                  , thrust::device_ptr<ValueType>( v.ptr_on_device() + end ) );
-    }
-};
-#endif
-
-
-
-//----------------------------------------------------------------------------
-
-template< class ExecSpace >
-struct GrowArrayFunctor {
-
-  typedef ExecSpace  execution_space ;
-
-  enum { SHIFT = sizeof(int) == 8 ? 6 : 5 }; // 8 or 4 byte int
-  enum { MASK  = ( 1 << SHIFT ) - 1 };
-
-  const Kokkos::View<int*,ExecSpace>  m_search_flags ; // bit flags for values to append
-  const Kokkos::View<int*,ExecSpace>  m_search_array ; // array to append values
-  const Kokkos::View<int,ExecSpace>   m_search_count ; // offset
-  const int m_search_total ;
-  const int m_search_team_chunk ;
-
-  GrowArrayFunctor( int array_length , int search_length , int print = 1 )
-    : m_search_flags( "flags" , ( search_length + MASK ) >> SHIFT ) // One bit per search entry
-    , m_search_array( "array" , array_length )
-    , m_search_count( "count" )
-    , m_search_total( search_length )
-    , m_search_team_chunk( 2048 )
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  bool flag_is_set( const int index ) const
-    {
-      // 64 or 32 bit integer:
-
-      const int j = index >> SHIFT ; // which integer flag
-      const int k = 1 << ( index & MASK ); // which bit in that integer
-      const int s = ( j < int(m_search_flags.dimension_0()) ) && ( 0 != ( m_search_flags(j) & k ) );
-
-      return s ;
-    }
-
-  typedef typename Kokkos::TeamPolicy<ExecSpace>::member_type team_member ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const team_member & member ) const
-    {
-      enum { LOCAL_BUFFER_LENGTH = 16 };
-
-      int local_buffer[ LOCAL_BUFFER_LENGTH ] ;
-      int local_count = 0 ;
-
-      // Each team searches 'm_search_team_chunk' indices.
-      // The threads of a team must iterate together because all
-      // threads in the team must call 'team_scan' to prevent deadlock in the team.
-
-            int search_team_begin = member.league_rank() * m_search_team_chunk ;
-      const int search_team_end   = search_team_begin + m_search_team_chunk ;
-
-      int k = 0 ;
-
-      while ( search_team_begin < search_team_end ) {
-
-        // This iteration searches [ search_team_begin .. search_team_begin + member.team_size() ]
-        const int thread_search_index = search_team_begin + member.team_rank();
-
-        // If this thread's search index is in the range
-        // and the flag is set, push into this thread's local buffer.
-        if ( thread_search_index < m_search_total && flag_is_set(thread_search_index) ) {
-          local_buffer[ local_count ] = thread_search_index ;
-          ++local_count ;
-        }
-
-        // Move the team's search range forward
-        search_team_begin += member.team_size(); // Striding team by team size
-
-        // Count number of times a thread's buffer might have grown:
-        ++k ;
-
-        // Write buffer if end of search or a thread might have filled its buffer.
-        if ( k == LOCAL_BUFFER_LENGTH /* A thread in my team might have filled its buffer */ ||
-             ! ( search_team_begin < search_team_end ) /* Team is at the end of its search */ ) {
-
-          // Team's exclusive scan of threads' contributions, with global offset.
-          // This thread writes its buffer into [ team_offset .. team_offset + local_count )
-          const int team_offset = member.team_scan( local_count , & *m_search_count );
-
-          // Copy locally buffered entries into global array:
-          for ( int i = 0 ; i < local_count ; ++i ) {
-            m_search_array( team_offset + i ) = local_buffer[i] ;
-          }
-
-          k = 0 ;
-          local_count = 0 ;
-        }
-      }
-    }
-};
-
-
-template< class ExecSpace >
-void grow_array( int array_length , int search_length , int print = 1 )
-{
-  typedef GrowArrayFunctor< ExecSpace > FunctorType ;
-
-  FunctorType functor( array_length , search_length , print );
-
-  typename Kokkos::View<int,ExecSpace>::HostMirror  count = Kokkos::create_mirror_view( functor.m_search_count );
-  typename Kokkos::View<int*,ExecSpace>::HostMirror flags = Kokkos::create_mirror_view( functor.m_search_flags );
-
-  // Set at most 'array_length' random bits over the search length.
-  for ( int i = 0 ; i < array_length ; ++i ) {
-    // 'lrand48()' generates random number between [0..2^31]
-    // index = ( lrand48() * search_length ) / ( 2^31 )
-    const long int index = ( lrand48() * search_length ) >> 31 ;
-    // set the bit within the flags:
-    flags( index >> FunctorType::SHIFT ) |= ( 1 << ( index & FunctorType::MASK ) );
-  }
-
-  Kokkos::deep_copy( functor.m_search_flags , flags );
-
-  // Each team works on 'functor.m_search_team_chunk' span of the search_length
-  Kokkos::TeamPolicy< ExecSpace >
-    work( /* #teams */ ( search_length + functor.m_search_team_chunk - 1 ) / functor.m_search_team_chunk
-        , /* threads/team */ Kokkos::TeamPolicy< ExecSpace >::team_size_max( functor ) );
-
-  // Fill array:
-  Kokkos::parallel_for( work , functor );
-
-  // How much was filled:
-  Kokkos::deep_copy( count , functor.m_search_count );
-
-  // Sort array:
-  SortView< ExecSpace >( functor.m_search_array , 0 , *count );
-
-  // Mirror the results:
-  typename Kokkos::View<int*,ExecSpace>::HostMirror results = Kokkos::create_mirror_view( functor.m_search_array );
-  Kokkos::deep_copy( results , functor.m_search_array );
-
-  // Verify results:
-  int result_error_count = 0 ;
-  int flags_error_count = 0 ;
-  for ( int i = 0 ; i < *count ; ++i ) {
-    const int index = results(i);
-    const int entry = index >> FunctorType::SHIFT ;
-    const int bit   = 1 << ( index & FunctorType::MASK );
-    const bool flag = 0 != ( flags( entry ) & bit );
-    if ( ! flag ) {
-      if ( print ) std::cerr << "result( " << i << " : " << index << " )";
-      ++result_error_count ;
-    }
-    flags( entry ) &= ~bit ; // Clear that verified bit
-  }
-
-  for ( int i = 0 ; i < int(flags.dimension_0()) ; ++i ) {
-    // If any uncleared bits then an error
-    if ( flags(i) ) {
-      if ( print ) std::cerr << "flags( " << i << " : " << flags(i) << " )" ;
-      ++flags_error_count ;
-    }
-  }
-
-  if ( result_error_count || flags_error_count ) {
-    std::cerr << std::endl << "Example::GrowArrayFunctor( " << array_length
-              << " , " << search_length
-              << " ) result_error_count( " << result_error_count << " )"
-              << " ) flags_error_count( " << flags_error_count << " )"
-              << std::endl ;
-  }
-}
-
-
-} // namespace Example
-
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef EXAMPLE_GROW_ARRAY */
-
--- a/lib/kokkos/example/grow_array/main.cpp
+++ b/lib/kokkos/example/grow_array/main.cpp
@ -1,110 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <iostream>
-#include <sstream>
-
-#include <Kokkos_Core.hpp>
-
-#include <grow_array.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-int main( int argc , char ** argv )
-{
-  int num_threads = 4 ;
-  int use_numa = 1 ;
-  int use_core = 1 ;
-  int length_array  = 1000000 ;
-  int span_values = 100000000 ;
-
-
-  if ( Kokkos::hwloc::available() ) {
-    use_numa = Kokkos::hwloc::get_available_numa_count();
-    use_core = Kokkos::hwloc::get_available_cores_per_numa() - 1 ;
-    num_threads = use_numa * use_core * Kokkos::hwloc::get_available_threads_per_core();
-  }
-
-#if defined( KOKKOS_HAVE_SERIAL )
-  {
-    std::cout << "Kokkos::Serial" << std::endl ;
-    // The Serial device accepts these arguments, though it may ignore them.
-    Kokkos::Serial::initialize( num_threads , use_numa , use_core );
-    Example::grow_array< Kokkos::Serial >( length_array , span_values );
-    Kokkos::Serial::finalize ();
-  }
-#endif // defined( KOKKOS_HAVE_SERIAL )
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-  {
-    std::cout << "Kokkos::Threads" << std::endl ;
-    Kokkos::Threads::initialize( num_threads , use_numa , use_core );
-    Example::grow_array< Kokkos::Threads >( length_array , span_values );
-    Kokkos::Threads::finalize();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_OPENMP )
-  {
-    std::cout << "Kokkos::OpenMP" << std::endl ;
-    Kokkos::OpenMP::initialize( num_threads , use_numa , use_core );
-    Example::grow_array< Kokkos::OpenMP >( length_array , span_values );
-    Kokkos::OpenMP::finalize();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_CUDA )
-  {
-    std::cout << "Kokkos::Cuda" << std::endl ;
-    Kokkos::HostSpace::execution_space::initialize(1);
-    Kokkos::Cuda::initialize();
-    Example::grow_array< Kokkos::Cuda >( length_array , span_values );
-    Kokkos::Cuda::finalize();
-    Kokkos::HostSpace::execution_space::finalize();
-  }
-#endif
-
-  return 0 ;
-}
-
--- a/lib/kokkos/example/md_skeleton/CMakeLists.txt
+++ b/lib/kokkos/example/md_skeleton/CMakeLists.txt
@ -1,16 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-SET(SOURCES "")
-SET(LIBRARIES "")
-
-FILE(GLOB SOURCES *.cpp )
-
-TRIBITS_ADD_EXECUTABLE(
-  md_skeleton 
-  SOURCES ${SOURCES}
-  COMM serial mpi
-  DEPLIBS ${LIBRARIES}
-  )
-
--- a/lib/kokkos/example/md_skeleton/Makefile
+++ b/lib/kokkos/example/md_skeleton/Makefile
@ -1,53 +0,0 @@
-KOKKOS_PATH ?= ../..
-
-MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-SRC_DIR := $(dir $(MAKEFILE_PATH))
-
-SRC = $(wildcard $(SRC_DIR)/*.cpp)
-OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
-
-#SRC = $(wildcard *.cpp)
-#OBJ = $(SRC:%.cpp=%.o)
-
-default: build
-	echo "Start Build"
-
-# use installed Makefile.kokkos
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = $(NVCC_WRAPPER)
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS = 
-EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "Cuda,OpenMP"
-#KOKKOS_ARCH = "SNB,Kepler35"
-else
-CXX = g++
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS =  
-EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "OpenMP"
-#KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-LIB =
-
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: 
-	rm -f *.a *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
-
--- a/lib/kokkos/example/md_skeleton/README
+++ b/lib/kokkos/example/md_skeleton/README
@ -1,3 +0,0 @@
-To build this example on a 2012-model Macbook Pro with NVIDIA Kepler GPU:
-
-./build.cuda_std g++_osx cuda_osx 30 opt
--- a/lib/kokkos/example/md_skeleton/force.cpp
+++ b/lib/kokkos/example/md_skeleton/force.cpp
@ -1,192 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-/* Define values which set the max number of registers used for the Force Kernel
- * Its 32 * 2048 / (KOKKOS_CUDA_MAX_THREADS * KOKKOS_CUDA_MIN_BLOCKS)
- * Have to be set before including Kokkos header files.
- */
-
-#define KOKKOS_CUDA_MAX_THREADS 512
-#define KOKKOS_CUDA_MIN_BLOCKS 3
-
-#include <system.h>
-#include <cstdio>
-
-
-/* Simple Lennard Jones Force Kernel using neighborlists
- * Calculates for every pair of atoms (i,j) with distance smaller r_cut
- * f_ij = 4*epsilon * ( (sigma/r_ij)^12 - (sigma/r_ij)^6 )
- * where r_ij is the distance of atoms (i,j).
- * The force on atom i is the sum over f_ij:
- * f_i = sum_j (f_ij)
- * Neighborlists are used in order to pre calculate which atoms j are
- * close enough to i to be able to contribute. By choosing a larger neighbor
- * cutoff then the force cutoff, the neighbor list can be reused several times
- * (typically 10 - 100).
- */
-
-struct ForceFunctor {
-
-  typedef t_x_array::execution_space execution_space; //Device Type for running the kernel
-  typedef double2 value_type; // When energy calculation is requested return energy, and virial
-
-  t_x_array_randomread x;       //atom positions
-  t_f_array f;                  //atom forces
-  t_int_1d_const numneigh;      //number of neighbors per atom
-  t_neighbors_const neighbors;  //neighborlist
-  double cutforcesq;            //force cutoff
-  double epsilon;               //Potential parameter
-  double sigma6;                //Potential parameter
-
-
-  ForceFunctor(System s) {
-    x = s.d_x;
-    f = s.f;
-    numneigh = s.numneigh;
-    neighbors = s.neighbors;
-    cutforcesq = s.force_cutsq;
-    epsilon = 1.0;
-    sigma6 = 1.0;
-  }
-
-  /* Operator for not calculating energy and virial */
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int &i) const {
-    force<0>(i);
-  }
-
-  /* Operator for calculating energy and virial */
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int &i, double2 &energy_virial) const {
-    double2 ev = force<1>(i);
-    energy_virial.x += ev.x;
-    energy_virial.y += ev.y;
-  }
-
-  template<int EVFLAG>
-  KOKKOS_INLINE_FUNCTION
-  double2 force(const int &i) const
-  {
-    const int numneighs = numneigh[i];
-    const double xtmp = x(i, 0);
-    const double ytmp = x(i, 1);
-    const double ztmp = x(i, 2);
-    double fix = 0;
-    double fiy = 0;
-    double fiz = 0;
-    double energy = 0;
-    double virial = 0;
-
-    //pragma simd forces vectorization (ignoring the performance objections of the compiler)
-    //give hint to compiler that fix, fiy and fiz are used for reduction only
-
-  #ifdef USE_SIMD
-    #pragma simd reduction (+: fix,fiy,fiz,energy,virial)
-  #endif
-    for(int k = 0; k < numneighs; k++) {
-      const int j = neighbors(i, k);
-      const double delx = xtmp - x(j, 0);
-      const double dely = ytmp - x(j, 1);
-      const double delz = ztmp - x(j, 2);
-      const double rsq = delx * delx + dely * dely + delz * delz;
-
-      //if(i==0) printf("%i %i %lf %lf\n",i,j,rsq,cutforcesq);
-      if(rsq < cutforcesq) {
-        const double sr2 = 1.0 / rsq;
-        const double sr6 = sr2 * sr2 * sr2  * sigma6;
-        const double force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
-        fix += delx * force;
-        fiy += dely * force;
-        fiz += delz * force;
-
-        if(EVFLAG) {
-          energy += sr6 * (sr6 - 1.0) * epsilon;
-          virial += delx * delx * force + dely * dely * force + delz * delz * force;
-        }
-      }
-    }
-
-    f(i, 0) += fix;
-    f(i, 1) += fiy;
-    f(i, 2) += fiz;
-
-    double2 energy_virial ;
-    energy_virial.x = 4.0 * energy ;
-    energy_virial.y = 0.5 * virial ;
-    return energy_virial;
-  }
-
-  /* init and join functions when doing the reduction to obtain energy and virial */
-
-  KOKKOS_FUNCTION
-  static void init(volatile value_type &update) {
-    update.x = update.y = 0;
-  }
-  KOKKOS_FUNCTION
-  static void join(volatile value_type &update ,
-                   const volatile value_type &source) {
-    update.x += source.x ;
-    update.y += source.y ;
-  }
-
-};
-
-
-/* Calling function */
-
-double2 force(System &s,int evflag) {
-
-  ForceFunctor f(s);
-
-  double2 ev ; ev.x = 0 ; ev.y = 0 ;
-  if(!evflag)
-    Kokkos::parallel_for(s.nlocal,f);
-  else
-    Kokkos::parallel_reduce(s.nlocal,f,ev);
-
-  execution_space::fence();
-  return ev;
-}
-
--- a/lib/kokkos/example/md_skeleton/main.cpp
+++ b/lib/kokkos/example/md_skeleton/main.cpp
@ -1,205 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <cstdio>
-#include <cstring>
-#include <cstdlib>
-#include "system.h"
-
-int create_system(System &system, int nx, int ny, int nz, double rho);
-int neigh_setup(System &system);
-int neigh_build(System &system);
-double2 force(System &system,int evflag);
-
-/* simple MD Skeleton which
- *   - constructs a simple FCC lattice,
- *   - computes a neighborlist
- *   - compute LJ-Force kernel a number of times
- */
-
-int main(int argc, char** argv) {
-
-  printf("Running MD Skeleton\n");
-  /* Thread numbers for Host */
-
-  int num_threads = 1;
-  int teams = 1;
-  int device = 0; // Default device for GPU runs
-
-  /* avoid unused variable warnings */
-  (void)num_threads;
-  (void)teams;
-  (void)device;
-
-  /* Default value for number of force calculations */
-
-  int iter = 100;
-
-  /* Default value for system size (4*nx*ny*nz atoms)
-   * nx, ny and nz are set to system_size if not specififed on commandline */
-
-  int system_size = 20;
-  int nx = -1;
-  int ny = -1;
-  int nz = -1;
-
-  int neighbor_size = 1; // Default bin size for neighbor list construction
-
-  double rho = 0.8442; // Number density of the system
-  double delta = 0; // Scaling factor for random offsets of atom positions
-
-
-  /* read in command-line arguments */
-
-  for(int i = 0; i < argc; i++) {
-    if((strcmp(argv[i], "-t") == 0) || (strcmp(argv[i], "--num_threads") == 0)) {
-      num_threads = atoi(argv[++i]);
-      continue;
-    }
-
-    if((strcmp(argv[i], "--teams") == 0)) {
-      teams = atoi(argv[++i]);
-      continue;
-    }
-
-    if((strcmp(argv[i], "-d") == 0) || (strcmp(argv[i], "--device") == 0))  {
-      device = atoi(argv[++i]);
-      continue;
-    }
-
-    if((strcmp(argv[i], "--delta") == 0)) {
-      delta = atof(argv[++i]);
-      continue;
-    }
-
-    if((strcmp(argv[i], "-i") == 0) || (strcmp(argv[i], "--iter") == 0))  {
-      iter = atoi(argv[++i]);
-      continue;
-    }
-
-    if((strcmp(argv[i], "-rho") == 0)) {
-      rho = atoi(argv[++i]);
-      continue;
-    }
-
-    if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--size") == 0)) {
-      system_size = atoi(argv[++i]);
-      continue;
-    }
-
-    if((strcmp(argv[i], "-nx") == 0)) {
-      nx = atoi(argv[++i]);
-      continue;
-    }
-
-    if((strcmp(argv[i], "-ny") == 0)) {
-      ny = atoi(argv[++i]);
-      continue;
-    }
-
-    if((strcmp(argv[i], "-nz") == 0)) {
-      nz = atoi(argv[++i]);
-      continue;
-    }
-
-    if((strcmp(argv[i], "-b") == 0) || (strcmp(argv[i], "--neigh_bins") == 0))  {
-      neighbor_size = atoi(argv[++i]);
-      continue;
-    }
-  }
-
-  if( nx < 0 ) nx = system_size;
-  if( ny < 0 ) ny = system_size;
-  if( nz < 0 ) nz = system_size;
-
-  printf("-> Init Device\n");
-
-#if defined( KOKKOS_HAVE_CUDA )
-  Kokkos::HostSpace::execution_space::initialize(teams*num_threads);
-  Kokkos::Cuda::SelectDevice select_device(device);
-  Kokkos::Cuda::initialize(select_device);
-#elif defined( KOKKOS_HAVE_OPENMP )
-  Kokkos::OpenMP::initialize(teams*num_threads);
-#elif defined( KOKKOS_HAVE_PTHREAD )
-  Kokkos::Threads::initialize(teams*num_threads);
-#endif
-
-  System system;
-  system.neigh_cut = 2.8;
-  system.force_cut = 2.5;
-  system.force_cutsq = system.force_cut*system.force_cut;
-  system.delta = delta;
-
-  printf("-> Build system\n");
-  create_system(system,nx,ny,nz,rho);
-
-  printf("-> Created %i atoms and %i ghost atoms\n",system.nlocal,system.nghost);
-
-  system.nbinx = system.box.xprd/neighbor_size+1;
-  system.nbiny = system.box.yprd/neighbor_size+1;
-  system.nbinz = system.box.zprd/neighbor_size+1;
-
-
-  printf("-> Building Neighborlist\n");
-
-  neigh_setup(system);
-  neigh_build(system);
-
-  double2 ev = force(system,1);
-
-  printf("-> Calculate Energy: %f Virial: %f\n",ev.x,ev.y);
-
-  printf("-> Running %i force calculations\n",iter);
-
-  Kokkos::Impl::Timer timer;
-
-  for(int i=0;i<iter;i++) {
-    force(system,0);
-  }
-
-
-  double time = timer.seconds();
-  printf("Time: %e s for %i iterations with %i atoms\n",time,iter,system.nlocal);
-
-  execution_space::finalize();
-}
--- a/lib/kokkos/example/md_skeleton/neighbor.cpp
+++ b/lib/kokkos/example/md_skeleton/neighbor.cpp
@ -1,430 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <system.h>
-#include <cstdio>
-#include <Kokkos_Core.hpp>
-
-#define SMALL 1.0e-6
-#define FACTOR 0.999
-
-/* BinningFunctor puts atoms into bins of the simulation box
- * Neighborlists are then created by checking only distances of atoms
- * in adjacent bins. That makes neighborlist construction a O(N) operation.
- */
-
-struct BinningFunctor {
-  typedef t_int_2d::execution_space execution_space;
-
-  System s;
-
-  int atoms_per_bin;
-
-  BinningFunctor(System _s): s(_s) {
-    atoms_per_bin = s.bins.dimension_1();
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int &i) const
-  {
-    const int ibin = coord2bin(s.d_x(i, 0), s.d_x(i, 1), s.d_x(i, 2));
-
-    const int ac = Kokkos::atomic_fetch_add(&s.bincount[ibin], 1);
-
-    if(ac < atoms_per_bin) {
-      s.bins(ibin, ac) = i;
-    } else if(s.d_resize(0) < ac) {
-      s.d_resize(0) = ac;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int coord2bin(double x, double y, double z) const
-  {
-    int ix, iy, iz;
-
-    if(x >= s.box.xprd)
-      ix = (int)((x - s.box.xprd) * s.bininvx) + s.nbinx - s.mbinxlo;
-    else if(x >= 0.0)
-      ix = (int)(x * s.bininvx) - s.mbinxlo;
-    else
-      ix = (int)(x * s.bininvx) - s.mbinxlo - 1;
-
-    if(y >= s.box.yprd)
-      iy = (int)((y - s.box.yprd) * s.bininvy) + s.nbiny - s.mbinylo;
-    else if(y >= 0.0)
-      iy = (int)(y * s.bininvy) - s.mbinylo;
-    else
-      iy = (int)(y * s.bininvy) - s.mbinylo - 1;
-
-    if(z >= s.box.zprd)
-      iz = (int)((z - s.box.zprd) * s.bininvz) + s.nbinz - s.mbinzlo;
-    else if(z >= 0.0)
-      iz = (int)(z * s.bininvz) - s.mbinzlo;
-    else
-      iz = (int)(z * s.bininvz) - s.mbinzlo - 1;
-
-    return (iz * s.mbiny * s.mbinx + iy * s.mbinx + ix + 1);
-  }
-};
-
-/* Build the actual neighborlist*/
-
-struct BuildFunctor {
-
-  typedef t_int_2d::execution_space execution_space;
-
-  System s;
-
-  int maxneighs;
-  BuildFunctor(System _s): s(_s) {
-    maxneighs = s.neighbors.dimension_1();
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int &i) const
-  {
-    int n = 0;
-
-    const t_int_1d_const_um bincount_c = s.bincount;
-
-    const double xtmp = s.d_x(i, 0);
-    const double ytmp = s.d_x(i, 1);
-    const double ztmp = s.d_x(i, 2);
-
-    const int ibin = coord2bin(xtmp, ytmp, ztmp);
-
-    // loop over all bins in neighborhood (includes ibin)
-    for(int k = 0; k < s.nstencil; k++) {
-      const int jbin = ibin + s.d_stencil[k];
-
-      // get subview of jbin
-      const t_int_1d_const_um loc_bin =
-          Kokkos::subview(s.bins,jbin,Kokkos::ALL());
-
-      if(ibin == jbin)
-        for(int m = 0; m < bincount_c[jbin]; m++) {
-          const int j = loc_bin[m];
-
-          //for same bin as atom i skip j if i==j
-          if (j == i) continue;
-
-          const double delx = xtmp - s.d_x(j, 0);
-          const double dely = ytmp - s.d_x(j, 1);
-          const double delz = ztmp - s.d_x(j, 2);
-          const double rsq = delx * delx + dely * dely + delz * delz;
-
-          if(rsq <= s.neigh_cutsq && n<maxneighs) s.neighbors(i,n++) = j;
-        }
-      else {
-        for(int m = 0; m < bincount_c[jbin]; m++) {
-          const int j = loc_bin[m];
-
-          const double delx = xtmp - s.d_x(j, 0);
-          const double dely = ytmp - s.d_x(j, 1);
-          const double delz = ztmp - s.d_x(j, 2);
-          const double rsq = delx * delx + dely * dely + delz * delz;
-
-          if(rsq <= s.neigh_cutsq && n<maxneighs) s.neighbors(i,n++) = j;
-        }
-      }
-    }
-
-    s.numneigh[i] = n;
-
-    if(n >= maxneighs) {
-      if(n >= s.d_resize(0)) s.d_resize(0) = n;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int coord2bin(double x, double y, double z) const
-  {
-    int ix, iy, iz;
-
-    if(x >= s.box.xprd)
-      ix = (int)((x - s.box.xprd) * s.bininvx) + s.nbinx - s.mbinxlo;
-    else if(x >= 0.0)
-      ix = (int)(x * s.bininvx) - s.mbinxlo;
-    else
-      ix = (int)(x * s.bininvx) - s.mbinxlo - 1;
-
-    if(y >= s.box.yprd)
-      iy = (int)((y - s.box.yprd) * s.bininvy) + s.nbiny - s.mbinylo;
-    else if(y >= 0.0)
-      iy = (int)(y * s.bininvy) - s.mbinylo;
-    else
-      iy = (int)(y * s.bininvy) - s.mbinylo - 1;
-
-    if(z >= s.box.zprd)
-      iz = (int)((z - s.box.zprd) * s.bininvz) + s.nbinz - s.mbinzlo;
-    else if(z >= 0.0)
-      iz = (int)(z * s.bininvz) - s.mbinzlo;
-    else
-      iz = (int)(z * s.bininvz) - s.mbinzlo - 1;
-
-    return (iz * s.mbiny * s.mbinx + iy * s.mbinx + ix + 1);
-  }
-};
-
-/* Reset an array to zero */
-
-struct MemsetZeroFunctor {
-  typedef t_x_array::execution_space  execution_space ;
-  void* ptr;
-  KOKKOS_INLINE_FUNCTION void operator()(const int i) const {
-    ((int*)ptr)[i] = 0;
-  }
-};
-
-/* Calculate distance of two bins */
-
-double bindist(System &s, int i, int j, int k)
-{
-  double delx, dely, delz;
-
-  if(i > 0)
-    delx = (i - 1) * s.binsizex;
-  else if(i == 0)
-    delx = 0.0;
-  else
-    delx = (i + 1) * s.binsizex;
-
-  if(j > 0)
-    dely = (j - 1) * s.binsizey;
-  else if(j == 0)
-    dely = 0.0;
-  else
-    dely = (j + 1) * s.binsizey;
-
-  if(k > 0)
-    delz = (k - 1) * s.binsizez;
-  else if(k == 0)
-    delz = 0.0;
-  else
-    delz = (k + 1) * s.binsizez;
-
-  return (delx * delx + dely * dely + delz * delz);
-}
-
-/* Setup the neighborlist construction
- * Determine binsizes, a stencil for defining adjacency, etc.
- */
-
-void neigh_setup(System &s) {
-
-  s.neigh_cutsq = s.neigh_cut * s.neigh_cut;
-
-  /*
-  c bins must evenly divide into box size,
-  c   becoming larger than cutneigh if necessary
-  c binsize = 1/2 of cutoff is near optimal
-
-  if (flag == 0) {
-    nbinx = 2.0 * xprd / cutneigh;
-    nbiny = 2.0 * yprd / cutneigh;
-    nbinz = 2.0 * zprd / cutneigh;
-    if (nbinx == 0) nbinx = 1;
-    if (nbiny == 0) nbiny = 1;
-    if (nbinz == 0) nbinz = 1;
-  }
-  */
-
-  s.binsizex = s.box.xprd / s.nbinx;
-  s.binsizey = s.box.yprd / s.nbiny;
-  s.binsizez = s.box.zprd / s.nbinz;
-  s.bininvx = 1.0 / s.binsizex;
-  s.bininvy = 1.0 / s.binsizey;
-  s.bininvz = 1.0 / s.binsizez;
-
-  double coord = s.box.xlo - s.neigh_cut - SMALL * s.box.xprd;
-  s.mbinxlo = static_cast<int>(coord * s.bininvx);
-
-  if(coord < 0.0) s.mbinxlo = s.mbinxlo - 1;
-
-  coord = s.box.xhi + s.neigh_cut + SMALL * s.box.xprd;
-  int mbinxhi = static_cast<int>(coord * s.bininvx);
-
-  coord = s.box.ylo - s.neigh_cut - SMALL * s.box.yprd;
-  s.mbinylo = static_cast<int>(coord * s.bininvy);
-
-  if(coord < 0.0) s.mbinylo = s.mbinylo - 1;
-
-  coord = s.box.yhi + s.neigh_cut + SMALL * s.box.yprd;
-  int mbinyhi = static_cast<int>(coord * s.bininvy);
-
-  coord = s.box.zlo - s.neigh_cut - SMALL * s.box.zprd;
-  s.mbinzlo = static_cast<int>(coord * s.bininvz);
-
-  if(coord < 0.0) s.mbinzlo = s.mbinzlo - 1;
-
-  coord = s.box.zhi + s.neigh_cut + SMALL * s.box.zprd;
-  int mbinzhi = static_cast<int>(coord * s.bininvz);
-
-  /* extend bins by 1 in each direction to insure stencil coverage */
-
-  s.mbinxlo = s.mbinxlo - 1;
-  mbinxhi = mbinxhi + 1;
-  s.mbinx = mbinxhi - s.mbinxlo + 1;
-
-  s.mbinylo = s.mbinylo - 1;
-  mbinyhi = mbinyhi + 1;
-  s.mbiny = mbinyhi - s.mbinylo + 1;
-
-  s.mbinzlo = s.mbinzlo - 1;
-  mbinzhi = mbinzhi + 1;
-  s.mbinz = mbinzhi - s.mbinzlo + 1;
-
-  /*
-  compute bin stencil of all bins whose closest corner to central bin
-  is within neighbor cutoff
-  for partial Newton (newton = 0),
-  stencil is all surrounding bins including self
-  for full Newton (newton = 1),
-  stencil is bins to the "upper right" of central bin, does NOT include self
-  next(xyz) = how far the stencil could possibly extend
-  factor < 1.0 for special case of LJ benchmark so code will create
-  correct-size stencil when there are 3 bins for every 5 lattice spacings
-  */
-
-  int nextx = static_cast<int>(s.neigh_cut * s.bininvx);
-
-  if(nextx * s.binsizex < FACTOR * s.neigh_cut) nextx++;
-
-  int nexty = static_cast<int>(s.neigh_cut * s.bininvy);
-
-  if(nexty * s.binsizey < FACTOR * s.neigh_cut) nexty++;
-
-  int nextz = static_cast<int>(s.neigh_cut * s.bininvz);
-
-  if(nextz * s.binsizez < FACTOR * s.neigh_cut) nextz++;
-
-  int nmax = (2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1);
-  s.d_stencil = t_int_1d("stencil", nmax);
-  s.h_stencil = Kokkos::create_mirror_view(s.d_stencil);
-  s.nstencil = 0;
-  int kstart = -nextz;
-
-  for(int k = kstart; k <= nextz; k++) {
-    for(int j = -nexty; j <= nexty; j++) {
-      for(int i = -nextx; i <= nextx; i++) {
-        if(bindist(s,i, j, k) < s.neigh_cutsq) {
-          s.h_stencil(s.nstencil++) = k * s.mbiny * s.mbinx + j * s.mbinx + i;
-        }
-      }
-    }
-  }
-
-  /* Allocate neighbor arrays */
-
-  Kokkos::deep_copy(s.d_stencil, s.h_stencil);
-  s.mbins = s.mbinx * s.mbiny * s.mbinz;
-  s.bincount = t_int_1d("bincount", s.mbins);
-  s.bins = t_int_2d("bins", s.mbins, 8);
-
-  s.neighbors = t_neighbors("neighbors",s.natoms,80);
-  s.numneigh = t_int_1d("numneigh",s.natoms);
-  s.d_resize = t_int_scalar("resize");
-  s.h_resize = Kokkos::create_mirror_view(s.d_resize);
-}
-
-
-/* Build the neighborlist
- * This is a try and rerun algorithm for handling the case where the bins array
- * and the neighbors array are not big enough. So if one is too small, it will
- * reallocate and rerun the binnind algorithm or the neighborlist construction.
- */
-
-void neigh_build(System &s) {
-
-  /* Binning of atoms */
-
-  s.h_resize(0) = 1;
-
-  while(s.h_resize(0) > 0) {
-    s.h_resize(0) = 0;
-    Kokkos::deep_copy(s.d_resize, s.h_resize);
-
-    MemsetZeroFunctor f_zero;
-    f_zero.ptr = (void*) s.bincount.ptr_on_device();
-    Kokkos::parallel_for(s.mbins, f_zero);
-    execution_space::fence();
-
-    BinningFunctor f(s);
-    Kokkos::parallel_for(s.natoms, f);
-    execution_space::fence();
-
-    /* Check if bins was large enough, if nor reallocated and rerun */
-
-    deep_copy(s.h_resize, s.d_resize);
-
-    if(s.h_resize(0)) {
-      int atoms_per_bin = s.h_resize(0)+2;
-      s.bins = t_int_2d("bins", s.mbins, atoms_per_bin);
-    }
-  }
-
-  /* Neighborlist construction */
-
-  s.h_resize(0) = 1;
-
-  while(s.h_resize(0)) {
-    s.h_resize(0) = 0;
-
-    Kokkos::deep_copy(s.d_resize, s.h_resize);
-
-    BuildFunctor f(s);
-    Kokkos::parallel_for(s.nlocal, f);
-
-    execution_space::fence();
-
-    /* Check if neighbors was large enough, if nor reallocated and rerun */
-
-    deep_copy(s.h_resize, s.d_resize);
-
-    if(s.h_resize(0)) {
-      int maxneighs = s.h_resize(0) * 1.2;
-      s.neighbors = t_neighbors("neighbors", s.natoms, maxneighs);
-    }
-  }
-}
--- a/lib/kokkos/example/md_skeleton/setup.cpp
+++ b/lib/kokkos/example/md_skeleton/setup.cpp
@ -1,271 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <system.h>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-/* initialize atoms on fcc lattice in parallel fashion */
-
-#define MAX(a,b) (a>b?a:b)
-#define MIN(a,b) (a<b?a:b)
-
-
-int create_system(System &system, int nx, int ny, int nz, double rho)
-{
-  /* Box Setup */
-
-  double lattice = pow((4.0 / rho), (1.0 / 3.0));
-  system.box.xprd = nx * lattice;
-  system.box.yprd = ny * lattice;
-  system.box.zprd = nz * lattice;
-  system.box.xlo = 0;
-  system.box.ylo = 0;
-  system.box.zlo = 0;
-  system.box.xhi = system.box.xprd;
-  system.box.yhi = system.box.yprd;
-  system.box.zhi = system.box.zprd;
-
-
-  int ghost_dist = int(system.neigh_cut/lattice) + 1;
-
-  /* total # of atoms */
-
-  system.nlocal = 4 * nx * ny * nz;
-  system.nghost = 4 * (nx + 2 * ghost_dist) *
-                      (ny + 2 * ghost_dist) *
-                      (nz + 2 * ghost_dist) -
-                      system.nlocal;
-  system.natoms = system.nlocal + system.nghost;
-
-  system.d_x = t_x_array("X",system.natoms);
-  system.h_x = Kokkos::create_mirror_view(system.d_x);
-  system.f = t_f_array("F",system.natoms);
-
-  /* determine loop bounds of lattice subsection that overlaps my sub-box
-     insure loop bounds do not exceed nx,ny,nz */
-
-  double alat = pow((4.0 / rho), (1.0 / 3.0));
-  int ilo = static_cast<int>(system.box.xlo / (0.5 * alat) - 1);
-  int ihi = static_cast<int>(system.box.xhi / (0.5 * alat) + 1);
-  int jlo = static_cast<int>(system.box.ylo / (0.5 * alat) - 1);
-  int jhi = static_cast<int>(system.box.yhi / (0.5 * alat) + 1);
-  int klo = static_cast<int>(system.box.zlo / (0.5 * alat) - 1);
-  int khi = static_cast<int>(system.box.zhi / (0.5 * alat) + 1);
-
-  ilo = MAX(ilo, 0);
-  ihi = MIN(ihi, 2 * nx - 1);
-  jlo = MAX(jlo, 0);
-  jhi = MIN(jhi, 2 * ny - 1);
-  klo = MAX(klo, 0);
-  khi = MIN(khi, 2 * nz - 1);
-
-
-
-  /* generates positions of atoms on fcc sublattice*/
-
-  srand(3718273);
-  /* create non-ghost atoms */
-  {
-    double xtmp, ytmp, ztmp;
-    int sx = 0;
-    int sy = 0;
-    int sz = 0;
-    int ox = 0;
-    int oy = 0;
-    int oz = 0;
-    int subboxdim = 8;
-
-    int n = 0;
-    int iflag = 0;
-
-    while(oz * subboxdim <= khi) {
-      const int k = oz * subboxdim + sz;
-      const int j = oy * subboxdim + sy;
-      const int i = ox * subboxdim + sx;
-
-      if(iflag) continue;
-
-      if(((i + j + k) % 2 == 0) &&
-          (i >= ilo) && (i <= ihi) &&
-          (j >= jlo) && (j <= jhi) &&
-          (k >= klo) && (k <= khi)) {
-
-        const int nold = n;
-        while(nold == n) {
-          xtmp = 0.5 * alat * i + system.delta/1000*(rand()%1000-500);
-          ytmp = 0.5 * alat * j + system.delta/1000*(rand()%1000-500);
-          ztmp = 0.5 * alat * k + system.delta/1000*(rand()%1000-500);
-
-          if(xtmp >= system.box.xlo && xtmp < system.box.xhi &&
-              ytmp >= system.box.ylo && ytmp < system.box.yhi &&
-              ztmp >= system.box.zlo && ztmp < system.box.zhi) {
-            system.h_x(n,0) = xtmp;
-            system.h_x(n,1) = ytmp;
-            system.h_x(n,2) = ztmp;
-            n++;
-          }
-        }
-      }
-
-      sx++;
-
-      if(sx == subboxdim) {
-        sx = 0;
-        sy++;
-      }
-
-      if(sy == subboxdim) {
-        sy = 0;
-        sz++;
-      }
-
-      if(sz == subboxdim) {
-        sz = 0;
-        ox++;
-      }
-
-      if(ox * subboxdim > ihi) {
-        ox = 0;
-        oy++;
-      }
-
-      if(oy * subboxdim > jhi) {
-        oy = 0;
-        oz++;
-      }
-    }
-
-    /* check that correct # of atoms were created */
-
-    if(system.nlocal != n) {
-      printf("Created incorrect # of atoms\n");
-
-      return 1;
-    }
-  }
-
-  /* create ghost atoms */
-
-  {
-    double xtmp, ytmp, ztmp;
-
-    int ilo_g = ilo - 2 * ghost_dist;
-    int jlo_g = jlo - 2 * ghost_dist;
-    int klo_g = klo - 2 * ghost_dist;
-    int ihi_g = ihi + 2 * ghost_dist;
-    int jhi_g = jhi + 2 * ghost_dist;
-    int khi_g = khi + 2 * ghost_dist;
-
-    int subboxdim = 8;
-    int sx = 0;
-    int sy = 0;
-    int sz = 0;
-    int ox = subboxdim * ilo_g;
-    int oy = subboxdim * jlo_g;
-    int oz = subboxdim * klo_g;
-
-    int n = system.nlocal;
-    int iflag = 0;
-
-
-    while(oz * subboxdim <= khi_g) {
-      const int k = oz * subboxdim + sz;
-      const int j = oy * subboxdim + sy;
-      const int i = ox * subboxdim + sx;
-
-      if(iflag) continue;
-
-      if(((i + j + k) % 2 == 0) &&
-          (i >= ilo_g) && (i <= ihi_g) &&
-          (j >= jlo_g) && (j <= jhi_g) &&
-          (k >= klo_g) && (k <= khi_g) &&
-          ((i < ilo) || (i > ihi) ||
-           (j < jlo) || (j > jhi) ||
-           (k < klo) || (k > khi))
-          ) {
-
-        xtmp = 0.5 * alat * i;
-        ytmp = 0.5 * alat * j;
-        ztmp = 0.5 * alat * k;
-
-        system.h_x(n,0) = xtmp + system.delta/1000*(rand()%1000-500);;
-        system.h_x(n,1) = ytmp + system.delta/1000*(rand()%1000-500);;
-        system.h_x(n,2) = ztmp + system.delta/1000*(rand()%1000-500);;
-        n++;
-      }
-
-      sx++;
-
-      if(sx == subboxdim) {
-        sx = 0;
-        sy++;
-      }
-
-      if(sy == subboxdim) {
-        sy = 0;
-        sz++;
-      }
-
-      if(sz == subboxdim) {
-        sz = 0;
-        ox++;
-        //printf("%i %i %i // %i %i %i\n",ox,oy,oz,i,j,k);
-      }
-
-      if(ox * subboxdim > ihi_g) {
-        ox = subboxdim * ilo_g;
-        oy++;
-      }
-
-      if(oy * subboxdim > jhi_g) {
-        oy = subboxdim * jlo_g;
-        oz++;
-      }
-    }
-  }
-
-  Kokkos::deep_copy(system.d_x,system.h_x);
-  return 0;
-}
-
--- a/lib/kokkos/example/md_skeleton/system.h
+++ b/lib/kokkos/example/md_skeleton/system.h
@ -1,92 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef SYSTEM_H_
-#define SYSTEM_H_
-
-#include <types.h>
-
-struct Box {
-  double xprd, yprd, zprd;
-  double xlo, xhi;
-  double ylo, yhi;
-  double zlo, zhi;
-};
-
-struct System {
-  Box box;
-
-  int natoms;
-  int nlocal;
-  int nghost;
-
-  t_x_array d_x;
-  t_x_array_host h_x;
-
-  t_f_array f;
-
-  t_neighbors neighbors;
-  t_int_1d numneigh;
-
-  double delta;
-
-  double neigh_cut,neigh_cutsq;
-
-  int mbins;
-  int nbinx,nbiny,nbinz;
-  int mbinx,mbiny,mbinz;
-  int mbinxlo,mbinylo,mbinzlo;
-  double binsizex,binsizey,binsizez;
-  double bininvx,bininvy,bininvz;
-
-  t_int_1d bincount;
-  t_int_2d bins;
-  t_int_scalar d_resize;
-  t_int_scalar_host h_resize;
-  t_int_1d d_stencil;
-  t_int_1d_host h_stencil;
-  int nstencil;
-
-  double force_cut,force_cutsq;
-};
-#endif
--- a/lib/kokkos/example/md_skeleton/types.h
+++ b/lib/kokkos/example/md_skeleton/types.h
@ -1,118 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef TYPES_H_
-#define TYPES_H_
-
-/* Determine default device type and necessary includes */
-
-#include <Kokkos_Core.hpp>
-
-typedef Kokkos::DefaultExecutionSpace execution_space ;
-
-#if ! defined( KOKKOS_HAVE_CUDA )
-  struct double2 {
-    double x, y;
-    KOKKOS_INLINE_FUNCTION
-    double2(double xinit, double yinit) {
-      x = xinit;
-      y = yinit;
-    }
-    KOKKOS_INLINE_FUNCTION
-    double2() {
-      x = 0.0;
-      y = 0.0;
-    }
-    KOKKOS_INLINE_FUNCTION
-    double2& operator += (const double2& src) {
-      x+=src.x;
-      y+=src.y;
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    volatile double2& operator += (const volatile double2& src) volatile {
-      x+=src.x;
-      y+=src.y;
-      return *this;
-    }
-
-  };
-#endif
-
-#include <impl/Kokkos_Timer.hpp>
-
-/* Define types used throughout the code */
-
-//Position arrays
-typedef Kokkos::View<double*[3], Kokkos::LayoutRight, execution_space>                                   t_x_array ;
-typedef t_x_array::HostMirror                                                                        t_x_array_host ;
-typedef Kokkos::View<const double*[3], Kokkos::LayoutRight, execution_space>                             t_x_array_const ;
-typedef Kokkos::View<const double*[3], Kokkos::LayoutRight, execution_space, Kokkos::MemoryRandomAccess >  t_x_array_randomread ;
-
-//Force array
-typedef Kokkos::View<double*[3],  execution_space>                                                       t_f_array ;
-
-
-//Neighborlist
-typedef Kokkos::View<int**, execution_space >                                                            t_neighbors ;
-typedef Kokkos::View<const int**, execution_space >                                                      t_neighbors_const ;
-typedef Kokkos::View<int*, execution_space, Kokkos::MemoryUnmanaged >                                    t_neighbors_sub ;
-typedef Kokkos::View<const int*, execution_space, Kokkos::MemoryUnmanaged >                              t_neighbors_const_sub ;
-
-//1d int array
-typedef Kokkos::View<int*, execution_space >                                                             t_int_1d ;
-typedef t_int_1d::HostMirror                                                                         t_int_1d_host ;
-typedef Kokkos::View<const int*, execution_space >                                                       t_int_1d_const ;
-typedef Kokkos::View<int*, execution_space , Kokkos::MemoryUnmanaged>                                    t_int_1d_um ;
-typedef Kokkos::View<const int* , execution_space , Kokkos::MemoryUnmanaged>                             t_int_1d_const_um ;
-
-//2d int array
-typedef Kokkos::View<int**, Kokkos::LayoutRight, execution_space >                                       t_int_2d ;
-typedef t_int_2d::HostMirror                                                                         t_int_2d_host ;
-
-//Scalar ints
-typedef Kokkos::View<int[1], Kokkos::LayoutLeft, execution_space>                                        t_int_scalar ;
-typedef t_int_scalar::HostMirror                                                                     t_int_scalar_host ;
-
-#endif /* TYPES_H_ */
--- a/lib/kokkos/example/multi_fem/BoxMeshFixture.hpp
+++ b/lib/kokkos/example/multi_fem/BoxMeshFixture.hpp
@ -1,610 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_BOXMESHFIXTURE_HPP
-#define KOKKOS_BOXMESHFIXTURE_HPP
-
-#include <cmath>
-#include <stdexcept>
-#include <sstream>
-
-#include <Kokkos_Core.hpp>
-#include <BoxMeshPartition.hpp>
-#include <FEMesh.hpp>
-#include <HexElement.hpp>
-
-//----------------------------------------------------------------------------
-
-struct FixtureElementHex8 {
-
-  static const unsigned element_node_count = 8 ;
-
-  HybridFEM::HexElement_TensorData< element_node_count > elem_data ;
-  BoxBoundsLinear box_bounds ;
-
-  FixtureElementHex8() : elem_data(), box_bounds() {}
-
-  static void create_node_boxes_from_vertex_boxes(
-    const BoxType                & vertex_box_global ,
-    const std::vector< BoxType > & vertex_box_parts ,
-          BoxType                & node_box_global ,
-          std::vector< BoxType > & node_box_parts )
-  {
-    node_box_global = vertex_box_global ;
-    node_box_parts  = vertex_box_parts  ;
-  }
-
-  void elem_to_node( const unsigned node_local , unsigned coord[] ) const
-  {
-    coord[0] += elem_data.eval_map[ node_local ][0] ;
-    coord[1] += elem_data.eval_map[ node_local ][1] ;
-    coord[2] += elem_data.eval_map[ node_local ][2] ;
-  }
-};
-
-struct FixtureElementHex27 {
-  static const unsigned element_node_count = 27 ;
-
-  HybridFEM::HexElement_TensorData< element_node_count > elem_data ;
-  BoxBoundsQuadratic box_bounds ;
-
-  FixtureElementHex27() : elem_data(), box_bounds() {}
-
-  static void create_node_boxes_from_vertex_boxes(
-    const BoxType                & vertex_box_global ,
-    const std::vector< BoxType > & vertex_box_parts ,
-          BoxType                & node_box_global ,
-          std::vector< BoxType > & node_box_parts )
-  {
-    node_box_global = vertex_box_global ;
-    node_box_parts  = vertex_box_parts  ;
-
-    node_box_global[0][1] = 2 * node_box_global[0][1] - 1 ;
-    node_box_global[1][1] = 2 * node_box_global[1][1] - 1 ;
-    node_box_global[2][1] = 2 * node_box_global[2][1] - 1 ;
-
-    for ( unsigned i = 0 ; i < vertex_box_parts.size() ; ++i ) {
-      node_box_parts[i][0][0] = 2 * node_box_parts[i][0][0] ;
-      node_box_parts[i][1][0] = 2 * node_box_parts[i][1][0] ;
-      node_box_parts[i][2][0] = 2 * node_box_parts[i][2][0] ;
-
-      node_box_parts[i][0][1] =
-        std::min( node_box_global[0][1] , 2 * node_box_parts[i][0][1] );
-      node_box_parts[i][1][1] =
-        std::min( node_box_global[1][1] , 2 * node_box_parts[i][1][1] );
-      node_box_parts[i][2][1] =
-        std::min( node_box_global[2][1] , 2 * node_box_parts[i][2][1] );
-    }
-  }
-
-  void elem_to_node( const unsigned node_local , unsigned coord[] ) const
-  {
-    coord[0] = 2 * coord[0] + elem_data.eval_map[ node_local ][0] ;
-    coord[1] = 2 * coord[1] + elem_data.eval_map[ node_local ][1] ;
-    coord[2] = 2 * coord[2] + elem_data.eval_map[ node_local ][2] ;
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class Device , class ElementSpec >
-struct BoxMeshFixture {
-
-  typedef Scalar  coordinate_scalar_type ;
-  typedef Device  execution_space ;
-
-  static const unsigned element_node_count = ElementSpec::element_node_count ;
-
-  typedef HybridFEM::FEMesh< coordinate_scalar_type ,
-                             element_node_count ,
-                             execution_space > FEMeshType ;
-
-  typedef typename FEMeshType::node_coords_type    node_coords_type ;
-  typedef typename FEMeshType::elem_node_ids_type  elem_node_ids_type ;
-  typedef typename FEMeshType::node_elem_ids_type  node_elem_ids_type ;
-
-
-  static void verify(
-    const typename FEMeshType::node_coords_type::HostMirror   & node_coords ,
-    const typename FEMeshType::elem_node_ids_type::HostMirror & elem_node_ids ,
-    const typename FEMeshType::node_elem_ids_type::HostMirror & node_elem_ids )
-  {
-    typedef typename FEMeshType::size_type         size_type ;
-    //typedef typename node_coords_type::value_type  coords_type ; // unused
-
-    const size_type node_count_total = node_coords.dimension_0();
-    const size_type elem_count_total = elem_node_ids.dimension_0();
-
-    const ElementSpec element ;
-
-    for ( size_type node_index = 0 ;
-                    node_index < node_count_total ; ++node_index ) {
-
-      for ( size_type
-              j = node_elem_ids.row_map[ node_index ] ;
-              j < node_elem_ids.row_map[ node_index + 1 ] ; ++j ) {
-
-        const size_type elem_index = node_elem_ids.entries(j,0);
-        const size_type node_local = node_elem_ids.entries(j,1);
-        const size_type en_id      = elem_node_ids(elem_index,node_local);
-
-        if ( node_index != en_id ) {
-          std::ostringstream msg ;
-          msg << "BoxMeshFixture node_elem_ids error"
-              << " : node_index(" << node_index
-              << ") entry(" << j
-              << ") elem_index(" << elem_index
-              << ") node_local(" << node_local
-              << ") elem_node_id(" << en_id
-              << ")" ;
-          throw std::runtime_error( msg.str() );
-        }
-      }
-    }
-
-    for ( size_type elem_index = 0 ;
-                    elem_index < elem_count_total; ++elem_index ) {
-
-      coordinate_scalar_type elem_node_coord[ element_node_count ][3] ;
-
-      for ( size_type nn = 0 ; nn < element_node_count ; ++nn ) {
-        const size_type node_index = elem_node_ids( elem_index , nn );
-
-        for ( size_type nc = 0 ; nc < 3 ; ++nc ) {
-          elem_node_coord[nn][nc] = node_coords( node_index , nc );
-        }
-      }
-
-
-      for ( size_type nn = 0 ; nn < element_node_count ; ++nn ) {
-
-        const unsigned ix = element.elem_data.eval_map[nn][0] ;
-        const unsigned iy = element.elem_data.eval_map[nn][1] ;
-        const unsigned iz = element.elem_data.eval_map[nn][2] ;
-
-        if ( elem_node_coord[nn][0] != elem_node_coord[0][0] + ix ||
-             elem_node_coord[nn][1] != elem_node_coord[0][1] + iy ||
-             elem_node_coord[nn][2] != elem_node_coord[0][2] + iz ) {
-
-          std::ostringstream msg ;
-          msg << "BoxMeshFixture elem_node_coord mapping failure { "
-              << elem_node_coord[nn][0] << " "
-              << elem_node_coord[nn][1] << " "
-              << elem_node_coord[nn][2] << " } != { "
-              << elem_node_coord[ 0][0] + ix << " "
-              << elem_node_coord[ 0][1] + iy << " "
-              << elem_node_coord[ 0][2] + iz
-              << " }" ;
-          throw std::runtime_error( msg.str() );
-        }
-      }
-    }
-  }
-
-  //------------------------------------
-  // Initialize element-node connectivity:
-  // Order elements that only depend on owned nodes first.
-  // These elements could be computed while waiting for
-  // received node data.
-
-  static void layout_elements_interior_exterior(
-    const BoxType                vertex_box_local_used ,
-    const BoxType                vertex_box_local_owned ,
-    const BoxType                node_box_local_used ,
-    const std::vector<size_t> &  node_used_id_map ,
-    const ElementSpec            element_fixture ,
-    const size_t                 elem_count_interior ,
-    const typename elem_node_ids_type::HostMirror elem_node_ids )
-  {
-    size_t elem_index_interior = 0 ;
-    size_t elem_index_boundary = elem_count_interior ;
-
-    for ( size_t iz = vertex_box_local_used[2][0] ;
-                 iz < vertex_box_local_used[2][1] - 1 ; ++iz ) {
-    for ( size_t iy = vertex_box_local_used[1][0] ;
-                 iy < vertex_box_local_used[1][1] - 1 ; ++iy ) {
-    for ( size_t ix = vertex_box_local_used[0][0] ;
-                 ix < vertex_box_local_used[0][1] - 1 ; ++ix ) {
-
-      size_t elem_index ;
-
-      // If lower and upper vertices are owned then element is interior
-      if ( contain( vertex_box_local_owned, ix,   iy,   iz ) &&
-           contain( vertex_box_local_owned, ix+1, iy+1, iz+1 ) ) {
-        elem_index = elem_index_interior++ ;
-      }
-      else {
-        elem_index = elem_index_boundary++ ;
-      }
-
-      for ( size_t nn = 0 ; nn < element_node_count ; ++nn ) {
-        unsigned coord[3] = { static_cast<unsigned>(ix) , static_cast<unsigned>(iy) , static_cast<unsigned>(iz) };
-
-        element_fixture.elem_to_node( nn , coord );
-
-        const size_t node_local_id =
-          box_map_id( node_box_local_used ,
-                      node_used_id_map ,
-                      coord[0] , coord[1] , coord[2] );
-
-        elem_node_ids( elem_index , nn ) = node_local_id ;
-      }
-    }}}
-  }
-
-  //------------------------------------
-  // Nested partitioning of elements by number of thread 'gangs'
-
-  static void layout_elements_partitioned(
-    const BoxType                vertex_box_local_used ,
-    const BoxType                /*vertex_box_local_owned*/ ,
-    const BoxType                node_box_local_used ,
-    const std::vector<size_t> &  node_used_id_map ,
-    const ElementSpec            element_fixture ,
-    const size_t                 thread_gang_count ,
-    const typename elem_node_ids_type::HostMirror elem_node_ids )
-  {
-    std::vector< BoxType > element_box_gangs( thread_gang_count );
-
-    BoxType element_box_local_used = vertex_box_local_used ;
-
-    element_box_local_used[0][1] -= 1 ;
-    element_box_local_used[1][1] -= 1 ;
-    element_box_local_used[2][1] -= 1 ;
-
-    box_partition_rcb( element_box_local_used , element_box_gangs );
-
-    size_t elem_index = 0 ;
-
-    for ( size_t ig = 0 ; ig < thread_gang_count ; ++ig ) {
-
-      const BoxType box = element_box_gangs[ig] ;
-
-      for ( size_t iz = box[2][0] ; iz < box[2][1] ; ++iz ) {
-      for ( size_t iy = box[1][0] ; iy < box[1][1] ; ++iy ) {
-      for ( size_t ix = box[0][0] ; ix < box[0][1] ; ++ix , ++elem_index ) {
-
-        for ( size_t nn = 0 ; nn < element_node_count ; ++nn ) {
-          unsigned coord[3] = { static_cast<unsigned>(ix) , static_cast<unsigned>(iy) , static_cast<unsigned>(iz) };
-
-          element_fixture.elem_to_node( nn , coord );
-
-          const size_t node_local_id =
-            box_map_id( node_box_local_used ,
-                        node_used_id_map ,
-                        coord[0] , coord[1] , coord[2] );
-
-          elem_node_ids( elem_index , nn ) = node_local_id ;
-        }
-      }}}
-    }
-  }
-
-  //------------------------------------
-
-  static FEMeshType create( const size_t proc_count ,
-                            const size_t proc_local ,
-                            const size_t gang_count ,
-                            const size_t elems_x ,
-                            const size_t elems_y ,
-                            const size_t elems_z ,
-                            const double x_coord_curve = 1 ,
-                            const double y_coord_curve = 1 ,
-                            const double z_coord_curve = 1 )
-  {
-    const size_t vertices_x = elems_x + 1 ;
-    const size_t vertices_y = elems_y + 1 ;
-    const size_t vertices_z = elems_z + 1 ;
-
-    const BoxBoundsLinear vertex_box_bounds ;
-    const ElementSpec element ;
-
-    // Partition based upon vertices:
-
-    BoxType vertex_box_global ;
-    std::vector< BoxType > vertex_box_parts( proc_count );
-
-    vertex_box_global[0][0] = 0 ; vertex_box_global[0][1] = vertices_x ;
-    vertex_box_global[1][0] = 0 ; vertex_box_global[1][1] = vertices_y ;
-    vertex_box_global[2][0] = 0 ; vertex_box_global[2][1] = vertices_z ;
-
-    box_partition_rcb( vertex_box_global , vertex_box_parts );
-
-    const BoxType vertex_box_local_owned = vertex_box_parts[ proc_local ];
-
-    // Determine interior and used vertices:
-
-    BoxType vertex_box_local_interior ;
-    BoxType vertex_box_local_used ;
-
-    vertex_box_bounds.apply( vertex_box_global ,
-                             vertex_box_local_owned ,
-                             vertex_box_local_interior ,
-                             vertex_box_local_used );
-
-    // Element counts:
-
-    const long local_elems_x =
-      ( vertex_box_local_used[0][1] - vertex_box_local_used[0][0] ) - 1 ;
-    const long local_elems_y =
-      ( vertex_box_local_used[1][1] - vertex_box_local_used[1][0] ) - 1 ;
-    const long local_elems_z =
-      ( vertex_box_local_used[2][1] - vertex_box_local_used[2][0] ) - 1 ;
-
-    const size_t elem_count_total = std::max( long(0) , local_elems_x ) *
-                                    std::max( long(0) , local_elems_y ) *
-                                    std::max( long(0) , local_elems_z );
-
-    const long interior_elems_x =
-      ( vertex_box_local_owned[0][1] - vertex_box_local_owned[0][0] ) - 1 ;
-    const long interior_elems_y =
-      ( vertex_box_local_owned[1][1] - vertex_box_local_owned[1][0] ) - 1 ;
-    const long interior_elems_z =
-      ( vertex_box_local_owned[2][1] - vertex_box_local_owned[2][0] ) - 1 ;
-
-    const size_t elem_count_interior = std::max( long(0) , interior_elems_x ) *
-                                       std::max( long(0) , interior_elems_y ) *
-                                       std::max( long(0) , interior_elems_z );
-
-    // Expand vertex boxes to node boxes:
-
-    BoxType node_box_global ;
-    BoxType node_box_local_used ;
-    std::vector< BoxType > node_box_parts ;
-
-    element.create_node_boxes_from_vertex_boxes(
-      vertex_box_global , vertex_box_parts ,
-      node_box_global , node_box_parts );
-
-    // Node communication maps:
-
-    size_t node_count_interior = 0 ;
-    size_t node_count_owned    = 0 ;
-    size_t node_count_total    = 0 ;
-    std::vector<size_t>                 node_used_id_map ;
-    std::vector<size_t>                 node_part_counts ;
-    std::vector< std::vector<size_t> >  node_send_map ;
-
-    box_partition_maps( node_box_global ,
-                        node_box_parts ,
-                        element.box_bounds ,
-                        proc_local ,
-                        node_box_local_used ,
-                        node_used_id_map ,
-                        node_count_interior ,
-                        node_count_owned ,
-                        node_count_total ,
-                        node_part_counts ,
-                        node_send_map );
-
-    size_t node_count_send = 0 ;
-    for ( size_t i = 0 ; i < node_send_map.size() ; ++i ) {
-      node_count_send += node_send_map[i].size();
-    }
-
-    size_t recv_msg_count = 0 ;
-    size_t send_msg_count = 0 ;
-    size_t send_count = 0 ;
-
-    for ( size_t i = 1 ; i < proc_count ; ++i ) {
-      if ( node_part_counts[i] ) ++recv_msg_count ;
-      if ( node_send_map[i].size() ) {
-        ++send_msg_count ;
-        send_count += node_send_map[i].size();
-      }
-    }
-
-    // Finite element mesh:
-
-    FEMeshType mesh ;
-
-    if ( node_count_total ) {
-      mesh.node_coords = node_coords_type( "node_coords", node_count_total );
-    }
-
-    if ( elem_count_total ) {
-      mesh.elem_node_ids =
-        elem_node_ids_type( "elem_node_ids", elem_count_total );
-    }
-
-    mesh.parallel_data_map.assign( node_count_interior ,
-                                   node_count_owned ,
-                                   node_count_total ,
-                                   recv_msg_count ,
-                                   send_msg_count ,
-                                   send_count );
-
-    typename node_coords_type::HostMirror node_coords =
-      Kokkos::create_mirror( mesh.node_coords );
-
-    typename elem_node_ids_type::HostMirror elem_node_ids =
-      Kokkos::create_mirror( mesh.elem_node_ids );
-
-    //------------------------------------
-    // set node coordinates to grid location for subsequent verification
-
-    for ( size_t iz = node_box_local_used[2][0] ;
-                 iz < node_box_local_used[2][1] ; ++iz ) {
-
-    for ( size_t iy = node_box_local_used[1][0] ;
-                 iy < node_box_local_used[1][1] ; ++iy ) {
-
-    for ( size_t ix = node_box_local_used[0][0] ;
-                 ix < node_box_local_used[0][1] ; ++ix ) {
-
-      const size_t node_local_id =
-        box_map_id( node_box_local_used , node_used_id_map , ix , iy , iz );
-
-      node_coords( node_local_id , 0 ) = ix ;
-      node_coords( node_local_id , 1 ) = iy ;
-      node_coords( node_local_id , 2 ) = iz ;
-    }}}
-
-    //------------------------------------
-    // Initialize element-node connectivity:
-
-    if ( 1 < gang_count ) {
-      layout_elements_partitioned( vertex_box_local_used ,
-                                   vertex_box_local_owned ,
-                                   node_box_local_used ,
-                                   node_used_id_map ,
-                                   element ,
-                                   gang_count ,
-                                   elem_node_ids );
-    }
-    else {
-      layout_elements_interior_exterior( vertex_box_local_used ,
-                                         vertex_box_local_owned ,
-                                         node_box_local_used ,
-                                         node_used_id_map ,
-                                         element ,
-                                         elem_count_interior ,
-                                         elem_node_ids );
-    }
-
-    //------------------------------------
-    // Populate node->element connectivity:
-
-    std::vector<size_t> node_elem_work( node_count_total , (size_t) 0 );
-
-    for ( size_t i = 0 ; i < elem_count_total ; ++i ) {
-      for ( size_t n = 0 ; n < element_node_count  ; ++n ) {
-        ++node_elem_work[ elem_node_ids(i,n) ];
-      }
-    }
-
-    mesh.node_elem_ids =
-      Kokkos::create_staticcrsgraph< node_elem_ids_type >( "node_elem_ids" , node_elem_work );
-
-    typename node_elem_ids_type::HostMirror
-      node_elem_ids = Kokkos::create_mirror( mesh.node_elem_ids );
-
-    for ( size_t i = 0 ; i < node_count_total ; ++i ) {
-      node_elem_work[i] = node_elem_ids.row_map[i];
-    }
-
-    // Looping in element order insures the list of elements
-    // is sorted by element index.
-
-    for ( size_t i = 0 ; i < elem_count_total ; ++i ) {
-      for ( size_t n = 0 ; n < element_node_count ; ++n ) {
-        const unsigned nid = elem_node_ids(i, n);
-        const unsigned j = node_elem_work[nid] ; ++node_elem_work[nid] ;
-
-        node_elem_ids.entries( j , 0 ) = i ;
-        node_elem_ids.entries( j , 1 ) = n ;
-      }
-    }
-    //------------------------------------
-    // Verify setup with node coordinates matching grid indices.
-    verify( node_coords , elem_node_ids , node_elem_ids );
-
-    //------------------------------------
-    // Scale node coordinates to problem extent with
-    // nonlinear mapping.
-    {
-      const double problem_extent[3] =
-        { static_cast<double>( vertex_box_global[0][1] - 1 ) ,
-          static_cast<double>( vertex_box_global[1][1] - 1 ) ,
-          static_cast<double>( vertex_box_global[2][1] - 1 ) };
-
-      const double grid_extent[3] =
-        { static_cast<double>( node_box_global[0][1] - 1 ) ,
-          static_cast<double>( node_box_global[1][1] - 1 ) ,
-          static_cast<double>( node_box_global[2][1] - 1 ) };
-
-      for ( size_t i = 0 ; i < node_count_total ; ++i ) {
-        const double x_unit = node_coords(i,0) / grid_extent[0] ;
-        const double y_unit = node_coords(i,1) / grid_extent[1] ;
-        const double z_unit = node_coords(i,2) / grid_extent[2] ;
-
-        node_coords(i,0) = coordinate_scalar_type( problem_extent[0] * std::pow( x_unit , x_coord_curve ) );
-        node_coords(i,1) = coordinate_scalar_type( problem_extent[1] * std::pow( y_unit , y_coord_curve ) );
-        node_coords(i,2) = coordinate_scalar_type( problem_extent[2] * std::pow( z_unit , z_coord_curve ) );
-      }
-    }
-
-    Kokkos::deep_copy( mesh.node_coords ,   node_coords );
-    Kokkos::deep_copy( mesh.elem_node_ids , elem_node_ids );
-    Kokkos::deep_copy( mesh.node_elem_ids.entries , node_elem_ids.entries );
-
-    //------------------------------------
-    // Communication lists:
-    {
-      recv_msg_count = 0 ;
-      send_msg_count = 0 ;
-      send_count = 0 ;
-
-      for ( size_t i = 1 ; i < proc_count ; ++i ) {
-
-        // Order sending starting with the local processor rank
-        // to try to smooth out the amount of messages simultaneously
-        // send to a particular processor.
-
-        const int proc = ( proc_local + i ) % proc_count ;
-        if ( node_part_counts[i] ) {
-          mesh.parallel_data_map.host_recv(recv_msg_count,0) = proc ;
-          mesh.parallel_data_map.host_recv(recv_msg_count,1) = node_part_counts[i] ;
-          ++recv_msg_count ;
-        }
-        if ( node_send_map[i].size() ) {
-          mesh.parallel_data_map.host_send(send_msg_count,0) = proc ;
-          mesh.parallel_data_map.host_send(send_msg_count,1) = node_send_map[i].size() ;
-          for ( size_t j = 0 ; j < node_send_map[i].size() ; ++j , ++send_count ) {
-            mesh.parallel_data_map.host_send_item(send_count) = node_send_map[i][j] - node_count_interior ;
-          }
-          ++send_msg_count ;
-        }
-      }
-    }
-
-    return mesh ;
-  }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_BOXMESHFIXTURE_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/BoxMeshPartition.cpp
+++ b/lib/kokkos/example/multi_fem/BoxMeshPartition.cpp
@ -1,381 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <limits>
-#include <BoxMeshPartition.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace {
-
-void box_partition( size_t ip , size_t up ,
-                    const BoxType & box ,
-                    BoxType * const p_box )
-{
-  const size_t np = up - ip ;
-
-  if ( 1 == np ) {
-    p_box[ip] = box ;
-  }
-  else {
-    // Choose axis with largest count:
-
-    const size_t n0 = box[0][1] - box[0][0] ;
-    const size_t n1 = box[1][1] - box[1][0] ;
-    const size_t n2 = box[2][1] - box[2][0] ;
-
-    const size_t axis = n2 > n1 ? ( n2 > n0 ? 2 : ( n1 > n0 ? 1 : 0 ) ) :
-                                  ( n1 > n0 ? 1 : 0 );
-
-    const size_t n = box[ axis ][1] - box[ axis ][0] ;
-
-    if ( 0 == np % 3 ) {
-      const size_t np_part = np / 3 ; // exact
-
-      const size_t nbox_low = (size_t)(( (double) n ) * ( 1.0 / 3.0 ));
-      const size_t nbox_mid = (size_t)(( (double) n ) * ( 2.0 / 3.0 ));
-
-      BoxType dbox_low = box ; // P = [ip,ip+np/3) 
-      BoxType dbox_mid = box ; // P = [ip+np/3,ip+2*np/3) 
-      BoxType dbox_upp = box ; // P = [ip+2*np/3,ip+np) 
-
-      dbox_low[ axis ][1] = box[ axis ][0] + nbox_low ;
-      dbox_mid[ axis ][1] = box[ axis ][0] + nbox_mid ;
-
-      dbox_mid[ axis ][0] = dbox_low[ axis ][1];
-      dbox_upp[ axis ][0] = dbox_mid[ axis ][1];
-
-      box_partition( ip,           ip +   np_part, dbox_low , p_box );
-      box_partition( ip+  np_part, ip + 2*np_part, dbox_mid , p_box );
-      box_partition( ip+2*np_part, up,             dbox_upp , p_box );
-    }
-    else {
-      const size_t np_low = np / 2 ; /* Rounded down */
-      const size_t nbox_low = (size_t)
-        (((double)n) * ( ((double) np_low ) / ((double) np ) ));
-
-      BoxType dbox_low = box ;
-      BoxType dbox_upp = box ;
-
-      dbox_low[ axis ][1] = dbox_low[ axis ][0] + nbox_low ; 
-      dbox_upp[ axis ][0] = dbox_low[ axis ][1];
-
-      box_partition( ip, ip + np_low, dbox_low , p_box );
-      box_partition( ip + np_low, up, dbox_upp , p_box );
-    }
-  }
-}
-
-size_t box_map_offset( const BoxType & local_use ,
-                       const size_t global_i ,
-                       const size_t global_j ,
-                       const size_t global_k )
-
-{
-  const size_t max = std::numeric_limits<size_t>::max();
-
-  const size_t n[3] =
-    { local_use[0][1] - local_use[0][0] ,
-      local_use[1][1] - local_use[1][0] ,
-      local_use[2][1] - local_use[2][0] };
-
-  const size_t use[3] = {
-    ( global_i >= local_use[0][0] ? global_i - local_use[0][0] : max ) ,
-    ( global_j >= local_use[1][0] ? global_j - local_use[1][0] : max ) ,
-    ( global_k >= local_use[2][0] ? global_k - local_use[2][0] : max ) };
-
-  const size_t offset =
-    ( use[0] < n[0] && use[1] < n[1] && use[2] < n[2] ) ?
-    ( use[0] + n[0] * ( use[1] + n[1] * use[2] ) ) : max ;
-
-  if ( offset == max ) {
-    std::ostringstream msg ;
-    msg << "box_map_offset ERROR: "
-        << " use " << local_use
-        << " ( " << global_i
-        << " , " << global_j
-        << " , " << global_k
-        << " )" ;
-    throw std::runtime_error( msg.str() );
-  }
-
-  return offset ;
-}
-
-} // namespace
-
-//----------------------------------------------------------------------------
-
-void BoxBoundsLinear::apply(  const BoxType & box_global ,
-                              const BoxType & box_part ,
-                                    BoxType & box_interior ,
-                                    BoxType & box_use ) const
-{
-  const unsigned ghost = 1 ;
-
-  if ( 0 == count( box_part ) ) {
-    box_interior = box_part ;
-    box_use      = box_part ;
-  }
-  else {
-    for ( size_t i = 0 ; i < 3 ; ++i ) {
-
-      box_interior[i][0] =
-        ( box_part[i][0] == box_global[i][0] )      ? box_part[i][0] : (
-        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][0] + ghost : 
-                                                      box_part[i][1] );
-
-      box_interior[i][1] =
-        ( box_part[i][1] == box_global[i][1] )      ? box_part[i][1] : (
-        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][1] - ghost :
-                                                      box_part[i][0] );
-
-      box_use[i][0] = 
-        ( box_part[i][0] > ghost + box_global[i][0] ) ? box_part[i][0] - ghost :
-                                                        box_global[i][0] ;
-      box_use[i][1] = 
-        ( box_part[i][1] + ghost < box_global[i][1] ) ? box_part[i][1] + ghost :
-                                                        box_global[i][1] ;
-    }
-  }
-}
-
-void BoxBoundsQuadratic::apply( const BoxType & box_global ,
-                                const BoxType & box_part ,
-                                      BoxType & box_interior ,
-                                      BoxType & box_use ) const
-{
-  if ( 0 == count( box_part ) ) {
-    box_interior = box_part ;
-    box_use      = box_part ;
-  }
-  else {
-    for ( size_t i = 0 ; i < 3 ; ++i ) {
-      const bool odd = ( box_part[i][0] - box_global[i][0] ) & 01 ;
-
-      const unsigned ghost = odd ? 1 : 2 ;
-
-      box_interior[i][0] =
-        ( box_part[i][0] == box_global[i][0] )      ? box_part[i][0] : (
-        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][0] + ghost : 
-                                                      box_part[i][1] );
-
-      box_interior[i][1] =
-        ( box_part[i][1] == box_global[i][1] )      ? box_part[i][1] : (
-        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][1] - ghost :
-                                                      box_part[i][0] );
-
-      box_use[i][0] = 
-        ( box_part[i][0] > ghost + box_global[i][0] ) ? box_part[i][0] - ghost :
-                                                        box_global[i][0] ;
-      box_use[i][1] = 
-        ( box_part[i][1] + ghost < box_global[i][1] ) ? box_part[i][1] + ghost :
-                                                        box_global[i][1] ;
-    }
-  }
-}
-
-//----------------------------------------------------------------------------
-
-void box_partition_rcb( const BoxType        & root_box ,
-                        std::vector<BoxType> & part_boxes )
-{
-  const BoxBoundsLinear use_boxes ;
-
-  const size_t part_count = part_boxes.size();
-
-  box_partition( 0 , part_count , root_box , & part_boxes[0] );
-
-  // Verify partitioning
-
-  size_t total_cell = 0 ;
-
-  for ( size_t i = 0 ; i < part_count ; ++i ) {
-
-    total_cell += count( part_boxes[i] );
-
-    BoxType box_interior , box_use ;
-
-    use_boxes.apply( root_box , part_boxes[i] , box_interior , box_use );
-
-    if ( count( box_use ) < count( part_boxes[i] ) ||
-         count( part_boxes[i] ) < count( box_interior ) ||
-         part_boxes[i] != intersect( part_boxes[i] , box_use ) ||
-         box_interior  != intersect( part_boxes[i] , box_interior )) {
-
-      std::ostringstream msg ;
-
-      msg << "box_partition_rcb ERROR : "
-          << "part_boxes[" << i << "] = "
-          << part_boxes[i]
-          << " use " << box_use
-          << " interior " << box_interior
-          << std::endl 
-          << "  part ^ use " << intersect( part_boxes[i] , box_use )
-          << "  part ^ interior " << intersect( part_boxes[i] , box_interior );
-
-      throw std::runtime_error( msg.str() );
-    }
-
-    for ( size_t j = i + 1 ; j < part_count ; ++j ) {
-      const BoxType tmp = intersect( part_boxes[i] , part_boxes[j] );
-
-      if ( count( tmp ) ) {
-        throw std::runtime_error( std::string("box partition intersection") );
-      }
-    }
-  }
-
-  if ( total_cell != count( root_box ) ) {
-    throw std::runtime_error( std::string("box partition count") );
-  }
-}
-
-//----------------------------------------------------------------------------
-         
-size_t box_map_id( const BoxType & local_use ,
-                   const std::vector<size_t> & local_use_id_map ,
-                   const size_t global_i ,
-                   const size_t global_j ,
-                   const size_t global_k )
-
-{
-  const size_t offset =
-    box_map_offset( local_use , global_i , global_j , global_k );
-  return local_use_id_map[ offset ];
-}
-         
-//----------------------------------------------------------------------------
-
-void box_partition_maps( const BoxType              & root_box ,
-                         const std::vector<BoxType> & part_boxes ,
-                         const BoxBounds            & use_boxes ,
-                         const size_t          my_part ,
-                         BoxType             & my_use_box ,
-                         std::vector<size_t> & my_use_id_map ,
-                         size_t              & my_count_interior ,
-                         size_t              & my_count_owned ,
-                         size_t              & my_count_uses ,
-                         std::vector<size_t> & my_part_counts ,
-                         std::vector<std::vector<size_t> > & my_send_map )
-{
-  const size_t np = part_boxes.size();
-
-  if ( np <= my_part ) {
-    std::ostringstream msg ;
-    msg << "box_partition_maps ERROR : "
-        << " np(" << np << ") <= my_part(" << my_part << ")" ;
-    throw std::runtime_error( msg.str() );
-  }
-
-  const BoxType my_owned_box = part_boxes[my_part];
-  BoxType my_interior_box ;
-
-
-  use_boxes.apply( root_box, my_owned_box, my_interior_box, my_use_box );
-
-  my_count_interior = count( my_interior_box );
-  my_count_owned    = count( my_owned_box );
-  my_count_uses     = count( my_use_box );
-
-  my_use_id_map.assign( my_count_uses , std::numeric_limits<size_t>::max() );
-
-  // Order ids as { owned-interior , owned-parallel , received_{(p+i)%np} }
-
-  size_t offset_interior = 0 ;
-  size_t offset_parallel = my_count_interior ;
-
-  for ( size_t iz = my_owned_box[2][0] ; iz < my_owned_box[2][1] ; ++iz ) {
-  for ( size_t iy = my_owned_box[1][0] ; iy < my_owned_box[1][1] ; ++iy ) {
-  for ( size_t ix = my_owned_box[0][0] ; ix < my_owned_box[0][1] ; ++ix ) {
-    const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
-    if ( contain( my_interior_box , ix , iy , iz ) ) {
-      my_use_id_map[ offset ] = offset_interior++ ;
-    }
-    else {
-      my_use_id_map[ offset ] = offset_parallel++ ;
-    }
-  }}}
-
-
-  my_part_counts.assign( np , (size_t) 0 );
-  my_send_map.assign( np , std::vector<size_t>() );
-
-  my_part_counts[0] = my_count_owned ;
-
-  for ( size_t i = 1 ; i < np ; ++i ) {
-
-    const size_t ip = ( my_part + i ) % np ;
-
-    const BoxType p_owned_box = part_boxes[ip];
-    BoxType p_use_box , p_interior_box ;
-    use_boxes.apply( root_box, p_owned_box, p_interior_box, p_use_box );
-
-    const BoxType recv_box = intersect( my_use_box , p_owned_box );
-    const BoxType send_box = intersect( my_owned_box , p_use_box );
-
-    if ( 0 != ( my_part_counts[i] = count( recv_box ) ) ) {
-      for ( size_t iz = recv_box[2][0] ; iz < recv_box[2][1] ; ++iz ) {
-      for ( size_t iy = recv_box[1][0] ; iy < recv_box[1][1] ; ++iy ) {
-      for ( size_t ix = recv_box[0][0] ; ix < recv_box[0][1] ; ++ix ) {
-        const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
-        my_use_id_map[ offset ] = offset_parallel++ ;
-      }}}
-    }
-
-    if ( 0 != count( send_box ) ) {
-      for ( size_t iz = send_box[2][0] ; iz < send_box[2][1] ; ++iz ) {
-      for ( size_t iy = send_box[1][0] ; iy < send_box[1][1] ; ++iy ) {
-      for ( size_t ix = send_box[0][0] ; ix < send_box[0][1] ; ++ix ) {
-        const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
-
-        my_send_map[ i ].push_back( my_use_id_map[ offset ] );
-      }}}
-    }
-  }
-}
-
-
--- a/lib/kokkos/example/multi_fem/BoxMeshPartition.hpp
+++ b/lib/kokkos/example/multi_fem/BoxMeshPartition.hpp
@ -1,210 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef BOXMESHPARTITION_HPP
-#define BOXMESHPARTITION_HPP
-
-#include <cstddef>
-#include <utility>
-#include <vector>
-#include <iostream>
-
-//----------------------------------------------------------------------------
-
-struct BoxType {
-  size_t data[3][2] ;
-
-  typedef size_t range_type[2] ;
-
-  inline
-  const range_type & operator[]( size_t i ) const { return data[i]; }
-
-  inline
-  range_type & operator[]( size_t i ) { return data[i]; }
-
-  inline
-  bool operator == ( const BoxType & rhs ) const
-  {
-    return data[0][0] == rhs.data[0][0] && data[0][1] == rhs.data[0][1] &&
-           data[1][0] == rhs.data[1][0] && data[1][1] == rhs.data[2][1] &&
-           data[2][0] == rhs.data[2][0] && data[2][1] == rhs.data[2][1] ;
-  }
-
-  inline
-  bool operator != ( const BoxType & rhs ) const
-  {
-    return data[0][0] != rhs.data[0][0] || data[0][1] != rhs.data[0][1] ||
-           data[1][0] != rhs.data[1][0] || data[1][1] != rhs.data[1][1] ||
-           data[2][0] != rhs.data[2][0] || data[2][1] != rhs.data[2][1] ;
-  }
-};
-
-inline
-size_t count( const BoxType & b )
-{
-  size_t n = 1 ;
-  for ( size_t i = 0 ; i < 3 ; ++i ) {
-    n *= b[i][1] > b[i][0] ? b[i][1] - b[i][0] : 0 ;
-  }
-  return n ;
-}
-
-inline
-bool contain( const BoxType & b , size_t i , size_t j , size_t k )
-{
-  return b[0][0] <= i && i < b[0][1] &&
-         b[1][0] <= j && j < b[1][1] &&
-         b[2][0] <= k && k < b[2][1] ;
-}
-
-inline
-BoxType intersect( const BoxType & x , const BoxType & y )
-{
-  BoxType z ;
-  for ( size_t i = 0 ; i < 3 ; ++i ) {
-    z[i][0] = std::max( x[i][0] , y[i][0] );    
-    z[i][1] = std::min( x[i][1] , y[i][1] );    
-  }
-
-  return z ;
-}
-
-inline
-std::ostream & operator << ( std::ostream & s , const BoxType & box )
-{
-  s << "{ "
-    << box[0][0] << " " << box[0][1] << " , "
-    << box[1][0] << " " << box[1][1] << " , "
-    << box[2][0] << " " << box[2][1] << " }" ;
-  return s ;
-}
-
-//----------------------------------------------------------------------------
-
-class BoxBounds {
-public:
-  /** \brief  Default bounds to one layer of ghosting */
-  virtual
-  void apply( const BoxType & box_global ,
-              const BoxType & box_part ,
-                    BoxType & box_interior ,
-                    BoxType & box_use ) const = 0 ;
-
-  virtual ~BoxBounds() {}
-  BoxBounds() {}
-};
-
-class BoxBoundsLinear : public BoxBounds
-{
-public:
-  /** \brief  Default bounds to one layer of ghosting */
-  virtual
-  void apply( const BoxType & box_global ,
-              const BoxType & box_part ,
-                    BoxType & box_interior ,
-                    BoxType & box_use ) const ;
-
-  virtual ~BoxBoundsLinear() {}
-  BoxBoundsLinear() {}
-};
-
-class BoxBoundsQuadratic : public BoxBounds {
-public:
-  /** \brief  Quadratic mesh: even ordinates have two layers,
-   *          odd ordinates have one layer.
-   */
-  virtual
-  void apply( const BoxType & box_global ,
-              const BoxType & box_part ,
-                    BoxType & box_interior ,
-                    BoxType & box_use ) const ;
-
-  virtual ~BoxBoundsQuadratic() {}
-  BoxBoundsQuadratic() {}
-};
-
-//----------------------------------------------------------------------------
-/* Partition box into part_boxes.size() sub-boxes */
-
-void box_partition_rcb( const BoxType        & root_box ,
-                        std::vector<BoxType> & part_boxes );
-
-//----------------------------------------------------------------------------
-/* Determine local id layout and communication maps for partitioned boxes.
- *
- *  Local ids are layed out as follows:
- *    { [ owned-interior ids not sent ] ,
- *      [ owned-boundary ids to be sent to other processes ] ,
- *      [ received ids from processor ( my_part + 1 ) % part_count ]
- *      [ received ids from processor ( my_part + 2 ) % part_count ]
- *      [ received ids from processor ( my_part + 3 ) % part_count ]
- *      ... };
- *
- *  This layout allows
- *  (1) received data to be copied into a contiguous block of memory
- *  (2) send data to be extracted from a contiguous block of memory.
- */
-void box_partition_maps(
-  const BoxType              & root_box ,   // [in] Global box
-  const std::vector<BoxType> & part_boxes , // [in] Partitioned boxes
-  const BoxBounds            & use_boxes ,  // [in] Ghost boundaries
-  const size_t          my_part ,           // [in] My local part
-  BoxType             & my_use_box ,        // [out] My used box with ghost
-  std::vector<size_t> & my_use_id_map ,     // [out] Local ordering map
-  size_t              & my_count_interior , // [out] How many interior
-  size_t              & my_count_owned ,    // [out] How many owned
-  size_t              & my_count_uses ,     // [out] How may used
-  std::vector<size_t> & my_part_counts ,    // [out] Partitioning of my_use_id_map
-  std::vector<std::vector<size_t> > & my_send_map ); // [out] Send id map
-
-/*  Mapping of cartesian coordinate to local id */
-size_t box_map_id( const BoxType             & my_use_box ,
-                   const std::vector<size_t> & my_use_id_map ,
-                   const size_t global_i ,
-                   const size_t global_j ,
-                   const size_t global_k );
-
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef BOXMESHPARTITION_HPP */
-
--- a/lib/kokkos/example/multi_fem/CMakeLists.txt
+++ b/lib/kokkos/example/multi_fem/CMakeLists.txt
@ -1,16 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-SET(SOURCES "")
-
-FILE(GLOB SOURCES *.cpp)
-
-SET(LIBRARIES kokkoscore)
-
-TRIBITS_ADD_EXECUTABLE(
-  multi_fem
-  SOURCES ${SOURCES}
-  COMM serial mpi
-  )
-
--- a/lib/kokkos/example/multi_fem/Explicit.hpp
+++ b/lib/kokkos/example/multi_fem/Explicit.hpp
@ -1,452 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef EXPLICIT_DRIVER_HPP
-#define EXPLICIT_DRIVER_HPP
-
-#include <sys/time.h>
-#include <iostream>
-#include <iomanip>
-#include <cstdlib>
-#include <cmath>
-
-#include <impl/Kokkos_Timer.hpp>
-
-#include <ExplicitFunctors.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Explicit {
-
-struct PerformanceData {
-  double mesh_time ;
-  double init_time ;
-  double internal_force_time ;
-  double central_diff ;
-  double comm_time ;
-  size_t number_of_steps ;
-
-  PerformanceData()
-  : mesh_time(0)
-  , init_time(0)
-  , internal_force_time(0)
-  , central_diff(0)
-  , comm_time(0)
-  , number_of_steps(0)
-  {}
-
-  void best( const PerformanceData & rhs )
-  {
-    if ( rhs.mesh_time < mesh_time ) mesh_time = rhs.mesh_time ;
-    if ( rhs.init_time < init_time ) init_time = rhs.init_time ;
-    if ( rhs.internal_force_time < internal_force_time ) internal_force_time = rhs.internal_force_time ;
-    if ( rhs.central_diff < central_diff ) central_diff = rhs.central_diff ;
-    if ( rhs.comm_time < comm_time ) comm_time = rhs.comm_time ;
-  }
-};
-
-template< typename Scalar , class FixtureType >
-PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
-                     const int global_max_x ,
-                     const int global_max_y ,
-                     const int global_max_z ,
-                     const int steps ,
-                     const int print_sample )
-{
-  typedef Scalar                              scalar_type ;
-  typedef FixtureType                         fixture_type ;
-  typedef typename fixture_type::execution_space  execution_space ;
-  //typedef typename fixture_type::FEMeshType   mesh_type ; // unused
-
-  enum { ElementNodeCount = fixture_type::element_node_count };
-
-  const int NumStates = 2;
-
-  const int total_num_steps = steps ;
-
-  const Scalar user_dt = 5.0e-6;
-  //const Scalar  end_time = 0.0050;
-
-  // element block parameters
-  const Scalar  lin_bulk_visc = 0.0;
-  const Scalar  quad_bulk_visc = 0.0;
-
-  // const Scalar  lin_bulk_visc = 0.06;
-  // const Scalar  quad_bulk_visc = 1.2;
-  // const Scalar  hg_stiffness = 0.0;
-  // const Scalar  hg_viscosity = 0.0;
-  // const Scalar  hg_stiffness = 0.03;
-  // const Scalar  hg_viscosity = 0.001;
-
-  // material properties
-  const Scalar youngs_modulus=1.0e6;
-  const Scalar poissons_ratio=0.0;
-  const Scalar  density = 8.0e-4;
-
-  const comm::Machine machine = mesh.parallel_data_map.machine ;
-
-  PerformanceData perf_data ;
-
-  Kokkos::Impl::Timer wall_clock ;
-
-  //------------------------------------
-  // Generate fields
-
-  typedef Fields< scalar_type , execution_space > fields_type ;
-
-  fields_type mesh_fields( mesh ,
-                           lin_bulk_visc ,
-                           quad_bulk_visc ,
-                           youngs_modulus ,
-                           poissons_ratio ,
-                           density );
-
-  typename fields_type::node_coords_type::HostMirror
-    model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords );
-
-  typename fields_type::geom_state_array_type::HostMirror
-    displacement_h = Kokkos::create_mirror( mesh_fields.displacement );
-
-  typename fields_type::geom_state_array_type::HostMirror
-    velocity_h = Kokkos::create_mirror( mesh_fields.velocity );
-
-  Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords );
-
-  //------------------------------------
-  // Initialization
-
-  initialize_element<Scalar,execution_space>::apply( mesh_fields );
-  initialize_node<   Scalar,execution_space>::apply( mesh_fields );
-
-  const Scalar x_bc = global_max_x ;
-
-  // Initial condition on velocity to initiate a pulse along the X axis
-  {
-    const unsigned X = 0;
-    for (int inode = 0; inode< mesh_fields.num_nodes; ++inode) {
-      if ( model_coords_h(inode,X) == 0) {
-        velocity_h(inode,X,0) = 1.0e3;
-        velocity_h(inode,X,1) = 1.0e3;
-      }
-    }
-  }
-
-  Kokkos::deep_copy( mesh_fields.velocity , velocity_h );
-
-  //--------------------------------------------------------------------------
-  // We will call a sequence of functions.  These functions have been
-  // grouped into several functors to balance the number of global memory
-  // accesses versus requiring too many registers or too much L1 cache.
-  // Global memory accees have read/write cost and memory subsystem contention cost.
-  //--------------------------------------------------------------------------
-
-  perf_data.init_time = comm::max( machine , wall_clock.seconds() );
-
-  // Parameters required for the internal force computations.
-
-  int current_state = 0;
-  int previous_state = 0;
-  int next_state = 0;
-
-  perf_data.number_of_steps = total_num_steps ;
-
-#if defined( KOKKOS_HAVE_MPI )
-
-  typedef typename
-    fields_type::geom_state_array_type::value_type  comm_value_type ;
-
-  const unsigned comm_value_count = 6 ;
-
-  Kokkos::AsyncExchange< comm_value_type , execution_space ,
-                              Kokkos::ParallelDataMap >
-    comm_exchange( mesh.parallel_data_map , comm_value_count );
-
-#endif
-
-  for (int step = 0; step < total_num_steps; ++step) {
-
-    wall_clock.reset();
-
-    //------------------------------------------------------------------------
-#if defined( KOKKOS_HAVE_MPI )
-    {
-      // Communicate "send" nodes' displacement and velocity next_state
-      // to the ghosted nodes.
-      // buffer packages: { { dx , dy , dz , vx , vy , vz }_node }
-
-      pack_state< Scalar , execution_space >
-        ::apply( comm_exchange.buffer() ,
-                 mesh.parallel_data_map.count_interior ,
-                 mesh.parallel_data_map.count_send ,
-                 mesh_fields , next_state );
-
-      comm_exchange.setup();
-
-      comm_exchange.send_receive();
-
-      unpack_state< Scalar , execution_space >
-        ::apply( mesh_fields , next_state ,
-                 comm_exchange.buffer() ,
-                 mesh.parallel_data_map.count_owned ,
-                 mesh.parallel_data_map.count_receive );
-
-      execution_space::fence();
-    }
-#endif
-
-    perf_data.comm_time += comm::max( machine , wall_clock.seconds() );
-
-    //------------------------------------------------------------------------
-    // rotate the states
-
-    previous_state = current_state;
-    current_state = next_state;
-    ++next_state;
-    next_state %= NumStates;
-
-    wall_clock.reset();
-
-    // First kernel 'grad_hgop' combines two functions:
-    // gradient, velocity gradient
-    grad< Scalar , execution_space >::apply( mesh_fields ,
-                                         current_state ,
-                                         previous_state );
-
-    // Combine tensor decomposition and rotation functions.
-    decomp_rotate< Scalar , execution_space >::apply( mesh_fields ,
-                                                  current_state ,
-                                                  previous_state );
-
-    internal_force< Scalar , execution_space >::apply( mesh_fields ,
-                                                   user_dt ,
-                                                   current_state );
-
-    execution_space::fence();
-
-    perf_data.internal_force_time +=
-      comm::max( machine , wall_clock.seconds() );
-
-    wall_clock.reset();
-
-    // Assembly of elements' contributions to nodal force into
-    // a nodal force vector.  Update the accelerations, velocities,
-    // displacements.
-    // The same pattern can be used for matrix-free residual computations.
-    nodal_step< Scalar , execution_space >::apply( mesh_fields ,
-                                               x_bc ,
-                                               current_state,
-                                               next_state );
-    execution_space::fence();
-
-    perf_data.central_diff +=
-      comm::max( machine , wall_clock.seconds() );
-
-    if ( print_sample && 0 == step % 100 ) {
-      Kokkos::deep_copy( displacement_h , mesh_fields.displacement );
-      Kokkos::deep_copy( velocity_h ,     mesh_fields.velocity );
-
-      if ( 1 == print_sample ) {
-
-        std::cout << "step " << step
-                  << " : displacement(*,0,0) =" ;
-        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
-          if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) {
-            std::cout << " " << displacement_h(i,0,next_state);
-          }
-        }
-        std::cout << std::endl ;
-
-        const float tol = 1.0e-6 ;
-        const int yb = global_max_y ;
-        const int zb = global_max_z ;
-        std::cout << "step " << step
-                  << " : displacement(*," << yb << "," << zb << ") =" ;
-        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
-          if ( fabs( model_coords_h(i,1) - yb ) < tol &&
-               fabs( model_coords_h(i,2) - zb ) < tol ) {
-            std::cout << " " << displacement_h(i,0,next_state);
-          }
-        }
-        std::cout << std::endl ;
-      }
-      else if ( 2 == print_sample ) {
-
-        const float tol = 1.0e-6 ;
-        const int xb = global_max_x / 2 ;
-        const int yb = global_max_y / 2 ;
-        const int zb = global_max_z / 2 ;
-
-        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
-          if ( fabs( model_coords_h(i,0) - xb ) < tol &&
-               fabs( model_coords_h(i,1) - yb ) < tol &&
-               fabs( model_coords_h(i,2) - zb ) < tol ) {
-            std::cout << "step " << step
-                      << " : displacement("
-                      << xb << "," << yb << "," << zb << ") = {"
-                      << std::setprecision(6)
-                      << " " << displacement_h(i,0,next_state)
-                      << std::setprecision(2)
-                      << " " << displacement_h(i,1,next_state)
-                      << std::setprecision(2)
-                      << " " << displacement_h(i,2,next_state)
-                      << " }" << std::endl ;
-          }
-        }
-      }
-    }
-  }
-
-  return perf_data ;
-}
-
-
-template <typename Scalar, typename Device>
-static void driver( const char * const label ,
-                    comm::Machine machine ,
-                    const int gang_count ,
-                    const int elem_count_beg ,
-                    const int elem_count_end ,
-                    const int runs )
-{
-  typedef Scalar              scalar_type ;
-  typedef Device              execution_space ;
-  typedef double              coordinate_scalar_type ;
-  typedef FixtureElementHex8  fixture_element_type ;
-
-  typedef BoxMeshFixture< coordinate_scalar_type ,
-                          execution_space ,
-                          fixture_element_type > fixture_type ;
-
-  typedef typename fixture_type::FEMeshType mesh_type ;
-
-  const size_t proc_count = comm::size( machine );
-  const size_t proc_rank  = comm::rank( machine );
-
-  const int space = 15 ;
-  const int steps = 1000 ;
-  const int print_sample = 0 ;
-
-  if ( comm::rank( machine ) == 0 ) {
-
-    std::cout << std::endl ;
-    std::cout << "\"MiniExplicitDynamics with Kokkos " << label
-              << " time_steps(" << steps << ")"
-              << "\"" << std::endl;
-    std::cout << std::left << std::setw(space) << "\"Element\" , ";
-    std::cout << std::left << std::setw(space) << "\"Node\" , ";
-    std::cout << std::left << std::setw(space) << "\"Initialize\" , ";
-    std::cout << std::left << std::setw(space) << "\"ElemForce\" , ";
-    std::cout << std::left << std::setw(space) << "\"NodeUpdate\" , ";
-    std::cout << std::left << std::setw(space) << "\"NodeComm\" , ";
-    std::cout << std::left << std::setw(space) << "\"Time/Elem\" , ";
-    std::cout << std::left << std::setw(space) << "\"Time/Node\"";
-
-    std::cout << std::endl;
-
-    std::cout << std::left << std::setw(space) << "\"count\" , ";
-    std::cout << std::left << std::setw(space) << "\"count\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
-    std::cout << std::left << std::setw(space) << "\"microsec\"";
-
-    std::cout << std::endl;
-  }
-
-  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
-  {
-    const int iz = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
-    const int iy = iz + 1 ;
-    const int ix = 2 * iy ;
-    const int nelem = ix * iy * iz ;
-    const int nnode = ( ix + 1 ) * ( iy + 1 ) * ( iz + 1 );
-
-    mesh_type mesh =
-      fixture_type::create( proc_count , proc_rank , gang_count ,
-                            ix , iy , iz );
-
-    mesh.parallel_data_map.machine = machine ;
-
-    PerformanceData perf , best ;
-
-    for(int j = 0; j < runs; j++){
-
-     perf = run<scalar_type,fixture_type>(mesh,ix,iy,iz,steps,print_sample);
-
-     if( j == 0 ) {
-       best = perf ;
-     }
-     else {
-       best.best( perf );
-     }
-   }
-
-   if ( comm::rank( machine ) == 0 ) {
-     double time_per_element =
-       ( best.internal_force_time ) / ( nelem * perf.number_of_steps );
-     double time_per_node =
-       ( best.comm_time + best.central_diff ) / ( nnode * perf.number_of_steps );
-
-   std::cout << std::setw(space-3) << nelem << " , "
-             << std::setw(space-3) << nnode << " , "
-             << std::setw(space-3) << best.number_of_steps << " , "
-             << std::setw(space-3) << best.init_time * 1000000 << " , "
-             << std::setw(space-3)
-             << ( best.internal_force_time * 1000000 ) / best.number_of_steps << " , "
-             << std::setw(space-3)
-             << ( best.central_diff * 1000000 ) / best.number_of_steps << " , "
-             << std::setw(space-3)
-             << ( best.comm_time * 1000000 ) / best.number_of_steps << " , "
-             << std::setw(space-3) << time_per_element * 1000000 << " , "
-             << std::setw(space-3) << time_per_node * 1000000
-             << std::endl ;
-    }
-  }
-}
-
-
-} // namespace Explicit
-
-#endif /* #ifndef EXPLICIT_DRIVER_HPP */
--- a/lib/kokkos/example/multi_fem/ExplicitFunctors.hpp
+++ b/lib/kokkos/example/multi_fem/ExplicitFunctors.hpp
--- a/lib/kokkos/example/multi_fem/FEMesh.hpp
+++ b/lib/kokkos/example/multi_fem/FEMesh.hpp
@ -1,86 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_FEMESH_HPP
-#define KOKKOS_FEMESH_HPP
-
-#include <utility>
-#include <limits>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-#include <Kokkos_Core.hpp>
-#include <Kokkos_StaticCrsGraph.hpp>
-
-#include <ParallelComm.hpp>
-#include <ParallelDataMap.hpp>
-
-namespace HybridFEM {
-
-//----------------------------------------------------------------------------
-/** \brief  Finite element mesh fixture for hybrid parallel performance tests.
- */
-template< typename CoordScalarType , unsigned ElemNodeCount , class Device >
-struct FEMesh {
-
-  typedef typename Device::size_type size_type ;
-
-  static const size_type element_node_count = ElemNodeCount ;
-
-  typedef Kokkos::View< CoordScalarType*[3] , Device >       node_coords_type ;
-  typedef Kokkos::View< size_type*[ElemNodeCount], Device >  elem_node_ids_type ;
-  typedef Kokkos::StaticCrsGraph< size_type[2] ,  Device >   node_elem_ids_type ;
-
-  node_coords_type         node_coords ;
-  elem_node_ids_type       elem_node_ids ;
-  node_elem_ids_type       node_elem_ids ;
-  Kokkos::ParallelDataMap  parallel_data_map ;
-};
-
-//----------------------------------------------------------------------------
-
-} /* namespace HybridFEM */
-
-#endif /* #ifndef KOKKOS_FEMESH_HPP */
-
--- a/lib/kokkos/example/multi_fem/HexElement.hpp
+++ b/lib/kokkos/example/multi_fem/HexElement.hpp
@ -1,268 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef ELEMENTHEX_HPP
-#define ELEMENTHEX_HPP
-
-namespace HybridFEM {
-
-template< unsigned NodeCount >
-class HexElement_TensorData ;
-
-template< unsigned NodeCount , class Device >
-class HexElement_TensorEval ;
-
-//----------------------------------------------------------------------------
-/** \brief  Evaluate Hex element on interval [-1,1]^3 */
-template<>
-class HexElement_TensorData< 8 > {
-public:
-
-  static const unsigned element_node_count    = 8 ;
-  static const unsigned spatial_dimension     = 3 ;
-  static const unsigned integration_count_1d  = 2 ;
-  static const unsigned function_count_1d     = 2 ;
-
-  float values_1d [ function_count_1d ][ integration_count_1d ];
-  float derivs_1d [ function_count_1d ][ integration_count_1d ];
-  float weights_1d[ integration_count_1d ];
-
-  unsigned char eval_map[ element_node_count ][4] ;
-
-  static float eval_value_1d( const unsigned jf , const float x )
-  {
-    return 0 == jf ? 0.5 * ( 1.0 - x ) : (
-           1 == jf ? 0.5 * ( 1.0 + x ) : 0 );
-  }
-
-  static float eval_deriv_1d( const unsigned jf , const float )
-  {
-    return 0 == jf ? -0.5 : (
-           1 == jf ?  0.5 : 0 );
-  }
-
-  HexElement_TensorData()
-  {
-    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
-      { { 0 , 0 , 0 },
-        { 1 , 0 , 0 },
-        { 1 , 1 , 0 },
-        { 0 , 1 , 0 },
-        { 0 , 0 , 1 },
-        { 1 , 0 , 1 },
-        { 1 , 1 , 1 },
-        { 0 , 1 , 1 } };
-
-    weights_1d[0] = 1 ;
-    weights_1d[1] = 1 ;
-
-    const float points_1d[ integration_count_1d ] =
-      { -0.577350269 , 0.577350269 };
-
-    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
-      eval_map[i][0] = tmp_map[i][0];
-      eval_map[i][1] = tmp_map[i][1];
-      eval_map[i][2] = tmp_map[i][2];
-    }
-
-    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
-    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
-      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
-      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
-    }}
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template<>
-class HexElement_TensorData< 27 > {
-public:
-
-  static const unsigned element_node_count    = 27 ;
-  static const unsigned spatial_dimension     = 3 ;
-  static const unsigned integration_count_1d  = 3 ;
-  static const unsigned function_count_1d     = 3 ;
-
-  float values_1d [ function_count_1d ][ integration_count_1d ];
-  float derivs_1d [ function_count_1d ][ integration_count_1d ];
-  float weights_1d[ integration_count_1d ];
-
-  unsigned char eval_map[ element_node_count ][4] ;
-
-  // sizeof(EvaluateElementHex) = 111 bytes =
-  //   sizeof(float) * 9 +
-  //   sizeof(float) * 9 +
-  //   sizeof(float) * 3 +
-  //   sizeof(char) * 27 
-
-  static float eval_value_1d( const unsigned jf , const float p )
-  {
-    return 0 == jf ? 0.5 * p * ( p - 1 ) : (
-           1 == jf ? 1.0 - p * p : (
-           2 == jf ? 0.5 * p * ( p + 1 ) : 0 ));
-  }
-
-  static float eval_deriv_1d( const unsigned jf , const float p )
-  {
-    return 0 == jf ? p - 0.5 : (
-           1 == jf ? -2.0 * p : (
-           2 == jf ? p + 0.5 : 0 ));
-  }
-
-  HexElement_TensorData()
-  {
-    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
-      { { 0 , 0 , 0 },
-        { 2 , 0 , 0 },
-        { 2 , 2 , 0 },
-        { 0 , 2 , 0 },
-        { 0 , 0 , 2 },
-        { 2 , 0 , 2 },
-        { 2 , 2 , 2 },
-        { 0 , 2 , 2 },
-        { 1 , 0 , 0 },
-        { 2 , 1 , 0 },
-        { 1 , 2 , 0 },
-        { 0 , 1 , 0 },
-        { 0 , 0 , 1 },
-        { 2 , 0 , 1 },
-        { 2 , 2 , 1 },
-        { 0 , 2 , 1 },
-        { 1 , 0 , 2 },
-        { 2 , 1 , 2 },
-        { 1 , 2 , 2 },
-        { 0 , 1 , 2 },
-        { 1 , 1 , 1 },
-        { 1 , 1 , 0 },
-        { 1 , 1 , 2 },
-        { 0 , 1 , 1 },
-        { 2 , 1 , 1 },
-        { 1 , 0 , 1 },
-        { 1 , 2 , 1 } };
-
-    // Interval [-1,1]
-
-    weights_1d[0] = 0.555555556 ;
-    weights_1d[1] = 0.888888889 ;
-    weights_1d[2] = 0.555555556 ;
-
-    const float points_1d[3] = { -0.774596669 ,
-                                  0.000000000 ,
-                                  0.774596669 };
-
-    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
-      eval_map[i][0] = tmp_map[i][0];
-      eval_map[i][1] = tmp_map[i][1];
-      eval_map[i][2] = tmp_map[i][2];
-    }
-
-    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
-    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
-      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
-      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
-    }}
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< unsigned NodeCount >
-class HexElement_Data {
-public:
-  static const unsigned spatial_dimension   = 3 ;
-  static const unsigned element_node_count  = NodeCount ;
-  static const unsigned integration_count   = NodeCount ;
-  static const unsigned function_count      = NodeCount ;
-
-  float weights[   integration_count ] ;
-  float values[    integration_count ][ function_count ];
-  float gradients[ integration_count ][ spatial_dimension ][ function_count ];
-
-  HexElement_Data()
-  {
-    HexElement_TensorData< NodeCount > tensor_data ;
-
-    for ( unsigned ip = 0 ; ip < integration_count ; ++ip ) {
-
-      const unsigned ipx = tensor_data.eval_map[ip][0] ;
-      const unsigned ipy = tensor_data.eval_map[ip][1] ;
-      const unsigned ipz = tensor_data.eval_map[ip][2] ;
-
-      weights[ip] = tensor_data.weights_1d[ ipx ] *
-                    tensor_data.weights_1d[ ipy ] *
-                    tensor_data.weights_1d[ ipz ] ;
-
-      for ( unsigned jf = 0 ; jf < function_count ; ++jf ) {
-
-        const unsigned jfx = tensor_data.eval_map[jf][0] ;
-        const unsigned jfy = tensor_data.eval_map[jf][1] ;
-        const unsigned jfz = tensor_data.eval_map[jf][2] ;
-
-        values[ip][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
-                         tensor_data.values_1d[ ipy ][ jfy ] *
-                         tensor_data.values_1d[ ipz ][ jfz ] ;
-
-        gradients[ip][0][jf] = tensor_data.derivs_1d[ ipx ][ jfx ] *
-                               tensor_data.values_1d[ ipy ][ jfy ] *
-                               tensor_data.values_1d[ ipz ][ jfz ] ;
-
-        gradients[ip][1][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
-                               tensor_data.derivs_1d[ ipy ][ jfy ] *
-                               tensor_data.values_1d[ ipz ][ jfz ] ;
-
-        gradients[ip][2][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
-                               tensor_data.values_1d[ ipy ][ jfy ] *
-                               tensor_data.derivs_1d[ ipz ][ jfz ] ;
-      }
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} /* namespace HybridFEM */
-
-#endif /* #ifndef ELEMENTHEX_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/HexExplicitFunctions.hpp
+++ b/lib/kokkos/example/multi_fem/HexExplicitFunctions.hpp
@ -1,443 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_HEXEXPLICITFUNCTIONS_HPP
-#define KOKKOS_HEXEXPLICITFUNCTIONS_HPP
-
-#include <math.h>
-
-namespace Explicit {
-
-struct Hex8Functions
-{
-  static const unsigned SpatialDim    = 3 ;
-  static const unsigned ElemNodeCount = 8 ;
-
-  // Indices for full 3x3 tensor:
-
-  static const unsigned K_F_XX = 0 ;
-  static const unsigned K_F_YY = 1 ;
-  static const unsigned K_F_ZZ = 2 ;
-  static const unsigned K_F_XY = 3 ;
-  static const unsigned K_F_YZ = 4 ;
-  static const unsigned K_F_ZX = 5 ;
-  static const unsigned K_F_YX = 6 ;
-  static const unsigned K_F_ZY = 7 ;
-  static const unsigned K_F_XZ = 8 ;
-  static const unsigned K_F_SIZE = 9 ;
-
-  //  Indexes into a 3 by 3 symmetric tensor stored as a length 6 vector
-
-  static const unsigned K_S_XX = 0 ;
-  static const unsigned K_S_YY = 1 ;
-  static const unsigned K_S_ZZ = 2 ;
-  static const unsigned K_S_XY = 3 ;
-  static const unsigned K_S_YZ = 4 ;
-  static const unsigned K_S_ZX = 5 ;
-  static const unsigned K_S_YX = 3 ;
-  static const unsigned K_S_ZY = 4 ;
-  static const unsigned K_S_XZ = 5 ;
-  static const unsigned K_S_SIZE = 6 ;
-
-  //  Indexes into a 3 by 3 skew symmetric tensor stored as a length 3 vector
-
-  static const unsigned K_V_XY = 0 ;
-  static const unsigned K_V_YZ = 1 ;
-  static const unsigned K_V_ZX = 2 ;
-  static const unsigned K_V_SIZE = 3 ;
-
-  //--------------------------------------------------------------------------
-
-  template< typename ScalarA , typename ScalarB >
-  KOKKOS_INLINE_FUNCTION static
-  double dot8( const ScalarA * const a , const ScalarB * const b )
-  { return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3] +
-           a[4] * b[4] + a[5] * b[5] + a[6] * b[6] + a[7] * b[7] ; }
-
-  //--------------------------------------------------------------------------
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  KOKKOS_INLINE_FUNCTION static
-  void grad( const ScalarPrecise x[] ,
-             const ScalarPrecise z[] ,
-                   ScalarCompact grad_y[] )
-  {
-    const ScalarCompact R42=(x[3] - x[1]);
-    const ScalarCompact R52=(x[4] - x[1]);
-    const ScalarCompact R54=(x[4] - x[3]);
-
-    const ScalarCompact R63=(x[5] - x[2]);
-    const ScalarCompact R83=(x[7] - x[2]);
-    const ScalarCompact R86=(x[7] - x[5]);
-
-    const ScalarCompact R31=(x[2] - x[0]);
-    const ScalarCompact R61=(x[5] - x[0]);
-    const ScalarCompact R74=(x[6] - x[3]);
-
-    const ScalarCompact R72=(x[6] - x[1]);
-    const ScalarCompact R75=(x[6] - x[4]);
-    const ScalarCompact R81=(x[7] - x[0]);
-
-    const ScalarCompact t1=(R63 + R54);
-    const ScalarCompact t2=(R61 + R74);
-    const ScalarCompact t3=(R72 + R81);
-
-    const ScalarCompact t4 =(R86 + R42);
-    const ScalarCompact t5 =(R83 + R52);
-    const ScalarCompact t6 =(R75 + R31);
-
-    //  Calculate Y gradient from X and Z data
-
-    grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
-    grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
-    grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
-    grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
-    grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
-    grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
-    grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
-    grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);
-  }
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  static KOKKOS_INLINE_FUNCTION
-  void grad( const ScalarPrecise x[] ,
-             const ScalarPrecise y[] ,
-             const ScalarPrecise z[] ,
-                   ScalarCompact grad_x[] ,
-                   ScalarCompact grad_y[] ,
-                   ScalarCompact grad_z[] )
-  {
-    grad( x , z , grad_y );
-    grad( z , y , grad_x );
-    grad( y , x , grad_z );
-  }
-
-  //--------------------------------------------------------------------------
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  KOKKOS_INLINE_FUNCTION static
-  void polar_decomp( const float dt ,
-                     const ScalarCompact v_gr[] ,
-                           ScalarPrecise stretch[] /* INOUT */ ,
-                           ScalarCompact str_ten[] /* OUT */ ,
-                           ScalarCompact rot[]     /* OUT */ )
-  {
-    const float dt_half = 0.5 * dt;
-
-    ScalarCompact vort[ K_V_SIZE ];  // Vorticity
-
-    //  Symmetric part
-    str_ten[K_S_XX] = v_gr[K_F_XX];
-    str_ten[K_S_YY] = v_gr[K_F_YY];
-    str_ten[K_S_ZZ] = v_gr[K_F_ZZ];
-    str_ten[K_S_XY] = 0.5 * ( v_gr[K_F_XY] + v_gr[K_F_YX] );
-    str_ten[K_S_YZ] = 0.5 * ( v_gr[K_F_YZ] + v_gr[K_F_ZY] );
-    str_ten[K_S_ZX] = 0.5 * ( v_gr[K_F_ZX] + v_gr[K_F_XZ] );
-
-    //  Skew Symmetric part
-    vort[K_V_XY] = 0.5 * ( v_gr[K_F_XY] - v_gr[K_F_YX] );
-    vort[K_V_YZ] = 0.5 * ( v_gr[K_F_YZ] - v_gr[K_F_ZY] );
-    vort[K_V_ZX] = 0.5 * ( v_gr[K_F_ZX] - v_gr[K_F_XZ] );
-
-    //   calculate the rates of rotation via gauss elimination.
-
-    ScalarCompact z1 = str_ten[K_S_XY] * stretch[K_S_ZX] -
-                       str_ten[K_S_ZX] * stretch[K_S_XY] +
-                       str_ten[K_S_YY] * stretch[K_S_YZ] -
-                       str_ten[K_S_YZ] * stretch[K_S_YY] +
-                       str_ten[K_S_YZ] * stretch[K_S_ZZ] -
-                       str_ten[K_S_ZZ] * stretch[K_S_YZ];
-
-    ScalarCompact z2 = str_ten[K_S_ZX] * stretch[K_S_XX] -
-                       str_ten[K_S_XX] * stretch[K_S_ZX] +
-                       str_ten[K_S_YZ] * stretch[K_S_XY] -
-                       str_ten[K_S_XY] * stretch[K_S_YZ] +
-                       str_ten[K_S_ZZ] * stretch[K_S_ZX] -
-                       str_ten[K_S_ZX] * stretch[K_S_ZZ];
-
-    ScalarCompact z3 = str_ten[K_S_XX] * stretch[K_S_XY] -
-                       str_ten[K_S_XY] * stretch[K_S_XX] +
-                       str_ten[K_S_XY] * stretch[K_S_YY] -
-                       str_ten[K_S_YY] * stretch[K_S_XY] +
-                       str_ten[K_S_ZX] * stretch[K_S_YZ] -
-                       str_ten[K_S_YZ] * stretch[K_S_ZX];
-
-    {
-      //   forward elimination
-
-      const ScalarCompact a1inv  = 1.0 / (stretch[K_S_YY] + stretch[K_S_ZZ]);
-      const ScalarCompact a4BYa1 = -1 * stretch[K_S_XY] * a1inv;
-      const ScalarCompact a2inv  = 1.0 / (stretch[K_S_ZZ] + stretch[K_S_XX] + stretch[K_S_XY] * a4BYa1);
-
-     const ScalarCompact a5 =  -stretch[K_S_YZ] + stretch[K_S_ZX] * a4BYa1;
-
-      z2 -= z1 * a4BYa1;
-      const ScalarCompact a6BYa1 = -1 * stretch[K_S_ZX] * a1inv;
-      const ScalarCompact a5BYa2 = a5 * a2inv;
-      z3 -= z1 * a6BYa1 - z2 * a5BYa2;
-
-      //   backward substitution -
-
-      z3 /= (stretch[K_S_XX] + stretch[K_S_YY] + stretch[K_S_ZX] * a6BYa1 + a5 * a5BYa2);
-      z2 = (z2 - a5 * z3) * a2inv;
-      z1 = (z1*a1inv - a6BYa1 * z3 -a4BYa1 * z2);
-    }
-
-    //   calculate rotation rates - recall that spin_rate is an asymmetric tensor,
-    //   so compute spin rate vector as dual of spin rate tensor,
-    //   i.e   w_i = e_ijk * spin_rate_jk
-
-    z1 += vort[K_V_YZ];
-    z2 += vort[K_V_ZX];
-    z3 += vort[K_V_XY];
-
-    {
-      //   update rotation tensor:
-      //  1) premultiply old rotation tensor to get right-hand side.
-
-      ScalarCompact r_XX = rot[K_F_XX] + dt_half*( z3 * rot[K_F_YX] - z2 * rot[K_F_ZX] );
-      ScalarCompact r_YX = rot[K_F_YX] + dt_half*( z1 * rot[K_F_ZX] - z3 * rot[K_F_XX] );
-      ScalarCompact r_ZX = rot[K_F_ZX] + dt_half*( z2 * rot[K_F_XX] - z1 * rot[K_F_YX] );
-      ScalarCompact r_XY = rot[K_F_XY] + dt_half*( z3 * rot[K_F_YY] - z2 * rot[K_F_ZY] );
-      ScalarCompact r_YY = rot[K_F_YY] + dt_half*( z1 * rot[K_F_ZY] - z3 * rot[K_F_XY] );
-      ScalarCompact r_ZY = rot[K_F_ZY] + dt_half*( z2 * rot[K_F_XY] - z1 * rot[K_F_YY] );
-      ScalarCompact r_XZ = rot[K_F_XZ] + dt_half*( z3 * rot[K_F_YZ] - z2 * rot[K_F_ZZ] );
-      ScalarCompact r_YZ = rot[K_F_YZ] + dt_half*( z1 * rot[K_F_ZZ] - z3 * rot[K_F_XZ] );
-      ScalarCompact r_ZZ = rot[K_F_ZZ] + dt_half*( z2 * rot[K_F_XZ] - z1 * rot[K_F_YZ] );
-
-
-      //  2) solve for new rotation tensor via gauss elimination.
-      //   forward elimination -
-
-      const ScalarCompact a12 = - dt_half * z3;
-      const ScalarCompact a13 =   dt_half * z2;
-            ScalarCompact b32 = - dt_half * z1;
-      const ScalarCompact a22inv = 1.0 / (1.0 + a12 * a12);
-
-      const ScalarCompact a13a12 = a13*a12;
-      const ScalarCompact a23 = b32 + a13a12;
-
-      r_YX += r_XX * a12;
-      r_YY += r_XY * a12;
-      r_YZ += r_XZ * a12;
-
-      b32 = (b32 - a13a12) * a22inv;
-
-      r_ZX += r_XX * a13 + r_YX * b32;
-      r_ZY += r_XY * a13 + r_YY * b32;
-      r_ZZ += r_XZ * a13 + r_YZ * b32;
-
-      //   backward substitution -
-
-      const ScalarCompact a33inv = 1.0 / (1.0 + a13 * a13 + a23 * b32);
-
-      rot[K_F_ZX] = r_ZX * a33inv;
-      rot[K_F_ZY] = r_ZY * a33inv;
-      rot[K_F_ZZ] = r_ZZ * a33inv;
-      rot[K_F_YX] = ( r_YX - rot[K_F_ZX] * a23 ) * a22inv;
-      rot[K_F_YY] = ( r_YY - rot[K_F_ZY] * a23 ) * a22inv;
-      rot[K_F_YZ] = ( r_YZ - rot[K_F_ZZ] * a23 ) * a22inv;
-      rot[K_F_XX] = r_XX - rot[K_F_ZX] * a13 - rot[K_F_YX] * a12;
-      rot[K_F_XY] = r_XY - rot[K_F_ZY] * a13 - rot[K_F_YY] * a12;
-      rot[K_F_XZ] = r_XZ - rot[K_F_ZZ] * a13 - rot[K_F_YZ] * a12;
-    }
-
-    //   update stretch tensor in the new configuration -
-
-    const ScalarCompact a1 = str_ten[K_S_XY] + vort[K_V_XY];
-    const ScalarCompact a2 = str_ten[K_S_YZ] + vort[K_V_YZ];
-    const ScalarCompact a3 = str_ten[K_S_ZX] + vort[K_V_ZX];
-    const ScalarCompact b1 = str_ten[K_S_ZX] - vort[K_V_ZX];
-    const ScalarCompact b2 = str_ten[K_S_XY] - vort[K_V_XY];
-    const ScalarCompact b3 = str_ten[K_S_YZ] - vort[K_V_YZ];
-
-    const ScalarCompact s_XX = stretch[K_S_XX];
-    const ScalarCompact s_YY = stretch[K_S_YY];
-    const ScalarCompact s_ZZ = stretch[K_S_ZZ];
-    const ScalarCompact s_XY = stretch[K_S_XY];
-    const ScalarCompact s_YZ = stretch[K_S_YZ];
-    const ScalarCompact s_ZX = stretch[K_S_ZX];
-
-    stretch[K_S_XX] += dt * (str_ten[K_S_XX] * s_XX + ( a1 + z3 ) * s_XY + ( b1 - z2 ) * s_ZX);
-    stretch[K_S_YY] += dt * (str_ten[K_S_YY] * s_YY + ( a2 + z1 ) * s_YZ + ( b2 - z3 ) * s_XY);
-    stretch[K_S_ZZ] += dt * (str_ten[K_S_ZZ] * s_ZZ + ( a3 + z2 ) * s_ZX + ( b3 - z1 ) * s_YZ);
-    stretch[K_S_XY] += dt * (str_ten[K_S_XX] * s_XY + ( a1 )      * s_YY + ( b1      ) * s_YZ - z3 * s_XX + z1 * s_ZX);
-    stretch[K_S_YZ] += dt * (str_ten[K_S_YY] * s_YZ + ( a2 )      * s_ZZ + ( b2      ) * s_ZX - z1 * s_YY + z2 * s_XY);
-    stretch[K_S_ZX] += dt * (str_ten[K_S_ZZ] * s_ZX + ( a3 )      * s_XX + ( b3      ) * s_XY - z2 * s_ZZ + z3 * s_YZ);
-  }
-
-  //--------------------------------------------------------------------------
-
-  template< typename ScalarCompact >
-  static KOKKOS_INLINE_FUNCTION
-  void rotate_tensor( const ScalarCompact str_ten[] ,
-                      const ScalarCompact rot[] ,
-                            ScalarCompact rot_str[] )
-  {
-    ScalarCompact t[9];
-
-    t[0] = str_ten[K_S_XX]*rot[K_F_XX] + str_ten[K_S_XY]*rot[K_F_YX] + str_ten[K_S_XZ]*rot[K_F_ZX];
-    t[1] = str_ten[K_S_YX]*rot[K_F_XX] + str_ten[K_S_YY]*rot[K_F_YX] + str_ten[K_S_YZ]*rot[K_F_ZX];
-    t[2] = str_ten[K_S_ZX]*rot[K_F_XX] + str_ten[K_S_ZY]*rot[K_F_YX] + str_ten[K_S_ZZ]*rot[K_F_ZX];
-
-    t[3] = str_ten[K_S_XX]*rot[K_F_XY] + str_ten[K_S_XY]*rot[K_F_YY] + str_ten[K_S_XZ]*rot[K_F_ZY];
-    t[4] = str_ten[K_S_YX]*rot[K_F_XY] + str_ten[K_S_YY]*rot[K_F_YY] + str_ten[K_S_YZ]*rot[K_F_ZY];
-    t[5] = str_ten[K_S_ZX]*rot[K_F_XY] + str_ten[K_S_ZY]*rot[K_F_YY] + str_ten[K_S_ZZ]*rot[K_F_ZY];
-
-    t[6] = str_ten[K_S_XX]*rot[K_F_XZ] + str_ten[K_S_XY]*rot[K_F_YZ] + str_ten[K_S_XZ]*rot[K_F_ZZ];
-    t[7] = str_ten[K_S_YX]*rot[K_F_XZ] + str_ten[K_S_YY]*rot[K_F_YZ] + str_ten[K_S_YZ]*rot[K_F_ZZ];
-    t[8] = str_ten[K_S_ZX]*rot[K_F_XZ] + str_ten[K_S_ZY]*rot[K_F_YZ] + str_ten[K_S_ZZ]*rot[K_F_ZZ];
-
-
-    rot_str[ K_S_XX ] = rot[K_F_XX] * t[0] + rot[K_F_YX] * t[1] + rot[K_F_ZX] * t[2];
-    rot_str[ K_S_YY ] = rot[K_F_XY] * t[3] + rot[K_F_YY] * t[4] + rot[K_F_ZY] * t[5];
-    rot_str[ K_S_ZZ ] = rot[K_F_XZ] * t[6] + rot[K_F_YZ] * t[7] + rot[K_F_ZZ] * t[8];
-
-    rot_str[ K_S_XY ] = rot[K_F_XX] * t[3] + rot[K_F_YX] * t[4] + rot[K_F_ZX] * t[5];
-    rot_str[ K_S_YZ ] = rot[K_F_XY] * t[6] + rot[K_F_YY] * t[7] + rot[K_F_ZY] * t[8];
-    rot_str[ K_S_ZX ] = rot[K_F_XZ] * t[0] + rot[K_F_YZ] * t[1] + rot[K_F_ZZ] * t[2];
-  }
-
-  //--------------------------------------------------------------------------
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  static KOKKOS_INLINE_FUNCTION
-  void rotate_tensor_backward( const ScalarPrecise stress[] ,
-                               const ScalarCompact rot[] ,
-                                     ScalarCompact rot_stress[] )
-  {
-    ScalarCompact t[9] ;
-
-    t[0] = stress[K_S_XX]*rot[K_F_XX]+ stress[K_S_XY]*rot[K_F_XY]+ stress[K_S_XZ]*rot[K_F_XZ];
-    t[1] = stress[K_S_YX]*rot[K_F_XX]+ stress[K_S_YY]*rot[K_F_XY]+ stress[K_S_YZ]*rot[K_F_XZ];
-    t[2] = stress[K_S_ZX]*rot[K_F_XX]+ stress[K_S_ZY]*rot[K_F_XY]+ stress[K_S_ZZ]*rot[K_F_XZ];
-    t[3] = stress[K_S_XX]*rot[K_F_YX]+ stress[K_S_XY]*rot[K_F_YY]+ stress[K_S_XZ]*rot[K_F_YZ];
-    t[4] = stress[K_S_YX]*rot[K_F_YX]+ stress[K_S_YY]*rot[K_F_YY]+ stress[K_S_YZ]*rot[K_F_YZ];
-    t[5] = stress[K_S_ZX]*rot[K_F_YX]+ stress[K_S_ZY]*rot[K_F_YY]+ stress[K_S_ZZ]*rot[K_F_YZ];
-    t[6] = stress[K_S_XX]*rot[K_F_ZX]+ stress[K_S_XY]*rot[K_F_ZY]+ stress[K_S_XZ]*rot[K_F_ZZ];
-    t[7] = stress[K_S_YX]*rot[K_F_ZX]+ stress[K_S_YY]*rot[K_F_ZY]+ stress[K_S_YZ]*rot[K_F_ZZ];
-    t[8] = stress[K_S_ZX]*rot[K_F_ZX]+ stress[K_S_ZY]*rot[K_F_ZY]+ stress[K_S_ZZ]*rot[K_F_ZZ];
-
-    rot_stress[ K_S_XX ] = rot[K_F_XX]*t[0] + rot[K_F_XY]*t[1] + rot[K_F_XZ]*t[2];
-    rot_stress[ K_S_YY ] = rot[K_F_YX]*t[3] + rot[K_F_YY]*t[4] + rot[K_F_YZ]*t[5];
-    rot_stress[ K_S_ZZ ] = rot[K_F_ZX]*t[6] + rot[K_F_ZY]*t[7] + rot[K_F_ZZ]*t[8];
-
-    rot_stress[ K_S_XY ] = rot[K_F_XX]*t[3] + rot[K_F_XY]*t[4] + rot[K_F_XZ]*t[5];
-    rot_stress[ K_S_YZ ] = rot[K_F_YX]*t[6] + rot[K_F_YY]*t[7] + rot[K_F_YZ]*t[8];
-    rot_stress[ K_S_ZX ] = rot[K_F_ZX]*t[0] + rot[K_F_ZY]*t[1] + rot[K_F_ZZ]*t[2];
-  }
-
-  //--------------------------------------------------------------------------
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  KOKKOS_INLINE_FUNCTION static
-  void update_stress( const float dt ,
-                      const float two_mu ,
-                      const float bulk_modulus ,
-                      const ScalarCompact rot_str[] ,
-                            ScalarPrecise stress[] )
-  {
-    const ScalarCompact e = rot_str[ K_S_XX ] + rot_str[ K_S_YY ] + rot_str[ K_S_ZZ ] ;
-    const ScalarCompact eb = e * bulk_modulus ;
-    const ScalarCompact e3 = e / 3.0 ;
-
-    stress[K_S_XX] += dt * ( two_mu * ( rot_str[K_S_XX] - e3 ) + eb );
-    stress[K_S_YY] += dt * ( two_mu * ( rot_str[K_S_YY] - e3 ) + eb );
-    stress[K_S_ZZ] += dt * ( two_mu * ( rot_str[K_S_ZZ] - e3 ) + eb );
-
-    stress[K_S_XY] += dt * two_mu * rot_str[K_S_XY];
-    stress[K_S_YZ] += dt * two_mu * rot_str[K_S_YZ];
-    stress[K_S_ZX] += dt * two_mu * rot_str[K_S_ZX];
-  }
-
-  //--------------------------------------------------------------------------
-
-  template< class ScalarPrecise ,
-            class ScalarCompact >
-  static KOKKOS_INLINE_FUNCTION
-  void comp_force( const ScalarPrecise vx[] ,
-                   const ScalarPrecise vy[] ,
-                   const ScalarPrecise vz[] ,
-                   const ScalarCompact grad_x[] ,
-                   const ScalarCompact grad_y[] ,
-                   const ScalarCompact grad_z[] ,
-                   const ScalarCompact total_stress12th[] ,
-                         ScalarCompact force[][ SpatialDim ] ,
-                         ScalarCompact & energy )
-  {
-    ScalarPrecise internal_energy = 0 ;
-
-    for ( unsigned inode = 0; inode < ElemNodeCount ; ++inode ) {
-
-      force[inode][0] = total_stress12th[K_S_XX] * grad_x[inode] +
-                        total_stress12th[K_S_XY] * grad_y[inode] +
-                        total_stress12th[K_S_XZ] * grad_z[inode] ;
-
-      force[inode][1] = total_stress12th[K_S_YX] * grad_x[inode] +
-                        total_stress12th[K_S_YY] * grad_y[inode] +
-                        total_stress12th[K_S_YZ] * grad_z[inode] ;
-
-      force[inode][2] = total_stress12th[K_S_ZX] * grad_x[inode] +
-                        total_stress12th[K_S_ZY] * grad_y[inode] +
-                        total_stress12th[K_S_ZZ] * grad_z[inode] ;
-
-      internal_energy += force[inode][0] * vx[inode] +
-                         force[inode][1] * vy[inode] +
-                         force[inode][2] * vz[inode] ;
-    }
-
-    energy = internal_energy ;
-  }
-
-  //--------------------------------------------------------------------------
-};
-
-} // namespace Explicit
-
-#endif /* #ifndef KOKKOS_HEXEXPLICITFUNCTIONS_HPP */
-
--- a/lib/kokkos/example/multi_fem/Implicit.hpp
+++ b/lib/kokkos/example/multi_fem/Implicit.hpp
@ -1,341 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef HYBRIDFEM_IMPLICIT_HPP
-#define HYBRIDFEM_IMPLICIT_HPP
-
-#include <utility>
-#include <iostream>
-#include <iomanip>
-
-#include <Kokkos_Core.hpp>
-#include <SparseLinearSystem.hpp>
-#include <SparseLinearSystemFill.hpp>
-#include <ImplicitFunctors.hpp>
-#include <FEMesh.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace HybridFEM {
-namespace Implicit {
-
-struct PerformanceData {
-  double mesh_time ;
-  double graph_time ;
-  double elem_time ;
-  double matrix_gather_fill_time ;
-  double matrix_boundary_condition_time ;
-  double cg_iteration_time ;
-
-  PerformanceData()
-    : mesh_time(0)
-    , graph_time(0)
-    , elem_time(0)
-    , matrix_gather_fill_time(0)
-    , matrix_boundary_condition_time(0)
-    , cg_iteration_time(0)
-    {}
-
-  void best( const PerformanceData & rhs )
-  {
-    mesh_time = std::min( mesh_time , rhs.mesh_time );
-    graph_time = std::min( graph_time , rhs.graph_time );
-    elem_time = std::min( elem_time , rhs.elem_time );
-    matrix_gather_fill_time = std::min( matrix_gather_fill_time , rhs.matrix_gather_fill_time );
-    matrix_boundary_condition_time = std::min( matrix_boundary_condition_time , rhs.matrix_boundary_condition_time );
-    cg_iteration_time = std::min( cg_iteration_time , rhs.cg_iteration_time );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class FixtureType >
-PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
-                     const int , // global_max_x ,
-                     const int , // global_max_y ,
-                     const int global_max_z ,
-                     const bool print_sample )
-{
-  typedef Scalar                              scalar_type ;
-  typedef FixtureType                         fixture_type ;
-  typedef typename fixture_type::execution_space  execution_space;
-  //typedef typename execution_space::size_type     size_type ; // unused
-
-  typedef typename fixture_type::FEMeshType mesh_type ;
-  typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;
-
-  enum { ElementNodeCount = fixture_type::element_node_count };
-
-  const comm::Machine machine = mesh.parallel_data_map.machine ;
-
-  const size_t element_count = mesh.elem_node_ids.dimension_0();
-
-  const size_t iteration_limit = 200 ;
-  const double residual_tolerance = 1e-14 ;
-
-  size_t iteration_count = 0 ;
-  double residual_norm = 0 ;
-
-  PerformanceData perf_data ;
-
-  //------------------------------------
-  // Sparse linear system types:
-
-  typedef Kokkos::View< scalar_type* , execution_space >   vector_type ;
-  typedef Kokkos::CrsMatrix< scalar_type , execution_space >     matrix_type ;
-  typedef typename matrix_type::graph_type         matrix_graph_type ;
-  typedef typename matrix_type::coefficients_type  matrix_coefficients_type ;
-
-  typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;
-
-  //------------------------------------
-  // Problem setup types:
-
-  typedef ElementComputation< scalar_type , scalar_type , execution_space > ElementFunctor ;
-  typedef DirichletBoundary< scalar_type , scalar_type , execution_space > BoundaryFunctor ;
-
-  typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
-  typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;
-
-  typedef GatherFill< matrix_type ,
-                      mesh_type ,
-                      elem_matrices_type ,
-                      elem_vectors_type > GatherFillFunctor ;
-
-  //------------------------------------
-
-  const scalar_type elem_coeff_K = 2 ;
-  const scalar_type elem_load_Q  = 1 ;
-
-  matrix_type linsys_matrix ;
-  vector_type linsys_rhs ;
-  vector_type linsys_solution ;
-
-  typename graph_factory::element_map_type element_map ;
-
-  Kokkos::Impl::Timer wall_clock ;
-
-  //------------------------------------
-  // Generate sparse matrix graph and element->graph map.
-
-  graph_factory::create( mesh , linsys_matrix.graph , element_map );
-
-  execution_space::fence();
-  perf_data.graph_time = comm::max( machine , wall_clock.seconds() );
-
-  //------------------------------------
-  // Allocate linear system coefficients and rhs:
-
-  const size_t local_owned_length =
-    linsys_matrix.graph.row_map.dimension_0() - 1 ;
-
-  linsys_matrix.coefficients =
-    matrix_coefficients_type( "coeff" , linsys_matrix.graph.entries.dimension_0() );
-
-  linsys_rhs      = vector_type( "rhs" , local_owned_length );
-  linsys_solution = vector_type( "solution" , local_owned_length );
-
-  //------------------------------------
-  // Fill linear system
-  {
-    elem_matrices_type elem_matrices ;
-    elem_vectors_type  elem_vectors ;
-
-    if ( element_count ) {
-      elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
-      elem_vectors  = elem_vectors_type ( std::string("elem_vectors"), element_count );
-    }
-
-    //------------------------------------
-    // Compute element matrices and vectors:
-
-    wall_clock.reset();
-
-    ElementFunctor::apply( mesh ,
-                           elem_matrices , elem_vectors ,
-                           elem_coeff_K , elem_load_Q );
-
-    execution_space::fence();
-    perf_data.elem_time = comm::max( machine , wall_clock.seconds() );
-
-    //------------------------------------
-    // Fill linear system coefficients:
-
-    wall_clock.reset();
-
-    GatherFillFunctor::apply( linsys_matrix , linsys_rhs ,
-               mesh , element_map , elem_matrices , elem_vectors );
-
-    execution_space::fence();
-    perf_data.matrix_gather_fill_time = comm::max( machine , wall_clock.seconds() );
-
-    // Apply boundary conditions:
-
-    wall_clock.reset();
-
-    BoundaryFunctor::apply( linsys_matrix , linsys_rhs , mesh ,
-                            0 , global_max_z , 0 , global_max_z );
-
-    execution_space::fence();
-    perf_data.matrix_boundary_condition_time = comm::max( machine , wall_clock.seconds() );
-  }
-
-  //------------------------------------
-  // Solve linear sytem
-
-  cgsolve( mesh.parallel_data_map ,
-           linsys_matrix , linsys_rhs , linsys_solution ,
-           iteration_count , residual_norm ,
-           perf_data.cg_iteration_time ,
-           iteration_limit , residual_tolerance );
-
-  //------------------------------------
-
-  if ( print_sample ) {
-
-    typename mesh_type::node_coords_type::HostMirror coords_h =
-      Kokkos::create_mirror( mesh.node_coords );
-
-    typename vector_type::HostMirror X_h =
-      Kokkos::create_mirror( linsys_solution );
-
-    Kokkos::deep_copy( coords_h , mesh.node_coords );
-    Kokkos::deep_copy( X_h , linsys_solution );
-
-    for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
-      const coordinate_scalar_type x = coords_h(i,0);
-      const coordinate_scalar_type y = coords_h(i,1);
-      const coordinate_scalar_type z = coords_h(i,2);
-
-      if ( x <= 0 && y <= 0 ) {
-        std::cout << "  node( " << x << " " << y << " " << z << " ) = "
-                  << X_h(i) << std::endl ;
-      }
-    }
-  }
-
-  return perf_data ;
-}
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class Device >
-void driver( const char * const label ,
-             comm::Machine machine ,
-             const int gang_count ,
-             const int elem_count_beg ,
-             const int elem_count_end ,
-             const int runs )
-{
-  typedef Scalar              scalar_type ;
-  typedef Device              execution_space ;
-  typedef double              coordinate_scalar_type ;
-  typedef FixtureElementHex8  fixture_element_type ;
-
-  typedef BoxMeshFixture< coordinate_scalar_type ,
-                          execution_space ,
-                          fixture_element_type > fixture_type ;
-
-  typedef typename fixture_type::FEMeshType mesh_type ;
-
-  const size_t proc_count = comm::size( machine );
-  const size_t proc_rank  = comm::rank( machine );
-
-  if ( elem_count_beg == 0 || elem_count_end == 0 || runs == 0 ) return ;
-
-  if ( comm::rank( machine ) == 0 ) {
-    std::cout << std::endl ;
-    std::cout << "\"Kokkos::HybridFE::Implicit " << label << "\"" << std::endl;
-    std::cout << "\"Size\" ,  \"Graphing\" , \"Element\" , \"Fill\" ,   \"Boundary\" ,  \"CG-Iter\"" << std::endl
-              << "\"elems\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\"" << std::endl ;
-  }
-
-  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
-  {
-    const int ix = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
-    const int iy = ix + 1 ;
-    const int iz = 2 * iy ;
-    const int n  = ix * iy * iz ;
-
-    mesh_type mesh =
-      fixture_type::create( proc_count , proc_rank , gang_count ,
-                            ix , iy , iz );
-
-    mesh.parallel_data_map.machine = machine ;
-
-    PerformanceData perf_data , perf_best ;
-
-    for(int j = 0; j < runs; j++){
-
-     perf_data = run<scalar_type,fixture_type>(mesh,ix,iy,iz, false );
-
-     if( j == 0 ) {
-       perf_best = perf_data ;
-     }
-     else {
-       perf_best.best( perf_data );
-     }
-   }
-
-  if ( comm::rank( machine ) == 0 ) {
-
-     std::cout << std::setw(8) << n << " , "
-               << std::setw(10) << perf_best.graph_time * 1000 << " , "
-               << std::setw(10) << perf_best.elem_time * 1000 << " , "
-               << std::setw(10) << perf_best.matrix_gather_fill_time * 1000 << " , "
-               << std::setw(10) << perf_best.matrix_boundary_condition_time * 1000 << " , "
-               << std::setw(10) << perf_best.cg_iteration_time * 1000
-               << std::endl ;
-    }
-  }
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace Implicit */
-} /* namespace HybridFEM */
-
-
-#endif /* #ifndef HYBRIDFEM_IMPLICIT_HPP */
-
--- a/lib/kokkos/example/multi_fem/ImplicitFunctors.hpp
+++ b/lib/kokkos/example/multi_fem/ImplicitFunctors.hpp
@ -1,585 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <iostream>
-#include <fstream>
-#include <iomanip>
-#include <cstdlib>
-#include <cmath>
-
-namespace HybridFEM {
-namespace Implicit {
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , unsigned Dim , unsigned N >
-struct TensorIntegration ;
-
-template<typename Scalar >
-struct TensorIntegration<Scalar,1,1> {
-  Scalar pts[1] ;
-  Scalar wts[1] ;
-
-  TensorIntegration() { pts[0] = 0 ; wts[0] = 2 ; }
-};
-
-template<typename Scalar >
-struct TensorIntegration<Scalar,1,2>
-{
-  Scalar pts[2] ;
-  Scalar wts[2] ;
-
-  TensorIntegration()
-  {
-    const Scalar x2 = 0.577350269 ;
-    pts[0] = -x2; wts[0] = 1.0;
-    pts[1] =  x2; wts[1] = 1.0;
-  }
-};
-
-template<typename Scalar >
-struct TensorIntegration<Scalar,1,3>
-{
-  Scalar pts[3] ;
-  Scalar wts[3] ;
-
-  TensorIntegration()
-  {
-    const Scalar x3 = 0.774596669 ;
-    const Scalar w1 = 0.555555556 ;
-    const Scalar w2 = 0.888888889 ;
-    pts[0] =  -x3 ;  wts[0] = w1 ;
-    pts[1] =    0 ;  wts[1] = w2 ;
-    pts[2] =   x3 ;  wts[2] = w1 ;
-  }
-};
-
-template< typename Scalar , unsigned Order >
-struct TensorIntegration<Scalar,3,Order>
-{
-  static const unsigned N = Order * Order * Order ;
-
-  Scalar pts[N][3] ;
-  Scalar wts[N];
-
-  TensorIntegration()
-  {
-    TensorIntegration<Scalar,1,Order> oneD ;
-
-    unsigned n = 0 ;
-    for ( unsigned k = 0 ; k < Order ; ++k ) {
-    for ( unsigned j = 0 ; j < Order ; ++j ) {
-    for ( unsigned i = 0 ; i < Order ; ++i , ++n ) {
-      pts[n][0] = oneD.pts[i] ;
-      pts[n][1] = oneD.pts[j] ;
-      pts[n][2] = oneD.pts[k] ;
-      wts[n] = oneD.wts[i] * oneD.wts[j] * oneD.wts[k] ;
-    }}}
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar >
-struct ShapeFunctionEvaluation {
-
-  static const unsigned FunctionCount = 8 ;
-  static const unsigned SpatialDimension = 3 ;
-  static const unsigned IntegrationOrder = 2 ;
-
-  typedef TensorIntegration< Scalar , SpatialDimension , IntegrationOrder > 
-    TensorIntegrationType ;
-
-  static const unsigned PointCount = TensorIntegrationType::N ;
-
-  Scalar value   [ PointCount ][ FunctionCount ] ;
-  Scalar gradient[ PointCount ][ FunctionCount * SpatialDimension ];
-  Scalar weight  [ PointCount ];
-
-  ShapeFunctionEvaluation()
-  {
-    const TensorIntegration< Scalar , SpatialDimension , IntegrationOrder > 
-      integration ;
-
-    const Scalar ONE8TH = 0.125 ;
-
-    for ( unsigned i = 0 ; i < PointCount ; ++i ) {
-
-      const Scalar u = 1.0 - integration.pts[i][0];
-      const Scalar v = 1.0 - integration.pts[i][1];
-      const Scalar w = 1.0 - integration.pts[i][2];
-
-      const Scalar up1 = 1.0 + integration.pts[i][0];
-      const Scalar vp1 = 1.0 + integration.pts[i][1];
-      const Scalar wp1 = 1.0 + integration.pts[i][2];
-
-      weight[i] = integration.wts[i] ;
-
-      // Vaues:
-      value[i][0] = ONE8TH *   u *   v *  w ;
-      value[i][1] = ONE8TH * up1 *   v *  w ;
-      value[i][2] = ONE8TH * up1 * vp1 *  w ;
-      value[i][3] = ONE8TH *   u * vp1 *  w ;
-
-      value[i][4] = ONE8TH *   u *   v *  wp1 ;
-      value[i][5] = ONE8TH * up1 *   v *  wp1 ;
-      value[i][6] = ONE8TH * up1 * vp1 *  wp1 ;
-      value[i][7] = ONE8TH *   u * vp1 *  wp1 ;
-
-      //fn 0 = u * v * w
-      gradient[i][ 0] = ONE8TH * -1  *  v  *  w  ;
-      gradient[i][ 1] = ONE8TH *  u  * -1  *  w  ;
-      gradient[i][ 2] = ONE8TH *  u  *  v  * -1  ;
-
-      //fn 1 = up1 * v * w
-      gradient[i][ 3] = ONE8TH *  1  *  v  *  w  ;
-      gradient[i][ 4] = ONE8TH * up1 * -1  *  w  ;
-      gradient[i][ 5] = ONE8TH * up1 *  v  * -1  ;
-
-      //fn 2 = up1 * vp1 * w
-      gradient[i][ 6] = ONE8TH *  1  * vp1 *  w ;
-      gradient[i][ 7] = ONE8TH * up1 *  1  *  w ;
-      gradient[i][ 8] = ONE8TH * up1 * vp1 * -1 ;
-
-      //fn 3 = u * vp1 * w
-      gradient[i][ 9] = ONE8TH * -1 * vp1 *  w ;
-      gradient[i][10] = ONE8TH *  u *  1  *  w ;
-      gradient[i][11] = ONE8TH *  u * vp1 * -1 ;
-
-      //fn 4 = u * v * wp1
-      gradient[i][12] = ONE8TH * -1  *  v  * wp1 ;
-      gradient[i][13] = ONE8TH *  u  * -1  * wp1 ;
-      gradient[i][14] = ONE8TH *  u  *  v  *  1  ;
-
-      //fn 5 = up1 * v * wp1
-      gradient[i][15] = ONE8TH *  1  *  v  * wp1 ;
-      gradient[i][16] = ONE8TH * up1 * -1  * wp1 ;
-      gradient[i][17] = ONE8TH * up1 *  v  *  1  ;
-
-      //fn 6 = up1 * vp1 * wp1
-      gradient[i][18] = ONE8TH *  1  * vp1 * wp1 ;
-      gradient[i][19] = ONE8TH * up1 *  1  * wp1 ;
-      gradient[i][20] = ONE8TH * up1 * vp1 *  1 ;
-
-      //fn 7 = u * vp1 * wp1
-      gradient[i][21] = ONE8TH * -1 * vp1 * wp1 ;
-      gradient[i][22] = ONE8TH *  u *  1  * wp1 ;
-      gradient[i][23] = ONE8TH *  u * vp1 *  1 ;
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarType , typename ScalarCoordType , class DeviceType >
-struct ElementComputation
-{
-  typedef DeviceType     execution_space;
-  typedef ScalarType              scalar_type ;
-  typedef typename execution_space::size_type  size_type ;
-
-  static const size_type ElementNodeCount = 8 ;
-
-  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
-  typedef Kokkos::View< scalar_type[][ElementNodeCount][ElementNodeCount] , execution_space > elem_matrices_type ;
-  typedef Kokkos::View< scalar_type[][ElementNodeCount] , execution_space > elem_vectors_type ;
-
-  typedef ShapeFunctionEvaluation< scalar_type > shape_function_data ;
-
-  static const unsigned SpatialDim    = shape_function_data::SpatialDimension ;
-  static const unsigned FunctionCount = shape_function_data::FunctionCount ;
-
-private:
-
-  const shape_function_data               shape_eval ;
-  typename mesh_type::elem_node_ids_type  elem_node_ids ;
-  typename mesh_type::node_coords_type    node_coords ;
-  elem_matrices_type                      element_matrices ;
-  elem_vectors_type                       element_vectors ;
-  scalar_type                             coeff_K ;
-  scalar_type                             coeff_Q ;
-
-  ElementComputation( const mesh_type   & arg_mesh ,
-                      const elem_matrices_type  & arg_element_matrices , 
-                      const elem_vectors_type   & arg_element_vectors ,
-                      const scalar_type   arg_coeff_K ,
-                      const scalar_type   arg_coeff_Q )
-  : shape_eval()
-  , elem_node_ids( arg_mesh.elem_node_ids )
-  , node_coords(   arg_mesh.node_coords )
-  , element_matrices( arg_element_matrices )
-  , element_vectors( arg_element_vectors )
-  , coeff_K( arg_coeff_K )
-  , coeff_Q( arg_coeff_Q )
-  {}
-
-public:
-
-  static void apply( const mesh_type  & mesh ,
-                     const elem_matrices_type & elem_matrices ,
-                     const elem_vectors_type  & elem_vectors ,
-                     const scalar_type  elem_coeff_K ,
-                     const scalar_type  elem_coeff_Q )
-  {
-    ElementComputation comp( mesh , elem_matrices , elem_vectors , elem_coeff_K , elem_coeff_Q );
-    const size_t elem_count = mesh.elem_node_ids.dimension_0();
-
-    parallel_for( elem_count , comp );
-  }
-
-  //------------------------------------
-
-  static const unsigned FLOPS_jacobian =
-    FunctionCount * SpatialDim * SpatialDim * 2 ;
-
-  KOKKOS_INLINE_FUNCTION
-  void jacobian( const ScalarCoordType * x, 
-                 const ScalarCoordType * y, 
-                 const ScalarCoordType * z, 
-                 const scalar_type * grad_vals, 
-                 scalar_type * J) const
-  {
-    int i_grad = 0 ;
-
-    for( unsigned i = 0; i < ElementNodeCount ; ++i , i_grad += SpatialDim ) {
-      const scalar_type g0 = grad_vals[ i_grad ];
-      const scalar_type g1 = grad_vals[ i_grad + 1 ];
-      const scalar_type g2 = grad_vals[ i_grad + 2 ];
-      const scalar_type x0 = x[i] ;
-      const scalar_type x1 = y[i] ;
-      const scalar_type x2 = z[i] ;
-
-      J[0] += g0 * x0 ;
-      J[1] += g0 * x1 ;
-      J[2] += g0 * x2 ;
-
-      J[3] += g1 * x0 ;
-      J[4] += g1 * x1 ;
-      J[5] += g1 * x2 ;
-
-      J[6] += g2 * x0 ;
-      J[7] += g2 * x1 ;
-      J[8] += g2 * x2 ;
-    }
-  }
-
-  //------------------------------------
-
-  static const unsigned FLOPS_inverse_and_det = 46 ;
-
-  KOKKOS_INLINE_FUNCTION
-  scalar_type inverse_and_determinant3x3( scalar_type * const J ) const
-  {
-    const scalar_type J00 = J[0];
-    const scalar_type J01 = J[1];
-    const scalar_type J02 = J[2];
-
-    const scalar_type J10 = J[3];
-    const scalar_type J11 = J[4];
-    const scalar_type J12 = J[5];
-
-    const scalar_type J20 = J[6];
-    const scalar_type J21 = J[7];
-    const scalar_type J22 = J[8];
-
-    const scalar_type term0 = J22*J11 - J21*J12;
-    const scalar_type term1 = J22*J01 - J21*J02;
-    const scalar_type term2 = J12*J01 - J11*J02;
-
-    const scalar_type detJ = J00*term0 - J10*term1 + J20*term2;
-    const scalar_type inv_detJ = 1.0/detJ;
-
-    J[0] =  term0*inv_detJ;
-    J[1] = -term1*inv_detJ;
-    J[2] =  term2*inv_detJ;
-
-    J[3] = -(J22*J10 - J20*J12)*inv_detJ;
-    J[4] =  (J22*J00 - J20*J02)*inv_detJ;
-    J[5] = -(J12*J00 - J10*J02)*inv_detJ;
-
-    J[6] =  (J21*J10 - J20*J11)*inv_detJ;
-    J[7] = -(J21*J00 - J20*J01)*inv_detJ;
-    J[8] =  (J11*J00 - J10*J01)*inv_detJ;
-
-    return detJ ;
-  }
-
-  //------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  void matTransMat3x3_X_3xn( const scalar_type * A, int n,
-                             const scalar_type * B,
-                             scalar_type * C ) const
-  {
-    //A is 3x3, B is 3xn. So C is also 3xn.
-    //A,B,C are all assumed to be ordered such that columns are contiguous.
-
-    scalar_type * Cj = C;
-    const scalar_type * Bj = B;
-
-    for(int j=0; j<n; ++j) {
-      Cj[0] = A[0]*Bj[0] + A[1]*Bj[1] + A[2]*Bj[2];
-      Cj[1] = A[3]*Bj[0] + A[4]*Bj[1] + A[5]*Bj[2];
-      Cj[2] = A[6]*Bj[0] + A[7]*Bj[1] + A[8]*Bj[2];
-      Bj += 3;
-      Cj += 3;
-    }
-
-  }
-  //------------------------------------
-
-  static const unsigned FLOPS_contributeDiffusionMatrix = FunctionCount * ( 3 * 5 + FunctionCount * 7 ) ;
-
-  KOKKOS_INLINE_FUNCTION
-  void contributeDiffusionMatrix(
-    const scalar_type weight ,
-    const scalar_type grad_vals[] ,
-    const scalar_type invJ[] ,
-    scalar_type elem_mat[][8] ) const
-  {
-    scalar_type dpsidx[8], dpsidy[8], dpsidz[8];
-
-    int i_grad = 0 ;
-    for( unsigned i = 0; i < FunctionCount ; ++i , i_grad += 3 ) {
-      const scalar_type g0 = grad_vals[i_grad+0];
-      const scalar_type g1 = grad_vals[i_grad+1];
-      const scalar_type g2 = grad_vals[i_grad+2];
-
-      dpsidx[i] = g0 * invJ[0] + g1 * invJ[1] + g2 * invJ[2];
-      dpsidy[i] = g0 * invJ[3] + g1 * invJ[4] + g2 * invJ[5];
-      dpsidz[i] = g0 * invJ[6] + g1 * invJ[7] + g2 * invJ[8];
-    }
-
-    for( unsigned m = 0; m < FunctionCount; m++) {
-      for( unsigned n = 0; n < FunctionCount; n++) {
-
-        elem_mat[m][n] += weight * 
-          ((dpsidx[m] * dpsidx[n]) + 
-           (dpsidy[m] * dpsidy[n]) +
-           (dpsidz[m] * dpsidz[n]));            
-      }
-    }
-  }
-
-  //------------------------------------
-
-  static const unsigned FLOPS_contributeSourceVector = FunctionCount * 2 ;
-
-  KOKKOS_INLINE_FUNCTION
-  void contributeSourceVector( const scalar_type term ,
-                               const scalar_type psi[] ,
-                               scalar_type elem_vec[] ) const
-  {
-     for( unsigned i=0; i< FunctionCount ; ++i) {
-       elem_vec[i] += psi[i] * term ;
-     }
-  }
-
-
-  static const unsigned FLOPS_operator =
-           shape_function_data::PointCount * ( 3
-             + FLOPS_jacobian
-             + FLOPS_inverse_and_det
-             + FLOPS_contributeDiffusionMatrix
-             + FLOPS_contributeSourceVector ) ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( int ielem )const {
-
-    scalar_type elem_vec[8] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
-    scalar_type elem_mat[8][8] =
-      { { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
-        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } };
-
-    ScalarCoordType x[8], y[8], z[8];
-
-    for ( int i = 0 ; i < 8 ; ++i ) {
-      const int node_index = elem_node_ids( ielem , i );
-      x[i] = node_coords( node_index , 0 );
-      y[i] = node_coords( node_index , 1 );
-      z[i] = node_coords( node_index , 2 );
-    }
-
-    // This loop could be parallelized; however,
-    // it would require additional per-thread temporaries
-    // of 'elem_vec' and 'elem_mat' which would
-    // consume more local memory and have to be reduced.
-
-    for ( unsigned i = 0 ; i < shape_function_data::PointCount ; ++i ) {
-
-      scalar_type J[SpatialDim*SpatialDim] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
-
-      jacobian( x, y, z, shape_eval.gradient[i] , J );
-
-      // Overwrite J with its inverse to save scratch memory space.
-      const scalar_type detJ_w   = shape_eval.weight[i] * inverse_and_determinant3x3(J);
-      const scalar_type k_detJ_w = coeff_K * detJ_w ;
-      const scalar_type Q_detJ_w = coeff_Q * detJ_w ;
-
-      contributeDiffusionMatrix( k_detJ_w , shape_eval.gradient[i] , J , elem_mat );
-
-      contributeSourceVector( Q_detJ_w , shape_eval.value[i] , elem_vec );
-    }
-
-    for( size_type i=0; i< ElementNodeCount ; ++i) {
-      element_vectors(ielem, i) = elem_vec[i] ;
-    }
-
-    for( size_type i = 0; i < ElementNodeCount ; i++){
-      for( size_type j = 0; j < ElementNodeCount ; j++){
-        element_matrices(ielem, i, j) = elem_mat[i][j] ;
-      }
-    }
-  }
-}; /* ElementComputation */
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarType , typename ScalarCoordType , class DeviceType >
-struct DirichletBoundary
-{
-  typedef DeviceType     execution_space;
-  typedef typename execution_space::size_type  size_type ;
-
-  static const size_type ElementNodeCount = 8 ;
-
-  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
-  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
-
-  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
-
-  typename mesh_type::node_coords_type node_coords ;
-  matrix_type     matrix ;
-  vector_type     rhs ;
-  ScalarCoordType bc_lower_z ;
-  ScalarCoordType bc_upper_z ;
-  ScalarType      bc_lower_value ;
-  ScalarType      bc_upper_value ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( size_type inode ) const
-  {
-    //  Apply a dirichlet boundary condition to 'irow'
-    //  to maintain the symmetry of the original 
-    //  global stiffness matrix, zero out the columns
-    //  that correspond to boundary conditions, and
-    //  adjust the load vector accordingly
-
-    const size_type iBeg = matrix.graph.row_map[inode];
-    const size_type iEnd = matrix.graph.row_map[inode+1];
-
-    const ScalarCoordType z = node_coords(inode,2);
-    const bool bc_lower = z <= bc_lower_z ;
-    const bool bc_upper = bc_upper_z <= z ;
-
-    if ( bc_lower || bc_upper ) {
-      const ScalarType bc_value = bc_lower ? bc_lower_value
-                                           : bc_upper_value ;
-
-      rhs(inode) = bc_value ; //  set the rhs vector
-
-      //  zero each value on the row, and leave a one
-      //  on the diagonal
-
-      for( size_type i = iBeg ; i < iEnd ; i++) {
-        matrix.coefficients(i) =
-          (int) inode == matrix.graph.entries(i) ? 1 : 0 ;
-      }
-    }
-    else {
-      //  Find any columns that are boundary conditions.
-      //  Clear them and adjust the load vector
-
-      for( size_type i = iBeg ; i < iEnd ; i++ ) {
-        const size_type cnode = matrix.graph.entries(i) ;
-
-        const ScalarCoordType zc = node_coords(cnode,2);
-        const bool c_bc_lower = zc <= bc_lower_z ;
-        const bool c_bc_upper = bc_upper_z <= zc ;
-
-        if ( c_bc_lower || c_bc_upper ) {
-
-          const ScalarType c_bc_value = c_bc_lower ? bc_lower_value
-                                                   : bc_upper_value ;
-
-          rhs( inode ) -= c_bc_value * matrix.coefficients(i);
-
-          matrix.coefficients(i) = 0 ;
-        }
-      }
-    }
-  }
-
-
-  static void apply( const matrix_type & linsys_matrix ,
-                     const vector_type & linsys_rhs ,
-                     const mesh_type   & mesh ,
-                     const ScalarCoordType  bc_lower_z ,
-                     const ScalarCoordType  bc_upper_z ,
-                     const ScalarType       bc_lower_value ,
-                     const ScalarType       bc_upper_value )
-  {
-    const size_t row_count = linsys_matrix.graph.row_map.dimension_0() - 1 ;
-    DirichletBoundary op ;
-    op.node_coords    = mesh.node_coords ;
-    op.matrix         = linsys_matrix ;
-    op.rhs            = linsys_rhs ;
-    op.bc_lower_z     = bc_lower_z ;
-    op.bc_upper_z     = bc_upper_z ;
-    op.bc_lower_value = bc_lower_value ;
-    op.bc_upper_value = bc_upper_value ;
-    parallel_for( row_count , op );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} /* namespace Implicit */
-} /* namespace HybridFEM */
-
--- a/lib/kokkos/example/multi_fem/LinAlgBLAS.hpp
+++ b/lib/kokkos/example/multi_fem/LinAlgBLAS.hpp
@ -1,567 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef USESCASES_LINALG_BLAS_HPP
-#define USESCASES_LINALG_BLAS_HPP
-
-#include <cmath>
-#include <utility>
-#include <ParallelComm.hpp>
-#include <Kokkos_Core.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class Scalar , class Layout , class DeviceType > struct Dot ;
-
-template< class Scalar , class Layout , class DeviceType > struct Dot1 ;
-
-template< typename ScalarA ,
-          typename ScalarY ,
-          class Layout , class Device >
-struct Scale ;
-
-template< typename ScalarA ,
-          typename ScalarY ,
-          class Layout , class Device >
-struct Fill ;
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          typename ScalarY ,
-          class Layout , class Device >
-struct AXPY ;
-
-template< typename ScalarX ,
-          typename ScalarB ,
-          typename ScalarY ,
-          class Layout , class Device >
-struct XPBY ;
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          typename ScalarB ,
-          typename ScalarY ,
-          typename ScalarW ,
-          class Layout , class Device >
-struct WAXPBY ;
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_MPI )
-
-template< typename ScalarX /* Allow mix of const and non-const */ ,
-          typename ScalarY /* Allow mix of const and non-const */ ,
-          class L , class D ,
-          class MX /* Allow any management type */ ,
-          class MY /* Allow any management type */ >
-inline
-double dot( const size_t n ,
-            const View< ScalarX * , L , D , MX > & x ,
-            const View< ScalarY * , L , D , MY > & y ,
-            comm::Machine machine )
-{
-  double global_result = 0 ;
-  double local_result = 0 ;
-
-  Impl::Dot< ScalarX , L , D >( n , x , y , local_result );
-
-  MPI_Allreduce( & local_result , & global_result , 1 ,
-                 MPI_DOUBLE , MPI_SUM , machine.mpi_comm );
-
-  return global_result ;
-}
-
-#else
-
-template< typename ScalarX /* Allow mix of const and non-const */ ,
-          typename ScalarY /* Allow mix of const and non-const */ ,
-          class L , class D ,
-          class MX /* Allow any management type */ ,
-          class MY /* Allow any management type */ >
-inline
-double dot( const size_t n ,
-            const View< ScalarX * , L , D , MX > & x ,
-            const View< ScalarY * , L , D , MY > & y ,
-            comm::Machine )
-{
-  double global_result = 0 ;
-
-  Impl::Dot< ScalarX , L , D >( n , x , y , global_result );
-
-  return global_result ;
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_MPI )
-
-template< typename ScalarX /* Allow mix of const and non-const */ ,
-          class L , class D ,
-          class MX /* Allow any management type */ >
-inline
-double dot( const size_t n ,
-            const View< ScalarX * , L , D , MX > & x ,
-            comm::Machine machine )
-{
-  double global_result = 0 ;
-  double local_result = 0 ;
-
-  Impl::Dot1< ScalarX , L , D >( n , x , local_result );
-
-  MPI_Allreduce( & local_result , & global_result , 1 ,
-                 MPI_DOUBLE , MPI_SUM , machine.mpi_comm );
-
-  return global_result ;
-}
-
-#else
-
-template< typename ScalarX /* Allow mix of const and non-const */ ,
-          class L , class D ,
-          class MX /* Allow any management type */ >
-inline
-double dot( const size_t n ,
-            const View< ScalarX * , L , D , MX > & x ,
-            comm::Machine )
-{
-  double global_result = 0 ;
-
-  Impl::Dot1< ScalarX , L , D >( n , x , global_result );
-
-  return global_result ;
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarX /* Allow mix of const and non-const */ ,
-          class L , class D ,
-          class MX /* Allow any management type */ >
-inline
-double norm2( const size_t n ,
-              const View< ScalarX * , L , D , MX > & x ,
-              comm::Machine machine )
-{
-  return std::sqrt( dot( n , x , machine ) );
-}
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          class L ,
-          class D ,
-          class MX >
-void scale( const size_t n ,
-            const ScalarA & alpha ,
-            const View< ScalarX * , L , D , MX > & x )
-{
-  Impl::Scale< ScalarA , ScalarX , L , D >( n , alpha , x );
-}
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          class L ,
-          class D ,
-          class MX >
-void fill( const size_t n ,
-           const ScalarA & alpha ,
-           const View< ScalarX * , L , D , MX > & x )
-{
-  Impl::Fill< ScalarA , ScalarX , L , D >( n , alpha , x );
-}
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          typename ScalarY ,
-          class L ,
-          class D ,
-          class MX ,
-          class MY >
-void axpy( const size_t n ,
-           const ScalarA & alpha ,
-           const View< ScalarX *, L , D , MX > & x ,
-           const View< ScalarY *, L , D , MY > & y )
-{
-  Impl::AXPY< ScalarA, ScalarX, ScalarY , L , D >( n, alpha, x, y );
-}
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarX ,
-          typename ScalarB ,
-          typename ScalarY ,
-          class L ,
-          class D ,
-          class MX ,
-          class MY >
-void xpby( const size_t n ,
-           const View< ScalarX *, L , D , MX > & x ,
-           const ScalarB & beta ,
-           const View< ScalarY *, L , D , MY > & y )
-{
-  Impl::XPBY< ScalarX, ScalarB, ScalarY , L , D >( n, x, beta, y );
-}
-
-//----------------------------------------------------------------------------
-// w = alpha * x + beta * y
-
-template< typename ScalarA ,
-          typename ScalarX ,
-          typename ScalarB ,
-          typename ScalarY ,
-          typename ScalarW ,
-          class L , class D ,
-          class MX , class MY , class MW >
-void waxpby( const size_t n ,
-             const ScalarA & alpha ,
-             const View< ScalarX * , L , D , MX > & x ,
-             const ScalarB & beta ,
-             const View< ScalarY * , L , D , MY > & y ,
-             const View< ScalarW * , L , D , MW > & w )
-{
-  Impl::WAXPBY<ScalarA,ScalarX,ScalarB,ScalarY,ScalarW,L,D>
-    ( n , alpha , x , beta , y , w );
-}
-
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< typename Scalar , class L , class D >
-struct Dot
-{
-private:
-
-  typedef View< const Scalar*, L, D, MemoryUnmanaged >  vector_const_type ;
-
-  const vector_const_type x ;
-  const vector_const_type y ;
-
-public:
-
-  typedef typename vector_const_type::execution_space  execution_space ; // Manycore device
-  typedef double      value_type ;  // Reduction value
-
-  template< class ArgX , class ArgY >
-  inline
-  Dot( const size_t n , const ArgX & arg_x , const ArgY & arg_y , double & result )
-    : x( arg_x ), y( arg_y )
-  {
-    parallel_reduce( n , *this , result );
-  }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i , value_type & update ) const
-  { update += x(i) * y(i); }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
-                    const volatile value_type & source )
-  { update += source;    }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init( value_type & update )
-  { update = 0 ; }
-}; // Dot
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class L , class D >
-struct Dot1
-{
-private:
-
-  typedef View< const Scalar*, L, D , MemoryUnmanaged >  vector_const_type ;
-
-  const vector_const_type x ;
-
-public:
-
-  typedef typename vector_const_type::execution_space  execution_space ; // Manycore device
-  typedef double      value_type ;  // Reduction value
-
-  template< class ArgX >
-  inline
-  Dot1( const size_t n , const ArgX & arg_x , double & result )
-    : x( arg_x )
-  {
-    parallel_reduce( n , *this , result );
-  }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i , value_type & update ) const
-  { update += x(i) * x(i) ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
-                    const volatile value_type & source )
-  { update += source ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init( value_type & update )
-  { update = 0 ; }
-}; // Dot
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template < typename ScalarA ,
-           typename ScalarX ,
-           typename ScalarB ,
-           typename ScalarY ,
-           typename ScalarW ,
-           class L , class D >
-struct WAXPBY
-{
-private:
-
-  typedef View<       ScalarW *, L , D , MemoryUnmanaged > ViewW ;
-  typedef View< const ScalarX *, L , D , MemoryUnmanaged > ViewX ;
-  typedef View< const ScalarY *, L , D , MemoryUnmanaged > ViewY ;
-
-  const ViewW    w ;
-  const ViewX    x ;
-  const ViewY    y ;
-  const ScalarA  alpha ;
-  const ScalarB  beta ;
-
-public:
-
-  typedef typename ViewW::execution_space  execution_space ;
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType inode ) const
-  {
-    w(inode) = alpha * x(inode) + beta * y(inode);
-  }
-
-  template< class ArgX , class ArgY , class ArgW >
-  inline
-  WAXPBY( const size_t  n ,
-          const ScalarA & arg_alpha ,
-          const ArgX    & arg_x ,
-          const ScalarB & arg_beta ,
-          const ArgY    & arg_y ,
-          const ArgW    & arg_w )
-    : w( arg_w ), x( arg_x ), y( arg_y )
-    , alpha( arg_alpha ), beta( arg_beta )
-  {
-    parallel_for( n , *this );
-  }
-}; // WAXPBY
-
-//----------------------------------------------------------------------------
-
-template < typename ScalarB ,
-           typename ScalarW ,
-           class L , class D >
-struct Scale
-{
-private:
-
-  typedef View< ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
-  const ViewW    w ;
-  const ScalarB  beta ;
-
-public:
-
-  typedef typename ViewW::execution_space  execution_space ;
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i ) const
-  { w(i) *= beta ; }
-
-  template< class ArgW >
-  inline
-  Scale( const size_t n , const ScalarB & arg_beta , const ArgW & arg_w )
-    : w( arg_w )
-    , beta( arg_beta )
-  {
-    parallel_for( n , *this );
-  }
-};
-
-template < typename ScalarB ,
-           typename ScalarW ,
-           class L , class D >
-struct Fill
-{
-private:
-
-  typedef View< ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
-  const ViewW    w ;
-  const ScalarB  beta ;
-
-public:
-
-  typedef typename ViewW::execution_space  execution_space ;
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i ) const
-  { w(i) = beta ; }
-
-  template< class ArgW >
-  inline
-  Fill( const size_t n , const ScalarB & arg_beta , const ArgW & arg_w )
-    : w( arg_w )
-    , beta( arg_beta )
-  {
-    parallel_for( n , *this );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template < typename ScalarA ,
-           typename ScalarX ,
-           typename ScalarW ,
-           class L , class D >
-struct AXPY
-{
-private:
-
-  typedef View<       ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
-  typedef View< const ScalarX *, L , D , MemoryUnmanaged >  ViewX ;
-
-  const ViewW    w ;
-  const ViewX    x ;
-  const ScalarA  alpha ;
-
-public:
-
-  typedef typename ViewW::execution_space  execution_space ;
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i ) const
-  { w(i) += alpha * x(i); }
-
-  template< class ArgX , class ArgW >
-  inline
-  AXPY( const size_t  n ,
-        const ScalarA & arg_alpha ,
-        const ArgX    & arg_x ,
-        const ArgW    & arg_w )
-    : w( arg_w ), x( arg_x )
-    , alpha( arg_alpha )
-  {
-    parallel_for( n , *this );
-  }
-}; // AXPY
-
-template< typename ScalarX ,
-          typename ScalarB ,
-          typename ScalarW ,
-          class L , class D >
-struct XPBY
-{
-private:
-
-  typedef View<       ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
-  typedef View< const ScalarX *, L , D , MemoryUnmanaged >  ViewX ;
-
-  const ViewW    w ;
-  const ViewX    x ;
-  const ScalarB  beta ;
-
-public:
-
-  typedef typename ViewW::execution_space  execution_space ;
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const iType & i ) const
-  { w(i) = x(i) + beta * w(i); }
-
-  template< class ArgX , class ArgW >
-  inline
-  XPBY( const size_t  n ,
-        const ArgX    & arg_x ,
-        const ScalarB & arg_beta ,
-        const ArgW    & arg_w )
-    : w( arg_w ), x( arg_x )
-    , beta( arg_beta )
-  {
-    parallel_for( n , *this );
-  }
-}; // XPBY
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef USESCASES_LINALG_BLAS_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/Makefile
+++ b/lib/kokkos/example/multi_fem/Makefile
@ -1,53 +0,0 @@
-KOKKOS_PATH ?= ../..
-
-MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-SRC_DIR := $(dir $(MAKEFILE_PATH))
-
-SRC = $(wildcard $(SRC_DIR)/*.cpp)
-OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
-
-#SRC = $(wildcard *.cpp)
-#OBJ = $(SRC:%.cpp=%.o)
-
-default: build
-	echo "Start Build"
-
-# use installed Makefile.kokkos
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = $(NVCC_WRAPPER)
-CXXFLAGS = -I$(SRC_DIR) -I$(CUDA_PATH) -O3
-LINK = $(CXX)
-LINKFLAGS = -L$(CUDA_PATH)/lib64 -lcusparse
-EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "Cuda,OpenMP"
-#KOKKOS_ARCH = "SNB,Kepler35"
-else
-CXX = g++
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS =  
-EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "OpenMP"
-#KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-LIB =
-
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: 
-	rm -f *.a *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
-
--- a/lib/kokkos/example/multi_fem/Nonlinear.hpp
+++ b/lib/kokkos/example/multi_fem/Nonlinear.hpp
@ -1,573 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef HYBRIDFEM_NONLINEAR_HPP
-#define HYBRIDFEM_NONLINEAR_HPP
-
-#include <utility>
-#include <iostream>
-#include <iomanip>
-
-#include <Kokkos_Core.hpp>
-#include <SparseLinearSystem.hpp>
-#include <SparseLinearSystemFill.hpp>
-#include <NonlinearFunctors.hpp>
-
-#include <FEMesh.hpp>
-#include <HexElement.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace HybridFEM {
-namespace Nonlinear {
-
-struct PerformanceData {
-  double mesh_time ;
-  double graph_time ;
-  double elem_time ;
-  double matrix_gather_fill_time ;
-  double matrix_boundary_condition_time ;
-  double cg_iteration_time ;
-  size_t cg_iteration_count ;
-  size_t newton_iteration_count ;
-  double error_max ;
-
-  PerformanceData()
-    : mesh_time(0)
-    , graph_time(0)
-    , elem_time(0)
-    , matrix_gather_fill_time(0)
-    , matrix_boundary_condition_time(0)
-    , cg_iteration_time(0)
-    , cg_iteration_count(0)
-    , newton_iteration_count(0)
-    , error_max(0)
-    {}
-
-  void best( const PerformanceData & rhs )
-  {
-    mesh_time = std::min( mesh_time , rhs.mesh_time );
-    graph_time = std::min( graph_time , rhs.graph_time );
-    elem_time = std::min( elem_time , rhs.elem_time );
-    matrix_gather_fill_time = std::min( matrix_gather_fill_time , rhs.matrix_gather_fill_time );
-    matrix_boundary_condition_time = std::min( matrix_boundary_condition_time , rhs.matrix_boundary_condition_time );
-    cg_iteration_time = std::min( cg_iteration_time , rhs.cg_iteration_time );
-    cg_iteration_count = std::min( cg_iteration_count , rhs.cg_iteration_count );
-    newton_iteration_count = std::min( newton_iteration_count , rhs.newton_iteration_count );
-    error_max = std::min( error_max , rhs.error_max );
-  }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-class ManufacturedSolution {
-public:
-
-  // Manufactured solution for one dimensional nonlinear PDE
-  //
-  //  -K T_zz + T^2 = 0 ; T(zmin) = T_zmin ; T(zmax) = T_zmax
-  //
-  //  Has an analytic solution of the form:
-  //
-  //    T(z) = ( a ( z - zmin ) + b )^(-2) where K = 1 / ( 6 a^2 )
-  //
-  //  Given T_0 and T_L compute K for this analytic solution.
-  //
-  //  Two analytic solutions:
-  //
-  //    Solution with singularity:
-  //    , a( ( 1.0 / sqrt(T_zmax) + 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
-  //    , b( -1.0 / sqrt(T_zmin) )
-  //
-  //    Solution without singularity:
-  //    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
-  //    , b( 1.0 / sqrt(T_zmin) )
-
-  const double zmin ;
-  const double zmax ;
-  const double T_zmin ;
-  const double T_zmax ;
-  const double a ;
-  const double b ;
-  const double K ;
-
-  ManufacturedSolution( const double arg_zmin ,
-                        const double arg_zmax ,
-                        const double arg_T_zmin ,
-                        const double arg_T_zmax )
-    : zmin( arg_zmin )
-    , zmax( arg_zmax )
-    , T_zmin( arg_T_zmin )
-    , T_zmax( arg_T_zmax )
-    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
-    , b( 1.0 / sqrt(T_zmin) )
-    , K( 1.0 / ( 6.0 * a * a ) )
-    {}
-
-  double operator()( const double z ) const
-  {
-    const double tmp = a * ( z - zmin ) + b ;
-    return 1.0 / ( tmp * tmp );
-  }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class FixtureType >
-PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
-                     const int , // global_max_x ,
-                     const int , // global_max_y ,
-                     const int global_max_z ,
-                     const bool print_error )
-{
-  typedef Scalar                              scalar_type ;
-  typedef FixtureType                         fixture_type ;
-  typedef typename fixture_type::execution_space  execution_space;
-  //typedef typename execution_space::size_type     size_type ; // unused
-
-  typedef typename fixture_type::FEMeshType mesh_type ;
-  typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;
-
-  enum { ElementNodeCount = fixture_type::element_node_count };
-
-  const comm::Machine machine = mesh.parallel_data_map.machine ;
-
-  const size_t element_count = mesh.elem_node_ids.dimension_0();
-
-  //------------------------------------
-  // The amount of nonlinearity is proportional to the ratio
-  // between T(zmax) and T(zmin).  For the manufactured solution
-  // 0 < T(zmin) and 0 < T(zmax)
-
-  const ManufacturedSolution
-    exact_solution( /* zmin */ 0 ,
-                    /* zmax */ global_max_z ,
-                    /* T(zmin) */ 1 ,
-                    /* T(zmax) */ 20 );
-
-  //-----------------------------------
-  // Convergence Criteria and perf data:
-
-  const size_t cg_iteration_limit = 200 ;
-  const double cg_tolerance = 1e-14 ;
-
-  const size_t newton_iteration_limit = 150 ;
-  const double newton_tolerance = 1e-14 ;
-
-  size_t cg_iteration_count_total = 0 ;
-  double cg_iteration_time = 0 ;
-
-  size_t newton_iteration_count = 0 ;
-  double residual_norm_init = 0 ;
-  double residual_norm = 0 ;
-
-  PerformanceData perf_data ;
-
-  //------------------------------------
-  // Sparse linear system types:
-
-  typedef Kokkos::View< scalar_type* , execution_space >     vector_type ;
-  typedef Kokkos::CrsMatrix< scalar_type , execution_space >  matrix_type ;
-  typedef typename matrix_type::graph_type                matrix_graph_type ;
-  typedef typename matrix_type::coefficients_type         matrix_coefficients_type ;
-
-  typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;
-
-  //------------------------------------
-  // Problem setup types:
-
-  typedef ElementComputation < mesh_type , scalar_type > ElementFunctor ;
-  typedef DirichletSolution  < mesh_type , scalar_type > DirichletSolutionFunctor ;
-  typedef DirichletResidual  < mesh_type , scalar_type > DirichletResidualFunctor ;
-
-  typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
-  typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;
-
-  typedef GatherFill< matrix_type ,
-                      mesh_type ,
-                      elem_matrices_type ,
-                      elem_vectors_type > GatherFillFunctor ;
-
-  //------------------------------------
-
-  matrix_type jacobian ;
-  vector_type residual ;
-  vector_type delta ;
-  vector_type nodal_solution ;
-
-  typename graph_factory::element_map_type element_map ;
-
-  //------------------------------------
-  // Generate mesh and corresponding sparse matrix graph
-
-  Kokkos::Impl::Timer wall_clock ;
-
-  //------------------------------------
-  // Generate sparse matrix graph and element->graph map.
-
-  wall_clock.reset();
-
-  graph_factory::create( mesh , jacobian.graph , element_map );
-
-  execution_space::fence();
-
-  perf_data.graph_time = comm::max( machine , wall_clock.seconds() );
-
-  //------------------------------------
-  // Allocate linear system coefficients and rhs:
-
-  const size_t local_owned_length = jacobian.graph.row_map.dimension_0() - 1 ;
-  const size_t local_total_length = mesh.node_coords.dimension_0();
-
-  jacobian.coefficients =
-    matrix_coefficients_type( "jacobian_coeff" , jacobian.graph.entries.dimension_0() );
-
-  // Nonlinear residual for owned nodes:
-  residual = vector_type( "residual" , local_owned_length );
-
-  // Nonlinear solution for owned and ghosted nodes:
-  nodal_solution = vector_type( "solution" , local_total_length );
-
-  // Nonlinear solution update for owned nodes:
-  delta = vector_type( "delta" , local_owned_length );
-
-  //------------------------------------
-  // Allocation of arrays to fill the linear system
-
-  elem_matrices_type elem_matrices ; // Jacobian matrices
-  elem_vectors_type  elem_vectors ;  // Residual vectors
-
-  if ( element_count ) {
-    elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
-    elem_vectors = elem_vectors_type( std::string("elem_vectors"), element_count );
-  }
-
-  //------------------------------------
-  // For boundary condition set the correct values in the solution vector
-  //   The 'zmin' face is assigned to 'T_zmin'.
-  //   The 'zmax' face is assigned to 'T_zmax'.
-  //   The resulting solution is one dimensional along the 'Z' axis.
-
-  DirichletSolutionFunctor::apply( nodal_solution , mesh ,
-                                   exact_solution.zmin ,
-                                   exact_solution.zmax ,
-                                   exact_solution.T_zmin ,
-                                   exact_solution.T_zmax );
-
-  for(;;) { // Nonlinear loop
-
-#if defined( KOKKOS_HAVE_MPI )
-
-    { //------------------------------------
-      // Import off-processor nodal solution values
-      // for residual and jacobian computations
-
-      Kokkos::AsyncExchange< typename vector_type::value_type , execution_space ,
-                                  Kokkos::ParallelDataMap >
-        exchange( mesh.parallel_data_map , 1 );
-
-      Kokkos::PackArray< vector_type >
-        ::pack( exchange.buffer() ,
-                mesh.parallel_data_map.count_interior ,
-                mesh.parallel_data_map.count_send ,
-                nodal_solution );
-
-      exchange.setup();
-
-      exchange.send_receive();
-
-      Kokkos::UnpackArray< vector_type >
-        ::unpack( nodal_solution , exchange.buffer() ,
-                  mesh.parallel_data_map.count_owned ,
-                  mesh.parallel_data_map.count_receive );
-    }
-
-#endif
-
-    //------------------------------------
-    // Compute element matrices and vectors:
-
-    wall_clock.reset();
-
-    ElementFunctor( mesh ,
-                    elem_matrices ,
-                    elem_vectors ,
-                    nodal_solution ,
-                    exact_solution.K );
-
-    execution_space::fence();
-    perf_data.elem_time += comm::max( machine , wall_clock.seconds() );
-
-    //------------------------------------
-    // Fill linear system coefficients:
-
-    wall_clock.reset();
-
-    fill( jacobian.coefficients.dimension_0(), 0 , jacobian.coefficients );
-    fill( residual.dimension_0() , 0 , residual );
-
-    GatherFillFunctor::apply( jacobian ,
-                              residual ,
-                              mesh ,
-                              element_map ,
-                              elem_matrices ,
-                              elem_vectors );
-
-    execution_space::fence();
-    perf_data.matrix_gather_fill_time += comm::max( machine , wall_clock.seconds() );
-
-    // Apply boundary conditions:
-
-    wall_clock.reset();
-
-    // Updates jacobian matrix to 1 on the diagonal, zero elsewhere,
-    // and 0 in the residual due to the solution vector having the correct value
-    DirichletResidualFunctor::apply( jacobian, residual, mesh ,
-                                     exact_solution.zmin ,
-                                     exact_solution.zmax );
-
-    execution_space::fence();
-    perf_data.matrix_boundary_condition_time +=
-      comm::max( machine , wall_clock.seconds() );
-
-    //------------------------------------
-    // Has the residual converged?
-
-    residual_norm = norm2( mesh.parallel_data_map.count_owned,
-                           residual,
-                           mesh.parallel_data_map.machine );
-
-    if ( 0 == newton_iteration_count ) {
-      residual_norm_init = residual_norm ;
-    }
-
-    if ( residual_norm / residual_norm_init < newton_tolerance ) {
-      break ;
-    }
-
-    //------------------------------------
-    // Solve linear sytem
-
-    size_t cg_iteration_count = 0 ;
-    double cg_residual_norm = 0 ;
-
-    cgsolve( mesh.parallel_data_map ,
-             jacobian , residual , delta ,
-             cg_iteration_count ,
-             cg_residual_norm ,
-             cg_iteration_time ,
-             cg_iteration_limit , cg_tolerance ) ;
-
-    perf_data.cg_iteration_time += cg_iteration_time ;
-    cg_iteration_count_total += cg_iteration_count ;
-
-    // Update non-linear solution with delta...
-    // delta is : - Dx = [Jacobian]^1 * Residual which is the negative update
-    // LaTeX:
-    // \vec {x}_{n+1} = \vec {x}_{n} - ( - \Delta \vec{x}_{n} )
-    // text:
-    // x[n+1] = x[n] + Dx
-
-    axpy( mesh.parallel_data_map.count_owned ,
-          -1.0, delta, nodal_solution);
-
-    ++newton_iteration_count ;
-
-    if ( newton_iteration_limit < newton_iteration_count ) {
-      break ;
-    }
-  };
-
-  if ( newton_iteration_count ) {
-    perf_data.elem_time /= newton_iteration_count ;
-    perf_data.matrix_gather_fill_time /= newton_iteration_count ;
-    perf_data.matrix_boundary_condition_time /= newton_iteration_count ;
-  }
-
-  if ( cg_iteration_count_total ) {
-    perf_data.cg_iteration_time /= cg_iteration_count_total ;
-  }
-
-  perf_data.newton_iteration_count = newton_iteration_count ;
-  perf_data.cg_iteration_count = cg_iteration_count_total ;
-
-  //------------------------------------
-
-  {
-    // For extracting the nodal solution and its coordinates:
-
-    typename mesh_type::node_coords_type::HostMirror node_coords_host =
-      Kokkos::create_mirror( mesh.node_coords );
-
-    typename vector_type::HostMirror nodal_solution_host =
-      Kokkos::create_mirror( nodal_solution );
-
-    Kokkos::deep_copy( node_coords_host , mesh.node_coords );
-    Kokkos::deep_copy( nodal_solution_host , nodal_solution );
-
-    double tmp = 0 ;
-
-    for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
-      const coordinate_scalar_type x = node_coords_host(i,0);
-      const coordinate_scalar_type y = node_coords_host(i,1);
-      const coordinate_scalar_type z = node_coords_host(i,2);
-
-      const double Tx = exact_solution(z);
-      const double Ts = nodal_solution_host(i);
-      const double Te = std::abs( Tx - Ts ) / std::abs( Tx );
-
-      tmp = std::max( tmp , Te );
-
-      if ( print_error && 0.02 < Te ) {
-        std::cout << "  node( " << x << " " << y << " " << z << " ) = "
-                  << Ts << " != exact_solution " << Tx
-                  << std::endl ;
-      }
-    }
-    perf_data.error_max = comm::max( machine , tmp );
-  }
-
-  return perf_data ;
-}
-
-//----------------------------------------------------------------------------
-
-template< typename Scalar , class Device , class FixtureElement >
-void driver( const char * const label ,
-             comm::Machine machine ,
-             const int gang_count ,
-             const int elem_count_beg ,
-             const int elem_count_end ,
-             const int runs )
-{
-  typedef Scalar          scalar_type ;
-  typedef Device          execution_space ;
-  typedef double          coordinate_scalar_type ;
-  typedef FixtureElement  fixture_element_type ;
-
-  typedef BoxMeshFixture< coordinate_scalar_type ,
-                          execution_space ,
-                          fixture_element_type > fixture_type ;
-
-  typedef typename fixture_type::FEMeshType mesh_type ;
-
-  const size_t proc_count = comm::size( machine );
-  const size_t proc_rank  = comm::rank( machine );
-
-  if ( elem_count_beg == 0 || elem_count_end == 0 || runs == 0 ) return ;
-
-  if ( comm::rank( machine ) == 0 ) {
-    std::cout << std::endl ;
-    std::cout << "\"Kokkos::HybridFE::Nonlinear " << label << "\"" << std::endl;
-    std::cout
-      << "\"Size\" ,  \"Size\" ,  \"Graphing\" , \"Element\" ,  \"Fill\" ,     \"Boundary\" , \"CG-Iter\" , \"CG-Iter\" ,      \"Newton-Iter\" , \"Max-node-error\""
-      << std::endl
-      << "\"elems\" , \"nodes\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"total-count\" , \"total-count\" , \"ratio\""
-      << std::endl ;
-  }
-
-  const bool print_sample = 0 ;
-  const double x_curve = 1.0 ;
-  const double y_curve = 1.0 ;
-  const double z_curve = 0.8 ;
-
-  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
-  {
-    const int ix = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
-    const int iy = 1 + ix ;
-    const int iz = 2 * iy ;
-    const int global_elem_count = ix * iy * iz ;
-    const int global_node_count = ( 2 * ix + 1 ) *
-                                  ( 2 * iy + 1 ) *
-                                  ( 2 * iz + 1 );
-
-    mesh_type mesh =
-      fixture_type::create( proc_count , proc_rank , gang_count ,
-                            ix , iy , iz ,
-                            x_curve , y_curve , z_curve );
-
-    mesh.parallel_data_map.machine = machine ;
-
-
-    PerformanceData perf_data , perf_best ;
-
-    for(int j = 0; j < runs; j++){
-
-      perf_data = run<scalar_type,fixture_type>(mesh,ix,iy,iz, print_sample );
-
-      if( j == 0 ) {
-        perf_best = perf_data ;
-      }
-      else {
-        perf_best.best( perf_data );
-      }
-    }
-
-    if ( comm::rank( machine ) == 0 ) {
-
-      std::cout << std::setw(8) << global_elem_count << " , "
-                << std::setw(8) << global_node_count << " , "
-                << std::setw(10) << perf_best.graph_time * 1000 << " , "
-                << std::setw(10) << perf_best.elem_time * 1000 << " , "
-                << std::setw(10) << perf_best.matrix_gather_fill_time * 1000 << " , "
-                << std::setw(10) << perf_best.matrix_boundary_condition_time * 1000 << " , "
-                << std::setw(10) << perf_best.cg_iteration_time * 1000 << " , "
-                << std::setw(7) << perf_best.cg_iteration_count << " , "
-                << std::setw(3) << perf_best.newton_iteration_count << " , "
-                << std::setw(10) << perf_best.error_max
-                << std::endl ;
-    }
-  }
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace Nonlinear */
-} /* namespace HybridFEM */
-
-
-#endif /* #ifndef HYBRIDFEM_IMPLICIT_HPP */
-
--- a/lib/kokkos/example/multi_fem/NonlinearElement_Cuda.hpp
+++ b/lib/kokkos/example/multi_fem/NonlinearElement_Cuda.hpp
@ -1,390 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-#include <stdio.h>
-
-#include <iostream>
-#include <fstream>
-#include <iomanip>
-#include <cstdlib>
-#include <cmath>
-
-#include <Kokkos_Core.hpp>
-#include <HexElement.hpp>
-#include <FEMesh.hpp>
-
-namespace HybridFEM {
-namespace Nonlinear {
-
-template< class MeshType , typename ScalarType > struct ElementComputation ;
-
-//----------------------------------------------------------------------------
-
-template<>
-struct ElementComputation< FEMesh< double , 27 , Kokkos::Cuda > , double >
-{
-  typedef Kokkos::Cuda    execution_space ;
-
-  static const unsigned ElementNodeCount = 27 ;
-
-  typedef HexElement_Data< ElementNodeCount >                element_data_type ;
-  typedef FEMesh< double , ElementNodeCount , execution_space >  mesh_type ;
-
-  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
-  static const unsigned FunctionCount    = element_data_type::function_count ;
-  static const unsigned IntegrationCount = element_data_type::integration_count ;
-  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
-
-  typedef Kokkos::View< double[][FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
-  typedef Kokkos::View< double[][FunctionCount] , execution_space > elem_vectors_type ;
-  typedef Kokkos::View< double[] , execution_space > value_vector_type ;
-
-private:
-
-  const element_data_type                       elem_data ;
-  const typename mesh_type::elem_node_ids_type  elem_node_ids ;
-  const typename mesh_type::node_coords_type    node_coords ;
-  const value_vector_type                       nodal_values ;
-  const elem_matrices_type                      element_matrices ;
-  const elem_vectors_type                       element_vectors ;
-  const float                                   coeff_K ;
-  const unsigned                                elem_count ;
-        unsigned                                invJacIndex[9][4] ;
-
-  static const unsigned j11 = 0 , j12 = 1 , j13 = 2 ,
-                        j21 = 3 , j22 = 4 , j23 = 5 ,
-                        j31 = 6 , j32 = 7 , j33 = 8 ;
-
-  // Can only handle up to 16 warps:
-  static const unsigned BlockDimX = 32 ;
-  static const unsigned BlockDimY = 7 ;
-
-  struct WorkSpace {
-    double sum[ BlockDimY ][ BlockDimX ];
-
-    double  value_at_integ[ IntegrationCount ];
-    double  gradx_at_integ[ IntegrationCount ];
-    double  grady_at_integ[ IntegrationCount ];
-    double  gradz_at_integ[ IntegrationCount ];
-
-    float  spaceJac[    BlockDimY ][ 9 ];
-    float  spaceInvJac[ BlockDimY ][ 9 ];
-
-    float  detJweight[ IntegrationCount ];
-
-    float  dpsidx[ FunctionCount ][ IntegrationCount ];
-    float  dpsidy[ FunctionCount ][ IntegrationCount ];
-    float  dpsidz[ FunctionCount ][ IntegrationCount ];
-  };
-
-public:
-
-  ElementComputation ( const mesh_type          & arg_mesh ,
-                       const elem_matrices_type & arg_element_matrices ,
-                       const elem_vectors_type  & arg_element_vectors ,
-                       const value_vector_type  & arg_nodal_values ,
-                       const float                arg_coeff_K )
-  : elem_data()
-  , elem_node_ids(    arg_mesh.elem_node_ids )
-  , node_coords(      arg_mesh.node_coords )
-  , nodal_values(     arg_nodal_values )
-  , element_matrices( arg_element_matrices )
-  , element_vectors(  arg_element_vectors )
-  , coeff_K(          arg_coeff_K )
-  , elem_count(       arg_mesh.elem_node_ids.dimension_0() )
-  {
-    const unsigned jInvJ[9][4] = 
-     { { j22 , j33 , j23 , j32 } ,
-       { j13 , j32 , j12 , j33 } ,
-       { j12 , j23 , j13 , j22 } ,
-
-       { j23 , j31 , j21 , j33 } ,
-       { j11 , j33 , j13 , j31 } ,
-       { j13 , j21 , j11 , j23 } ,
-
-       { j21 , j32 , j22 , j31 } ,
-       { j12 , j31 , j11 , j32 } ,
-       { j11 , j22 , j12 , j21 } };
-
-    for ( unsigned i = 0 ; i < 9 ; ++i ) {
-    for ( unsigned j = 0 ; j < 4 ; ++j ) {
-      invJacIndex[i][j] = jInvJ[i][j] ;
-    }
-    }
-
-    const unsigned shmem = sizeof(WorkSpace);
-    const unsigned grid_max = 65535 ;
-    const unsigned grid_count = std::min( grid_max , elem_count );
-
-    // For compute capability 2.x up to 1024 threads per block
-    const dim3 block( BlockDimX , BlockDimY , 1 );
-    const dim3 grid( grid_count , 1 , 1 );
-
-    Kokkos::Impl::CudaParallelLaunch< ElementComputation >( *this , grid , block , shmem );
-  }
-
-public:
-
-  //------------------------------------
-  // Sum among the threadIdx.x 
-
-  template< typename Type >
-  __device__ inline static
-  void sum_x( Type & result , const double value )
-  {
-    extern __shared__ WorkSpace work_data[] ;
-
-    volatile double * const base_sum =
-      & work_data->sum[ threadIdx.y ][ threadIdx.x ] ;
-
-    base_sum[ 0] = value ;
-
-    if ( threadIdx.x < 16 ) {
-      base_sum[0] += base_sum[16];
-      base_sum[0] += base_sum[ 8];
-      base_sum[0] += base_sum[ 4];
-      base_sum[0] += base_sum[ 2];
-      base_sum[0] += base_sum[ 1];
-    }
-
-    if ( 0 == threadIdx.x ) {
-      result = base_sum[0] ;
-    }
-  }
-
-  __device__ inline static
-  void sum_x_clear()
-  {
-    extern __shared__ WorkSpace work_data[] ;
-
-    work_data->sum[ threadIdx.y ][ threadIdx.x ] = 0 ;
-  }
-
-  //------------------------------------
-  //------------------------------------
-
-  __device__ inline
-  void evaluateFunctions( const unsigned ielem ) const
-  {
-    extern __shared__ WorkSpace work_data[] ;
-
-    // Each warp (threadIdx.y) computes an integration point
-    // Each thread is responsible for a node / function.
-
-    const unsigned iFunc = threadIdx.x ;
-    const bool     hasFunc = iFunc < FunctionCount ;
-
-    //------------------------------------
-    // Each warp gathers a different variable into 'elem_mat' shared memory.
-
-    if ( hasFunc ) {
-
-      const unsigned node = elem_node_ids( ielem , iFunc );
-
-      for ( unsigned iy = threadIdx.y ; iy < 4 ; iy += blockDim.y ) {
-      switch( iy ) {
-      case 0 : work_data->sum[0][iFunc] = node_coords(node,0); break ;
-      case 1 : work_data->sum[1][iFunc] = node_coords(node,1); break ;
-      case 2 : work_data->sum[2][iFunc] = node_coords(node,2); break ;
-      case 3 : work_data->sum[3][iFunc] = nodal_values(node); break ;
-      default: break ;
-      }
-      }
-    }
-
-    __syncthreads(); // Wait for all warps to finish gathering
-
-    // now get local 'const' copies in register space:
-
-    const double x       = work_data->sum[0][ iFunc ];
-    const double y       = work_data->sum[1][ iFunc ];
-    const double z       = work_data->sum[2][ iFunc ];
-    const double dof_val = work_data->sum[3][ iFunc ];
-
-    __syncthreads(); // Wait for all warps to finish extracting
-
-    sum_x_clear(); // Make sure summation scratch is zero
-
-    //------------------------------------
-    // Each warp is now on its own computing an integration point
-    // so no further explicit synchronizations are required.
-
-    if ( hasFunc ) {
-
-      float * const J    = work_data->spaceJac[    threadIdx.y ];
-      float * const invJ = work_data->spaceInvJac[ threadIdx.y ];
-
-      for ( unsigned iInt = threadIdx.y ;
-                     iInt < IntegrationCount ; iInt += blockDim.y ) {
-
-        const float val = elem_data.values[iInt][iFunc] ;
-        const float gx  = elem_data.gradients[iInt][0][iFunc] ;
-        const float gy  = elem_data.gradients[iInt][1][iFunc] ;
-        const float gz  = elem_data.gradients[iInt][2][iFunc] ;
-
-        sum_x( J[j11], gx * x );
-        sum_x( J[j12], gx * y );
-        sum_x( J[j13], gx * z );
-
-        sum_x( J[j21], gy * x );
-        sum_x( J[j22], gy * y );
-        sum_x( J[j23], gy * z );
-
-        sum_x( J[j31], gz * x );
-        sum_x( J[j32], gz * y );
-        sum_x( J[j33], gz * z );
-
-        // Inverse jacobian, only enough parallel work for 9 threads in the warp
-
-        if ( iFunc < TensorDim ) {
-
-          invJ[ iFunc ] =
-            J[ invJacIndex[iFunc][0] ] * J[ invJacIndex[iFunc][1] ] -
-            J[ invJacIndex[iFunc][2] ] * J[ invJacIndex[iFunc][3] ] ;
-
-          // Let all threads in the warp compute determinant into a register
-
-          const float detJ = J[j11] * invJ[j11] +
-                             J[j21] * invJ[j12] +
-                             J[j31] * invJ[j13] ;
-
-          invJ[ iFunc ] /= detJ ;
-
-          if ( 0 == iFunc ) {
-            work_data->detJweight[ iInt ] = detJ * elem_data.weights[ iInt ] ;
-          }
-        }
-
-        // Transform bases gradients and compute value and gradient
-
-        const float dx = gx * invJ[j11] + gy * invJ[j12] + gz * invJ[j13];
-        const float dy = gx * invJ[j21] + gy * invJ[j22] + gz * invJ[j23];
-        const float dz = gx * invJ[j31] + gy * invJ[j32] + gz * invJ[j33];
-
-        work_data->dpsidx[iFunc][iInt] = dx ;
-        work_data->dpsidy[iFunc][iInt] = dy ;
-        work_data->dpsidz[iFunc][iInt] = dz ;
-
-        sum_x( work_data->gradx_at_integ[iInt] , dof_val * dx );
-        sum_x( work_data->grady_at_integ[iInt] , dof_val * dy );
-        sum_x( work_data->gradz_at_integ[iInt] , dof_val * dz );
-        sum_x( work_data->value_at_integ[iInt] , dof_val * val );
-      }
-    }
-
-    __syncthreads(); // All shared data must be populated at return.
-  }
-
-  __device__ inline
-  void contributeResidualJacobian( const unsigned ielem ) const
-  {
-    extern __shared__ WorkSpace work_data[] ;
-
-    sum_x_clear(); // Make sure summation scratch is zero
-
-    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$ 
-    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$ 
-
-    const unsigned iInt = threadIdx.x ;
-
-    if ( iInt < IntegrationCount ) {
-
-      const double value_at_integ = work_data->value_at_integ[ iInt ] ;
-      const double gradx_at_integ = work_data->gradx_at_integ[ iInt ] ;
-      const double grady_at_integ = work_data->grady_at_integ[ iInt ] ;
-      const double gradz_at_integ = work_data->gradz_at_integ[ iInt ] ;
-
-      const float detJweight     = work_data->detJweight[ iInt ] ;
-      const float coeff_K_detJweight = coeff_K * detJweight ;
-
-      for ( unsigned iRow = threadIdx.y ;
-                     iRow < FunctionCount ; iRow += blockDim.y ) {
-
-        const float value_row  = elem_data.values[ iInt ][ iRow ] * detJweight ;
-        const float dpsidx_row = work_data->dpsidx[ iRow ][ iInt ] * coeff_K_detJweight ;
-        const float dpsidy_row = work_data->dpsidy[ iRow ][ iInt ] * coeff_K_detJweight ;
-        const float dpsidz_row = work_data->dpsidz[ iRow ][ iInt ] * coeff_K_detJweight ;
-
-        const double res_del = dpsidx_row * gradx_at_integ +
-                               dpsidy_row * grady_at_integ +
-                               dpsidz_row * gradz_at_integ ;
-
-        const double res_val = value_at_integ * value_at_integ * value_row ;
-        const double jac_val_row = 2 * value_at_integ * value_row ;
-
-        sum_x( element_vectors( ielem , iRow ) , res_del + res_val );
-
-        for ( unsigned iCol = 0 ; iCol < FunctionCount ; ++iCol ) {
-
-          const float jac_del = 
-            dpsidx_row * work_data->dpsidx[iCol][iInt] +
-            dpsidy_row * work_data->dpsidy[iCol][iInt] +
-            dpsidz_row * work_data->dpsidz[iCol][iInt] ;
-
-          const double jac_val =
-            jac_val_row * elem_data.values[ iInt ][ iCol ] ;
-
-          sum_x( element_matrices( ielem , iRow , iCol ) , jac_del + jac_val );
-        }
-      }
-    }
-
-    __syncthreads(); // All warps finish before refilling shared data 
-  }
-
-  __device__ inline
-  void operator()(void) const
-  {
-    extern __shared__ WorkSpace work_data[] ;
-
-    for ( unsigned ielem = blockIdx.x ; ielem < elem_count ; ielem += gridDim.x ) {
-
-      evaluateFunctions( ielem );
-
-      contributeResidualJacobian( ielem );
-    }
-  }
-
-}; /* ElementComputation */
-
-} /* namespace Nonlinear */
-} /* namespace HybridFEM */
-
--- a/lib/kokkos/example/multi_fem/NonlinearFunctors.hpp
+++ b/lib/kokkos/example/multi_fem/NonlinearFunctors.hpp
@ -1,482 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_NONLINEARFUNCTORS_HPP
-#define KOKKOS_NONLINEARFUNCTORS_HPP
-
-#include <iostream>
-#include <fstream>
-#include <iomanip>
-#include <cstdlib>
-#include <cmath>
-
-namespace HybridFEM {
-namespace Nonlinear {
-
-template< class MeshType , typename ScalarType > struct ElementComputation ;
-template< class MeshType , typename ScalarType > struct DirichletSolution ;
-template< class MeshType , typename ScalarType > struct DirichletResidual ;
-
-}
-}
-
-/* A Cuda-specific specialization for the element computation functor. */
-#if defined( __CUDACC__ )
-#include <NonlinearElement_Cuda.hpp>
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace HybridFEM {
-namespace Nonlinear {
-
-template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
-          typename ScalarType >
-struct ElementComputation<
-  FEMesh< ScalarCoordType , ElemNode , DeviceType > , ScalarType >
-{
-  typedef DeviceType  execution_space;
-  typedef ScalarType           scalar_type ;
-
-  static const unsigned ElementNodeCount = ElemNode ;
-
-  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
-
-  typedef HexElement_Data< ElementNodeCount > element_data_type ;
-
-  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
-  static const unsigned FunctionCount    = element_data_type::function_count ;
-  static const unsigned IntegrationCount = element_data_type::integration_count ;
-  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
-
-  typedef Kokkos::View< scalar_type[][FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
-  typedef Kokkos::View< scalar_type[][FunctionCount] , execution_space > elem_vectors_type ;
-  typedef Kokkos::View< scalar_type[] , execution_space > value_vector_type ;
-
-
-private:
-
-  const element_data_type                 elem_data ;
-  typename mesh_type::elem_node_ids_type  elem_node_ids ;
-  typename mesh_type::node_coords_type    node_coords ;
-  value_vector_type                       nodal_values ;
-  elem_matrices_type                      element_matrices ;
-  elem_vectors_type                       element_vectors ;
-  scalar_type                             coeff_K ;
-
-public:
-
-  ElementComputation( const mesh_type   & arg_mesh ,
-                      const elem_matrices_type  & arg_element_matrices ,
-                      const elem_vectors_type   & arg_element_vectors ,
-                      const value_vector_type   & arg_nodal_values ,
-	              const scalar_type   arg_coeff_K )
-  : elem_data()
-  , elem_node_ids( arg_mesh.elem_node_ids )
-  , node_coords(   arg_mesh.node_coords )
-  , nodal_values(   arg_nodal_values )
-  , element_matrices( arg_element_matrices )
-  , element_vectors( arg_element_vectors )
-  , coeff_K( arg_coeff_K )
-  {
-    const size_t elem_count = arg_mesh.elem_node_ids.dimension_0();
-
-    parallel_for( elem_count , *this );
-  }
-
-  //------------------------------------
-
-  static const unsigned FLOPS_transform_gradients =
-     /* Jacobian */           FunctionCount * TensorDim * 2 +
-     /* Inverse jacobian */   TensorDim * 6 + 6 +
-     /* Gradient transform */ FunctionCount * 15 ;
-
-  KOKKOS_INLINE_FUNCTION
-  float transform_gradients(
-    const float grad[][ FunctionCount ] , // Gradient of bases master element
-    const double x[] ,
-    const double y[] ,
-    const double z[] ,
-    float dpsidx[] ,
-    float dpsidy[] ,
-    float dpsidz[] ) const
-  {
-    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
-           j21 = 3 , j22 = 4 , j23 = 5 ,
-           j31 = 6 , j32 = 7 , j33 = 8 };
-
-    // Jacobian accumulation:
-
-    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
-
-    for( unsigned i = 0; i < FunctionCount ; ++i ) {
-      const double x1 = x[i] ;
-      const double x2 = y[i] ;
-      const double x3 = z[i] ;
-
-      const float g1 = grad[0][i] ;
-      const float g2 = grad[1][i] ;
-      const float g3 = grad[2][i] ;
-
-      J[j11] += g1 * x1 ;
-      J[j12] += g1 * x2 ;
-      J[j13] += g1 * x3 ;
-
-      J[j21] += g2 * x1 ;
-      J[j22] += g2 * x2 ;
-      J[j23] += g2 * x3 ;
-
-      J[j31] += g3 * x1 ;
-      J[j32] += g3 * x2 ;
-      J[j33] += g3 * x3 ;
-    }
-
-    // Inverse jacobian:
-
-    float invJ[ TensorDim ] = {
-      static_cast<float>( J[j22] * J[j33] - J[j23] * J[j32] ) ,
-      static_cast<float>( J[j13] * J[j32] - J[j12] * J[j33] ) ,
-      static_cast<float>( J[j12] * J[j23] - J[j13] * J[j22] ) ,
-
-      static_cast<float>( J[j23] * J[j31] - J[j21] * J[j33] ) ,
-      static_cast<float>( J[j11] * J[j33] - J[j13] * J[j31] ) ,
-      static_cast<float>( J[j13] * J[j21] - J[j11] * J[j23] ) ,
-
-      static_cast<float>( J[j21] * J[j32] - J[j22] * J[j31] ) ,
-      static_cast<float>( J[j12] * J[j31] - J[j11] * J[j32] ) ,
-      static_cast<float>( J[j11] * J[j22] - J[j12] * J[j21] ) };
-
-    const float detJ = J[j11] * invJ[j11] +
-                       J[j21] * invJ[j12] +
-                       J[j31] * invJ[j13] ;
-
-    const float detJinv = 1.0 / detJ ;
-
-    for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
-
-    // Transform gradients:
-
-    for( unsigned i = 0; i < FunctionCount ; ++i ) {
-      const float g0 = grad[0][i];
-      const float g1 = grad[1][i];
-      const float g2 = grad[2][i];
-
-      dpsidx[i] = g0 * invJ[j11] + g1 * invJ[j12] + g2 * invJ[j13];
-      dpsidy[i] = g0 * invJ[j21] + g1 * invJ[j22] + g2 * invJ[j23];
-      dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33];
-    }
-
-    return detJ ;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void contributeResidualJacobian(
-    const float coeff_k ,
-    const double dof_values[] ,
-    const float dpsidx[] ,
-    const float dpsidy[] ,
-    const float dpsidz[] ,
-    const float detJ ,
-    const float integ_weight ,
-    const float bases_vals[] ,
-    double elem_res[] ,
-    double elem_mat[][ FunctionCount ] ) const
-  {
-    double value_at_pt = 0 ;
-    double gradx_at_pt = 0 ;
-    double grady_at_pt = 0 ;
-    double gradz_at_pt = 0 ;
-
-    for ( unsigned m = 0 ; m < FunctionCount ; m++ ) {
-      value_at_pt += dof_values[m] * bases_vals[m] ;
-      gradx_at_pt += dof_values[m] * dpsidx[m] ;
-      grady_at_pt += dof_values[m] * dpsidy[m] ;
-      gradz_at_pt += dof_values[m] * dpsidz[m] ;
-    }
-
-    const scalar_type k_detJ_weight = coeff_k        * detJ * integ_weight ;
-    const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ;
-    const double mat_val = 2.0 * value_at_pt         * detJ * integ_weight ;
-
-    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
-    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
-
-    for ( unsigned m = 0; m < FunctionCount; m++) {
-      double * const mat = elem_mat[m] ;
-      const float bases_val_m = bases_vals[m];
-      const float dpsidx_m    = dpsidx[m] ;
-      const float dpsidy_m    = dpsidy[m] ;
-      const float dpsidz_m    = dpsidz[m] ;
-
-      elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt +
-                                       dpsidy_m * grady_at_pt +
-                                       dpsidz_m * gradz_at_pt ) +
-                     res_val * bases_val_m ;
-
-      for( unsigned n = 0; n < FunctionCount; n++) {
-
-        mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] +
-                                    dpsidy_m * dpsidy[n] +
-                                    dpsidz_m * dpsidz[n] ) +
-                  mat_val * bases_val_m * bases_vals[n];
-      }
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned ielem ) const
-  {
-    // Gather nodal coordinates and solution vector:
-
-    double x[ FunctionCount ] ;
-    double y[ FunctionCount ] ;
-    double z[ FunctionCount ] ;
-    double val[ FunctionCount ] ;
-
-    for ( unsigned i = 0 ; i < ElementNodeCount ; ++i ) {
-      const unsigned node_index = elem_node_ids( ielem , i );
-
-      x[i] = node_coords( node_index , 0 );
-      y[i] = node_coords( node_index , 1 );
-      z[i] = node_coords( node_index , 2 );
-
-      val[i] = nodal_values( node_index );
-    }
-
-    double elem_vec[ FunctionCount ] ;
-    double elem_mat[ FunctionCount ][ FunctionCount ] ;
-
-    for( unsigned i = 0; i < FunctionCount ; i++ ) {
-      elem_vec[i] = 0 ;
-      for( unsigned j = 0; j < FunctionCount ; j++){
-        elem_mat[i][j] = 0 ;
-      }
-    }
-
-    for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) {
-      float dpsidx[ FunctionCount ] ;
-      float dpsidy[ FunctionCount ] ;
-      float dpsidz[ FunctionCount ] ;
-
-      const float detJ =
-        transform_gradients( elem_data.gradients[i] , x , y , z ,
-                             dpsidx , dpsidy , dpsidz );
-
-      contributeResidualJacobian( coeff_K ,
-                                  val , dpsidx , dpsidy , dpsidz ,
-                                  detJ ,
-                                  elem_data.weights[i] ,
-                                  elem_data.values[i] ,
-                                  elem_vec , elem_mat );
-    }
-
-    for( unsigned i = 0; i < FunctionCount ; i++){
-      element_vectors(ielem, i) = elem_vec[i] ;
-      for( unsigned j = 0; j < FunctionCount ; j++){
-        element_matrices(ielem, i, j) = elem_mat[i][j] ;
-      }
-    }
-  }
-
-}; /* ElementComputation */
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
-          typename ScalarType >
-struct DirichletSolution<
-  FEMesh< ScalarCoordType , ElemNode , DeviceType > ,
-  ScalarType >
-{
-  typedef DeviceType  execution_space;
-
-  static const unsigned ElementNodeCount = ElemNode ;
-
-  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
-
-  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
-
-  typename mesh_type::node_coords_type node_coords ;
-
-  vector_type     solution ;
-  ScalarCoordType bc_lower_z ;
-  ScalarCoordType bc_upper_z ;
-  ScalarType      bc_lower_value ;
-  ScalarType      bc_upper_value ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned inode ) const
-  {
-
-  // Apply dirichlet boundary condition on the Solution vector.
-  // Define boundary node values to be either bc_lower_value or
-  // bc_upper_value, depending on which boundary face they lie on.
-  // Non-boundary terms will be left at their previous value.
-
-    const ScalarCoordType z = node_coords(inode,2);
-    const bool bc_lower = z <= bc_lower_z ;
-    const bool bc_upper = bc_upper_z <= z ;
-
-    if ( bc_lower || bc_upper ) {
-      const ScalarType bc_value = bc_lower ? bc_lower_value
-                                           : bc_upper_value ;
-
-      solution(inode) = bc_value ; //  set the solution vector
-    }
-  }
-
-  static void apply( const vector_type    & solution ,
-                     const mesh_type      & mesh ,
-                     const ScalarCoordType  bc_lower_z ,
-                     const ScalarCoordType  bc_upper_z ,
-                     const ScalarType       bc_lower_value ,
-                     const ScalarType       bc_upper_value )
-  {
-    DirichletSolution op ;
-    op.node_coords    = mesh.node_coords ;
-    op.solution       = solution ;
-    op.bc_lower_z     = bc_lower_z ;
-    op.bc_upper_z     = bc_upper_z ;
-    op.bc_lower_value = bc_lower_value ;
-    op.bc_upper_value = bc_upper_value ;
-    parallel_for( solution.dimension_0() , op );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
-          typename ScalarType >
-struct DirichletResidual<
-  FEMesh< ScalarCoordType , ElemNode , DeviceType > , ScalarType >
-{
-  typedef DeviceType     execution_space;
-  typedef typename execution_space::size_type  size_type ;
-
-  static const unsigned ElementNodeCount = ElemNode ;
-
-  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
-  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
-
-  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
-
-  typename mesh_type::node_coords_type node_coords ;
-  matrix_type     matrix ;
-  vector_type     rhs ;
-  ScalarCoordType bc_lower_z ;
-  ScalarCoordType bc_upper_z ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned inode ) const
-  {
-    //  Apply a dirichlet boundary condition to 'irow'
-    //  to maintain the symmetry of the original
-    //  global stiffness matrix, zero out the columns
-    //  that correspond to boundary conditions, and
-    //  adjust the load vector accordingly
-
-    const size_type iBeg = matrix.graph.row_map[inode];
-    const size_type iEnd = matrix.graph.row_map[inode+1];
-
-    const ScalarCoordType z = node_coords(inode,2);
-    const bool bc_lower = z <= bc_lower_z ;
-    const bool bc_upper = bc_upper_z <= z ;
-
-    if ( bc_lower || bc_upper ) {
-      rhs(inode) = 0 ; //  set the residual vector
-
-      //  zero each value on the row, and leave a one
-      //  on the diagonal
-
-      for( size_type i = iBeg ; i < iEnd ; i++) {
-        matrix.coefficients(i) =
-          (int) inode == matrix.graph.entries(i) ? 1 : 0 ;
-      }
-    }
-    else {
-
-      //  Find any columns that are boundary conditions.
-      //  Clear them and adjust the load vector
-
-      for( size_type i = iBeg ; i < iEnd ; i++ ) {
-        const size_type cnode = matrix.graph.entries(i) ;
-
-        const ScalarCoordType zc = node_coords(cnode,2);
-        const bool c_bc_lower = zc <= bc_lower_z ;
-        const bool c_bc_upper = bc_upper_z <= zc ;
-
-        if ( c_bc_lower || c_bc_upper ) {
-
-	   matrix.coefficients(i) = 0 ;
-        }
-      }
-    }
-  }
-
-
-  static void apply( const matrix_type & linsys_matrix ,
-                     const vector_type & linsys_rhs ,
-                     const mesh_type   & mesh ,
-                     const ScalarCoordType  bc_lower_z ,
-                     const ScalarCoordType  bc_upper_z)
-  {
-    const size_t row_count = linsys_matrix.graph.row_map.dimension_0() - 1 ;
-
-    DirichletResidual op ;
-    op.node_coords    = mesh.node_coords ;
-    op.matrix         = linsys_matrix ;
-    op.rhs            = linsys_rhs ;
-    op.bc_lower_z     = bc_lower_z ;
-    op.bc_upper_z     = bc_upper_z ;
-    parallel_for( row_count , op );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} /* namespace Nonlinear */
-} /* namespace HybridFEM */
-
-#endif /* #ifndef KOKKOS_NONLINEARFUNCTORS_HPP */
-
--- a/lib/kokkos/example/multi_fem/ParallelComm.hpp
+++ b/lib/kokkos/example/multi_fem/ParallelComm.hpp
@ -1,167 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef PARALLELCOMM_HPP
-#define PARALLELCOMM_HPP
-
-//------------------------------------------------------------------------
-
-#include <Kokkos_Macros.hpp>
-
-//------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_MPI )
-
-#include <mpi.h>
-#include <string>
-
-namespace comm {
-
-struct Machine {
-  MPI_Comm mpi_comm ;
-
-  Machine() : mpi_comm( MPI_COMM_NULL ) {}
-
-  Machine( const Machine & rhs )
-    : mpi_comm( rhs.mpi_comm ) {}
-
-  Machine( MPI_Comm c ) : mpi_comm( c ) {}
-
-  static Machine init( int * argc , char *** argv )
-  {
-    MPI_Init( argc , argv );
-    return Machine( MPI_COMM_WORLD );
-  }
-
-  static void finalize() { MPI_Finalize(); }
-};
-
-inline
-unsigned  size( Machine machine )
-{
-  int np ; MPI_Comm_size( machine.mpi_comm , & np ); return np ;
-}
-
-inline
-unsigned  rank( Machine machine )
-{
-  int ip ; MPI_Comm_rank( machine.mpi_comm , & ip ); return ip ;
-}
-
-inline
-double max( Machine machine , double local )
-{
-  double global = 0;
-  MPI_Allreduce( & local , & global , 1 , MPI_DOUBLE , MPI_MAX , machine.mpi_comm );
-  return global ;
-}
-
-inline
-std::string command_line( Machine machine , const int argc , const char * const * const argv )
-{
-  std::string argline ;
-
-  if ( 0 == rank( machine ) ) {
-    for ( int i = 1 ; i < argc ; ++i ) {
-      argline.append(" ").append( argv[i] );
-    }
-  }
-
-  int length = argline.length();
-  MPI_Bcast( & length , 1 , MPI_INT , 0 , machine.mpi_comm );
-  argline.resize( length , ' ' );
-  MPI_Bcast( (void*) argline.data() , length , MPI_CHAR , 0 , machine.mpi_comm );
-
-  return argline ;
-}
-
-}
-
-#else /* ! defined( KOKKOS_HAVE_MPI ) */
-
-#include <string>
-
-namespace comm {
-
-// Stub for non-parallel
-
-struct Machine {
-  static Machine init( int * , char *** )
-  { return Machine(); }
-
-  static void finalize() {}
-};
-
-inline
-unsigned  size( Machine ) { return 1 ; }
-
-inline
-unsigned  rank( Machine ) { return 0 ; }
-
-inline
-double max( Machine , double local )
-{ return local ; }
-
-inline
-std::string command_line( Machine machine , const int argc , const char * const * const argv )
-{
-  std::string argline ;
-
-  if ( 0 == rank( machine ) ) {
-    for ( int i = 1 ; i < argc ; ++i ) {
-      argline.append(" ").append( argv[i] );
-    }
-  }
-
-  return argline ;
-}
-
-}
-
-#endif /* ! defined( KOKKOS_HAVE_MPI ) */
-
-//------------------------------------------------------------------------
-
-#endif /* #ifndef PARALLELCOMM_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/ParallelDataMap.hpp
+++ b/lib/kokkos/example/multi_fem/ParallelDataMap.hpp
@ -1,517 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_PARALLELDATAMAP_HPP
-#define KOKKOS_PARALLELDATAMAP_HPP
-
-#include <utility>
-#include <limits>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-#include <Kokkos_Core.hpp>
-#include <ParallelComm.hpp>
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-/** \brief  Parallel distributed data mapping
- *
- *  ordering { interior : { owned items not sent elsewhere }
- *             send     : { owned items sent }
- *             receive  : { not-owned items received } }
- *
- *  recv { { N ghosted items from process P : ( P , N ) } }
- *
- *  send { { N send items to process P : ( P , N ) } }
- *
- *  send_item { send item offsets within 'send' range }
- */
-struct ParallelDataMap {
-  typedef View< unsigned*[2], HostSpace >  host_recv_type ;
-  typedef View< unsigned*[2], HostSpace >  host_send_type ;
-  typedef View< unsigned* ,   HostSpace >  host_send_item_type ;
-
-  comm::Machine        machine ;
-  host_recv_type       host_recv ;
-  host_send_type       host_send ;
-  host_send_item_type  host_send_item ;
-  unsigned             count_interior ;
-  unsigned             count_send ;
-  unsigned             count_owned ; // = count_interior + count_send
-  unsigned             count_receive ;
-
-  void assign( const unsigned arg_count_interior ,
-               const unsigned arg_count_owned ,
-               const unsigned arg_count_total ,
-               const unsigned arg_recv_msg ,
-               const unsigned arg_send_msg ,
-               const unsigned arg_send_count )
-  {
-    const std::string label("Kokkos::ParallelDataMap buffer");
-
-    count_interior = arg_count_interior ;
-    count_owned    = arg_count_owned ;
-    count_send     = arg_count_owned - arg_count_interior ;
-    count_receive  = arg_count_total - arg_count_owned ;
-
-    host_recv = host_recv_type( label , arg_recv_msg );
-    host_send = host_send_type( label , arg_send_msg );
-    host_send_item = host_send_item_type( label , arg_send_count );
-  }
-};
-
-//----------------------------------------------------------------------------
-//PackArray
-//----------------------------------------------------------------------------
-template< class ArrayType , class Rank = void >
-struct PackArray ;
-
-template< typename DeviceType, typename ValueType >
-struct PackArray< View< ValueType* , DeviceType > , void >
-{
-  typedef DeviceType                         execution_space ;
-  typedef typename DeviceType::size_type     size_type ;
-  typedef View< ValueType* , execution_space >  array_type ;
-  typedef View< ValueType* , execution_space >  buffer_type ;
-
-private:
-
-  buffer_type  output ;
-  array_type   input ;
-  size_type    base ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i ) const
-  { output[i] = input(base+i); }
-
-  inline
-  static
-  void pack( const buffer_type & arg_output ,
-             const size_type     arg_begin ,
-             const size_type     arg_count ,
-             const array_type  & arg_input )
-  {
-    PackArray op ;
-    op.output = arg_output ;
-    op.input  = arg_input ;
-    op.base   = arg_begin ;
-    parallel_for( arg_count , op );
-  }
-};
-
-template< typename DeviceType, typename ValueType , unsigned N1 >
-struct PackArray< View< ValueType*[N1] , DeviceType > , void >
-{
-  typedef DeviceType                                  execution_space ;
-  typedef typename DeviceType::size_type              size_type ;
-  typedef View< ValueType*[N1] , execution_space >       array_type ;
-  typedef View< ValueType* , execution_space >           buffer_type ;
-
-private:
-
-  buffer_type  output ;
-  array_type   input ;
-  size_type    base ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i ) const
-  {
-    for ( size_type j = 0 , k = i * N1 ; j < N1 ; ++j , ++k ) {
-      output[k] = input(base+i,j);
-    }
-  }
-
-  inline static
-  void pack( const buffer_type & arg_output ,
-             const size_type     arg_begin ,
-             const size_type     arg_count ,
-             const array_type  & arg_input )
-  {
-    if ( arg_count ) {
-      PackArray op ;
-      op.output = arg_output ;
-      op.input  = arg_input ;
-      op.base   = arg_begin ;
-      parallel_for( arg_count , op );
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-//UnpackArray
-//----------------------------------------------------------------------------
-template< class ArrayType , class Rank = void > struct UnpackArray ;
-
-template< typename DeviceType, typename ValueType >
-struct UnpackArray< View< ValueType* , DeviceType > , void >
-{
-  typedef DeviceType                         execution_space ;
-  typedef typename DeviceType::size_type     size_type ;
-  typedef View< ValueType* , execution_space >  array_type ;
-  typedef View< ValueType* , execution_space >  buffer_type ;
-
-private:
-
-  array_type   output ;
-  buffer_type  input ;
-  size_type    base ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i ) const
-  { output(base+i) = input[i]; }
-
-  inline
-  static
-  void unpack( const array_type  & arg_output ,
-               const buffer_type & arg_input ,
-               const size_type     arg_begin ,
-               const size_type     arg_count )
-  {
-    UnpackArray op ;
-    op.output = arg_output ;
-    op.input  = arg_input ;
-    op.base   = arg_begin ;
-    parallel_for( arg_count , op );
-  }
-};
-
-template< typename DeviceType, typename ValueType , unsigned N1 >
-struct UnpackArray< View< ValueType*[N1] , DeviceType > , void >
-{
-  typedef DeviceType                                  execution_space ;
-  typedef typename DeviceType::size_type              size_type ;
-  typedef View< ValueType* , execution_space >           buffer_type ;
-  typedef View< ValueType*[N1] , execution_space >       array_type ;
-
-private:
-
-  array_type   output ;
-  buffer_type  input ;
-  size_type    base ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i ) const
-  {
-    for ( size_type j = 0 , k = i * N1 ; j < N1 ; ++j , ++k ) {
-      output(base+i,j) = input(k);
-    }
-  }
-
-  inline
-  static
-  void unpack( const array_type  & arg_output ,
-               const buffer_type & arg_input ,
-               const size_type     arg_begin ,
-               const size_type     arg_count )
-  {
-    if ( arg_count ) {
-      UnpackArray op ;
-      op.output = arg_output ;
-      op.input  = arg_input ;
-      op.base   = arg_begin ;
-      parallel_for( arg_count , op );
-    }
-  }
-};
-//----------------------------------------------------------------------------
-template< class ValueType , class Device , class DataMap >
-class AsyncExchange ;
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-// Application call procedure:
-//
-// construct: AsyncExchange object
-// * pack send buffer on device
-// initiate: copy send buffer from device to host
-// * dispatch asynchronous local work
-// complete: send/receive on host, copy receive buffer to device
-// * unpack receive buffer on device
-// destroy: AsyncExchange object
-//
-//----------------------------------------------------------------------------
-
-#ifdef KOKKOS_HAVE_MPI
-
-namespace Kokkos {
-
-template< class ValueType , class Device >
-class AsyncExchange< ValueType, Device , Kokkos::ParallelDataMap > {
-public:
-
-  typedef Device                                    execution_space ;
-  typedef Kokkos::ParallelDataMap                   data_map_type ;
-  typedef Kokkos::View< ValueType* , execution_space >  buffer_dev_type ;
-  typedef typename buffer_dev_type::HostMirror      buffer_host_type ;
-
-private:
-
-  static const int mpi_tag = 11 ;
-
-  const data_map_type  data_map ;
-  unsigned             chunk_size ;
-  unsigned             send_count_max ;
-  buffer_host_type     host_recv_buffer ;
-  buffer_host_type     host_send_buffer ;
-  buffer_host_type     send_msg_buffer ;
-  buffer_dev_type      dev_buffer ;
-  buffer_dev_type      dev_send_buffer ; // Subview for send
-  buffer_dev_type      dev_recv_buffer ; // Subview for receive
-  std::vector< MPI_Request > recv_request ;
-
-public:
-
-  const buffer_dev_type & buffer() const { return dev_buffer ; }
-
-  AsyncExchange( const data_map_type & arg_data_map ,
-                 const size_t          arg_chunk )
-  : data_map( arg_data_map )
-  , chunk_size( arg_chunk )
-  , send_count_max( 0 )
-  , host_recv_buffer()
-  , host_send_buffer()
-  , send_msg_buffer()
-  , dev_buffer()
-  , dev_send_buffer()
-  , dev_recv_buffer()
-  , recv_request()
-  {
-    const size_t send_msg_count = arg_data_map.host_send.dimension_0();
-    const size_t recv_msg_count = arg_data_map.host_recv.dimension_0();
-
-    const size_t send_msg_length = arg_chunk * arg_data_map.count_send ;
-    const size_t recv_msg_length = arg_chunk * arg_data_map.count_receive ;
-
-    for ( size_t i = 0 ; i < send_msg_count ; ++i ) {
-      send_count_max = std::max( send_count_max ,
-                                 (unsigned) arg_data_map.host_send(i,1) );
-    }
-
-    // A single shared buffer on the device can be used for
-    // send and receive message buffers.
-    dev_buffer = buffer_dev_type(
-                     std::string("AsyncExchange dev_buffer") ,
-                     std::max( send_msg_length , recv_msg_length ) );
-
-    // Total send subview of the device buffer
-    dev_send_buffer =
-      Kokkos::subview( dev_buffer , std::pair<size_t,size_t>( 0 , send_msg_length ) );
-
-    // Total receive subview of the device buffer
-    dev_recv_buffer =
-      Kokkos::subview( dev_buffer , std::pair<size_t,size_t>( 0 , recv_msg_length ) );
-
-    // Total receive message buffer on the host:
-    host_recv_buffer = buffer_host_type(
-                           std::string("AsyncExchange host_recv_buffer") ,
-                           recv_msg_length );
-
-    // Total send message buffer on the host:
-    host_send_buffer = buffer_host_type(
-                           std::string("AsyncExchange host_send_buffer") ,
-                           send_msg_length );
-
-    // Individual send message buffer on the host:
-    send_msg_buffer = buffer_host_type(
-                          std::string("AsyncExchange send_msg_buffer") ,
-                          arg_chunk * send_count_max );
-
-    // MPI asynchronous receive request handles:
-    recv_request.assign( recv_msg_count , MPI_REQUEST_NULL );
-  }
-
-  //------------------------------------------------------------------------
-
-  void setup()
-  {
-    { // Post receives:
-      const size_t recv_msg_count = data_map.host_recv.dimension_0();
-
-      ValueType * ptr = host_recv_buffer.ptr_on_device();
-
-      for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
-        const int proc  = data_map.host_recv(i,0);
-        const int count = data_map.host_recv(i,1) * chunk_size ;
-
-        MPI_Irecv( ptr , count * sizeof(ValueType) , MPI_BYTE ,
-                   proc , mpi_tag , data_map.machine.mpi_comm ,
-                   & recv_request[i] );
-
-        ptr += count ;
-      }
-    }
-
-    // Copy send buffer from the device to host memory for sending
-
-    Kokkos::deep_copy( host_send_buffer , dev_send_buffer );
-
-    // Done with the device until communication is complete.
-    // Application can dispatch asynchronous work on the device.
-  }
-
-  // Application can dispatch local work to device ...
-  // No communication progress until main thread calls 'send_receive'
-
-  void send_receive()
-  {
-    const size_t recv_msg_count = data_map.host_recv.dimension_0();
-    const size_t send_msg_count = data_map.host_send.dimension_0();
-
-    // Pack and send:
-
-    for ( size_t i = 0 , j = 0 ; i < send_msg_count ; ++i ) {
-      const int proc  = data_map.host_send(i,0);
-      const int count = data_map.host_send(i,1);
-
-      for ( int k = 0 , km = 0 ; k < count ; ++k , ++j ) {
-        const int km_end = km + chunk_size ;
-        int ki = chunk_size * data_map.host_send_item(j);
-
-        for ( ; km < km_end ; ++km , ++ki ) {
-          send_msg_buffer[km] = host_send_buffer[ki];
-        }
-      }
-
-      // MPI_Ssend blocks until
-      // (1) a receive is matched for the message and
-      // (2) the send buffer can be re-used.
-      //
-      // It is suggested that MPI_Ssend will have the best performance:
-      // http://www.mcs.anl.gov/research/projects/mpi/sendmode.html .
-
-      MPI_Ssend( send_msg_buffer.ptr_on_device(),
-                 count * chunk_size * sizeof(ValueType) , MPI_BYTE ,
-                 proc , mpi_tag , data_map.machine.mpi_comm );
-    }
-
-    // Wait for receives and verify:
-
-    for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
-      MPI_Status recv_status ;
-      int recv_which = 0 ;
-      int recv_size  = 0 ;
-
-      MPI_Waitany( recv_msg_count , & recv_request[0] ,
-                   & recv_which , & recv_status );
-
-      const int recv_proc = recv_status.MPI_SOURCE ;
-
-      MPI_Get_count( & recv_status , MPI_BYTE , & recv_size );
-
-      // Verify message properly received:
-
-      const int  expected_proc = data_map.host_recv(recv_which,0);
-      const int  expected_size = data_map.host_recv(recv_which,1) *
-                                 chunk_size * sizeof(ValueType);
-
-      if ( ( expected_proc != recv_proc ) ||
-           ( expected_size != recv_size ) ) {
-        std::ostringstream msg ;
-        msg << "AsyncExchange error:"
-            << " P" << comm::rank( data_map.machine )
-            << " received from P" << recv_proc
-            << " size "     << recv_size
-            << " expected " << expected_size
-            << " from P"    << expected_proc ;
-        throw std::runtime_error( msg.str() );
-      }
-    }
-
-    // Copy received data to device memory.
-
-    Kokkos::deep_copy( dev_recv_buffer , host_recv_buffer );
-  }
-};
-
-} // namespace Kokkos
-
-#else /* ! #ifdef KOKKOS_HAVE_MPI */
-
-namespace Kokkos {
-
-template< class ValueType , class Device >
-class AsyncExchange< ValueType, Device , Kokkos::ParallelDataMap > {
-public:
-
-  typedef Device                                    execution_space ;
-  typedef Kokkos::ParallelDataMap                   data_map_type ;
-  typedef Kokkos::View< ValueType* , execution_space >  buffer_dev_type ;
-  typedef typename buffer_dev_type::HostMirror      buffer_host_type ;
-
-  buffer_dev_type      dev_buffer ;
-
-public:
-
-  const buffer_dev_type & buffer() const { return dev_buffer ; }
-
-  AsyncExchange( const data_map_type & , const size_t )
-  : dev_buffer()
-  { }
-
-  //------------------------------------------------------------------------
-
-  void setup() { }
-
-  void send_receive() { }
-};
-
-} // namespace Kokkos
-
-#endif /* ! #ifdef KOKKOS_HAVE_MPI */
-
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_PARALLELDATAMAP_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/ParallelMachine.cpp
+++ b/lib/kokkos/example/multi_fem/ParallelMachine.cpp
@ -1,178 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#if 0
-
-#include <stdlib.h>
-#include <string.h>
-
-#include <ParallelMachine.hpp>
-
-#include <Kokkos_Core.hpp>
-
-#if ! defined( KOKKOS_HAVE_MPI )
-#define MPI_COMM_NULL 0
-#endif
-
-//------------------------------------------------------------------------
-
-namespace Parallel {
-
-Machine::Machine( int * argc , char *** argv )
-  : m_mpi_comm( MPI_COMM_NULL )
-  , m_mpi_size(0)
-  , m_mpi_rank(0)
-  , m_mpi_gpu(0)
-{
-
-#if defined( KOKKOS_HAVE_CUDA )
-  //------------------------------------
-  // Might be using a Cuda aware version of MPI.
-  // Must select Cuda device before initializing MPI.
-  {
-    int i = 1 ;
-    for ( ; i < *argc && strcmp((*argv)[i],"mpi_cuda") ; ++i );
-
-    if ( i < *argc ) {
-      // Determine, if possible, what will be the node-local
-      // rank of the MPI process once MPI has been initialized.
-      // This rank is needed to set the Cuda device before 'mvapich'
-      // is initialized.
-
-      const char * const mvapich_local_rank = getenv("MV2_COMM_WORLD_LOCAL_RANK");
-      const char * const slurm_local_rank   = getenv("SLURM_LOCALID");
-
-      const int pre_mpi_local_rank =
-        0 != mvapich_local_rank ? atoi( mvapich_local_rank ) : (
-        0 != slurm_local_rank   ? atoi( slurm_local_rank ) : (
-        -1 ) );
-
-      if ( 0 <= pre_mpi_local_rank ) {
-
-        const int ngpu = Kokkos::Cuda::detect_device_count();
-
-        const int cuda_device_rank = pre_mpi_local_rank % ngpu ;
-
-        Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cuda_device_rank ) );
-
-        m_mpi_gpu = 1 ;
-      }
-    }
-  }
-#endif
-
-  //------------------------------------
-
-#if defined( KOKKOS_HAVE_MPI )
-  MPI_Init( argc , argv );
-  m_mpi_comm = MPI_COMM_WORLD ;
-  MPI_Comm_size( m_mpi_comm , & m_mpi_size );
-  MPI_Comm_rank( m_mpi_comm , & m_mpi_rank );
-#endif
-
-  // Query hwloc after MPI initialization to allow MPI binding:
-  //------------------------------------
-  // Request to use host device:
-  {
-    int i = 1 ;
-    for ( ; i < *argc && strcmp((*argv)[i],"host") ; ++i );
-
-    if ( i < *argc ) {
-
-      unsigned team_count       = Kokkos::hwloc::get_available_numa_count();
-      unsigned threads_per_team = Kokkos::hwloc::get_available_cores_per_numa() *
-                                  Kokkos::hwloc::get_available_threads_per_core();
- 
-      if ( i + 2 < *argc ) {
-        team_count       = atoi( (*argv)[i+1] );
-        threads_per_team = atoi( (*argv)[i+2] );
-      }
-
-      Kokkos::Threads::initialize( team_count * threads_per_team );
-    }
-  }
-
-#if defined( KOKKOS_HAVE_CUDA )
-  //------------------------------------
-  // Request to use Cuda device and not already initialized.
-  if ( ! m_mpi_gpu ) {
-    int i = 1 ;
-    for ( ; i < *argc && strcmp((*argv)[i],"mpi_cuda") && strcmp((*argv)[i],"cuda") ; ++i );
-
-    if ( i < *argc ) {
-
-      const int ngpu = Kokkos::Cuda::detect_device_count();
-
-      const int cuda_device_rank = m_mpi_rank % ngpu ;
-
-      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cuda_device_rank ) );
-    }
-  }
-#endif
-
-}
-
-Machine::~Machine()
-{
-  Kokkos::Threads::finalize();
-#if defined( KOKKOS_HAVE_CUDA )
-  Kokkos::Cuda::finalize();
-#endif
-#if defined( KOKKOS_HAVE_MPI )
-  MPI_Finalize();
-#endif
-}
-
-void Machine::print_configuration( std::ostream & msg ) const
-{
-  msg << "MPI [ " << m_mpi_rank << " / " << m_mpi_size << " ]" << std::endl ;
-  Kokkos::Threads::print_configuration( msg );
-#if defined( KOKKOS_HAVE_CUDA )
-  Kokkos::Cuda::print_configuration( msg );
-#endif
-}
-
-}
-
-#endif /* #if 0 */
-
--- a/lib/kokkos/example/multi_fem/ParallelMachine.hpp
+++ b/lib/kokkos/example/multi_fem/ParallelMachine.hpp
@ -1,118 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#error "ParallelMachine"
-
-#ifndef PARALLELMACHINE_HPP
-#define PARALLELMACHINE_HPP
-
-//------------------------------------------------------------------------
-
-#include <iosfwd>
-
-#include <Kokkos_Core.hpp>
-
-//------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_MPI )
-#include <mpi.h>
-#else
-  typedef int MPI_Comm ;
-#endif
-
-//------------------------------------------------------------------------
-//------------------------------------------------------------------------
-
-namespace Parallel {
-
-/** \brief  Hybrid parallel machine with MPI+Kokkos::Threads or MPI+Kokkos::Cuda.
- *
- *  Initialization of MPI and Kokkos device has interdependencies which this
- *  class manages.  The command line and environment variables are queried to initialize
- *  the Threads or Cuda device:
- *
- *    1)  cuda               : initializes Cuda device
- *    2)  host               : initializes Threads device with all hwloc detected cores.
- *    3)  host #gang #worker : initializes Threads with specified
- */
-class Machine {
-private:
-
-  MPI_Comm m_mpi_comm ;
-  int      m_mpi_size ;
-  int      m_mpi_rank ;
-  unsigned m_mpi_gpu ;
-  unsigned m_gpu_arch ;
-
-  Machine();
-  Machine( const Machine & );
-  Machine & operator = ( const Machine & );
-
-public:
-
-  /** \brief  Coordinated initialize MPI, Cuda, or Threads devices from 'main'.  */
-  Machine( int * argc , char *** argv );
-
-  ~Machine();
-
-  MPI_Comm mpi_comm() const { return m_mpi_comm ; }
-
-  int mpi_size() const { return m_mpi_size ; }
-  int mpi_rank() const { return m_mpi_rank ; }
-
-  /** \brief  If using MPI that can directly operate on GPU memory */
-  bool mpi_gpu() const { return m_mpi_gpu ; }
-
-  /** \brief  If using GPU then what architecture */
-  unsigned gpu_arch() const { return m_gpu_arch ; }
-
-  void print_configuration( std::ostream & ) const ;
-};
-
-}
-
-//------------------------------------------------------------------------
-
-#endif /* #ifndef PARALLELMACHINE_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp
+++ b/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp
@ -1,400 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef SPARSELINEARSYSTEM_HPP
-#define SPARSELINEARSYSTEM_HPP
-
-#include <cmath>
-#include <impl/Kokkos_Timer.hpp>
-
-#include <Kokkos_Core.hpp>
-#include <Kokkos_StaticCrsGraph.hpp>
-
-#include <LinAlgBLAS.hpp>
-
-namespace Kokkos {
-
-template< typename ScalarType , class Device >
-struct CrsMatrix {
-  typedef Device      execution_space ;
-  typedef ScalarType  value_type ;
-
-  typedef StaticCrsGraph< int , execution_space , void , int >  graph_type ;
-  typedef View< value_type* , execution_space >   coefficients_type ;
-
-  graph_type         graph ;
-  coefficients_type  coefficients ;
-};
-
-//----------------------------------------------------------------------------
-
-namespace Impl {
-
-template< class Matrix , class OutputVector , class InputVector >
-struct Multiply ;
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< typename AScalarType ,
-          typename VScalarType ,
-          class DeviceType >
-struct Multiply< CrsMatrix<AScalarType,DeviceType> ,
-                 View<VScalarType*,DeviceType > ,
-                 View<VScalarType*,DeviceType > >
-{
-  typedef DeviceType                       execution_space ;
-  typedef typename execution_space::size_type  size_type ;
-
-  typedef View<       VScalarType*, execution_space, MemoryUnmanaged >  vector_type ;
-  typedef View< const VScalarType*, execution_space, MemoryUnmanaged >  vector_const_type ;
-
-  typedef CrsMatrix< AScalarType , execution_space >    matrix_type ;
-
-private:
-
-  matrix_type        m_A ;
-  vector_const_type  m_x ;
-  vector_type        m_y ;
-
-public:
-
-  //--------------------------------------------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type iRow ) const
-  {
-    const size_type iEntryBegin = m_A.graph.row_map[iRow];
-    const size_type iEntryEnd   = m_A.graph.row_map[iRow+1];
-
-    double sum = 0 ;
-
-#if defined( __INTEL_COMPILER )
-#pragma simd reduction(+:sum)
-#pragma ivdep
-    for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
-      sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
-    }
-#else
-    for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
-      sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
-    }
-#endif
-
-    m_y(iRow) = sum ;
-  }
-
-  Multiply( const matrix_type & A ,
-            const size_type nrow ,
-            const size_type , // ncol ,
-            const vector_type & x ,
-            const vector_type & y )
-    : m_A( A ), m_x( x ), m_y( y )
-  {
-    parallel_for( nrow , *this );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-template< typename AScalarType ,
-          typename VScalarType ,
-          class Device >
-class Operator {
-  typedef CrsMatrix<AScalarType,Device>  matrix_type ;
-  typedef View<VScalarType*,Device>     vector_type ;
-
-private:
-  const CrsMatrix<AScalarType,Device> A ;
-
-  ParallelDataMap                                         data_map ;
-  AsyncExchange< VScalarType , Device , ParallelDataMap > exchange ;
-
-public:
-
-  Operator( const ParallelDataMap                  & arg_data_map ,
-            const CrsMatrix<AScalarType,Device>    & arg_A )
-    : A( arg_A )
-    , data_map( arg_data_map )
-    , exchange( arg_data_map , 1 )
-    {}
-
-  void apply( const View<VScalarType*,Device>  & x ,
-              const View<VScalarType*,Device>  & y )
-  {
-    // Gather off-processor data for 'x'
-
-    PackArray< vector_type >::pack( exchange.buffer() ,
-                                    data_map.count_interior ,
-                                    data_map.count_send , x );
-
-    exchange.setup();
-
-    // If interior & boundary matrices then could launch interior multiply
-
-    exchange.send_receive();
-
-    UnpackArray< vector_type >::unpack( x , exchange.buffer() ,
-                                        data_map.count_owned ,
-                                        data_map.count_receive );
-
-    const typename Device::size_type nrow = data_map.count_owned ;
-    const typename Device::size_type ncol = data_map.count_owned +
-                                            data_map.count_receive ;
-
-    Impl::Multiply<matrix_type,vector_type,vector_type>( A, nrow, ncol, x, y);
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< typename AScalarType , typename VScalarType , class Device >
-void cgsolve(
-  const ParallelDataMap                 data_map ,
-  const CrsMatrix<AScalarType,Device>   A ,
-  const View<VScalarType*,Device> b ,
-  const View<VScalarType*,Device> x ,
-  size_t & iteration ,
-  double & normr ,
-  double & iter_time ,
-  const size_t maximum_iteration = 200 ,
-  const double tolerance = std::numeric_limits<VScalarType>::epsilon() )
-{
-  typedef View<VScalarType*,Device> vector_type ;
-  //typedef View<VScalarType,  Device> value_type ; // unused
-
-  const size_t count_owned = data_map.count_owned ;
-  const size_t count_total = data_map.count_owned + data_map.count_receive ;
-
-  Operator<AScalarType,VScalarType,Device> matrix_operator( data_map , A );
-
-  // Need input vector to matvec to be owned + received
-  vector_type pAll ( "cg::p" , count_total );
-
-  vector_type p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
-  vector_type r ( "cg::r" , count_owned );
-  vector_type Ap( "cg::Ap", count_owned );
-
-  /* r = b - A * x ; */
-
-  /* p  = x      */ deep_copy( p , x );
-  /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );
-  /* r  = b - Ap */ waxpby( count_owned , 1.0 , b , -1.0 , Ap , r );
-  /* p  = r      */ deep_copy( p , r );
-
-  double old_rdot = dot( count_owned , r , data_map.machine );
-
-  normr     = sqrt( old_rdot );
-  iteration = 0 ;
-
-  Kokkos::Impl::Timer wall_clock ;
-
-  while ( tolerance < normr && iteration < maximum_iteration ) {
-
-    /* pAp_dot = dot( p , Ap = A * p ) */
-
-    /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );
-
-    const double pAp_dot = dot( count_owned , p , Ap , data_map.machine );
-    const double alpha   = old_rdot / pAp_dot ;
-
-    /* x += alpha * p ;  */ axpy( count_owned,  alpha, p , x );
-    /* r -= alpha * Ap ; */ axpy( count_owned, -alpha, Ap, r );
-
-    const double r_dot = dot( count_owned , r , data_map.machine );
-    const double beta  = r_dot / old_rdot ;
-
-    /* p = r + beta * p ; */ xpby( count_owned , r , beta , p );
-
-    normr = sqrt( old_rdot = r_dot );
-    ++iteration ;
-  }
-
-  iter_time = wall_clock.seconds();
-}
-
-//----------------------------------------------------------------------------
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_CUDA )
-
-#if ( CUDA_VERSION < 6000 )
-#pragma message "cusparse_v2.h"
-#include <cusparse_v2.h>
-#else
-#pragma message "cusparse.h"
-#include <cusparse.h>
-#endif
-
-namespace Kokkos {
-namespace Impl {
-
-struct CudaSparseSingleton {
-  cusparseHandle_t   handle;
-  cusparseMatDescr_t descra;
-
-  CudaSparseSingleton()
-  {
-    cusparseCreate( & handle );
-    cusparseCreateMatDescr( & descra );
-    cusparseSetMatType(       descra , CUSPARSE_MATRIX_TYPE_GENERAL );
-    cusparseSetMatIndexBase(  descra , CUSPARSE_INDEX_BASE_ZERO );
-  }
-
-  static CudaSparseSingleton & singleton();
-
-};
-
-template<>
-struct Multiply< CrsMatrix<double,Cuda> ,
-                 View<double*,Cuda > ,
-                 View<double*,Cuda > >
-{
-  typedef Cuda                                      execution_space ;
-  typedef execution_space::size_type                    size_type ;
-  typedef double                                    scalar_type ;
-  typedef View< scalar_type* , execution_space >        vector_type ;
-  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
-
-public:
-
-  Multiply( const matrix_type & A ,
-            const size_type nrow ,
-            const size_type ncol ,
-            const vector_type & x ,
-            const vector_type & y )
-  {
-    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
-    const scalar_type alpha = 1 , beta = 0 ;
-
-    cusparseStatus_t status =
-      cusparseDcsrmv( s.handle ,
-                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
-                      nrow , ncol , A.coefficients.dimension_0() ,
-                      &alpha ,
-                      s.descra ,
-                      A.coefficients.ptr_on_device() ,
-                      A.graph.row_map.ptr_on_device() ,
-                      A.graph.entries.ptr_on_device() ,
-                      x.ptr_on_device() ,
-                      &beta ,
-                      y.ptr_on_device() );
-
-    if ( CUSPARSE_STATUS_SUCCESS != status ) {
-      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
-    }
-  }
-};
-
-
-template<>
-struct Multiply< CrsMatrix<float,Cuda> ,
-                 View<float*,Cuda > ,
-                 View<float*,Cuda > >
-{
-  typedef Cuda                                      execution_space ;
-  typedef execution_space::size_type                    size_type ;
-  typedef float                                     scalar_type ;
-  typedef View< scalar_type* , execution_space >        vector_type ;
-  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
-
-public:
-
-  Multiply( const matrix_type & A ,
-            const size_type nrow ,
-            const size_type ncol ,
-            const vector_type & x ,
-            const vector_type & y )
-  {
-    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
-    const scalar_type alpha = 1 , beta = 0 ;
-
-    cusparseStatus_t status =
-      cusparseScsrmv( s.handle ,
-                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
-                      nrow , ncol , A.coefficients.dimension_0() ,
-                      &alpha ,
-                      s.descra ,
-                      A.coefficients.ptr_on_device() ,
-                      A.graph.row_map.ptr_on_device() ,
-                      A.graph.entries.ptr_on_device() ,
-                      x.ptr_on_device() ,
-                      &beta ,
-                      y.ptr_on_device() );
-
-    if ( CUSPARSE_STATUS_SUCCESS != status ) {
-      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
-    }
-  }
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef SPARSELINEARSYSTEM_HPP */
-
--- a/lib/kokkos/example/multi_fem/SparseLinearSystemFill.hpp
+++ b/lib/kokkos/example/multi_fem/SparseLinearSystemFill.hpp
@ -1,276 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef SPARSELINEARSYSTEMFILL_HPP
-#define SPARSELINEARSYSTEMFILL_HPP
-
-#include <vector>
-#include <algorithm>
-#include <limits>
-
-#include <FEMesh.hpp>
-#include <SparseLinearSystem.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace HybridFEM {
-
-template< class MatrixType , class MeshType ,
-          class elem_matrices_type ,
-          class elem_vectors_type > struct GatherFill ;
-
-
-template< typename ScalarType ,
-          class    DeviceType ,
-          unsigned ElemNode ,
-          typename CoordScalarType ,
-          class elem_matrices_type ,
-          class elem_vectors_type >
-struct GatherFill< 
-  Kokkos::CrsMatrix< ScalarType , DeviceType > ,
-  FEMesh< CoordScalarType , ElemNode , DeviceType > ,
-  elem_matrices_type , elem_vectors_type >
-{
-  typedef DeviceType     execution_space ;
-  typedef typename execution_space::size_type  size_type ;
-
-  static const size_type ElemNodeCount = ElemNode ;
-
-  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
-  typedef typename matrix_type::coefficients_type   coefficients_type ;
-  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
-  typedef Kokkos::View< size_type[][ElemNodeCount][ElemNodeCount] , execution_space >       elem_graph_type ;
-
-  typedef FEMesh< CoordScalarType , ElemNodeCount , execution_space > mesh_type ;
-  typedef typename mesh_type::node_elem_ids_type node_elem_ids_type ;
-
-private:
-
-  node_elem_ids_type  node_elem_ids ;
-  elem_graph_type     elem_graph ;
-  elem_matrices_type  elem_matrices ;
-  elem_vectors_type   elem_vectors ;
-  coefficients_type   system_coeff ;
-  vector_type         system_rhs ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( size_type irow ) const
-  {
-    const size_type node_elem_begin = node_elem_ids.row_map[irow];
-    const size_type node_elem_end   = node_elem_ids.row_map[irow+1];
-
-    //  for each element that a node belongs to 
-
-    for ( size_type i = node_elem_begin ; i < node_elem_end ; i++ ) {
-
-      const size_type elem_id   = node_elem_ids.entries( i, 0);
-      const size_type row_index = node_elem_ids.entries( i, 1);
-
-      system_rhs(irow) += elem_vectors(elem_id, row_index);
-
-      //  for each node in a particular related element  
-      //  gather the contents of the element stiffness
-      //  matrix that belong in irow
-
-      for ( size_type j = 0 ; j < ElemNodeCount ; ++j ){
-        const size_type A_index = elem_graph( elem_id , row_index , j );
-
-        system_coeff( A_index ) += elem_matrices( elem_id, row_index, j );
-      }
-    }
-  }
-
-
-  static void apply( const matrix_type & matrix ,
-                     const vector_type & rhs ,
-                     const mesh_type   & mesh ,
-                     const elem_graph_type    & elem_graph ,
-                     const elem_matrices_type & elem_matrices ,
-                     const elem_vectors_type  & elem_vectors )
-  {
-    const size_t row_count = matrix.graph.row_map.dimension_0() - 1 ;
-    GatherFill op ;
-    op.node_elem_ids = mesh.node_elem_ids ;
-    op.elem_graph    = elem_graph ;
-    op.elem_matrices = elem_matrices ;
-    op.elem_vectors  = elem_vectors ;
-    op.system_coeff  = matrix.coefficients ;
-    op.system_rhs    = rhs ;
-
-    parallel_for( row_count , op );
-  }
-};
-
-} /* namespace HybridFEM */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace HybridFEM {
-
-template< class GraphType , class MeshType >
-struct GraphFactory {
-  typedef GraphType                         graph_type ;
-  typedef MeshType                          mesh_type ;
-  typedef typename graph_type::execution_space  execution_space ;
-  typedef typename execution_space::size_type   size_type  ;
-
-  static const unsigned ElemNodeCount = mesh_type::element_node_count ;
-
-  typedef Kokkos::View< size_type[][ElemNodeCount][ElemNodeCount] , execution_space >         element_map_type ;
-
-  static
-  void
-  create( const mesh_type & mesh ,
-          graph_type & graph ,
-          element_map_type & elem_map )
-  {
-    typename mesh_type::node_elem_ids_type::HostMirror
-      node_elem_ids = create_mirror( mesh.node_elem_ids );
-
-    typename mesh_type::elem_node_ids_type::HostMirror
-      elem_node_ids = create_mirror( mesh.elem_node_ids );
-
-    typedef typename element_map_type::HostMirror element_map_host_type ;
-
-    deep_copy( elem_node_ids , mesh.elem_node_ids );
-    deep_copy( node_elem_ids.entries , mesh.node_elem_ids.entries );
-
-    const size_t owned_node = mesh.parallel_data_map.count_owned ;
-    const size_t total_elem = mesh.elem_node_ids.dimension_0();
-
-    if ( total_elem ) {
-      elem_map = element_map_type( std::string("element_map"), total_elem );
-    }
-
-    element_map_host_type elem_map_host = create_mirror( elem_map );
-
-    //------------------------------------
-    //  Node->node mapping for the CrsMatrix graph
-
-    std::vector< std::vector< unsigned > > node_node_ids( owned_node );
-    std::vector< unsigned > node_node_begin( owned_node );
-
-    size_t offset = 0 ;
-    for ( size_t i = 0 ; i < owned_node ; ++i ) {
-      const size_t j_end = node_elem_ids.row_map[i+1];
-            size_t j     = node_elem_ids.row_map[i];
-
-      node_node_begin[i] = offset ;
-
-      std::vector< unsigned > & work = node_node_ids[i] ;
-
-      for ( ; j < j_end ; ++j ) {
-        const size_t elem_id = node_elem_ids.entries(j,0);
-        for ( size_t k = 0 ; k < ElemNodeCount ; ++k ) {
-          work.push_back( elem_node_ids( elem_id , k ) );
-        }
-      }
-
-      std::sort( work.begin() , work.end() );
-
-      work.erase( std::unique( work.begin() , work.end() ) , work.end() );
-
-      offset += work.size();
-    }
-
-    graph = Kokkos::create_staticcrsgraph< graph_type >( "node_node_ids" , node_node_ids );
-
-    //------------------------------------
-    // ( element , node_row , node_column ) -> matrix_crs_column
-
-    for ( size_t elem_id = 0 ; elem_id < total_elem ; ++elem_id ) {
-      for ( size_t i = 0 ; i < ElemNodeCount ; ++i ) {
-
-        const size_t node_row = elem_node_ids( elem_id , i );
-        const size_t node_row_begin = node_node_begin[ node_row ];
-        const std::vector< unsigned > & column = node_node_ids[ node_row ] ;
-
-        if ( owned_node <= node_row ) {
-          for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
-            elem_map_host( elem_id , i , j ) = std::numeric_limits<size_type>::max();
-          }
-        }
-        else {
-
-          for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
-            const size_type node_col = elem_node_ids( elem_id , j );
-
-            int col_search = 0 ;
-
-            for ( int len = column.size() ; 0 < len ; ) {
-
-              const int half = len >> 1;
-              const int middle = col_search + half ;
-
-              if ( column[middle] < node_col ){
-                col_search = middle + 1 ;
-                len -= half + 1 ;
-              }
-              else {
-                len = half ;
-              }
-            }
-if ( node_col != column[col_search] ) {
-  throw std::runtime_error(std::string("Failed"));
-}
-            elem_map_host( elem_id , i , j ) = col_search + node_row_begin ;
-          }
-        }
-      }
-    }
-
-    deep_copy( elem_map , elem_map_host );
-  }
-};
-
-} // namespace HybridFEM
-
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef SPARSELINEARSYSTEMFILL_HPP */
-
--- a/lib/kokkos/example/multi_fem/SparseLinearSystem_Cuda.hpp
+++ b/lib/kokkos/example/multi_fem/SparseLinearSystem_Cuda.hpp
@ -1,164 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef SPARSELINEARSYSTEM_CUDA_HPP
-#define SPARSELINEARSYSTEM_CUDA_HPP
-
-#if defined( BUILD_FROM_CU_FILE )
-
-#include <cusparse_v2.h>
-#include <Kokkos_Core.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-
-struct CudaSparseSingleton {
-  cusparseHandle_t   handle;
-  cusparseMatDescr_t descra;
-
-  CudaSparseSingleton()
-  {
-    cusparseCreate( & handle );
-    cusparseCreateMatDescr( & descra );
-    cusparseSetMatType(       descra , CUSPARSE_MATRIX_TYPE_GENERAL );
-    cusparseSetMatIndexBase(  descra , CUSPARSE_INDEX_BASE_ZERO );
-  }
-
-  static CudaSparseSingleton & singleton();
-
-};
-
-CudaSparseSingleton & CudaSparseSingleton::singleton()
-{ static CudaSparseSingleton s ; return s ; }
-
-
-template<>
-struct Multiply< CrsMatrix<double,Cuda> ,
-                 View<double*,Cuda > ,
-                 View<double*,Cuda > >
-{
-  typedef Cuda                                      execution_space ;
-  typedef execution_space::size_type                    size_type ;
-  typedef double                                    scalar_type ;
-  typedef View< scalar_type* , execution_space >        vector_type ;
-  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
-
-public:
-
-  Multiply( const matrix_type & A ,
-            const size_type nrow ,
-            const size_type ncol ,
-            const vector_type & x ,
-            const vector_type & y )
-  {
-    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
-    const scalar_type alpha = 1 , beta = 0 ;
-
-    cusparseStatus_t status =
-      cusparseDcsrmv( s.handle ,
-                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
-                      nrow , ncol , A.coefficients.dimension_0() ,
-                      &alpha ,
-                      s.descra ,
-                      A.coefficients.ptr_on_device() ,
-                      A.graph.row_map.ptr_on_device() ,
-                      A.graph.entries.ptr_on_device() ,
-                      x.ptr_on_device() ,
-                      &beta ,
-                      y.ptr_on_device() );
-
-    if ( CUSPARSE_STATUS_SUCCESS != status ) {
-      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
-    }
-  }
-};
-
-
-template<>
-struct Multiply< CrsMatrix<float,Cuda> ,
-                 View<float*,Cuda > ,
-                 View<float*,Cuda > >
-{
-  typedef Cuda                                      execution_space ;
-  typedef execution_space::size_type                    size_type ;
-  typedef float                                     scalar_type ;
-  typedef View< scalar_type* , execution_space >        vector_type ;
-  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
-
-public:
-
-  Multiply( const matrix_type & A ,
-            const size_type nrow ,
-            const size_type ncol ,
-            const vector_type & x ,
-            const vector_type & y )
-  {
-    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
-    const scalar_type alpha = 1 , beta = 0 ;
-
-    cusparseStatus_t status =
-      cusparseScsrmv( s.handle ,
-                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
-                      nrow , ncol , A.coefficients.dimension_0() ,
-                      &alpha ,
-                      s.descra ,
-                      A.coefficients.ptr_on_device() ,
-                      A.graph.row_map.ptr_on_device() ,
-                      A.graph.entries.ptr_on_device() ,
-                      x.ptr_on_device() ,
-                      &beta ,
-                      y.ptr_on_device() );
-
-    if ( CUSPARSE_STATUS_SUCCESS != status ) {
-      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
-    }
-  }
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#endif /* #if defined( __CUDACC__ ) */
-#endif /* #ifndef SPARSELINEARSYSTEM_CUDA_HPP */
-
--- a/lib/kokkos/example/multi_fem/TestBoxMeshFixture.hpp
+++ b/lib/kokkos/example/multi_fem/TestBoxMeshFixture.hpp
@ -1,242 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef TESTFEMESHBOXFIXTURE_HPP
-#define TESTFEMESHBOXFIXTURE_HPP
-
-#include <stdio.h>
-#include <iostream>
-#include <stdexcept>
-#include <limits>
-#include <utility>
-#include <BoxMeshFixture.hpp>
-
-#include <ParallelComm.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace TestFEMesh {
-
-template< class ViewType >
-struct VerifyUnpack  ;
-
-template< typename DeviceType, typename T >
-struct VerifyUnpack< Kokkos::View< T*[3] , DeviceType > >
-{
-  typedef DeviceType     execution_space ;
-  typedef typename execution_space::size_type  size_type ;
-  typedef size_type               value_type ;
-
-  typedef Kokkos::View< T* ,    execution_space > buffer_type ;
-  typedef Kokkos::View< T*[3] , execution_space > array_type ;
-
-private:
-
-  array_type  node_coords ;
-  buffer_type buffer ;
-  size_type   node_begin ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  static void init( value_type & update )
-  { update = 0 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
-                    const volatile value_type & source )
-  { update += source ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i , value_type & update ) const
-  {
-    const size_type node_id = i + node_begin ;
-    const size_type k = i * 3 ;
-
-    const long xb = buffer[k];
-    const long yb = buffer[k+1];
-    const long zb = buffer[k+2];
-    const long xn = node_coords(node_id,0);
-    const long yn = node_coords(node_id,1);
-    const long zn = node_coords(node_id,2);
-
-    if ( xb != xn || yb != yn || zb != zn ) {
-      printf("TestFEMesh::VerifyUnpack failed at %d : node %d : { %ld %ld %ld } != { %ld %ld %ld }\n",
-             (int)i,(int)node_id, xb,yb,zb, xn, yn, zn );
-      ++update ;
-    }
-  }
-
-  static inline
-  size_type unpack( const array_type  & arg_node_coords ,
-                    const size_type     arg_node_begin ,
-                    const size_type     arg_node_count ,
-                    const buffer_type & arg_buffer )
-  {
-    VerifyUnpack op ;
-    op.node_coords = arg_node_coords ;
-    op.buffer      = arg_buffer ;
-    op.node_begin  = arg_node_begin ;
-    size_type count = 0 ;
-    Kokkos::parallel_reduce( arg_node_count , op , count );
-    return count ;
-  }
-};
-
-}
-
-//----------------------------------------------------------------------------
-
-#ifdef KOKKOS_HAVE_MPI
-
-namespace TestFEMesh {
-
-template< typename coordinate_scalar_type ,
-          unsigned ElemNodeCount ,
-          class Device >
-void verify_parallel(
-  const HybridFEM::FEMesh< coordinate_scalar_type ,
-                           ElemNodeCount ,
-                           Device > & mesh )
-{
-  typedef HybridFEM::FEMesh< coordinate_scalar_type, ElemNodeCount, Device > femesh_type ;
-  typedef typename femesh_type::node_coords_type node_coords_type ;
-
-  comm::Machine machine = mesh.parallel_data_map.machine ;
-
-  // Communicate node coordinates to verify communication and setup.
-
-  const size_t chunk_size = 3 ;
-
-  Kokkos::AsyncExchange< coordinate_scalar_type, Device, Kokkos::ParallelDataMap >
-    exchange( mesh.parallel_data_map , chunk_size );
-
-  const size_t send_begin = mesh.parallel_data_map.count_interior ;
-  const size_t send_count = mesh.parallel_data_map.count_send ;
-
-  const size_t recv_begin = mesh.parallel_data_map.count_owned ;
-  const size_t recv_count = mesh.parallel_data_map.count_receive ;
-
-  typedef Kokkos::PackArray< node_coords_type > pack_type ;
-
-  pack_type::pack( exchange.buffer(), send_begin, send_count, mesh.node_coords );
-
-  exchange.setup();
-
-  // Launch local-action device kernels
-
-  exchange.send_receive();
-
-  unsigned long local[3] ;
-  local[0] = mesh.parallel_data_map.count_owned ;
-  local[1] = mesh.parallel_data_map.count_receive ;
-  local[2] = TestFEMesh::VerifyUnpack< node_coords_type >::unpack( mesh.node_coords, recv_begin, recv_count, exchange.buffer() );
-
-  unsigned long global[3] = { 0 , 0 , 0 };
-
-  MPI_Allreduce( local , global ,
-                 3 , MPI_UNSIGNED_LONG , MPI_SUM , machine.mpi_comm );
-
-  if ( 0 == comm::rank( machine ) ) {
-    std::cout << ( global[2] ? "FAILED" : "PASSED" )
-              << ": TestFEMesh::verify_parallel "
-              << "NP(" << comm::size( machine )
-              << ") total_node(" << global[0]
-              << ") verified_nodes(" << global[1]
-              << ") failed_nodes(" << global[2]
-              << ")" << std::endl ;
-  }
-}
-
-} // namespace TestFEMesh
-
-#else /* ! #ifdef KOKKOS_HAVE_MPI */
-
-namespace TestFEMesh {
-
-template< typename coordinate_scalar_type ,
-          unsigned ElemNodeCount ,
-          class Device >
-void verify_parallel(
-  const HybridFEM::FEMesh< coordinate_scalar_type ,
-                           ElemNodeCount ,
-                           Device > & )
-{}
-
-} // namespace TestFEMesh
-
-#endif /* ! #ifdef KOKKOS_HAVE_MPI */
-
-//----------------------------------------------------------------------------
-
-template< class Device >
-void test_box_fixture( comm::Machine machine ,
-                       const size_t gang_count ,
-                       const size_t nodes_nx ,
-                       const size_t nodes_ny ,
-                       const size_t nodes_nz )
-{
-  typedef long                coordinate_scalar_type ;
-  typedef FixtureElementHex8  fixture_element_type ;
-
-  typedef BoxMeshFixture< coordinate_scalar_type ,
-                          Device ,
-                          fixture_element_type > fixture_type ;
-
-  typedef typename fixture_type::FEMeshType  mesh_type ;
-
-  const size_t proc_count = comm::size( machine );
-  const size_t proc_local = comm::rank( machine ) ;
-
-  mesh_type mesh =
-    fixture_type::create( proc_count , proc_local , gang_count ,
-                          nodes_nx - 1 , nodes_ny - 1 , nodes_nz - 1 );
-
-  mesh.parallel_data_map.machine = machine ;
-
-  TestFEMesh::verify_parallel( mesh );
-}
-
-#endif /* #ifndef TESTFEMESHBOXFIXTURE_HPP */
-
-
--- a/lib/kokkos/example/multi_fem/TestBoxMeshPartition.cpp
+++ b/lib/kokkos/example/multi_fem/TestBoxMeshPartition.cpp
@ -1,172 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-#include <iostream>
-#include <stdexcept>
-#include <limits>
-#include <utility>
-#include <BoxMeshPartition.hpp>
-
-//----------------------------------------------------------------------------
-
-void test_box_partition( bool print )
-{
-  const size_t np_max = 10000 ;
-
-  const BoxBoundsLinear use_box ;
-
-  BoxType root_box ;
-
-  root_box[0][0] = 0 ; root_box[0][1] = 100 ;
-  root_box[1][0] = 0 ; root_box[1][1] = 200 ;
-  root_box[2][0] = 0 ; root_box[2][1] = 300 ;
-
-  const size_t cell_total =
-    ( root_box[0][1] - root_box[0][0] ) *
-    ( root_box[1][1] - root_box[1][0] ) *
-    ( root_box[2][1] - root_box[2][0] );
-
-  for ( size_t np = 2 ; np < np_max ; np = 2 * ( np + 1 ) ) {
-
-    std::vector<BoxType> part_boxes( np );
-
-    box_partition_rcb( root_box , part_boxes );
-
-    size_t cell_goal = ( cell_total + np - 1 ) / np ;
-    size_t cell_max = 0 ;
-
-    for ( size_t i = 0 ; i < np ; ++i ) {
-      cell_max = std::max( cell_max , count( part_boxes[i] ) );
-    }
-
-    if ( print ) {
-      std::cout << std::endl
-                << "box_part( " << np 
-                << " ) max( " << cell_max
-                << " ) goal( " << cell_goal
-                << " ) ratio( " << double(cell_max) / double(cell_goal)
-                << " )" << std::endl ;
-    }
-
-    const size_t nsample = std::min(np,(size_t)4);
-    const size_t stride = ( np + nsample - 1 ) / nsample ;
-
-    for ( size_t my_part = 0 ; my_part < np ; my_part += stride ) {
-      BoxType             my_use_box ;
-      std::vector<size_t> my_use_id_map ;
-      size_t              my_count_interior ;
-      size_t              my_count_owned ;
-      size_t              my_count_uses ;
-      std::vector<size_t> my_recv_counts ;
-      std::vector<std::vector<size_t> > my_send_map ;
-
-      size_t count_verify = 0 ;
-
-      box_partition_maps( root_box , part_boxes ,
-                          use_box , my_part ,
-                          my_use_box , my_use_id_map ,
-                          my_count_interior ,
-                          my_count_owned ,
-                          my_count_uses ,
-                          my_recv_counts ,
-                          my_send_map );
-
-      count_verify = my_count_owned ;
-
-      if ( print ) {
-        std::cout << "  my_part(" << my_part << ") layout { "
-                  << "P" << my_part
-                  << "(" << my_count_interior
-                  << "," << ( my_count_owned - my_count_interior )
-                  << ")" ;
-      }
-
-      for ( size_t i = 1 ; i < np ; ++i ) {
-        if ( my_recv_counts[i] ) {
-          count_verify += my_recv_counts[i] ;
-          const size_t ip = ( my_part + i ) % np ;
-
-          if ( print ) {
-            std::cout << " P" << ip << "(" << my_recv_counts[i] << ")" ;
-          }
-
-          // Compare recv & send lists
-
-          BoxType             ip_use_box ;
-          std::vector<size_t> ip_use_id_map ;
-          size_t              ip_count_interior ;
-          size_t              ip_count_owned ;
-          size_t              ip_count_uses ;
-          std::vector<size_t> ip_recv_counts ;
-          std::vector<std::vector<size_t> > ip_send_map ;
-
-          box_partition_maps( root_box , part_boxes ,
-                              use_box , ip ,
-                              ip_use_box , ip_use_id_map ,
-                              ip_count_interior ,
-                              ip_count_owned ,
-                              ip_count_uses ,
-                              ip_recv_counts ,
-                              ip_send_map );
-
-          // Sent by ip, received by my_part:
-
-          const BoxType recv_send = intersect( part_boxes[ip] , my_use_box );
-          const size_t recv_send_count = count( recv_send );
-
-          const size_t j = ( my_part + np - ip ) % np ;
-
-          if ( recv_send_count != my_recv_counts[i] ||
-               recv_send_count != ip_send_map[j].size() ) {
-            throw std::runtime_error( std::string("bad recv/send map") );
-          }
-        }
-      }
-      if ( print ) { std::cout << " }" << std::endl ; }
-
-      if ( count_verify != my_count_uses ) {
-        throw std::runtime_error( std::string("bad partition map") );
-      }
-    }
-  }
-}
-
-
--- a/lib/kokkos/example/multi_fem/TestCuda.cpp
+++ b/lib/kokkos/example/multi_fem/TestCuda.cpp
@ -1,192 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-
-#include <TestBoxMeshFixture.hpp>
-#include <Implicit.hpp>
-#include <Nonlinear.hpp>
-#include <Explicit.hpp>
-
-#include <SparseLinearSystem.hpp>
-
-#if defined( KOKKOS_HAVE_CUDA )
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-CudaSparseSingleton & CudaSparseSingleton::singleton()
-{ static CudaSparseSingleton s ; return s ; }
-
-}
-}
-
-//----------------------------------------------------------------------------
-
-void test_cuda_query( comm::Machine machine )
-{
-  const size_t comm_rank = comm::rank( machine );
-  std::cout << "P" << comm_rank
-            << ": Cuda device_count = "
-            << Kokkos::Cuda::detect_device_count()
-            << std::endl ;
-}
-
-//----------------------------------------------------------------------------
-
-void test_cuda_fixture( comm::Machine machine ,
-                        size_t nx , size_t ny , size_t nz )
-{
-  const size_t comm_rank = comm::rank( machine );
-  const size_t comm_size = comm::size( machine );
-  const size_t dev_count = Kokkos::Cuda::detect_device_count();
-  const size_t dev_rank =
-    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
-  const size_t gang_count = 0 ;
-
-  Kokkos::HostSpace::execution_space::initialize();
-  Kokkos::Cuda::SelectDevice select_device( dev_rank );
-  Kokkos::Cuda::initialize( select_device );
-  test_box_fixture<Kokkos::Cuda>( machine , gang_count , nx , ny , nz );
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-void test_cuda_implicit( comm::Machine machine , 
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run )
-{
-  const size_t comm_rank = comm::rank( machine );
-  const size_t comm_size = comm::size( machine );
-  const size_t dev_count = Kokkos::Cuda::detect_device_count();
-  const size_t dev_rank =
-    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
-  const size_t gang_count = 0 ;
-
-  Kokkos::HostSpace::execution_space::initialize();
-  Kokkos::Cuda::SelectDevice select_device( dev_rank );
-  Kokkos::Cuda::initialize( select_device );
-  HybridFEM::Implicit::driver<double,Kokkos::Cuda>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-void test_cuda_explicit( comm::Machine machine , 
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run )
-{
-  const size_t comm_rank = comm::rank( machine );
-  const size_t comm_size = comm::size( machine );
-  const size_t dev_count = Kokkos::Cuda::detect_device_count();
-  const size_t dev_rank =
-    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
-  const size_t gang_count = 0 ;
-
-  Kokkos::HostSpace::execution_space::initialize();
-  Kokkos::Cuda::SelectDevice select_device( dev_rank );
-  Kokkos::Cuda::initialize( select_device );
-  Explicit::driver<double,Kokkos::Cuda>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-void test_cuda_nonlinear( comm::Machine machine , 
-                          size_t elem_count_begin ,
-                          size_t elem_count_end ,
-                          size_t count_run )
-{
-  const size_t comm_rank = comm::rank( machine );
-  const size_t comm_size = comm::size( machine );
-  const size_t dev_count = Kokkos::Cuda::detect_device_count();
-  const size_t dev_rank =
-    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
-  const size_t gang_count = 0 ;
-
-  Kokkos::HostSpace::execution_space::initialize();
-  Kokkos::Cuda::SelectDevice select_device( dev_rank );
-  Kokkos::Cuda::initialize( select_device );
-
-  typedef Kokkos::Cuda device ;
-  typedef FixtureElementHex8 hex8 ;
-  HybridFEM::Nonlinear::driver<double,device,hex8>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-void test_cuda_nonlinear_quadratic( comm::Machine machine , 
-                                    size_t elem_count_begin ,
-                                    size_t elem_count_end ,
-                                    size_t count_run )
-{
-  const size_t comm_rank = comm::rank( machine );
-  const size_t comm_size = comm::size( machine );
-  const size_t dev_count = Kokkos::Cuda::detect_device_count();
-  const size_t dev_rank =
-    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
-  const size_t gang_count = 0 ;
-
-  Kokkos::HostSpace::execution_space::initialize();
-  Kokkos::Cuda::SelectDevice select_device( dev_rank );
-  Kokkos::Cuda::initialize( select_device );
-
-  typedef Kokkos::Cuda device ;
-  typedef FixtureElementHex27 hex27 ;
-  HybridFEM::Nonlinear::driver<double,device,hex27>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::Cuda::finalize();
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-#endif  /* #if defined( KOKKOS_HAVE_CUDA ) */
-
--- a/lib/kokkos/example/multi_fem/TestHost.cpp
+++ b/lib/kokkos/example/multi_fem/TestHost.cpp
@ -1,137 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-// Must be included first on Intel-Phi systems due to
-// redefinition of SEEK_SET in <mpi.h>.
-
-#include <ParallelComm.hpp>
-
-#include <iostream>
-#include <stdexcept>
-#include <limits>
-#include <utility>
-
-//----------------------------------------------------------------------------
-
-#include <Kokkos_Core.hpp>
-
-#include <BoxMeshFixture.hpp>
-#include <TestBoxMeshFixture.hpp>
-#include <Implicit.hpp>
-#include <Nonlinear.hpp>
-#include <Explicit.hpp>
-#include <SparseLinearSystem.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-void test_host_fixture( comm::Machine machine ,
-                        size_t gang_count ,
-                        size_t gang_worker_count ,
-                        size_t nx , size_t ny , size_t nz )
-{
-  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
-  test_box_fixture<Kokkos::HostSpace::execution_space>( machine , gang_count , nx , ny , nz );
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-void test_host_implicit( comm::Machine machine ,
-                         size_t gang_count ,
-                         size_t gang_worker_count ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run )
-{
-  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
-  HybridFEM::Implicit::driver<double,Kokkos::HostSpace::execution_space>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-void test_host_explicit( comm::Machine machine ,
-                         size_t gang_count ,
-                         size_t gang_worker_count ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run )
-{
-  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
-  Explicit::driver<double,Kokkos::HostSpace::execution_space>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-void test_host_nonlinear( comm::Machine machine ,
-                          size_t gang_count ,
-                          size_t gang_worker_count ,
-                          size_t elem_count_begin ,
-                          size_t elem_count_end ,
-                          size_t count_run )
-{
-  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
-  typedef FixtureElementHex8 hex8 ;
-  typedef Kokkos::HostSpace::execution_space             device ;
-  HybridFEM::Nonlinear::driver<double,device,hex8>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-void test_host_nonlinear_quadratic( comm::Machine machine ,
-                                    size_t gang_count ,
-                                    size_t gang_worker_count ,
-                                    size_t elem_count_begin ,
-                                    size_t elem_count_end ,
-                                    size_t count_run )
-{
-  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
-  typedef FixtureElementHex27 hex27 ;
-  typedef Kokkos::HostSpace::execution_space              device ;
-  HybridFEM::Nonlinear::driver<double,device,hex27>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
-  Kokkos::HostSpace::execution_space::finalize();
-}
-
-//----------------------------------------------------------------------------
-
-
--- a/lib/kokkos/example/multi_fem/TestHybridFEM.cpp
+++ b/lib/kokkos/example/multi_fem/TestHybridFEM.cpp
@ -1,348 +0,0 @@
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-
-// Must be included first on Intel-Phi systems due to
-// redefinition of SEEK_SET in <mpi.h>.
-
-#include <ParallelComm.hpp>
-
-#include <string>
-#include <sstream>
-#include <iostream>
-#include <Kokkos_hwloc.hpp>
-
-//----------------------------------------------------------------------------
-
-void test_box_partition( bool print );
-
-//----------------------------------------------------------------------------
-
-void test_host_fixture( comm::Machine machine ,
-                        size_t gang_count ,
-                        size_t gang_worker_count ,
-                        size_t nx , size_t ny , size_t nz );
-
-void test_host_implicit( comm::Machine machine ,
-                         size_t gang_count ,
-                         size_t gang_worker_count ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run );
-
-void test_host_explicit( comm::Machine machine ,
-                         size_t gang_count ,
-                         size_t gang_worker_count ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run );
-
-void test_host_nonlinear( comm::Machine machine ,
-                          size_t gang_count ,
-                          size_t gang_worker_count ,
-                          size_t elem_count_begin ,
-                          size_t elem_count_end ,
-                          size_t count_run );
-
-void test_host_nonlinear_quadratic( comm::Machine machine ,
-                                    size_t gang_count ,
-                                    size_t gang_worker_count ,
-                                    size_t elem_count_begin ,
-                                    size_t elem_count_end ,
-                                    size_t count_run );
-
-
-//----------------------------------------------------------------------------
-
-void test_cuda_query( comm::Machine );
-
-void test_cuda_fixture( comm::Machine machine ,
-                        size_t nx , size_t ny , size_t nz );
-
-void test_cuda_implicit( comm::Machine machine ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run );
-
-void test_cuda_explicit( comm::Machine machine ,
-                         size_t elem_count_begin ,
-                         size_t elem_count_end ,
-                         size_t count_run );
-
-void test_cuda_nonlinear( comm:: Machine machine ,
-                          size_t elem_count_begin ,
-                          size_t elem_count_end ,
-                          size_t count_run );
-
-void test_cuda_nonlinear_quadratic( comm::Machine machine ,
-                                    size_t elem_count_begin ,
-                                    size_t elem_count_end ,
-                                    size_t count_run );
-
-
-//----------------------------------------------------------------------------
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace {
-
-bool run_host( std::istream & input ,
-               comm::Machine machine ,
-               const size_t host_gang_count ,
-               const size_t host_gang_worker_count )
-{
-  bool cmd_error = false ;
-
-  std::string which ; input >> which ;
-
-  if ( which == std::string("fixture") ) {
-
-    size_t nx = 0 , ny = 0 , nz = 0 ;
-    input >> nx >> ny >> nz ;
-    test_host_fixture( machine , host_gang_count , host_gang_worker_count , nx , ny , nz );
-
-  }
-  else if ( which == std::string("explicit") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_host_explicit( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("implicit") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_host_implicit( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("nonlinear") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_host_nonlinear( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("nonlinear_quadratic") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_host_nonlinear_quadratic( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else {
-    cmd_error = true ;
-  }
-
-  return cmd_error ;
-}
-
-#if defined( KOKKOS_HAVE_CUDA )
-bool run_cuda( std::istream & input , comm::Machine machine )
-{
-  bool cmd_error = false ;
-
-  std::string which ; input >> which ;
-
-  if ( which == std::string("fixture") ) {
-
-    size_t nx = 0 , ny = 0 , nz = 0 ;
-    input >> nx >> ny >> nz ;
-    test_cuda_fixture( machine , nx , ny , nz );
-
-  }
-  else if ( which == std::string("explicit") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_cuda_explicit( machine , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("implicit") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_cuda_implicit( machine , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("nonlinear") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_cuda_nonlinear( machine , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else if ( which == std::string("nonlinear_quadratic") ) {
-
-    size_t mesh_node_begin = 100 ;
-    size_t mesh_node_end   = 300 ;
-    size_t run             =   1 ;
-    input >> mesh_node_begin >> mesh_node_end >> run ;
-    test_cuda_nonlinear_quadratic( machine , mesh_node_begin , mesh_node_end , run );
-
-  }
-  else {
-    cmd_error = true ;
-  }
-
-  return cmd_error ;
-}
-#endif
-
-void run( const std::string & argline , comm::Machine machine )
-{
-  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-  std::istringstream input( argline );
-
-  bool cmd_error = false ;
-
-  std::string which ; input >> which ;
-
-  if ( which == std::string("query") ) {
-    std::cout << "P" << comm::rank( machine )
-              << ": hwloc { NUMA[" << numa_count << "]"
-              << " CORE[" << cores_per_numa << "]"
-              << " PU[" << threads_per_core << "] }"
-              << std::endl ;
-#if defined( KOKKOS_HAVE_CUDA )
-    test_cuda_query( machine );
-#endif
-  }
-  else if ( which == std::string("partition") ) {
-    if ( 0 == comm::rank( machine ) ) {
-      test_box_partition( false /* print flag */ );
-    }
-  }
-  else {
-    if ( which == std::string("host") ) {
-      size_t host_gang_count = 0 ;
-      size_t host_gang_worker_count = 1 ;
-
-      input >> host_gang_count ;
-      input >> host_gang_worker_count ;
-
-      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
-    }
-    else if ( which == std::string("host-all") ) {
-      size_t host_gang_count        = numa_count ;
-      size_t host_gang_worker_count = cores_per_numa * threads_per_core ;
-
-      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
-    }
-    else if ( which == std::string("host-most") ) {
-      size_t host_gang_count        = numa_count ;
-      size_t host_gang_worker_count = ( cores_per_numa - 1 ) * threads_per_core ;
-
-      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
-    }
-#if defined( KOKKOS_HAVE_CUDA )
-    else if ( which == std::string("cuda") ) {
-      cmd_error = run_cuda( input , machine );
-    }
-#endif
-    else {
-      cmd_error = true ;
-    }
-  }
-
-  if ( cmd_error && 0 == comm::rank( machine ) ) {
-    std::cout << "Expecting command line with" << std::endl
-              << "    query" << std::endl
-              << "    partition" << std::endl
-              << "    host NumNumaNode NumThreadPerNode <test>" << std::endl
-              << "    host-all <test>" << std::endl
-              << "    host-most <test>" << std::endl
-              << "    cuda <test>" << std::endl
-              << "where <test> is" << std::endl
-              << "    fixture   NumElemX NumElemY NumElemZ" << std::endl
-              << "    implicit  NumElemBegin NumElemEnd NumRun" << std::endl
-              << "    explicit  NumElemBegin NumElemEnd NumRun" << std::endl
-              << "    nonlinear NumElemBegin NumElemEnd NumRun" << std::endl
-              << "    nonlinear_quadratic NumElemBegin NumElemEnd NumRun" << std::endl ;
-
-  }
-}
-
-} // namespace
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-int main( int argc , char ** argv )
-{
-  comm::Machine machine = comm::Machine::init( & argc , & argv );
-
-  const unsigned comm_rank = comm::rank( machine );
-
-  const std::string argline = comm::command_line( machine , argc , argv );
-
-  try {
-    run( argline , machine );
-  }
-  catch( const std::exception & x ) {
-    std::cerr << "P" << comm_rank << " throw: " << x.what() << std::endl ;
-  }
-  catch( ... ) {
-    std::cerr << "P" << comm_rank << " throw: unknown exception" << std::endl ;
-  }
-
-  comm::Machine::finalize();
-
-  return 0 ;
-}
-
--- a/lib/kokkos/example/query_device/CMakeLists.txt
+++ b/lib/kokkos/example/query_device/CMakeLists.txt
@ -1,14 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-SET(SOURCES "")
-
-FILE(GLOB SOURCES *.cpp)
-
-TRIBITS_ADD_EXECUTABLE(
-  query_device
-  SOURCES ${SOURCES}
-  COMM serial mpi
-  )
-
--- a/lib/kokkos/example/query_device/Makefile
+++ b/lib/kokkos/example/query_device/Makefile
@ -1,53 +0,0 @@
-KOKKOS_PATH ?= ../..
-
-MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-SRC_DIR := $(dir $(MAKEFILE_PATH))
-
-SRC = $(wildcard $(SRC_DIR)/*.cpp)
-OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
-
-#SRC = $(wildcard *.cpp)
-#OBJ = $(SRC:%.cpp=%.o)
-
-default: build
-	echo "Start Build"
-
-# use installed Makefile.kokkos
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = $(NVCC_WRAPPER)
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS = 
-EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "Cuda,OpenMP"
-#KOKKOS_ARCH = "SNB,Kepler35"
-else
-CXX = g++
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS =  
-EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "OpenMP"
-#KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-LIB =
-
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: 
-	rm -f *.a *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
-
--- a/lib/kokkos/example/query_device/query_device.cpp
+++ b/lib/kokkos/example/query_device/query_device.cpp
@ -1,100 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <iostream>
-#include <sstream>
-
-#include <Kokkos_Macros.hpp>
-
-#if defined( KOKKOS_HAVE_MPI )
-#include <mpi.h>
-#endif
-
-#include <Kokkos_Core.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-int main( int argc , char ** argv )
-{
-  std::ostringstream msg ;
-
-#if defined( KOKKOS_HAVE_MPI )
-
-  MPI_Init( & argc , & argv );
-
-  int mpi_rank = 0 ;
-
-  MPI_Comm_rank( MPI_COMM_WORLD , & mpi_rank );
-
-  msg << "MPI rank(" << mpi_rank << ") " ;
-
-#endif
-
-  msg << "{" << std::endl ;
-
-  if ( Kokkos::hwloc::available() ) {
-    msg << "hwloc( NUMA[" << Kokkos::hwloc::get_available_numa_count()
-        << "] x CORE["    << Kokkos::hwloc::get_available_cores_per_numa()
-        << "] x HT["      << Kokkos::hwloc::get_available_threads_per_core()
-        << "] )"
-        << std::endl ;
-  }
-
-#if defined( KOKKOS_HAVE_CUDA )
-  Kokkos::Cuda::print_configuration( msg );
-#endif
-
-  msg << "}" << std::endl ;
-
-  std::cout << msg.str();
-
-#if defined( KOKKOS_HAVE_MPI )
-
-  MPI_Finalize();
-
-#endif
-
-  return 0 ;
-}
-
--- a/lib/kokkos/example/sort_array/CMakeLists.txt
+++ b/lib/kokkos/example/sort_array/CMakeLists.txt
@ -1,15 +0,0 @@
-INCLUDE(TribitsAddExecutableAndTest)   
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-SET(SOURCES "")
-
-FILE(GLOB SOURCES *.cpp)
-
-TRIBITS_ADD_EXECUTABLE(
-  sort_array
-  SOURCES ${SOURCES}
-  COMM serial mpi
-  )
-
--- a/lib/kokkos/example/sort_array/Makefile
+++ b/lib/kokkos/example/sort_array/Makefile
@ -1,53 +0,0 @@
-KOKKOS_PATH ?= ../..
-
-MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-SRC_DIR := $(dir $(MAKEFILE_PATH))
-
-SRC = $(wildcard $(SRC_DIR)/*.cpp)
-OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
-
-#SRC = $(wildcard *.cpp)
-#OBJ = $(SRC:%.cpp=%.o)
-
-default: build
-	echo "Start Build"
-
-# use installed Makefile.kokkos
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = $(NVCC_WRAPPER)
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS = 
-EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "Cuda,OpenMP"
-#KOKKOS_ARCH = "SNB,Kepler35"
-else
-CXX = g++
-CXXFLAGS = -I$(SRC_DIR) -O3
-LINK = $(CXX)
-LINKFLAGS =  
-EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
-#KOKKOS_DEVICES = "OpenMP"
-#KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-LIB =
-
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: 
-	rm -f *.a *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
-
--- a/lib/kokkos/example/sort_array/main.cpp
+++ b/lib/kokkos/example/sort_array/main.cpp
@ -1,95 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <string.h>
-#include <stdlib.h>
-#include <iostream>
-#include <sstream>
-
-#include <Kokkos_Core.hpp>
-
-#include <sort_array.hpp>
-
-
-int main( int argc , char ** argv )
-{
-#if defined( KOKKOS_HAVE_CUDA ) || defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_OPENMP )
-  Kokkos::initialize( argc , argv );
-
-  int length_array = 100000 ;
-
-  for ( int i = 0 ; i < argc ; ++i ) {
-    if ( 0 == strcmp( argv[i] , "length_array" ) ) {
-      length_array = atoi( argv[i+1] );
-    }
-  }
-
-  int length_total_array  = length_array * 100;
-
-#if defined( KOKKOS_HAVE_CUDA )
-  if ( Kokkos::Cuda::is_initialized() ) {
-    std::cout << "Kokkos::Cuda" << std::endl ;
-    Example::sort_array< Kokkos::Cuda >( length_array , length_total_array );
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-  if ( Kokkos::Threads::is_initialized() ) {
-    std::cout << "Kokkos::Threads" << std::endl ;
-    Example::sort_array< Kokkos::Threads >( length_array , length_total_array );
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_OPENMP )
-  if ( Kokkos::OpenMP::is_initialized() ) {
-    std::cout << "Kokkos::OpenMP" << std::endl ;
-    Example::sort_array< Kokkos::OpenMP >( length_array , length_total_array );
-  }
-#endif
-
-  Kokkos::finalize();
-#endif
-
-  return 0 ;
-}
-
--- a/lib/kokkos/example/sort_array/sort_array.hpp
+++ b/lib/kokkos/example/sort_array/sort_array.hpp
@ -1,190 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef EXAMPLE_SORT_ARRAY
-#define EXAMPLE_SORT_ARRAY
-
-#include <stdlib.h>
-#include <algorithm>
-
-#include <Kokkos_Core.hpp>
-
-#include <impl/Kokkos_Timer.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Example {
-
-template< class Device >
-struct SortView {
-
-  template< typename ValueType >
-  SortView( const Kokkos::View<ValueType*,Device> v , int begin , int end )
-    {
-      std::sort( v.ptr_on_device() + begin , v.ptr_on_device() + end );
-    }
-};
-
-}
-
-#if defined(KOKKOS_HAVE_CUDA)
-
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-
-namespace Example {
-
-template<>
-struct SortView< Kokkos::Cuda > {
-  template< typename ValueType >
-  SortView( const Kokkos::View<ValueType*,Kokkos::Cuda> v , int begin , int end )
-    {
-      thrust::sort( thrust::device_ptr<ValueType>( v.ptr_on_device() + begin )
-                  , thrust::device_ptr<ValueType>( v.ptr_on_device() + end ) );
-    }
-};
-
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Example {
-
-template< class Device >
-void sort_array( const size_t array_length /* length of spans of array to sort */
-               , const size_t total_length /* total length of array */
-               , const int print = 1 )
-{
-  typedef Device execution_space ;
-  typedef Kokkos::View<int*,Device>  device_array_type ;
-
-#if defined( KOKKOS_HAVE_CUDA )
-
-  typedef typename
-    Kokkos::Impl::if_c< Kokkos::Impl::is_same< Device , Kokkos::Cuda >::value
-                      , Kokkos::View<int*,Kokkos::Cuda::array_layout,Kokkos::CudaHostPinnedSpace>
-                      , typename device_array_type::HostMirror
-                      >::type  host_array_type ;
-
-#else
-
-  typedef typename device_array_type::HostMirror  host_array_type ;
-
-#endif
-
-  Kokkos::Impl::Timer timer;
-
-  const device_array_type  work_array("work_array" , array_length );
-  const host_array_type    host_array("host_array" , total_length );
-
-  std::cout << "sort_array length( " << total_length << " )"
-            << " in chunks( " << array_length << " )"
-            << std::endl ;
-
-  double sec = timer.seconds();
-  std::cout << "declaring Views took "
-            << sec << " seconds" << std::endl;
-  timer.reset();
-
-  for ( size_t i = 0 ; i < total_length ; ++i ) {
-    host_array(i) = ( lrand48() * total_length ) >> 31 ;
-  }
-
-  sec = timer.seconds();
-  std::cout << "initializing " << total_length << " elements on host took "
-            << sec << " seconds" << std::endl;
-  timer.reset();
-
-  double sec_copy_in  = 0 ;
-  double sec_sort     = 0 ;
-  double sec_copy_out = 0 ;
-  double sec_error    = 0 ;
-  size_t error_count  = 0 ;
-
-  for ( size_t begin = 0 ; begin < total_length ; begin += array_length ) {
-
-    const size_t end = begin + array_length < total_length
-                     ? begin + array_length : total_length ;
-
-    const std::pair<size_t,size_t> host_range(begin,end);
-
-    const host_array_type host_subarray = Kokkos::subview( host_array , host_range );
-
-    timer.reset();
-
-    Kokkos::deep_copy( work_array , host_subarray );
-
-    sec_copy_in += timer.seconds(); timer.reset();
-
-    SortView< execution_space >( work_array , 0 , end - begin );
-
-    sec_sort += timer.seconds(); timer.reset();
-
-    Kokkos::deep_copy( host_subarray , work_array );
-
-    sec_copy_out += timer.seconds(); timer.reset();
-
-    for ( size_t i = begin + 1 ; i < end ; ++i ) {
-      if ( host_array(i) < host_array(i-1) ) ++error_count ;
-    }
-
-    sec_error += timer.seconds(); timer.reset();
-  }
-
-  std::cout << "copy to   device " << sec_copy_in  << " seconds" << std::endl
-            << "sort on   device " << sec_sort     << " seconds" << std::endl
-            << "copy from device " << sec_copy_out << " seconds" << std::endl
-            << "errors " << error_count << " took " << sec_error << " seconds" << std::endl
-            ;
-}
-
-} // namespace Example
-
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef EXAMPLE_SORT_ARRAY */
-
--- a/lib/kokkos/example/tutorial/01_hello_world/CMakeLists.txt
+++ b/lib/kokkos/example/tutorial/01_hello_world/CMakeLists.txt
@ -1,11 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-# This is a tutorial, not a test, so we don't ask CTest to run it.
-TRIBITS_ADD_EXECUTABLE(
-  tutorial_01_hello_world
-  SOURCES hello_world.cpp
-  COMM serial mpi
-  )
-
--- a/lib/kokkos/example/tutorial/01_hello_world/Makefile
+++ b/lib/kokkos/example/tutorial/01_hello_world/Makefile
@ -1,43 +0,0 @@
-KOKKOS_PATH = ../../..
-SRC = $(wildcard *.cpp)
-
-default: build
-	echo "Start Build"
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS = 
-EXE = $(SRC:.cpp=.cuda)
-KOKKOS_DEVICES = "Cuda,OpenMP"
-KOKKOS_ARCH = "SNB,Kepler35"
-else
-CXX = g++
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS =  
-EXE = $(SRC:.cpp=.host)
-KOKKOS_DEVICES = "OpenMP"
-KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-OBJ = $(SRC:.cpp=.o)
-LIB =
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
--- a/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp
+++ b/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp
@ -1,130 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-#include <cstdio>
-#include <typeinfo>
-
-//
-// "Hello world" parallel_for example:
-//   1. Start up Kokkos
-//   2. Execute a parallel for loop in the default execution space,
-//      using a functor to define the loop body
-//   3. Shut down Kokkos
-//
-// If Kokkos was built with C++11 enabled, try comparing this example
-// to 01_hello_world_lambda.  The latter uses C++11 lambdas (anonymous
-// functions) to define the loop body of the parallel_for.  That makes
-// the code much more concise and readable.  On the other hand,
-// breaking out the loop body into an explicit functor makes it easier
-// to test the loop independently of the parallel pattern.
-//
-
-// Functor that defines the parallel_for's loop body.
-//
-// A "functor" is just a class or struct with a public operator()
-// instance method.
-struct hello_world {
-  // If a functor has an "execution_space" (or "execution_space", for
-  // backwards compatibility) public typedef, parallel_* will only run
-  // the functor in that execution space.  That's a good way to mark a
-  // functor as specific to an execution space.  If the functor lacks
-  // this typedef, parallel_for will run it in the default execution
-  // space, unless you tell it otherwise (that's an advanced topic;
-  // see "execution policies").
-
-  // The functor's operator() defines the loop body.  It takes an
-  // integer argument which is the parallel for loop index.  Other
-  // arguments are possible; see the "hierarchical parallelism" part
-  // of the tutorial.
-  //
-  // The operator() method must be const, and must be marked with the
-  // KOKKOS_INLINE_FUNCTION macro.  If building with CUDA, this macro
-  // will mark your method as suitable for running on the CUDA device
-  // (as well as on the host).  If not building with CUDA, the macro
-  // is unnecessary but harmless.
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int i) const {
-    printf ("Hello from i = %i\n", i);
-  }
-};
-
-int main (int argc, char* argv[]) {
-  // You must call initialize() before you may call Kokkos.
-  //
-  // With no arguments, this initializes the default execution space
-  // (and potentially its host execution space) with default
-  // parameters.  You may also pass in argc and argv, analogously to
-  // MPI_Init().  It reads and removes command-line arguments that
-  // start with "--kokkos-".
-  Kokkos::initialize (argc, argv);
-
-  // Print the name of Kokkos' default execution space.  We're using
-  // typeid here, so the name might get a bit mangled by the linker,
-  // but you should still be able to figure out what it is.
-  printf ("Hello World on Kokkos execution space %s\n",
-          typeid (Kokkos::DefaultExecutionSpace).name ());
-
-  // Run the above functor on the default Kokkos execution space in
-  // parallel, with a parallel for loop count of 15.
-  //
-  // The Kokkos::DefaultExecutionSpace typedef gives the default
-  // execution space.  Depending on how Kokkos was configured, this
-  // could be OpenMP, Threads, Cuda, Serial, or even some other
-  // execution space.
-  //
-  // The following line of code would look like this in OpenMP:
-  //
-  // #pragma omp parallel for
-  // for (int i = 0; i < 15; ++i) {
-  //   printf ("Hello from i = %i\n", i);
-  // }
-  //
-  // You may notice that the printed numbers do not print out in
-  // order.  Parallel for loops may execute in any order.
-  Kokkos::parallel_for ("HelloWorld",15, hello_world ());
-
-  // You must call finalize() after you are done using Kokkos.
-  Kokkos::finalize ();
-}
-
--- a/lib/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt
+++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt
@ -1,13 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-IF (Kokkos_ENABLE_CXX11)
-  # This is a tutorial, not a test, so we don't ask CTest to run it.
-  TRIBITS_ADD_EXECUTABLE(
-    tutorial_01_hello_world_lambda
-    SOURCES hello_world_lambda.cpp
-    COMM serial mpi
-    )
-ENDIF ()
-
--- a/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile
+++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile
@ -1,44 +0,0 @@
-KOKKOS_PATH = ../../..
-SRC = $(wildcard *.cpp)
-
-default: build
-	echo "Start Build"
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS = 
-EXE = $(SRC:.cpp=.cuda)
-KOKKOS_DEVICES = "Cuda,OpenMP"
-KOKKOS_ARCH = "SNB,Kepler35"
-KOKKOS_CUDA_OPTIONS = "enable_lambda"
-else
-CXX = g++
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS =  
-EXE = $(SRC:.cpp=.host)
-KOKKOS_DEVICES = "OpenMP"
-KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-OBJ = $(SRC:.cpp=.o)
-LIB =
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
--- a/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
+++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
@ -1,109 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-#include <cstdio>
-#include <typeinfo>
-
-//
-// "Hello world" parallel_for example:
-//   1. Start up Kokkos
-//   2. Execute a parallel for loop in the default execution space,
-//      using a C++11 lambda to define the loop body
-//   3. Shut down Kokkos
-//
-// This example only builds if C++11 is enabled.  Compare this example
-// to 01_hello_world, which uses functors (explicitly defined classes)
-// to define the loop body of the parallel_for.  Both functors and
-// lambdas have their places.
-//
-
-int main (int argc, char* argv[]) {
-  // You must call initialize() before you may call Kokkos.
-  //
-  // With no arguments, this initializes the default execution space
-  // (and potentially its host execution space) with default
-  // parameters.  You may also pass in argc and argv, analogously to
-  // MPI_Init().  It reads and removes command-line arguments that
-  // start with "--kokkos-".
-  Kokkos::initialize (argc, argv);
-
-  // Print the name of Kokkos' default execution space.  We're using
-  // typeid here, so the name might get a bit mangled by the linker,
-  // but you should still be able to figure out what it is.
-  printf ("Hello World on Kokkos execution space %s\n",
-          typeid (Kokkos::DefaultExecutionSpace).name ());
-
-  // Run lambda on the default Kokkos execution space in parallel,
-  // with a parallel for loop count of 15.  The lambda's argument is
-  // an integer which is the parallel for's loop index.  As you learn
-  // about different kinds of parallelism, you will find out that
-  // there are other valid argument types as well.
-  //
-  // For a single level of parallelism, we prefer that you use the
-  // KOKKOS_LAMBDA macro.  If CUDA is disabled, this just turns into
-  // [=].  That captures variables from the surrounding scope by
-  // value.  Do NOT capture them by reference!  If CUDA is enabled,
-  // this macro may have a special definition that makes the lambda
-  // work correctly with CUDA.  Compare to the KOKKOS_INLINE_FUNCTION
-  // macro, which has a special meaning if CUDA is enabled.
-  //
-  // The following parallel_for would look like this if we were using
-  // OpenMP by itself, instead of Kokkos:
-  //
-  // #pragma omp parallel for
-  // for (int i = 0; i < 15; ++i) {
-  //   printf ("Hello from i = %i\n", i);
-  // }
-  //
-  // You may notice that the printed numbers do not print out in
-  // order.  Parallel for loops may execute in any order.
-  Kokkos::parallel_for (15, KOKKOS_LAMBDA (const int i) {
-      // printf works in a CUDA parallel kernel; std::ostream does not.
-      printf ("Hello from i = %i\n", i);
-    });
-
-  // You must call finalize() after you are done using Kokkos.
-  Kokkos::finalize ();
-}
-
--- a/lib/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt
+++ b/lib/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt
@ -1,10 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-# This is a tutorial, not a test, so we don't ask CTest to run it.
-TRIBITS_ADD_EXECUTABLE(
-  tutorial_02_simple_reduce
-  SOURCES simple_reduce.cpp
-  COMM serial mpi
-  )
--- a/lib/kokkos/example/tutorial/02_simple_reduce/Makefile
+++ b/lib/kokkos/example/tutorial/02_simple_reduce/Makefile
@ -1,43 +0,0 @@
-KOKKOS_PATH = ../../..
-SRC = $(wildcard *.cpp)
-
-default: build
-	echo "Start Build"
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS = 
-EXE = $(SRC:.cpp=.cuda)
-KOKKOS_DEVICES = "Cuda,OpenMP"
-KOKKOS_ARCH = "SNB,Kepler35"
-else
-CXX = g++
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS =  
-EXE = $(SRC:.cpp=.host)
-KOKKOS_DEVICES = "OpenMP"
-KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-OBJ = $(SRC:.cpp=.o)
-LIB =
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
--- a/lib/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp
+++ b/lib/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp
@ -1,101 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-#include <cstdio>
-
-//
-// First reduction (parallel_reduce) example:
-//   1. Start up Kokkos
-//   2. Execute a parallel_reduce loop in the default execution space,
-//      using a functor to define the loop body
-//   3. Shut down Kokkos
-//
-// Compare this example to 02_simple_reduce_lambda, which uses a C++11
-// lambda to define the loop body of the parallel_reduce.
-//
-
-// Reduction functor for computing the sum of squares.
-//
-// More advanced reduction examples will show how to control the
-// reduction's "join" operator.  If the join operator is not provided,
-// it defaults to binary operator+ (adding numbers together).
-struct squaresum {
-  // Specify the type of the reduction value with a "value_type"
-  // typedef.  In this case, the reduction value has type int.
-  typedef int value_type;
-
-  // The reduction functor's operator() looks a little different than
-  // the parallel_for functor's operator().  For the reduction, we
-  // pass in both the loop index i, and the intermediate reduction
-  // value lsum.  The latter MUST be passed in by nonconst reference.
-  // (If the reduction type is an array like int[], indicating an
-  // array reduction result, then the second argument is just int[].)
-  KOKKOS_INLINE_FUNCTION
-  void operator () (const int i, int& lsum) const {
-    lsum += i*i; // compute the sum of squares
-  }
-};
-
-int main (int argc, char* argv[]) {
-  Kokkos::initialize (argc, argv);
-  const int n = 10;
-
-  // Compute the sum of squares of integers from 0 to n-1, in
-  // parallel, using Kokkos.
-  int sum = 0;
-  Kokkos::parallel_reduce (n, squaresum (), sum);
-  printf ("Sum of squares of integers from 0 to %i, "
-          "computed in parallel, is %i\n", n - 1, sum);
-
-  // Compare to a sequential loop.
-  int seqSum = 0;
-  for (int i = 0; i < n; ++i) {
-    seqSum += i*i;
-  }
-  printf ("Sum of squares of integers from 0 to %i, "
-          "computed sequentially, is %i\n", n - 1, seqSum);
-  Kokkos::finalize ();
-  return (sum == seqSum) ? 0 : -1;
-}
-
--- a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt
+++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt
@ -1,12 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-IF (Kokkos_ENABLE_CXX11)
-  # This is a tutorial, not a test, so we don't ask CTest to run it.
-  TRIBITS_ADD_EXECUTABLE(
-    tutorial_02_simple_reduce_lambda
-    SOURCES simple_reduce_lambda.cpp
-    COMM serial mpi
-    )
-ENDIF ()
--- a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
+++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
@ -1,44 +0,0 @@
-KOKKOS_PATH = ../../..
-SRC = $(wildcard *.cpp)
-
-default: build
-	echo "Start Build"
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS = 
-EXE = $(SRC:.cpp=.cuda)
-KOKKOS_DEVICES = "Cuda,OpenMP"
-KOKKOS_ARCH = "SNB,Kepler35"
-KOKKOS_CUDA_OPTIONS = "enable_lambda"
-else
-CXX = g++
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS =  
-EXE = $(SRC:.cpp=.host)
-KOKKOS_DEVICES = "OpenMP"
-KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-OBJ = $(SRC:.cpp=.o)
-LIB =
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
--- a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
+++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
@ -1,86 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-#include <cstdio>
-
-//
-// First reduction (parallel_reduce) example:
-//   1. Start up Kokkos
-//   2. Execute a parallel_reduce loop in the default execution space,
-//      using a C++11 lambda to define the loop body
-//   3. Shut down Kokkos
-//
-// This example only builds if C++11 is enabled.  Compare this example
-// to 02_simple_reduce, which uses a functor to define the loop body
-// of the parallel_reduce.
-//
-
-int main (int argc, char* argv[]) {
-  Kokkos::initialize (argc, argv);
-  const int n = 10;
-
-  // Compute the sum of squares of integers from 0 to n-1, in
-  // parallel, using Kokkos.  This time, use a lambda instead of a
-  // functor.  The lambda takes the same arguments as the functor's
-  // operator().
-  int sum = 0;
-  // The KOKKOS_LAMBDA macro replaces the capture-by-value clause [=].
-  // It also handles any other syntax needed for CUDA.
-  Kokkos::parallel_reduce (n, KOKKOS_LAMBDA (const int i, int& lsum) {
-      lsum += i*i;
-    }, sum);
-  printf ("Sum of squares of integers from 0 to %i, "
-          "computed in parallel, is %i\n", n - 1, sum);
-
-  // Compare to a sequential loop.
-  int seqSum = 0;
-  for (int i = 0; i < n; ++i) {
-    seqSum += i*i;
-  }
-  printf ("Sum of squares of integers from 0 to %i, "
-          "computed sequentially, is %i\n", n - 1, seqSum);
-  Kokkos::finalize ();
-  return (sum == seqSum) ? 0 : -1;
-}
-
--- a/lib/kokkos/example/tutorial/03_simple_view/CMakeLists.txt
+++ b/lib/kokkos/example/tutorial/03_simple_view/CMakeLists.txt
@ -1,10 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-# This is a tutorial, not a test, so we don't ask CTest to run it.
-TRIBITS_ADD_EXECUTABLE(
-  tutorial_03_simple_view
-  SOURCES simple_view.cpp
-  COMM serial mpi
-  )
--- a/lib/kokkos/example/tutorial/03_simple_view/Makefile
+++ b/lib/kokkos/example/tutorial/03_simple_view/Makefile
@ -1,43 +0,0 @@
-KOKKOS_PATH = ../../..
-SRC = $(wildcard *.cpp)
-
-default: build
-	echo "Start Build"
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS = 
-EXE = $(SRC:.cpp=.cuda)
-KOKKOS_DEVICES = "Cuda,OpenMP"
-KOKKOS_ARCH = "SNB,Kepler35"
-else
-CXX = g++
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS =  
-EXE = $(SRC:.cpp=.host)
-KOKKOS_DEVICES = "OpenMP"
-KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-OBJ = $(SRC:.cpp=.o)
-LIB =
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
--- a/lib/kokkos/example/tutorial/03_simple_view/simple_view.cpp
+++ b/lib/kokkos/example/tutorial/03_simple_view/simple_view.cpp
@ -1,142 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-//
-// First Kokkos::View (multidimensional array) example:
-//   1. Start up Kokkos
-//   2. Allocate a Kokkos::View
-//   3. Execute a parallel_for and a parallel_reduce over that View's data
-//   4. Shut down Kokkos
-//
-// Compare this example to 03_simple_view_lambda, which uses C++11
-// lambdas to define the loop bodies of the parallel_for and
-// parallel_reduce.
-//
-
-#include <Kokkos_Core.hpp>
-#include <cstdio>
-
-// A Kokkos::View is an array of zero or more dimensions.  The number
-// of dimensions is specified at compile time, as part of the type of
-// the View.  This array has two dimensions.  The first one
-// (represented by the asterisk) is a run-time dimension, and the
-// second (represented by [3]) is a compile-time dimension.  Thus,
-// this View type is an N x 3 array of type double, where N is
-// specified at run time in the View's constructor.
-//
-// The first dimension of the View is the dimension over which it is
-// efficient for Kokkos to parallelize.
-typedef Kokkos::View<double*[3]> view_type;
-
-// parallel_for functor that fills the View given to its constructor.
-// The View must already have been allocated.
-struct InitView {
-  view_type a;
-
-  // Views have "view semantics."  This means that they behave like
-  // pointers, not like std::vector.  Their copy constructor and
-  // operator= only do shallow copies.  Thus, you can pass View
-  // objects around by "value"; they won't do a deep copy unless you
-  // explicitly ask for a deep copy.
-  InitView (view_type a_) :
-    a (a_)
-  {}
-
-  // Fill the View with some data.  The parallel_for loop will iterate
-  // over the View's first dimension N.
-  KOKKOS_INLINE_FUNCTION
-  void operator () (const int i) const {
-    // Acesss the View just like a Fortran array.  The layout depends
-    // on the View's memory space, so don't rely on the View's
-    // physical memory layout unless you know what you're doing.
-    a(i,0) = 1.0*i;
-    a(i,1) = 1.0*i*i;
-    a(i,2) = 1.0*i*i*i;
-  }
-};
-
-// Reduction functor that reads the View given to its constructor.
-struct ReduceFunctor {
-  view_type a;
-
-  // Constructor takes View by "value"; this does a shallow copy.
-  ReduceFunctor (view_type a_) : a (a_) {}
-
-  // If you write a functor to do a reduction, you must specify the
-  // type of the reduction result via a public 'value_type' typedef.
-  typedef double value_type;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (int i, double &lsum) const {
-    lsum += a(i,0)*a(i,1)/(a(i,2)+0.1);
-  }
-};
-
-int main (int argc, char* argv[]) {
-  Kokkos::initialize (argc, argv);
-  const int N = 10;
-
-  // Allocate the View.  The first dimension is a run-time parameter
-  // N.  We set N = 10 here.  The second dimension is a compile-time
-  // parameter, 3.  We don't specify it here because we already set it
-  // by declaring the type of the View.
-  //
-  // Views get initialized to zero by default.  This happens in
-  // parallel, using the View's memory space's default execution
-  // space.  Parallel initialization ensures first-touch allocation.
-  // There is a way to shut off default initialization.
-  //
-  // You may NOT allocate a View inside of a parallel_{for, reduce,
-  // scan}.  Treat View allocation as a "thread collective."
-  //
-  // The string "A" is just the label; it only matters for debugging.
-  // Different Views may have the same label.
-  view_type a ("A", N);
-
-  Kokkos::parallel_for (N, InitView (a));
-  double sum = 0;
-  Kokkos::parallel_reduce (N, ReduceFunctor (a), sum);
-  printf ("Result: %f\n", sum);
-  Kokkos::finalize ();
-}
-
--- a/lib/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt
+++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt
@ -1,12 +0,0 @@
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-IF (Kokkos_ENABLE_CXX11)
-  # This is a tutorial, not a test, so we don't ask CTest to run it.
-  TRIBITS_ADD_EXECUTABLE(
-    tutorial_03_simple_view_lambda
-    SOURCES simple_view_lambda.cpp
-    COMM serial mpi
-    )
-ENDIF ()
--- a/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile
+++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile
@ -1,44 +0,0 @@
-KOKKOS_PATH = ../../..
-SRC = $(wildcard *.cpp)
-
-default: build
-	echo "Start Build"
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS = 
-EXE = $(SRC:.cpp=.cuda)
-KOKKOS_DEVICES = "Cuda,OpenMP"
-KOKKOS_ARCH = "SNB,Kepler35"
-KOKKOS_CUDA_OPTIONS = "enable_lambda"
-else
-CXX = g++
-CXXFLAGS = -O3
-LINK = ${CXX}
-LINKFLAGS =  
-EXE = $(SRC:.cpp=.host)
-KOKKOS_DEVICES = "OpenMP"
-KOKKOS_ARCH = "SNB"
-endif
-
-DEPFLAGS = -M
-
-OBJ = $(SRC:.cpp=.o)
-LIB =
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
-
-# Compilation rules
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
--- a/Show More
+++ b/Show More