lammps/lib/kokkos/example/multi_fem/ExplicitFunctors.hpp

/*
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 2.0
//              Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/

#ifndef KOKKOS_EXPLICITFUNCTORS_HPP
#define KOKKOS_EXPLICITFUNCTORS_HPP

#include <cmath>
#include <Kokkos_Core.hpp>
#include <FEMesh.hpp>

namespace Explicit {

template<typename Scalar , class Device >
struct Fields {

  static const int NumStates     = 2 ;
  static const int SpatialDim    = 3 ;
  static const int ElemNodeCount = 8 ;

  // Indices for full 3x3 tensor:

  static const int K_F_XX = 0 ;
  static const int K_F_YY = 1 ;
  static const int K_F_ZZ = 2 ;
  static const int K_F_XY = 3 ;
  static const int K_F_YZ = 4 ;
  static const int K_F_ZX = 5 ;
  static const int K_F_YX = 6 ;
  static const int K_F_ZY = 7 ;
  static const int K_F_XZ = 8 ;

  //  Indexes into a 3 by 3 symmetric tensor stored as a length 6 vector

  static const int K_S_XX = 0 ;
  static const int K_S_YY = 1 ;
  static const int K_S_ZZ = 2 ;
  static const int K_S_XY = 3 ;
  static const int K_S_YZ = 4 ;
  static const int K_S_ZX = 5 ;
  static const int K_S_YX = 3 ;
  static const int K_S_ZY = 4 ;
  static const int K_S_XZ = 5 ;

  //  Indexes into a 3 by 3 skew symmetric tensor stored as a length 3 vector

  static const int K_V_XY = 0 ;
  static const int K_V_YZ = 1 ;
  static const int K_V_ZX = 2 ;


  typedef Device                           execution_space ;
  typedef typename execution_space::size_type  size_type ;

  typedef HybridFEM::FEMesh<double,ElemNodeCount,execution_space>  FEMesh ;

  typedef typename FEMesh::node_coords_type    node_coords_type ;
  typedef typename FEMesh::elem_node_ids_type  elem_node_ids_type ;
  typedef typename FEMesh::node_elem_ids_type  node_elem_ids_type ;
  typedef typename Kokkos::ParallelDataMap   parallel_data_map ;

  typedef Kokkos::View< double[][ SpatialDim ][ NumStates ] , execution_space > geom_state_array_type ;
  typedef Kokkos::View< Scalar[][ SpatialDim ] , execution_space > geom_array_type ;
  typedef Kokkos::View< Scalar[] ,               execution_space > array_type ;
  typedef Kokkos::View< Scalar ,                 execution_space >  scalar_type ;

  typedef Kokkos::View< Scalar[][  6 ] ,    execution_space >  elem_sym_tensor_type ;
  typedef Kokkos::View< Scalar[][  9 ] ,    execution_space >  elem_tensor_type ;
  typedef Kokkos::View< Scalar[][  9 ][ NumStates ] , execution_space >  elem_tensor_state_type ;
  typedef Kokkos::View< Scalar[][ SpatialDim ][ ElemNodeCount ] , execution_space > elem_node_geom_type ;

  // Parameters:
  const int num_nodes ;
  const int num_nodes_owned ;
  const int num_elements ;

  const Scalar  lin_bulk_visc;
  const Scalar  quad_bulk_visc;
  const Scalar  two_mu;
  const Scalar  bulk_modulus;
  const Scalar  density;

  // Mesh:
  const elem_node_ids_type  elem_node_connectivity ;
  const node_elem_ids_type  node_elem_connectivity ;
  const node_coords_type    model_coords ;

  // Compute:
  const scalar_type                dt ;
  const scalar_type                prev_dt ;
  const geom_state_array_type      displacement ;
  const geom_state_array_type      velocity ;
  const geom_array_type            acceleration ;
  const geom_array_type            internal_force ;
  const array_type                 nodal_mass ;
  const array_type                 elem_mass ;
  const array_type                 internal_energy ;
  const elem_sym_tensor_type       stress_new ;
  const elem_tensor_state_type     rotation ;
  const elem_node_geom_type        element_force ;
  const elem_tensor_type           vel_grad ;
  const elem_sym_tensor_type       stretch ;
  const elem_sym_tensor_type       rot_stretch ;

  Fields(
      const FEMesh & mesh,
      Scalar arg_lin_bulk_visc,
      Scalar arg_quad_bulk_visc,
      Scalar youngs_modulus,
      Scalar poissons_ratio,
      Scalar arg_density )
    : num_nodes(       mesh.parallel_data_map.count_owned +
                       mesh.parallel_data_map.count_receive )
    , num_nodes_owned( mesh.parallel_data_map.count_owned )
    , num_elements(    mesh.elem_node_ids.dimension_0() )
    , lin_bulk_visc(  arg_lin_bulk_visc )
    , quad_bulk_visc( arg_quad_bulk_visc )
    , two_mu(youngs_modulus/(1.0+poissons_ratio))
    , bulk_modulus(youngs_modulus/(3*(1.0-2.0*poissons_ratio)))
    , density(arg_density)

    // mesh

    , elem_node_connectivity( mesh.elem_node_ids ) // ( num_elements , ElemNodeCount )
    , node_elem_connectivity( mesh.node_elem_ids ) // ( num_nodes , ... )
    , model_coords(  mesh.node_coords )            // ( num_nodes , 3 )

    // compute with input/output

    , dt(              "dt" )
    , prev_dt(         "prev_dt" )
    , displacement(    "displacement" ,   num_nodes )
    , velocity(        "velocity" ,       num_nodes )
    , acceleration(    "acceleration" ,   num_nodes_owned )
    , internal_force(  "internal_force" , num_nodes_owned )
    , nodal_mass(      "nodal_mass" ,     num_nodes_owned )
    , elem_mass(       "elem_mass" ,       num_elements )
    , internal_energy( "internal_energy" , num_elements )
    , stress_new(      "stress_new" ,      num_elements )

    // temporary arrays

    , rotation(      "rotation" ,  num_elements )
    , element_force( "element_force" ,  num_elements )
    , vel_grad(      "vel_grad" , num_elements )
    , stretch(       "stretch" , num_elements )
    , rot_stretch(   "rot_stretch" , num_elements )
  { }
};


//----------------------------------------------------------------------------

template< typename Scalar , class DeviceType >
KOKKOS_INLINE_FUNCTION
Scalar dot8( const Scalar * a , const Scalar * b )
{ return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3] +
         a[4] * b[4] + a[5] * b[5] + a[6] * b[6] + a[7] * b[7] ; }

template< typename Scalar , class DeviceType >
KOKKOS_INLINE_FUNCTION
void comp_grad( const Scalar * const x ,
                const Scalar * const y ,
                const Scalar * const z,
                Scalar * const grad_x ,
                Scalar * const grad_y ,
                Scalar * const grad_z )
{
  //  calc X difference vectors

  Scalar R42=(x[3] - x[1]);
  Scalar R52=(x[4] - x[1]);
  Scalar R54=(x[4] - x[3]);

  Scalar R63=(x[5] - x[2]);
  Scalar R83=(x[7] - x[2]);
  Scalar R86=(x[7] - x[5]);

  Scalar R31=(x[2] - x[0]);
  Scalar R61=(x[5] - x[0]);
  Scalar R74=(x[6] - x[3]);

  Scalar R72=(x[6] - x[1]);
  Scalar R75=(x[6] - x[4]);
  Scalar R81=(x[7] - x[0]);

  Scalar t1=(R63 + R54);
  Scalar t2=(R61 + R74);
  Scalar t3=(R72 + R81);

  Scalar t4 =(R86 + R42);
  Scalar t5 =(R83 + R52);
  Scalar t6 =(R75 + R31);

  //  Calculate Y gradient from X and Z data

  grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
  grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
  grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
  grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
  grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
  grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
  grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
  grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);

  //   calc Z difference vectors

  R42=(z[3] - z[1]);
  R52=(z[4] - z[1]);
  R54=(z[4] - z[3]);

  R63=(z[5] - z[2]);
  R83=(z[7] - z[2]);
  R86=(z[7] - z[5]);

  R31=(z[2] - z[0]);
  R61=(z[5] - z[0]);
  R74=(z[6] - z[3]);

  R72=(z[6] - z[1]);
  R75=(z[6] - z[4]);
  R81=(z[7] - z[0]);

  t1=(R63 + R54);
  t2=(R61 + R74);
  t3=(R72 + R81);

  t4 =(R86 + R42);
  t5 =(R83 + R52);
  t6 =(R75 + R31);

  //  Calculate X gradient from Y and Z data

  grad_x[0] = (y[1] *  t1) - (y[2] * R42) - (y[3] *  t5) + (y[4] *  t4) + (y[5] * R52) - (y[7] * R54);
  grad_x[1] = (y[2] *  t2) + (y[3] * R31) - (y[0] *  t1) - (y[5] *  t6) + (y[6] * R63) - (y[4] * R61);
  grad_x[2] = (y[3] *  t3) + (y[0] * R42) - (y[1] *  t2) - (y[6] *  t4) + (y[7] * R74) - (y[5] * R72);
  grad_x[3] = (y[0] *  t5) - (y[1] * R31) - (y[2] *  t3) + (y[7] *  t6) + (y[4] * R81) - (y[6] * R83);
  grad_x[4] = (y[5] *  t3) + (y[6] * R86) - (y[7] *  t2) - (y[0] *  t4) - (y[3] * R81) + (y[1] * R61);
  grad_x[5] = (y[6] *  t5) - (y[4] *  t3) - (y[7] * R75) + (y[1] *  t6) - (y[0] * R52) + (y[2] * R72);
  grad_x[6] = (y[7] *  t1) - (y[5] *  t5) - (y[4] * R86) + (y[2] *  t4) - (y[1] * R63) + (y[3] * R83);
  grad_x[7] = (y[4] *  t2) - (y[6] *  t1) + (y[5] * R75) - (y[3] *  t6) - (y[2] * R74) + (y[0] * R54);

  //  calc Y difference vectors

  R42=(y[3] - y[1]);
  R52=(y[4] - y[1]);
  R54=(y[4] - y[3]);

  R63=(y[5] - y[2]);
  R83=(y[7] - y[2]);
  R86=(y[7] - y[5]);

  R31=(y[2] - y[0]);
  R61=(y[5] - y[0]);
  R74=(y[6] - y[3]);

  R72=(y[6] - y[1]);
  R75=(y[6] - y[4]);
  R81=(y[7] - y[0]);

  t1=(R63 + R54);
  t2=(R61 + R74);
  t3=(R72 + R81);

  t4 =(R86 + R42);
  t5 =(R83 + R52);
  t6 =(R75 + R31);

  //  Calculate Z gradient from X and Y data

  grad_z[0] = (x[1] *  t1) - (x[2] * R42) - (x[3] *  t5)  + (x[4] *  t4) + (x[5] * R52) - (x[7] * R54);
  grad_z[1] = (x[2] *  t2) + (x[3] * R31) - (x[0] *  t1)  - (x[5] *  t6) + (x[6] * R63) - (x[4] * R61);
  grad_z[2] = (x[3] *  t3) + (x[0] * R42) - (x[1] *  t2)  - (x[6] *  t4) + (x[7] * R74) - (x[5] * R72);
  grad_z[3] = (x[0] *  t5) - (x[1] * R31) - (x[2] *  t3)  + (x[7] *  t6) + (x[4] * R81) - (x[6] * R83);
  grad_z[4] = (x[5] *  t3) + (x[6] * R86) - (x[7] *  t2)  - (x[0] *  t4) - (x[3] * R81) + (x[1] * R61);
  grad_z[5] = (x[6] *  t5) - (x[4] *  t3)  - (x[7] * R75) + (x[1] *  t6) - (x[0] * R52) + (x[2] * R72);
  grad_z[6] = (x[7] *  t1) - (x[5] *  t5)  - (x[4] * R86) + (x[2] *  t4) - (x[1] * R63) + (x[3] * R83);
  grad_z[7] = (x[4] *  t2) - (x[6] *  t1)  + (x[5] * R75) - (x[3] *  t6) - (x[2] * R74) + (x[0] * R54);
}

//----------------------------------------------------------------------------

template< typename Scalar , class DeviceType >
struct initialize_element
{
  typedef DeviceType     execution_space ;

  typedef Explicit::Fields< Scalar , execution_space > Fields ;

  typename Fields::elem_node_ids_type      elem_node_connectivity ;
  typename Fields::node_coords_type        model_coords ;
  typename Fields::elem_sym_tensor_type    stretch ;
  typename Fields::elem_tensor_state_type  rotation ;
  typename Fields::array_type              elem_mass ;

  const Scalar density ;

  initialize_element( const Fields & mesh_fields )
    : elem_node_connectivity( mesh_fields.elem_node_connectivity )
    , model_coords(           mesh_fields.model_coords )
    , stretch(                mesh_fields.stretch )
    , rotation(               mesh_fields.rotation )
    , elem_mass(              mesh_fields.elem_mass )
    , density(                mesh_fields.density )
    {}

  KOKKOS_INLINE_FUNCTION
  void operator()( int ielem )const
  {
    const int K_XX = 0 ;
    const int K_YY = 1 ;
    const int K_ZZ = 2 ;
    const Scalar ONE12TH = 1.0 / 12.0 ;

    Scalar x[ Fields::ElemNodeCount ];
    Scalar y[ Fields::ElemNodeCount ];
    Scalar z[ Fields::ElemNodeCount ];
    Scalar grad_x[ Fields::ElemNodeCount ];
    Scalar grad_y[ Fields::ElemNodeCount ];
    Scalar grad_z[ Fields::ElemNodeCount ];

    for ( int i = 0 ; i < Fields::ElemNodeCount ; ++i ) {
      const int n = elem_node_connectivity( ielem , i );

      x[i]  = model_coords( n , 0 );
      y[i]  = model_coords( n , 1 );
      z[i]  = model_coords( n , 2 );
    }

    comp_grad<Scalar,execution_space>( x, y, z, grad_x, grad_y, grad_z);

    stretch(ielem,K_XX) = 1 ;
    stretch(ielem,K_YY) = 1 ;
    stretch(ielem,K_ZZ) = 1 ;

    rotation(ielem,K_XX,0) = 1 ;
    rotation(ielem,K_YY,0) = 1 ;
    rotation(ielem,K_ZZ,0) = 1 ;

    rotation(ielem,K_XX,1) = 1 ;
    rotation(ielem,K_YY,1) = 1 ;
    rotation(ielem,K_ZZ,1) = 1 ;

    elem_mass(ielem) = ONE12TH * density *
                                 dot8<Scalar,execution_space>( x , grad_x );
  }

  static void apply( const Fields & mesh_fields )
  {
    initialize_element op( mesh_fields );
    Kokkos::parallel_for( mesh_fields.num_elements , op );
  }
};


template<typename Scalar , class DeviceType >
struct initialize_node
{
  typedef DeviceType     execution_space ;

  typedef Explicit::Fields< Scalar , execution_space > Fields ;

  typename Fields::node_elem_ids_type      node_elem_connectivity ;
  typename Fields::array_type              nodal_mass ;
  typename Fields::array_type              elem_mass ;

  static const int ElemNodeCount = Fields::ElemNodeCount ;

  initialize_node( const Fields & mesh_fields )
    : node_elem_connectivity( mesh_fields.node_elem_connectivity )
    , nodal_mass(             mesh_fields.nodal_mass )
    , elem_mass(              mesh_fields.elem_mass )
    {}


  KOKKOS_INLINE_FUNCTION
  void operator()( int inode )const
  {
    const int begin = node_elem_connectivity.row_map[inode];
    const int end   = node_elem_connectivity.row_map[inode+1];

    Scalar node_mass = 0;

    for(int i = begin; i != end; ++i) {
      const int elem_id = node_elem_connectivity.entries( i , 0 );
      node_mass += elem_mass(elem_id);
    }

    nodal_mass(inode) = node_mass / ElemNodeCount ;
  }

  static void apply( const Fields & mesh_fields )
  {
    initialize_node op( mesh_fields );
    Kokkos::parallel_for( mesh_fields.num_nodes_owned , op );
  }
};

//----------------------------------------------------------------------------


template<typename Scalar, class DeviceType >
struct grad
{
  typedef DeviceType execution_space ;

  typedef Explicit::Fields< Scalar , execution_space >  Fields ;

  static const int ElemNodeCount = Fields::ElemNodeCount ;

  static const int K_F_XX = Fields::K_F_XX ;
  static const int K_F_YY = Fields::K_F_YY ;
  static const int K_F_ZZ = Fields::K_F_ZZ ;
  static const int K_F_XY = Fields::K_F_XY ;
  static const int K_F_YZ = Fields::K_F_YZ ;
  static const int K_F_ZX = Fields::K_F_ZX ;
  static const int K_F_YX = Fields::K_F_YX ;
  static const int K_F_ZY = Fields::K_F_ZY ;
  static const int K_F_XZ = Fields::K_F_XZ ;

  // Global arrays used by this functor.

  const typename Fields::elem_node_ids_type     elem_node_connectivity ;
  const typename Fields::node_coords_type       model_coords ;
  const typename Fields::geom_state_array_type  displacement ;
  const typename Fields::geom_state_array_type  velocity ;
  const typename Fields::elem_tensor_type       vel_grad ;
  const typename Fields::scalar_type            dt ;

  const int  current_state;
  const int  previous_state;

  // Constructor on the Host to populate this device functor.
  // All array view copies are shallow.
  grad( const Fields &  fields,
        const int arg_current_state,
        const int arg_previous_state)
    : elem_node_connectivity( fields.elem_node_connectivity)
    , model_coords( fields.model_coords)
    , displacement( fields.displacement)
    , velocity( fields.velocity)
    , vel_grad( fields.vel_grad)
    , dt(  fields.dt)
    , current_state(arg_current_state)
    , previous_state(arg_previous_state)
    { }

  //--------------------------------------------------------------------------

    //   Calculate Velocity Gradients
    KOKKOS_INLINE_FUNCTION
    void v_grad(  int ielem,
      Scalar * vx,       Scalar * vy,       Scalar * vz,
      Scalar * grad_x,     Scalar * grad_y,     Scalar * grad_z,
      Scalar inv_vol) const
    {
      const int K_F_XX = Fields::K_F_XX ;
      const int K_F_YY = Fields::K_F_YY ;
      const int K_F_ZZ = Fields::K_F_ZZ ;
      const int K_F_XY = Fields::K_F_XY ;
      const int K_F_YZ = Fields::K_F_YZ ;
      const int K_F_ZX = Fields::K_F_ZX ;
      const int K_F_YX = Fields::K_F_YX ;
      const int K_F_ZY = Fields::K_F_ZY ;
      const int K_F_XZ = Fields::K_F_XZ ;

      vel_grad(ielem, K_F_XX) = inv_vol * dot8<Scalar,execution_space>( vx , grad_x );
      vel_grad(ielem, K_F_YX) = inv_vol * dot8<Scalar,execution_space>( vy , grad_x );
      vel_grad(ielem, K_F_ZX) = inv_vol * dot8<Scalar,execution_space>( vz , grad_x );

      vel_grad(ielem, K_F_XY) = inv_vol * dot8<Scalar,execution_space>( vx , grad_y );
      vel_grad(ielem, K_F_YY) = inv_vol * dot8<Scalar,execution_space>( vy , grad_y );
      vel_grad(ielem, K_F_ZY) = inv_vol * dot8<Scalar,execution_space>( vz , grad_y );

      vel_grad(ielem, K_F_XZ) = inv_vol * dot8<Scalar,execution_space>( vx , grad_z );
      vel_grad(ielem, K_F_YZ) = inv_vol * dot8<Scalar,execution_space>( vy , grad_z );
      vel_grad(ielem, K_F_ZZ) = inv_vol * dot8<Scalar,execution_space>( vz , grad_z );
    }

  //--------------------------------------------------------------------------
  // Functor operator() which calls the three member functions.


  KOKKOS_INLINE_FUNCTION
  void operator()( int ielem )const
  {
    const int X = 0 ;
    const int Y = 1 ;
    const int Z = 2 ;
    const Scalar dt_scale = -0.5 * dt();

    //  declare and reuse local data for frequently accessed data to
    //  reduce global memory reads and writes.

    Scalar      x[8],      y[8],      z[8];
    Scalar     vx[8],     vy[8],     vz[8];
    Scalar grad_x[8], grad_y[8], grad_z[8];

    // Read global velocity once and use many times
    // via local registers / L1 cache.
    //  store the velocity information in local memory before using,
    //  so it can be returned for other functions to use

    // Read global coordinates and velocity once and use many times
    // via local registers / L1 cache.
    // load X coordinate information and move by half time step

    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
      const int n = elem_node_connectivity( ielem , i );

      vx[i] = velocity( n , X , current_state );
      vy[i] = velocity( n , Y , current_state );
      vz[i] = velocity( n , Z , current_state );

      x[i]  = model_coords( n , X ) +
              displacement( n , X , current_state ) +
              dt_scale * vx[i];

      y[i]  = model_coords( n , Y ) +
              displacement( n , Y , current_state ) +
              dt_scale * vy[i];

      z[i]  = model_coords( n , Z ) +
              displacement( n , Z , current_state ) +
              dt_scale * vz[i];
    }

    comp_grad<Scalar,execution_space>( x, y, z, grad_x, grad_y, grad_z);

    //  Calculate hexahedral volume from x model_coords and gradient information

    const Scalar inv_vol = 1.0 / dot8<Scalar,execution_space>( x , grad_x );

    v_grad(ielem, vx, vy, vz, grad_x, grad_y, grad_z, inv_vol);
  }

  static void apply( const Fields & fields ,
                     const int arg_current_state ,
                     const int arg_previous_state )
  {
    grad op( fields, arg_current_state , arg_previous_state );
    Kokkos::parallel_for( fields.num_elements , op );
  }
};

//----------------------------------------------------------------------------

template<typename Scalar, class DeviceType >
struct decomp_rotate
{
  typedef DeviceType execution_space ;

  typedef Explicit::Fields< Scalar , execution_space >  Fields ;

  static const int ElemNodeCount = Fields::ElemNodeCount ;

  static const int K_F_XX = Fields::K_F_XX ;
  static const int K_F_YY = Fields::K_F_YY ;
  static const int K_F_ZZ = Fields::K_F_ZZ ;
  static const int K_F_XY = Fields::K_F_XY ;
  static const int K_F_YZ = Fields::K_F_YZ ;
  static const int K_F_ZX = Fields::K_F_ZX ;
  static const int K_F_YX = Fields::K_F_YX ;
  static const int K_F_ZY = Fields::K_F_ZY ;
  static const int K_F_XZ = Fields::K_F_XZ ;

  static const int K_S_XX = Fields::K_S_XX ;
  static const int K_S_YY = Fields::K_S_YY ;
  static const int K_S_ZZ = Fields::K_S_ZZ ;
  static const int K_S_XY = Fields::K_S_XY ;
  static const int K_S_YZ = Fields::K_S_YZ ;
  static const int K_S_ZX = Fields::K_S_ZX ;
  static const int K_S_YX = Fields::K_S_YX ;
  static const int K_S_ZY = Fields::K_S_ZY ;
  static const int K_S_XZ = Fields::K_S_XZ ;

  static const int K_V_XY = Fields::K_V_XY ;
  static const int K_V_YZ = Fields::K_V_YZ ;
  static const int K_V_ZX = Fields::K_V_ZX ;

  // Global arrays used by this functor.

  const typename Fields::elem_tensor_state_type     rotation ;
  const typename Fields::elem_tensor_type           vel_grad ;
  const typename Fields::elem_sym_tensor_type       stretch ;
  const typename Fields::elem_sym_tensor_type       rot_stretch ;
  const typename Fields::scalar_type                dt_value ;

  const int  current_state;
  const int  previous_state;

  decomp_rotate( const Fields & mesh_fields ,
                 const int arg_current_state,
                 const int arg_previous_state)
    : rotation(    mesh_fields.rotation )
    , vel_grad(    mesh_fields.vel_grad )
    , stretch(     mesh_fields.stretch )
    , rot_stretch( mesh_fields.rot_stretch )
    , dt_value(    mesh_fields.dt)
    , current_state( arg_current_state)
    , previous_state(arg_previous_state)
    {}

  static void apply( const Fields & mesh_fields ,
                     const int arg_current_state ,
                     const int arg_previous_state )
  {
    decomp_rotate op( mesh_fields , arg_current_state , arg_previous_state );
    Kokkos::parallel_for( mesh_fields.num_elements , op );
  }


  KOKKOS_INLINE_FUNCTION
  void additive_decomp(int ielem, Scalar * v_gr, Scalar * str_ten) const
  {
    //  In addition to calculating stretching_tensor,
    //  use this as an opportunity to load global
    //  variables into a local space

    for ( int i = 0 ; i < 9 ; ++i ) {
      v_gr[i] = vel_grad( ielem , i );
    }

    //
    //  Symmetric part
    //
    str_ten[K_S_XX] = v_gr[K_F_XX];
    str_ten[K_S_YY] = v_gr[K_F_YY];
    str_ten[K_S_ZZ] = v_gr[K_F_ZZ];
    str_ten[K_S_XY] = 0.5*(v_gr[K_F_XY] + v_gr[K_F_YX]);
    str_ten[K_S_YZ] = 0.5*(v_gr[K_F_YZ] + v_gr[K_F_ZY]);
    str_ten[K_S_ZX] = 0.5*(v_gr[K_F_ZX] + v_gr[K_F_XZ]);
  }

  KOKKOS_INLINE_FUNCTION
  void polar_decomp(int ielem, Scalar * v_gr, Scalar * str_ten, Scalar * str, Scalar * vort, Scalar * rot_old, Scalar * rot_new)const
  {
    const Scalar dt = dt_value();
    const Scalar dt_half = 0.5 * dt;

    //  Skew Symmetric part
    vort[K_V_XY] = 0.5*(v_gr[K_F_XY] - v_gr[K_F_YX]);
    vort[K_V_YZ] = 0.5*(v_gr[K_F_YZ] - v_gr[K_F_ZY]);
    vort[K_V_ZX] = 0.5*(v_gr[K_F_ZX] - v_gr[K_F_XZ]);

    //   calculate the rates of rotation via gauss elimination.
    for ( int i = 0 ; i < 6 ; ++i ) {
      str[i] = stretch(ielem, i);
    }

    Scalar z1 = str_ten[K_S_XY] * str[K_S_ZX] -
                str_ten[K_S_ZX] * str[K_S_XY] +
                str_ten[K_S_YY] * str[K_S_YZ] -
                str_ten[K_S_YZ] * str[K_S_YY] +
                str_ten[K_S_YZ] * str[K_S_ZZ] -
                str_ten[K_S_ZZ] * str[K_S_YZ];

    Scalar z2 = str_ten[K_S_ZX] * str[K_S_XX] -
                str_ten[K_S_XX] * str[K_S_ZX] +
                str_ten[K_S_YZ] * str[K_S_XY] -
                str_ten[K_S_XY] * str[K_S_YZ] +
                str_ten[K_S_ZZ] * str[K_S_ZX] -
                str_ten[K_S_ZX] * str[K_S_ZZ];

    Scalar z3 = str_ten[K_S_XX] * str[K_S_XY] -
                str_ten[K_S_XY] * str[K_S_XX] +
                str_ten[K_S_XY] * str[K_S_YY] -
                str_ten[K_S_YY] * str[K_S_XY] +
                str_ten[K_S_ZX] * str[K_S_YZ] -
                str_ten[K_S_YZ] * str[K_S_ZX];

  //   forward elimination
    const Scalar a1inv = 1.0 / (str[K_S_YY] + str[K_S_ZZ]);

    const Scalar a4BYa1 = -1 * str[K_S_XY] * a1inv;

    const Scalar a2inv = 1.0 / (str[K_S_ZZ] + str[K_S_XX] + str[K_S_XY] * a4BYa1);

    const Scalar a5 =  -str[K_S_YZ] + str[K_S_ZX] * a4BYa1;

    z2 -= z1 * a4BYa1;
    Scalar a6BYa1 = -1 * str[K_S_ZX] * a1inv;
    const Scalar a5BYa2 = a5 * a2inv;
    z3 -= z1 * a6BYa1 - z2 * a5BYa2;

  //   backward substitution -
    z3 /= (str[K_S_XX] + str[K_S_YY] + str[K_S_ZX] * a6BYa1 + a5 * a5BYa2);
    z2 = (z2 - a5 * z3) * a2inv;
    z1 = (z1*a1inv - a6BYa1 * z3 -a4BYa1 * z2);

  //   calculate rotation rates - recall that spin_rate is an asymmetric tensor,
  //   so compute spin rate vector as dual of spin rate tensor,
  //   i.e   w_i = e_ijk * spin_rate_jk
    z1 += vort[K_V_YZ];
    z2 += vort[K_V_ZX];
    z3 += vort[K_V_XY];

  //   update rotation tensor:
  //  1) premultiply old rotation tensor to get right-hand side.

    for ( int i = 0 ; i < 9 ; ++i ) {
      rot_old[i] = rotation(ielem, i, previous_state);
    }

    Scalar r_XX = rot_old[K_F_XX] + dt_half*( z3 * rot_old[K_F_YX] - z2 * rot_old[K_F_ZX] );
    Scalar r_YX = rot_old[K_F_YX] + dt_half*( z1 * rot_old[K_F_ZX] - z3 * rot_old[K_F_XX] );
    Scalar r_ZX = rot_old[K_F_ZX] + dt_half*( z2 * rot_old[K_F_XX] - z1 * rot_old[K_F_YX] );
    Scalar r_XY = rot_old[K_F_XY] + dt_half*( z3 * rot_old[K_F_YY] - z2 * rot_old[K_F_ZY] );
    Scalar r_YY = rot_old[K_F_YY] + dt_half*( z1 * rot_old[K_F_ZY] - z3 * rot_old[K_F_XY] );
    Scalar r_ZY = rot_old[K_F_ZY] + dt_half*( z2 * rot_old[K_F_XY] - z1 * rot_old[K_F_YY] );
    Scalar r_XZ = rot_old[K_F_XZ] + dt_half*( z3 * rot_old[K_F_YZ] - z2 * rot_old[K_F_ZZ] );
    Scalar r_YZ = rot_old[K_F_YZ] + dt_half*( z1 * rot_old[K_F_ZZ] - z3 * rot_old[K_F_XZ] );
    Scalar r_ZZ = rot_old[K_F_ZZ] + dt_half*( z2 * rot_old[K_F_XZ] - z1 * rot_old[K_F_YZ] );


  //  2) solve for new rotation tensor via gauss elimination.
  //   forward elimination -
    Scalar a12 = - dt_half * z3;
    Scalar a13 =   dt_half * z2;
    Scalar b32 = - dt_half * z1;
    Scalar a22inv = 1.0 / (1.0 + a12 * a12);

    Scalar a13a12 = a13*a12;
    Scalar a23 = b32 + a13a12;
    r_YX += r_XX * a12;
    r_YY += r_XY * a12;
    r_YZ += r_XZ * a12;


    b32 = (b32 - a13a12) * a22inv;
    r_ZX += r_XX * a13 + r_YX * b32;
    r_ZY += r_XY * a13 + r_YY * b32;
    r_ZZ += r_XZ * a13 + r_YZ * b32;


  //   backward substitution -
    const Scalar a33inv = 1.0 / (1.0 + a13 * a13 + a23 * b32);

    rot_new[K_F_ZX] = r_ZX * a33inv;
    rot_new[K_F_ZY] = r_ZY * a33inv;
    rot_new[K_F_ZZ] = r_ZZ * a33inv;
    rot_new[K_F_YX] = ( r_YX - rot_new[K_F_ZX] * a23 ) * a22inv;
    rot_new[K_F_YY] = ( r_YY - rot_new[K_F_ZY] * a23 ) * a22inv;
    rot_new[K_F_YZ] = ( r_YZ - rot_new[K_F_ZZ] * a23 ) * a22inv;
    rot_new[K_F_XX] = r_XX - rot_new[K_F_ZX] * a13 - rot_new[K_F_YX] * a12;
    rot_new[K_F_XY] = r_XY - rot_new[K_F_ZY] * a13 - rot_new[K_F_YY] * a12;
    rot_new[K_F_XZ] = r_XZ - rot_new[K_F_ZZ] * a13 - rot_new[K_F_YZ] * a12;

    for ( int i = 0 ; i < 9 ; ++i ) {
      rotation(ielem, i, current_state) = rot_new[i] ;
    }

  //   update stretch tensor in the new configuration -
    const Scalar a1 = str_ten[K_S_XY] + vort[K_V_XY];
    const Scalar a2 = str_ten[K_S_YZ] + vort[K_V_YZ];
    const Scalar a3 = str_ten[K_S_ZX] + vort[K_V_ZX];
    const Scalar b1 = str_ten[K_S_ZX] - vort[K_V_ZX];
    const Scalar b2 = str_ten[K_S_XY] - vort[K_V_XY];
    const Scalar b3 = str_ten[K_S_YZ] - vort[K_V_YZ];

    const Scalar s_XX = str[K_S_XX];
    const Scalar s_YY = str[K_S_YY];
    const Scalar s_ZZ = str[K_S_ZZ];
    const Scalar s_XY = str[K_S_XY];
    const Scalar s_YZ = str[K_S_YZ];
    const Scalar s_ZX = str[K_S_ZX];

    str[K_S_XX] += dt * (str_ten[K_S_XX] * s_XX + ( a1 + z3 ) * s_XY + ( b1 - z2 ) * s_ZX);
    str[K_S_YY] += dt * (str_ten[K_S_YY] * s_YY + ( a2 + z1 ) * s_YZ + ( b2 - z3 ) * s_XY);
    str[K_S_ZZ] += dt * (str_ten[K_S_ZZ] * s_ZZ + ( a3 + z2 ) * s_ZX + ( b3 - z1 ) * s_YZ);
    str[K_S_XY] += dt * (str_ten[K_S_XX] * s_XY + ( a1 )      * s_YY + ( b1      ) * s_YZ - z3 * s_XX + z1 * s_ZX);
    str[K_S_YZ] += dt * (str_ten[K_S_YY] * s_YZ + ( a2 )      * s_ZZ + ( b2      ) * s_ZX - z1 * s_YY + z2 * s_XY);
    str[K_S_ZX] += dt * (str_ten[K_S_ZZ] * s_ZX + ( a3 )      * s_XX + ( b3      ) * s_XY - z2 * s_ZZ + z3 * s_YZ);

  }


  KOKKOS_INLINE_FUNCTION
  void rotate_tensor(int ielem, Scalar * str_ten, Scalar * str, Scalar * rot_new)const {

    Scalar t[9];
    Scalar rot_str[6]; // Rotated stretch

    t[0] = str_ten[K_S_XX]*rot_new[K_F_XX] +
           str_ten[K_S_XY]*rot_new[K_F_YX] +
           str_ten[K_S_XZ]*rot_new[K_F_ZX];

    t[1] = str_ten[K_S_YX]*rot_new[K_F_XX] +
           str_ten[K_S_YY]*rot_new[K_F_YX] +
           str_ten[K_S_YZ]*rot_new[K_F_ZX];

    t[2] = str_ten[K_S_ZX]*rot_new[K_F_XX] +
           str_ten[K_S_ZY]*rot_new[K_F_YX] +
           str_ten[K_S_ZZ]*rot_new[K_F_ZX];

    t[3] = str_ten[K_S_XX]*rot_new[K_F_XY] +
           str_ten[K_S_XY]*rot_new[K_F_YY] +
           str_ten[K_S_XZ]*rot_new[K_F_ZY];

    t[4] = str_ten[K_S_YX]*rot_new[K_F_XY] +
           str_ten[K_S_YY]*rot_new[K_F_YY] +
           str_ten[K_S_YZ]*rot_new[K_F_ZY];

    t[5] = str_ten[K_S_ZX]*rot_new[K_F_XY] +
           str_ten[K_S_ZY]*rot_new[K_F_YY] +
           str_ten[K_S_ZZ]*rot_new[K_F_ZY];

    t[6] = str_ten[K_S_XX]*rot_new[K_F_XZ] +
           str_ten[K_S_XY]*rot_new[K_F_YZ] +
           str_ten[K_S_XZ]*rot_new[K_F_ZZ];

    t[7] = str_ten[K_S_YX]*rot_new[K_F_XZ] +
           str_ten[K_S_YY]*rot_new[K_F_YZ] +
           str_ten[K_S_YZ]*rot_new[K_F_ZZ];

    t[8] = str_ten[K_S_ZX]*rot_new[K_F_XZ] +
           str_ten[K_S_ZY]*rot_new[K_F_YZ] +
           str_ten[K_S_ZZ]*rot_new[K_F_ZZ];


    rot_str[ K_S_XX ] = rot_new[K_F_XX] * t[0] +
                        rot_new[K_F_YX] * t[1] +
                        rot_new[K_F_ZX] * t[2];
    rot_str[ K_S_YY ] = rot_new[K_F_XY] * t[3] +
                        rot_new[K_F_YY] * t[4] +
                        rot_new[K_F_ZY] * t[5];
    rot_str[ K_S_ZZ ] = rot_new[K_F_XZ] * t[6] +
                        rot_new[K_F_YZ] * t[7] +
                        rot_new[K_F_ZZ] * t[8];

    rot_str[ K_S_XY ] = rot_new[K_F_XX] * t[3] +
                        rot_new[K_F_YX] * t[4] +
                        rot_new[K_F_ZX] * t[5];
    rot_str[ K_S_YZ ] = rot_new[K_F_XY] * t[6] +
                        rot_new[K_F_YY] * t[7] +
                        rot_new[K_F_ZY] * t[8];
    rot_str[ K_S_ZX ] = rot_new[K_F_XZ] * t[0] +
                        rot_new[K_F_YZ] * t[1] +
                        rot_new[K_F_ZZ] * t[2];

    for ( int i = 0 ; i < 6 ; ++i ) {
      rot_stretch(ielem, i) = rot_str[i] ;
    }

    for ( int i = 0 ; i < 6 ; ++i ) {
      stretch(ielem, i) = str[i] ;
    }
  }

  KOKKOS_INLINE_FUNCTION
  void operator()( int ielem )const {

    //   Local scratch space to avoid multiple
    //   accesses to global memory.
    Scalar str_ten[6]; // Stretching tensor
    Scalar str[6];     // Stretch
    Scalar rot_old[9]; // Rotation old
    Scalar rot_new[9]; // Rotation new
    Scalar vort[3];    // Vorticity
    Scalar v_gr[9];    // Velocity gradient

    additive_decomp(ielem, v_gr, str_ten);

    polar_decomp(ielem, v_gr, str_ten, str, vort, rot_old, rot_new);

    rotate_tensor(ielem, str_ten, str, rot_new);
  }
};

//----------------------------------------------------------------------------

template<typename Scalar, class DeviceType >
struct internal_force
{
  typedef DeviceType execution_space ;

  typedef Explicit::Fields< Scalar , execution_space >  Fields ;

  static const int ElemNodeCount = Fields::ElemNodeCount ;

  static const int K_F_XX = Fields::K_F_XX ;
  static const int K_F_YY = Fields::K_F_YY ;
  static const int K_F_ZZ = Fields::K_F_ZZ ;
  static const int K_F_XY = Fields::K_F_XY ;
  static const int K_F_YZ = Fields::K_F_YZ ;
  static const int K_F_ZX = Fields::K_F_ZX ;
  static const int K_F_YX = Fields::K_F_YX ;
  static const int K_F_ZY = Fields::K_F_ZY ;
  static const int K_F_XZ = Fields::K_F_XZ ;

  static const int K_S_XX = Fields::K_S_XX ;
  static const int K_S_YY = Fields::K_S_YY ;
  static const int K_S_ZZ = Fields::K_S_ZZ ;
  static const int K_S_XY = Fields::K_S_XY ;
  static const int K_S_YZ = Fields::K_S_YZ ;
  static const int K_S_ZX = Fields::K_S_ZX ;
  static const int K_S_YX = Fields::K_S_YX ;
  static const int K_S_ZY = Fields::K_S_ZY ;
  static const int K_S_XZ = Fields::K_S_XZ ;

  //--------------------------------------------------------------------------
  // Reduction:

  typedef Scalar value_type;

  KOKKOS_INLINE_FUNCTION
  static void init(value_type &update) {
    update = 1.0e32;
  }

  KOKKOS_INLINE_FUNCTION
  static void join( volatile value_type & update,
                    const volatile value_type & source )
  {
    update = update < source ? update : source;
  }

  // Final serial processing of reduction value:
  KOKKOS_INLINE_FUNCTION
  void final( value_type & result ) const
  {
    prev_dt() = dt() ;
    dt() = result ;
  };

  //--------------------------------------------------------------------------

  // Global arrays used by this functor.

  const typename Fields::elem_node_ids_type      elem_node_connectivity ;
  const typename Fields::node_coords_type        model_coords ;
  const typename Fields::scalar_type             dt ;
  const typename Fields::scalar_type             prev_dt ;
  const typename Fields::geom_state_array_type   displacement ;
  const typename Fields::geom_state_array_type   velocity ;
  const typename Fields::array_type              elem_mass ;
  const typename Fields::array_type              internal_energy ;
  const typename Fields::elem_sym_tensor_type    stress_new ;
  const typename Fields::elem_node_geom_type     element_force ;
  const typename Fields::elem_tensor_state_type  rotation ;
  const typename Fields::elem_sym_tensor_type    rot_stretch ;

  const Scalar     two_mu;
  const Scalar     bulk_modulus;
  const Scalar     lin_bulk_visc;
  const Scalar     quad_bulk_visc;
  const Scalar     user_dt;
  const int        current_state;

  internal_force( const Fields & mesh_fields,
                  const Scalar arg_user_dt,
                  const int arg_current_state )
    : elem_node_connectivity( mesh_fields.elem_node_connectivity )
    , model_coords(           mesh_fields.model_coords )
    , dt(                     mesh_fields.dt )
    , prev_dt(                mesh_fields.prev_dt )
    , displacement(           mesh_fields.displacement )
    , velocity(               mesh_fields.velocity )
    , elem_mass(              mesh_fields.elem_mass )
    , internal_energy(        mesh_fields.internal_energy )
    , stress_new(             mesh_fields.stress_new )
    , element_force(          mesh_fields.element_force )
    , rotation(               mesh_fields.rotation )
    , rot_stretch(            mesh_fields.rot_stretch )
    , two_mu(                 mesh_fields.two_mu )
    , bulk_modulus(           mesh_fields.bulk_modulus )
    , lin_bulk_visc(          mesh_fields.lin_bulk_visc )
    , quad_bulk_visc(         mesh_fields.quad_bulk_visc )
    , user_dt(       arg_user_dt )
    , current_state( arg_current_state )
  {}

  static void apply( const Fields & mesh_fields ,
                     const Scalar arg_user_dt,
                     const int arg_current_state )
  {
    internal_force  op_force( mesh_fields , arg_user_dt , arg_current_state );

    Kokkos::parallel_reduce( mesh_fields.num_elements, op_force );
  }

  //--------------------------------------------------------------------------

  KOKKOS_INLINE_FUNCTION
  void rotate_tensor_backward(int ielem ,
    const Scalar * const s_n ,
    Scalar * const rot_stress )const
  {
    const int rot_state = current_state ; // 1 ;

    //   t : temporary variables
    //   s_n : stress_new in local memory space
    //   r_n : rotation_new in local memory space
    Scalar t[9], r_n[9];

    r_n[0] = rotation(ielem, 0, rot_state );
    r_n[1] = rotation(ielem, 1, rot_state );
    r_n[2] = rotation(ielem, 2, rot_state );
    r_n[3] = rotation(ielem, 3, rot_state );
    r_n[4] = rotation(ielem, 4, rot_state );
    r_n[5] = rotation(ielem, 5, rot_state );
    r_n[6] = rotation(ielem, 6, rot_state );
    r_n[7] = rotation(ielem, 7, rot_state );
    r_n[8] = rotation(ielem, 8, rot_state );

    t[0] = s_n[K_S_XX]*r_n[K_F_XX]+ s_n[K_S_XY]*r_n[K_F_XY]+ s_n[K_S_XZ]*r_n[K_F_XZ];
    t[1] = s_n[K_S_YX]*r_n[K_F_XX]+ s_n[K_S_YY]*r_n[K_F_XY]+ s_n[K_S_YZ]*r_n[K_F_XZ];
    t[2] = s_n[K_S_ZX]*r_n[K_F_XX]+ s_n[K_S_ZY]*r_n[K_F_XY]+ s_n[K_S_ZZ]*r_n[K_F_XZ];
    t[3] = s_n[K_S_XX]*r_n[K_F_YX]+ s_n[K_S_XY]*r_n[K_F_YY]+ s_n[K_S_XZ]*r_n[K_F_YZ];
    t[4] = s_n[K_S_YX]*r_n[K_F_YX]+ s_n[K_S_YY]*r_n[K_F_YY]+ s_n[K_S_YZ]*r_n[K_F_YZ];
    t[5] = s_n[K_S_ZX]*r_n[K_F_YX]+ s_n[K_S_ZY]*r_n[K_F_YY]+ s_n[K_S_ZZ]*r_n[K_F_YZ];
    t[6] = s_n[K_S_XX]*r_n[K_F_ZX]+ s_n[K_S_XY]*r_n[K_F_ZY]+ s_n[K_S_XZ]*r_n[K_F_ZZ];
    t[7] = s_n[K_S_YX]*r_n[K_F_ZX]+ s_n[K_S_YY]*r_n[K_F_ZY]+ s_n[K_S_YZ]*r_n[K_F_ZZ];
    t[8] = s_n[K_S_ZX]*r_n[K_F_ZX]+ s_n[K_S_ZY]*r_n[K_F_ZY]+ s_n[K_S_ZZ]*r_n[K_F_ZZ];

    rot_stress[ K_S_XX ] = r_n[K_F_XX]*t[0] + r_n[K_F_XY]*t[1] + r_n[K_F_XZ]*t[2];
    rot_stress[ K_S_YY ] = r_n[K_F_YX]*t[3] + r_n[K_F_YY]*t[4] + r_n[K_F_YZ]*t[5];
    rot_stress[ K_S_ZZ ] = r_n[K_F_ZX]*t[6] + r_n[K_F_ZY]*t[7] + r_n[K_F_ZZ]*t[8];

    rot_stress[ K_S_XY ] = r_n[K_F_XX]*t[3] + r_n[K_F_XY]*t[4] + r_n[K_F_XZ]*t[5];
    rot_stress[ K_S_YZ ] = r_n[K_F_YX]*t[6] + r_n[K_F_YY]*t[7] + r_n[K_F_YZ]*t[8];
    rot_stress[ K_S_ZX ] = r_n[K_F_ZX]*t[0] + r_n[K_F_ZY]*t[1] + r_n[K_F_ZZ]*t[2];
  }

  //--------------------------------------------------------------------------

  KOKKOS_INLINE_FUNCTION
  void comp_force(int ielem,
     const Scalar * const vx ,
     const Scalar * const vy ,
     const Scalar * const vz ,
     const Scalar * const grad_x ,
     const Scalar * const grad_y ,
     const Scalar * const grad_z ,
     Scalar * total_stress12th ) const
  {
    Scalar internal_energy_inc = 0 ;

    for(int inode = 0; inode < 8; ++inode) {

      const Scalar fx =
        total_stress12th[K_S_XX] * grad_x[inode] +
        total_stress12th[K_S_XY] * grad_y[inode] +
        total_stress12th[K_S_XZ] * grad_z[inode] ;

      element_force(ielem, 0, inode) = fx ;

      const Scalar fy =
        total_stress12th[K_S_YX] * grad_x[inode] +
        total_stress12th[K_S_YY] * grad_y[inode] +
        total_stress12th[K_S_YZ] * grad_z[inode] ;

      element_force(ielem, 1, inode) = fy ;

      const Scalar fz =
        total_stress12th[K_S_ZX] * grad_x[inode] +
        total_stress12th[K_S_ZY] * grad_y[inode] +
        total_stress12th[K_S_ZZ] * grad_z[inode] ;

      element_force(ielem, 2, inode) = fz ;

      internal_energy_inc +=
        fx * vx[inode] +
        fy * vy[inode] +
        fz * vz[inode] ;
    }

    internal_energy(ielem) = internal_energy_inc ;
  }

  //----------------------------------------------------------------------------

  KOKKOS_INLINE_FUNCTION
  void get_stress(int ielem , Scalar * const s_n ) const
    {
      const int kxx = 0;
      const int kyy = 1;
      const int kzz = 2;
      const int kxy = 3;
      const int kyz = 4;
      const int kzx = 5;

      const Scalar e = (rot_stretch(ielem,kxx)+rot_stretch(ielem,kyy)+rot_stretch(ielem,kzz))/3.0;

      s_n[kxx] = stress_new(ielem,kxx) += dt() * (two_mu * (rot_stretch(ielem,kxx)-e)+3*bulk_modulus*e);
      s_n[kyy] = stress_new(ielem,kyy) += dt() * (two_mu * (rot_stretch(ielem,kyy)-e)+3*bulk_modulus*e);
      s_n[kzz] = stress_new(ielem,kzz) += dt() * (two_mu * (rot_stretch(ielem,kzz)-e)+3*bulk_modulus*e);

      s_n[kxy] = stress_new(ielem,kxy) += dt() * two_mu * rot_stretch(ielem,kxy);
      s_n[kyz] = stress_new(ielem,kyz) += dt() * two_mu * rot_stretch(ielem,kyz);
      s_n[kzx] = stress_new(ielem,kzx) += dt() * two_mu * rot_stretch(ielem,kzx);
    }

  //----------------------------------------------------------------------------


  KOKKOS_INLINE_FUNCTION
  void operator()( int ielem, value_type & update )const
  {
    const Scalar ONE12TH = 1.0 / 12.0 ;

    Scalar x[8], y[8], z[8] ;
    Scalar vx[8], vy[8], vz[8];
    Scalar grad_x[8], grad_y[8], grad_z[8];

    // Position and velocity:

    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
      const int n = elem_node_connectivity(ielem,i);

      x[i] = model_coords(n, 0) + displacement(n, 0, current_state) ;
      y[i] = model_coords(n, 1) + displacement(n, 1, current_state) ;
      z[i] = model_coords(n, 2) + displacement(n, 2, current_state) ;

      vx[i] = velocity(n, 0, current_state);
      vy[i] = velocity(n, 1, current_state);
      vz[i] = velocity(n, 2, current_state);
    }

    // Gradient:

    comp_grad<Scalar,execution_space>( x , y , z , grad_x , grad_y , grad_z );


    const Scalar mid_vol = dot8<Scalar,execution_space>( x , grad_x );

    const Scalar shr = two_mu ;
    const Scalar dil = bulk_modulus + ((2.0*shr)/3.0);

    const Scalar aspect = 6.0 * mid_vol /
                          ( dot8<Scalar,execution_space>( grad_x , grad_x ) +
                            dot8<Scalar,execution_space>( grad_y , grad_y ) +
                            dot8<Scalar,execution_space>( grad_z , grad_z ) );

    const Scalar dtrial = std::sqrt(elem_mass(ielem) * aspect / dil);
    const Scalar traced = (rot_stretch(ielem, 0) + rot_stretch(ielem, 1) + rot_stretch(ielem, 2));

    const Scalar eps = traced < 0 ? (lin_bulk_visc - quad_bulk_visc * traced * dtrial) : lin_bulk_visc ;

    const Scalar bulkq = eps * dil * dtrial * traced;

    Scalar cur_time_step = dtrial * ( std::sqrt( 1.0 + eps * eps) - eps);

    // force fixed time step if input

    cur_time_step = user_dt > 0 ? user_dt : cur_time_step;

    update = update < cur_time_step ? update : cur_time_step;


    Scalar s_n[ 6 ];

    get_stress( ielem, s_n );

    Scalar total_stress12th[6];

    // Get rotated stress:

    rotate_tensor_backward(ielem, s_n , total_stress12th );

    total_stress12th[0] = ONE12TH*( total_stress12th[ 0 ] + bulkq );
    total_stress12th[1] = ONE12TH*( total_stress12th[ 1 ] + bulkq );
    total_stress12th[2] = ONE12TH*( total_stress12th[ 2 ] + bulkq );
    total_stress12th[3] = ONE12TH*( total_stress12th[ 3 ] );
    total_stress12th[4] = ONE12TH*( total_stress12th[ 4 ] );
    total_stress12th[5] = ONE12TH*( total_stress12th[ 5 ] );

    comp_force(ielem, vx, vy, vz,
                      grad_x, grad_y, grad_z, total_stress12th);
  }
};

//----------------------------------------------------------------------------

template<typename Scalar, class DeviceType >
struct nodal_step
{
  typedef DeviceType     execution_space ;
  typedef typename execution_space::size_type  size_type;

  typedef Explicit::Fields< Scalar , execution_space >  Fields ;

  const typename Fields::scalar_type            dt ;
  const typename Fields::scalar_type            prev_dt ;
  const typename Fields::node_elem_ids_type     node_elem_connectivity ;
  const typename Fields::node_coords_type       model_coords ;
  const typename Fields::array_type             nodal_mass ;
  const typename Fields::geom_state_array_type  displacement ;
  const typename Fields::geom_state_array_type  velocity ;
  const typename Fields::geom_array_type        acceleration ;
  const typename Fields::geom_array_type        internal_force ;
  const typename Fields::elem_node_geom_type    element_force ;

  const Scalar   x_bc;
  const int      current_state;
  const int      next_state;


  nodal_step( const Fields  & mesh_fields ,
              const Scalar    arg_x_bc,
              const int       arg_current_state,
              const int       arg_next_state)
   : dt(       mesh_fields.dt )
   , prev_dt(  mesh_fields.prev_dt )
   , node_elem_connectivity( mesh_fields.node_elem_connectivity )
   , model_coords(   mesh_fields.model_coords )
   , nodal_mass(     mesh_fields.nodal_mass )
   , displacement(   mesh_fields.displacement )
   , velocity(       mesh_fields.velocity )
   , acceleration(   mesh_fields.acceleration )
   , internal_force( mesh_fields.internal_force )
   , element_force(  mesh_fields.element_force )
   , x_bc(          arg_x_bc )
   , current_state( arg_current_state )
   , next_state(    arg_next_state )
   {
        //std::cout << "finish_step dt: " << dt << std::endl;
        //std::cout << "finish_step prev_dt: " << prev_dt << std::endl;
   }

  static void apply( const Fields  & mesh_fields ,
                     const Scalar    arg_x_bc ,
                     const int       arg_current_state ,
                     const int       arg_next_state )
  {
    nodal_step op( mesh_fields, arg_x_bc, arg_current_state, arg_next_state );

    // Only update the owned nodes:

    Kokkos::parallel_for( mesh_fields.num_nodes_owned , op );
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(int inode) const
    {
      // Getting count as per 'CSR-like' data structure
      const int begin = node_elem_connectivity.row_map[inode];
      const int end   = node_elem_connectivity.row_map[inode+1];

      double local_force[] = {0.0, 0.0, 0.0};

      // Gather-sum internal force from
      // each element that a node is attached to.

      for ( int i = begin; i < end ; ++i ){

        //  node_elem_offset is a cumulative structure, so
        //  node_elem_offset(inode) should be the index where
        //  a particular row's elem_IDs begin
        const int nelem = node_elem_connectivity.entries( i, 0);

        //  find the row in an element's stiffness matrix
        //  that corresponds to inode
        const int elem_node_index = node_elem_connectivity.entries( i, 1);

        local_force[0] += element_force(nelem, 0, elem_node_index);
        local_force[1] += element_force(nelem, 1, elem_node_index);
        local_force[2] += element_force(nelem, 2, elem_node_index);
      }

      internal_force(inode, 0) = local_force[0];
      internal_force(inode, 1) = local_force[1];
      internal_force(inode, 2) = local_force[2];

      // Acceleration:

      Scalar v_new[3];
      Scalar a_current[3];

      const Scalar tol = 1.0e-7;

      // If not on the boundary then: a = F / m
      if ( tol < fabs(model_coords(inode,0)-x_bc) ) {

        const Scalar m = nodal_mass( inode );

        acceleration(inode,0) = a_current[0] = -local_force[0] / m ;
        acceleration(inode,1) = a_current[1] = -local_force[1] / m ;
        acceleration(inode,2) = a_current[2] = -local_force[2] / m ;
      }
      else { //enforce fixed BC
        acceleration(inode,0) = a_current[0] = 0;
        acceleration(inode,1) = a_current[1] = 0;
        acceleration(inode,2) = a_current[2] = 0;
      }

      // Central difference time integration:

      const Scalar dt_disp = dt() ;
      const Scalar dt_vel = ( dt() + prev_dt() ) / 2.0 ;

      velocity(inode,0,next_state) = v_new[0] =
        velocity(inode,0,current_state) + dt_vel * a_current[0];

      velocity(inode,1,next_state) = v_new[1] =
        velocity(inode,1,current_state) + dt_vel * a_current[1];

      velocity(inode,2,next_state) = v_new[2] =
        velocity(inode,2,current_state) + dt_vel * a_current[2];

      displacement(inode,0,next_state) =
        displacement(inode,0,current_state) + dt_disp * v_new[0];

      displacement(inode,1,next_state) =
        displacement(inode,1,current_state) + dt_disp * v_new[1];

      displacement(inode,2,next_state) =
        displacement(inode,2,current_state) + dt_disp * v_new[2];
    }
};

//----------------------------------------------------------------------------

template< typename Scalar , class DeviceType >
struct pack_state
{
  typedef DeviceType     execution_space ;
  typedef typename execution_space::size_type  size_type ;

  typedef Explicit::Fields< Scalar , execution_space >  Fields ;

  typedef typename Fields::geom_state_array_type::value_type  value_type ;
  typedef Kokkos::View< value_type* , execution_space >     buffer_type ;

  static const unsigned value_count = 6 ;

  const typename Fields::geom_state_array_type  displacement ;
  const typename Fields::geom_state_array_type  velocity ;
  const buffer_type  output ;
  const size_type    inode_base ;
  const size_type    state_next ;

  pack_state( const buffer_type & arg_output ,
              const Fields      & mesh_fields ,
              const size_type     arg_begin ,
              const size_type     arg_state )
   : displacement( mesh_fields.displacement )
   , velocity(     mesh_fields.velocity )
   , output(       arg_output )
   , inode_base(   arg_begin )
   , state_next(   arg_state )
   {}

  static void apply( const buffer_type & arg_output ,
                     const size_type     arg_begin ,
                     const size_type     arg_count ,
                     const Fields      & mesh_fields ,
                     const size_type     arg_state )
  {
    pack_state op( arg_output , mesh_fields , arg_begin , arg_state );

    Kokkos::parallel_for( arg_count , op );
  }

  KOKKOS_INLINE_FUNCTION
  void operator()( const size_type i ) const
  {
    const size_type inode = inode_base + i ;

    size_type j = i * value_count ;

    output[j++] = displacement( inode , 0 , state_next );
    output[j++] = displacement( inode , 1 , state_next );
    output[j++] = displacement( inode , 2 , state_next );
    output[j++] = velocity( inode , 0 , state_next );
    output[j++] = velocity( inode , 1 , state_next );
    output[j++] = velocity( inode , 2 , state_next );
  }
};

template< typename Scalar , class DeviceType >
struct unpack_state
{
  typedef DeviceType     execution_space ;
  typedef typename execution_space::size_type  size_type ;

  typedef Explicit::Fields< Scalar , execution_space >  Fields ;

  typedef typename Fields::geom_state_array_type::value_type  value_type ;
  typedef Kokkos::View< value_type* , execution_space >     buffer_type ;

  static const unsigned value_count = 6 ;

  const typename Fields::geom_state_array_type  displacement ;
  const typename Fields::geom_state_array_type  velocity ;
  const buffer_type  input ;
  const size_type    inode_base ;
  const size_type    state_next ;

  unpack_state( const buffer_type & arg_input ,
                const Fields      & mesh_fields ,
                const size_type     arg_begin ,
                const size_type     arg_state )
   : displacement( mesh_fields.displacement )
   , velocity(     mesh_fields.velocity )
   , input(        arg_input )
   , inode_base(   arg_begin )
   , state_next(   arg_state )
   {}

  static void apply( const Fields      & mesh_fields ,
                     const size_type     arg_state ,
                     const buffer_type & arg_input ,
                     const size_type     arg_begin ,
                     const size_type     arg_count )
  {
    unpack_state op( arg_input , mesh_fields , arg_begin , arg_state );

    Kokkos::parallel_for( arg_count , op );
  }

  KOKKOS_INLINE_FUNCTION
  void operator()( const size_type i ) const
  {
    const size_type inode = inode_base + i ;

    size_type j = i * value_count ;

    displacement( inode , 0 , state_next ) = input[j++] ;
    displacement( inode , 1 , state_next ) = input[j++] ;
    displacement( inode , 2 , state_next ) = input[j++] ;
    velocity( inode , 0 , state_next ) = input[j++] ;
    velocity( inode , 1 , state_next ) = input[j++] ;
    velocity( inode , 2 , state_next ) = input[j++] ;
  }
};

} /* namespace Explicit */

#endif /* #ifndef KOKKOS_EXPLICITFUNCTORS_HPP */