Merge branch 'lammps-kokkos' into lammps-icms

2014-06-04 14:14:13 +02:00
parent 06af7cfd27 895ca766ee
commit bb338aaccd
212 changed files with 65095 additions and 634 deletions
--- a/bench/GPU/README
+++ b/bench/GPU/README
@ -39,9 +39,10 @@ mpirun -np 8 ../lmp_linux_mixed -sf gpu -c off -v g 2 -v x 32 -v y 32 -v z 64 -v

 The "xyz" settings determine the problem size.  The "t" setting
 determines the number of timesteps.  The "np" setting determines how
-many CPUs the problem will be run on, and the "g" settings determines
-how many GPUs the problem will run on, i.e. 1 or 2 in this case.  You
-can use more CPUs than GPUs with the GPU package.
+many MPI tasks per compute node the problem will run on, and the "g"
+setting determines how many GPUs per compute node the problem will run
+on, i.e. 1 or 2 in this case.  Note that you can use more MPI tasks
+than GPUs (both per compute node) with the GPU package.

 ------------------------------------------------------------------------

@ -54,7 +55,7 @@ mpirun -np 2 ../lmp_linux_double -sf cuda -v g 2 -v x 32 -v y 64 -v z 64 -v t 10

 The "xyz" settings determine the problem size.  The "t" setting
 determines the number of timesteps.  The "np" setting determines how
-many CPUs the problem will be run on, and the "g" setting determines
-how many GPUs the problem will run on, i.e. 1 or 2 in this case.  You
-should make the number of CPUs and number of GPUs equal for the
-USER-CUDA package.
+many MPI tasks per compute node the problem will run on, and the "g"
+setting determines how many GPUs per compute node the problem will run
+on, i.e. 1 or 2 in this case.  For the USER-CUDA package, the number
+of MPI tasks and GPUs (both per compute node) must be equal.
--- a/doc/Manual.html
+++ b/doc/Manual.html
@ -1,7 +1,7 @@
 <HTML>
 <HEAD>
 <TITLE>LAMMPS-ICMS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="27 May 2014 version">
+<META NAME="docnumber" CONTENT="29 May 2014 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@ -22,7 +22,7 @@

 <CENTER><H3>LAMMPS-ICMS Documentation 
 </H3></CENTER>
-<CENTER><H4>27 May 2014 version 
+<CENTER><H4>29 May 2014 version 
 </H4></CENTER>
 <H4>Version info: 
 </H4>
--- a/doc/Manual.txt
+++ b/doc/Manual.txt
@ -1,6 +1,6 @@
 <HEAD>
 <TITLE>LAMMPS-ICMS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="27 May 2014 version">
+<META NAME="docnumber" CONTENT="29 May 2014 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@ -18,7 +18,7 @@
 <H1></H1>

 LAMMPS-ICMS Documentation :c,h3
-27 May 2014 version :c,h4
+29 May 2014 version :c,h4

 Version info: :h4

--- a/doc/Section_accelerate.html
+++ b/doc/Section_accelerate.html
@ -24,7 +24,8 @@ kinds of machines.
 5.5 <A HREF = "#acc_5">USER-OMP package</A><BR>
 5.6 <A HREF = "#acc_6">GPU package</A><BR>
 5.7 <A HREF = "#acc_7">USER-CUDA package</A><BR>
-5.8 <A HREF = "#acc_8">Comparison of GPU and USER-CUDA packages</A> <BR>
+5.8 <A HREF = "#acc_8">KOKKOS package</A><BR>
+5.9 <A HREF = "#acc_9">Comparison of GPU and USER-CUDA packages</A> <BR>

 <HR>

@ -146,14 +147,14 @@ command is identical, their functionality is the same, and the
 numerical results it produces should also be identical, except for
 precision and round-off issues.
 </P>
-<P>For example, all of these variants of the basic Lennard-Jones pair
-style exist in LAMMPS:
+<P>For example, all of these styles are variants of the basic
+Lennard-Jones pair style <A HREF = "pair_lj.html">pair_style lj/cut</A>:
 </P>
-<UL><LI><A HREF = "pair_lj.html">pair_style lj/cut</A>
-<LI><A HREF = "pair_lj.html">pair_style lj/cut/opt</A>
-<LI><A HREF = "pair_lj.html">pair_style lj/cut/omp</A>
+<UL><LI><A HREF = "pair_lj.html">pair_style lj/cut/cuda</A>
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/gpu</A>
-<LI><A HREF = "pair_lj.html">pair_style lj/cut/cuda</A> 
+<LI><A HREF = "pair_lj.html">pair_style lj/cut/kk</A>
+<LI><A HREF = "pair_lj.html">pair_style lj/cut/omp</A>
+<LI><A HREF = "pair_lj.html">pair_style lj/cut/opt</A> 
 </UL>
 <P>Assuming you have built LAMMPS with the appropriate package, these
 styles can be invoked by specifying them explicitly in your input
@ -161,11 +162,17 @@ script.  Or you can use the <A HREF = "Section_start.html#start_7">-suffix comma
 switch</A> to invoke the accelerated versions
 automatically, without changing your input script.  The
 <A HREF = "suffix.html">suffix</A> command allows you to set a suffix explicitly and
-to turn off/on the comand-line switch setting, both from within your
-input script.
+to turn off and back on the comand-line switch setting, both from
+within your input script.
 </P>
-<P>Styles with an "opt" suffix are part of the OPT package and typically
-speed-up the pairwise calculations of your simulation by 5-25%.
+<P>Styles with a "cuda" or "gpu" suffix are part of the USER-CUDA or GPU
+packages, and can be run on NVIDIA GPUs associated with your CPUs.
+The speed-up due to GPU usage depends on a variety of factors, as
+discussed below.
+</P>
+<P>Styles with a "kk" suffix are part of the KOKKOS package, and can be
+run using OpenMP, pthreads, or on an NVIDIA GPU.  The speed-up depends
+on a variety of factors, as discussed below.
 </P>
 <P>Styles with an "omp" suffix are part of the USER-OMP package and allow
 a pair-style to be run in multi-threaded mode using OpenMP.  This can
@ -174,26 +181,26 @@ than cores is advantageous, e.g. when running with PPPM so that FFTs
 are run on fewer MPI processors or when the many MPI tasks would
 overload the available bandwidth for communication.
 </P>
-<P>Styles with a "gpu" or "cuda" suffix are part of the GPU or USER-CUDA
-packages, and can be run on NVIDIA GPUs associated with your CPUs.
-The speed-up due to GPU usage depends on a variety of factors, as
-discussed below.
+<P>Styles with an "opt" suffix are part of the OPT package and typically
+speed-up the pairwise calculations of your simulation by 5-25%.
 </P>
 <P>To see what styles are currently available in each of the accelerated
 packages, see <A HREF = "Section_commands.html#cmd_5">Section_commands 5</A> of the
 manual.  A list of accelerated styles is included in the pair, fix,
-compute, and kspace sections.
+compute, and kspace sections.  The doc page for each indvidual style
+(e.g. <A HREF = "pair_lj.html">pair lj/cut</A> or <A HREF = "fix_nve.html">fix nve</A>) will also
+list any accelerated variants available for that style.
 </P>
 <P>The following sections explain:
 </P>
 <UL><LI>what hardware and software the accelerated styles require
-<LI>how to build LAMMPS with the accelerated packages in place
+<LI>how to build LAMMPS with the accelerated package in place
 <LI>what changes (if any) are needed in your input scripts
 <LI>guidelines for best performance
 <LI>speed-ups you can expect 
 </UL>
 <P>The final section compares and contrasts the GPU and USER-CUDA
-packages, since they are both designed to use NVIDIA GPU hardware.
+packages, since they are both designed to use NVIDIA hardware.
 </P>
 <HR>

@ -212,8 +219,8 @@ dependencies:
 <PRE>make yes-opt
 make machine 
 </PRE>
-<P>If your input script uses one of the OPT pair styles,
-you can run it as follows:
+<P>If your input script uses one of the OPT pair styles, you can run it
+as follows:
 </P>
 <PRE>lmp_machine -sf opt < in.script
 mpirun -np 4 lmp_machine -sf opt < in.script 
@ -226,12 +233,13 @@ to 20% savings.

 <H4><A NAME = "acc_5"></A>5.5 USER-OMP package 
 </H4>
-<P>The USER-OMP package was developed by Axel Kohlmeyer at Temple University.
-It provides multi-threaded versions of most pair styles, all dihedral
-styles and a few fixes in LAMMPS. The package currently uses the OpenMP
-interface which requires using a specific compiler flag in the makefile
-to enable multiple threads; without this flag the corresponding pair
-styles will still be compiled and work, but do not support multi-threading.
+<P>The USER-OMP package was developed by Axel Kohlmeyer at Temple
+University.  It provides multi-threaded versions of most pair styles,
+all dihedral styles, and a few fixes in LAMMPS. The package currently
+uses the OpenMP interface which requires using a specific compiler
+flag in the makefile to enable multiple threads; without this flag the
+corresponding pair styles will still be compiled and work, but do not
+support multi-threading.
 </P>
 <P><B>Building LAMMPS with the USER-OMP package:</B>
 </P>
@ -264,18 +272,19 @@ env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
 mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script 
 </PRE>
 <P>The value of the environment variable OMP_NUM_THREADS determines how
-many threads per MPI task are launched. All three examples above use
-a total of 4 CPU cores.  For different MPI implementations the method
-to pass the OMP_NUM_THREADS environment variable to all processes is
-different.  Two different variants, one for MPICH and OpenMPI, respectively
-are shown above.  Please check the documentation of your MPI installation
-for additional details.  Alternatively, the value provided by OMP_NUM_THREADS
-can be overridded with the <A HREF = "package.html">package omp</A> command.
-Depending on which styles are accelerated in your input, you should
-see a reduction in the "Pair time" and/or "Bond time" and "Loop time"
-printed out at the end of the run. The optimal ratio of MPI to OpenMP
-can vary a lot and should always be confirmed through some benchmark
-runs for the current system and on the current machine.
+many threads per MPI task are launched. All three examples above use a
+total of 4 CPU cores.  For different MPI implementations the method to
+pass the OMP_NUM_THREADS environment variable to all processes is
+different.  Two different variants, one for MPICH and OpenMPI,
+respectively are shown above.  Please check the documentation of your
+MPI installation for additional details.  Alternatively, the value
+provided by OMP_NUM_THREADS can be overridded with the <A HREF = "package.html">package
+omp</A> command.  Depending on which styles are accelerated
+in your input, you should see a reduction in the "Pair time" and/or
+"Bond time" and "Loop time" printed out at the end of the run. The
+optimal ratio of MPI to OpenMP can vary a lot and should always be
+confirmed through some benchmark runs for the current system and on
+the current machine.
 </P>
 <P><B>Restrictions:</B>
 </P>
@ -293,53 +302,55 @@ On the other hand, in many cases you still want to use the <I>omp</I> version
 all contain optimizations similar to those in the OPT package, which
 can result in serial speedup.
 </P>
-<P>Using multi-threading is most effective under the following circumstances:
+<P>Using multi-threading is most effective under the following
+circumstances:
 </P>
-<UL><LI>Individual compute nodes have a significant number of CPU cores
-but the CPU itself has limited memory bandwidth, e.g. Intel Xeon 53xx
-(Clovertown) and 54xx (Harpertown) quad core processors. Running
-one MPI task per CPU core will result in significant performance
-degradation, so that running with 4 or even only 2 MPI tasks per
-nodes is faster. Running in hybrid MPI+OpenMP mode will reduce the
-inter-node communication bandwidth contention in the same way,
-but offers and additional speedup from utilizing the otherwise
-idle CPU cores. 
+<UL><LI>Individual compute nodes have a significant number of CPU cores but
+the CPU itself has limited memory bandwidth, e.g. Intel Xeon 53xx
+(Clovertown) and 54xx (Harpertown) quad core processors. Running one
+MPI task per CPU core will result in significant performance
+degradation, so that running with 4 or even only 2 MPI tasks per nodes
+is faster. Running in hybrid MPI+OpenMP mode will reduce the
+inter-node communication bandwidth contention in the same way, but
+offers and additional speedup from utilizing the otherwise idle CPU
+cores. 

 <LI>The interconnect used for MPI communication is not able to provide
-sufficient bandwidth for a large number of MPI tasks per node.
-This applies for example to running over gigabit ethernet or
-on Cray XT4 or XT5 series supercomputers. Same as in the aforementioned
-case this effect worsens with using an increasing number of nodes. 
+sufficient bandwidth for a large number of MPI tasks per node.  This
+applies for example to running over gigabit ethernet or on Cray XT4 or
+XT5 series supercomputers. Same as in the aforementioned case this
+effect worsens with using an increasing number of nodes. 

-<LI>The input is a system that has an inhomogeneous particle density
-which cannot be mapped well to the domain decomposition scheme
-that LAMMPS employs. While this can be to some degree alleviated
-through using the <A HREF = "processors.html">processors</A> keyword, multi-threading
-provides a parallelism that parallelizes over the number of particles
-not their distribution in space. 
+<LI>The input is a system that has an inhomogeneous particle density which
+cannot be mapped well to the domain decomposition scheme that LAMMPS
+employs. While this can be to some degree alleviated through using the
+<A HREF = "processors.html">processors</A> keyword, multi-threading provides a
+parallelism that parallelizes over the number of particles not their
+distribution in space. 

 <LI>Finally, multi-threaded styles can improve performance when running
 LAMMPS in "capability mode", i.e. near the point where the MPI
-parallelism scales out. This can happen in particular when using
-as kspace style for long-range electrostatics. Here the scaling
-of the kspace style is the performance limiting factor and using
-multi-threaded styles allows to operate the kspace style at the
-limit of scaling and then increase performance parallelizing
-the real space calculations with hybrid MPI+OpenMP. Sometimes
-additional speedup can be achived by increasing the real-space
-coulomb cutoff and thus reducing the work in the kspace part. 
+parallelism scales out. This can happen in particular when using as
+kspace style for long-range electrostatics. Here the scaling of the
+kspace style is the performance limiting factor and using
+multi-threaded styles allows to operate the kspace style at the limit
+of scaling and then increase performance parallelizing the real space
+calculations with hybrid MPI+OpenMP. Sometimes additional speedup can
+be achived by increasing the real-space coulomb cutoff and thus
+reducing the work in the kspace part. 
 </UL>
-<P>The best parallel efficiency from <I>omp</I> styles is typically 
-achieved when there is at least one MPI task per physical 
-processor, i.e. socket or die.
+<P>The best parallel efficiency from <I>omp</I> styles is typically achieved
+when there is at least one MPI task per physical processor,
+i.e. socket or die.
 </P>
 <P>Using threads on hyper-threading enabled cores is usually
 counterproductive, as the cost in additional memory bandwidth
-requirements is not offset by the gain in CPU utilization
-through hyper-threading.
+requirements is not offset by the gain in CPU utilization through
+hyper-threading.
 </P>
 <P>A description of the multi-threading strategy and some performance
-examples are <A HREF = "http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1">presented here</A>
+examples are <A HREF = "http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1">presented
+here</A>
 </P>
 <HR>

@ -366,32 +377,23 @@ between processors, runs on the CPU.
 <LI>Asynchronous force computations can be performed simultaneously on the
 CPU(s) and GPU. 

+<LI>It allows for GPU computations to be performed in single or double
+precision, or in mixed-mode precision. where pairwise forces are
+cmoputed in single precision, but accumulated into double-precision
+force vectors. 
+
 <LI>LAMMPS-specific code is in the GPU package.  It makes calls to a
 generic GPU library in the lib/gpu directory.  This library provides
 NVIDIA support as well as more general OpenCL support, so that the
 same functionality can eventually be supported on a variety of GPU
 hardware. 
 </UL>
-<P>NOTE:
-  discuss 3 precisions
-    if change, also have to re-link with LAMMPS
-  always use newton off
-  expt with differing numbers of CPUs vs GPU - can't tell what is fastest
-  give command line switches in examples
-</P>
-<P>I am not very clear to the meaning of  "Max Mem / Proc"
-in the "GPU Time Info (average)".
-Is it the maximal of GPU memory used by one CPU core?
-</P>
-<P>It is the maximum memory used at one time on the GPU for data storage by
-a single MPI process. - Mike
-</P>
 <P><B>Hardware and software requirements:</B>
 </P>
-<P>To use this package, you currently need to have specific NVIDIA
-hardware and install specific NVIDIA CUDA software on your system:
+<P>To use this package, you currently need to have an NVIDIA GPU and
+install the NVIDIA Cuda software on your system:
 </P>
-<UL><LI>Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
+<UL><LI>Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/cards/0
 <LI>Go to http://www.nvidia.com/object/cuda_get.html
 <LI>Install a driver and toolkit appropriate for your system (SDK is not necessary)
 <LI>Follow the instructions in lammps/lib/gpu/README to build the library (see below)
@ -403,8 +405,21 @@ hardware and install specific NVIDIA CUDA software on your system:
 need to first build the GPU library, before building LAMMPS itself.
 General instructions for doing this are in <A HREF = "Section_start.html#start_3">this
 section</A> of the manual.  For this package,
-do the following, using a Makefile in lib/gpu appropriate for your
-system:
+use a Makefile in lib/gpu appropriate for your system.
+</P>
+<P>Before building the library, you can set the precision it will use by
+editing the CUDA_PREC setting in the Makefile you are using, as
+follows:
+</P>
+<PRE>CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
+CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
+CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc, in double 
+</PRE>
+<P>The last setting is the mixed mode referred to above.  Note that your
+GPU must support double precision to use either the 2nd or 3rd of
+these settings.
+</P>
+<P>To build the library, then type:
 </P>
 <PRE>cd lammps/lib/gpu
 make -f Makefile.linux
@ -424,41 +439,60 @@ set appropriately to include the paths and settings for the CUDA
 system software on your machine.  See src/MAKE/Makefile.g++ for an
 example.
 </P>
-<P><B>GPU configuration</B>
+<P>Also note that if you change the GPU library precision, you need to
+re-build the entire library.  You should do a "clean" first,
+e.g. "make -f Makefile.linux clean".  Then you must also re-build
+LAMMPS if the library precision has changed, so that it re-links with
+the new library.
 </P>
-<P>When using GPUs, you are restricted to one physical GPU per LAMMPS
-process, which is an MPI process running on a single core or
-processor.  Multiple MPI processes (CPU cores) can share a single GPU,
-and in many cases it will be more efficient to run this way.
+<P><B>Running an input script:</B>
 </P>
-<P><B>Input script requirements:</B>
+<P>The examples/gpu and bench/GPU directories have scripts that can be
+run with the GPU package, as well as detailed instructions on how to
+run them.
 </P>
-<P>Additional input script requirements to run pair or PPPM styles with a
+<P>The total number of MPI tasks used by LAMMPS (one or multiple per
+compute node) is set in the usual manner via the mpirun or mpiexec
+commands, and is independent of the GPU package.
+</P>
+<P>When using the GPU package, you cannot assign more than one physical
+GPU to an MPI task.  However multiple MPI tasks can share the same
+GPU, and in many cases it will be more efficient to run this way.
+</P>
+<P>Input script requirements to run using pair or PPPM styles with a
 <I>gpu</I> suffix are as follows:
 </P>
-<UL><LI>To invoke specific styles from the GPU package, you can either append
-"gpu" to the style name (e.g. pair_style lj/cut/gpu), or use the
-<A HREF = "Section_start.html#start_7">-suffix command-line switch</A>, or use the
-<A HREF = "suffix.html">suffix</A> command. 
+<UL><LI>To invoke specific styles from the GPU package, either append "gpu" to
+the style name (e.g. pair_style lj/cut/gpu), or use the <A HREF = "Section_start.html#start_7">-suffix
+command-line switch</A>, or use the
+<A HREF = "suffix.html">suffix</A> command in the input script. 

-<LI>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I>. 
+<LI>The <A HREF = "newton.html">newton pair</A> setting in the input script must be
+<I>off</I>. 

-<LI>The <A HREF = "package.html">package gpu</A> command must be used near the beginning
-of your script to control the GPU selection and initialization
-settings.  It also has an option to enable asynchronous splitting of
-force computations between the CPUs and GPUs. 
+<LI>Unless the <A HREF = "Section_start.html#start_7">-suffix gpu command-line
+switch</A> is used, the <A HREF = "package.html">package
+gpu</A> command must be used near the beginning of the
+script to control the GPU selection and initialization settings.  It
+also has an option to enable asynchronous splitting of force
+computations between the CPUs and GPUs. 
 </UL>
-<P>As an example, if you have two GPUs per node and 8 CPU cores per node,
-and would like to run on 4 nodes (32 cores) with dynamic balancing of
-force calculation across CPU and GPU cores, you could specify
+<P>The default for the <A HREF = "package.html">package gpu</A> command is to have all
+the MPI tasks on the compute node use a single GPU.  If you have
+multiple GPUs per node, then be sure to create one or more MPI tasks
+per GPU, and use the first/last settings in the <A HREF = "package.html">package
+gpu</A> command to include all the GPU IDs on the node.
+E.g. first = 0, last = 1, for 2 GPUs.  For example, on an 8-core 2-GPU
+compute node, if you assign 8 MPI tasks to the node, the following
+command in the input script
 </P>
-<PRE>package gpu force/neigh 0 1 -1 
-</PRE>
-<P>In this case, all CPU cores and GPU devices on the nodes would be
-utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
-cores would perform force calculations for some fraction of the
-particles at the same time the GPUs performed force calculation for
-the other particles.
+<P>package gpu force/neigh 0 1 -1
+</P>
+<P>would speciy each GPU is shared by 4 MPI tasks.  The final -1 will
+dynamically balance force calculations across the CPU cores and GPUs.
+I.e. each CPU core will perform force calculations for some small
+fraction of the particles, at the same time the GPUs perform force
+calcaultions for the majority of the particles.
 </P>
 <P><B>Timing output:</B>
 </P>
@ -482,19 +516,30 @@ screen output (not in the log file) at the end of each run.  These
 timings represent total time spent on the GPU for each routine,
 regardless of asynchronous CPU calculations.
 </P>
+<P>The output section "GPU Time Info (average)" reports "Max Mem / Proc".
+This is the maximum memory used at one time on the GPU for data
+storage by a single MPI process.
+</P>
 <P><B>Performance tips:</B>
 </P>
-<P>Generally speaking, for best performance, you should use multiple CPUs
-per GPU, as provided my most multi-core CPU/GPU configurations.
+<P>You should experiment with how many MPI tasks per GPU to use to see
+what gives the best performance for your problem.  This is a function
+of your problem size and what pair style you are using.  Likewise, you
+should also experiment with the precision setting for the GPU library
+to see if single or mixed precision will give accurate results, since
+they will typically be faster.
 </P>
-<P>Because of the large number of cores within each GPU device, it may be
-more efficient to run on fewer processes per GPU when the number of
-particles per MPI process is small (100's of particles); this can be
-necessary to keep the GPU cores busy.
+<P>Using multiple MPI tasks per GPU will often give the best performance,
+as allowed my most multi-core CPU/GPU configurations.
 </P>
-<P>See the lammps/lib/gpu/README file for instructions on how to build
-the GPU library for single, mixed, or double precision.  The latter
-requires that your GPU card support double precision.
+<P>If the number of particles per MPI task is small (e.g. 100s of
+particles), it can be more eefficient to run with fewer MPI tasks per
+GPU, even if you do not use all the cores on the compute node.
+</P>
+<P>The <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the LAMMPS
+web site gives GPU performance on a desktop machine and the Titan HPC
+platform at ORNL for several of the LAMMPS benchmarks, as a function
+of problem size and number of compute nodes.
 </P>
 <HR>

@ -632,9 +677,302 @@ occurs, the faster your simulation will run.
 </P>
 <HR>

+<H4><A NAME = "acc_8"></A>5.8 KOKKOS package 
+</H4>
+<P>The KOKKOS package contains versions of pair, fix, and atom styles
+that use data structures and methods and macros provided by the Kokkos
+library, which is included with LAMMPS in lib/kokkos.
+</P>
+<P><A HREF = "http://trilinos.sandia.gov/packages/kokkos">Kokkos</A> is a C++ library
+that provides two key abstractions for an application like LAMMPS.
+First, it allows a single implementation of an application kernel
+(e.g. a pair style) to run efficiently on different kinds of hardware
+(GPU, Intel Phi, many-core chip).
+</P>
+<P>Second, it provides data abstractions to adjust (at compile time) the
+memory layout of basic data structures like 2d and 3d arrays and allow
+the transparent utilization of special hardware load and store units.
+Such data structures are used in LAMMPS to store atom coordinates or
+forces or neighbor lists.  The layout is chosen to optimize
+performance on different platforms.  Again this operation is hidden
+from the developer, and does not affect how the single implementation
+of the kernel is coded.
+</P>
+<P>These abstractions are set at build time, when LAMMPS is compiled with
+the KOKKOS package installed.  This is done by selecting a "host" and
+"device" to build for, compatible with the compute nodes in your
+machine.  Note that if you are running on a desktop machine, you
+typically have one compute node.  On a cluster or supercomputer there
+may be dozens or 1000s of compute nodes.  The procedure for building
+and running with the Kokkos library is the same, no matter how many
+nodes you run on.
+</P>
+<P>All Kokkos operations occur within the context of an individual MPI
+task running on a single node of the machine.  The total number of MPI
+tasks used by LAMMPS (one or multiple per compute node) is set in the
+usual manner via the mpirun or mpiexec commands, and is independent of
+Kokkos.
+</P>
+<P>Kokkos provides support for one or two modes of execution per MPI
+task.  This means that some computational tasks (pairwise
+interactions, neighbor list builds, time integration, etc) are
+parallelized in one or the other of the two modes.  The first mode is
+called the "host" and is one or more threads running on one or more
+physical CPUs (within the node).  Currently, both multi-core CPUs and
+an Intel Phi processor (running in native mode) are supported.  The
+second mode is called the "device" and is an accelerator chip of some
+kind.  Currently only an NVIDIA GPU is supported.  If your compute
+node does not have a GPU, then there is only one mode of execution,
+i.e. the host and device are the same.
+</P>
+<P>IMPORTNANT NOTE: Currently, if using GPUs, you should set the number
+of MPI tasks per compute node to be equal to the number of GPUs per
+compute node.  In the future Kokkos will support assigning one GPU to
+multiple MPI tasks or using multiple GPUs per MPI task.  Currently
+Kokkos does not support AMD GPUs due to limits in the available
+backend programming models (in particular relative extensive C++
+support is required for the Kernel language).  This is expected to
+change in the future.
+</P>
+<P>Here are several examples of how to build LAMMPS and run a simulation
+using the KOKKOS package for typical compute node configurations.
+Note that the -np setting for the mpirun command in these examples are
+for a run on a single node.  To scale these examples up to run on a
+system with N compute nodes, simply multiply the -np setting by N.
+</P>
+<P>All the build steps are performed from within the src directory.  All
+the run steps are performed in the bench directory using the in.lj
+input script.  It is assumed the LAMMPS executable has been copied to
+that directory or whatever directory the runs are being performed in.
+Details of the various options are discussed below.
+</P>
+<P><B>Compute node(s) = dual hex-core CPUs and no GPU:</B>
+</P>
+<PRE>make yes-kokkos                           # install the KOKKOS package
+make g++ OMP=yes                          # build with OpenMP, no CUDA 
+</PRE>
+<PRE>mpirun -np 12 lmp_g++ -k off < in.lj      # MPI-only mode with no Kokkos
+mpirun -np 12 lmp_g++ -sf kk < in.lj      # MPI-only mode with Kokkos
+mpirun -np 1 lmp_g++ -k on t 12 -sf kk < in.lj     # one MPI task, 12 threads
+mpirun -np 2 lmp_g++ -k on t 6 -sf kk < in.lj      # two MPI tasks, 6 threads/task 
+</PRE>
+<P><B>Compute node(s) = Intel Phi with 61 cores:</B>
+</P>
+<PRE>make yes-kokkos
+make g++ OMP=yes MIC=yes                  # build with OpenMP for Phi 
+</PRE>
+<PRE>mpirun -np 12 lmp_g++ -k on t 20 -sf kk < in.lj      # 12*20 = 240 total cores
+mpirun -np 15 lmp_g++ -k on t 16 -sf kk < in.lj
+mpirun -np 30 lmp_g++ -k on t 8 -sf kk < in.lj
+mpirun -np 1 lmp_g++ -k on t 240 -sf kk < in.lj 
+</PRE>
+<P><B>Compute node(s) = dual hex-core CPUs and a single GPU:</B>
+</P>
+<PRE>make yes-kokkos
+make cuda CUDA=yes             # build for GPU, use src/MAKE/Makefile.cuda 
+</PRE>
+<PRE>mpirun -np 1 lmp_cuda -k on t 6 -sf kk < in.lj 
+</PRE>
+<P><B>Compute node(s) = dual 8-core CPUs and 2 GPUs:</B>
+</P>
+<PRE>make yes-kokkos
+make cuda CUDA=yes 
+</PRE>
+<PRE>mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk < in.lj     # use both GPUs, one per MPI task 
+</PRE>
+<P><B>Building LAMMPS with the KOKKOS package:</B>
+</P>
+<P>A summary of the build process is given here.  More details and all
+the available make variable options are given in <A HREF = "Section_start.html#start_3_4">this
+section</A> of the manual.
+</P>
+<P>From the src directory, type
+</P>
+<PRE>make yes-kokkos 
+</PRE>
+<P>to include the KOKKOS package.  Then perform a normal LAMMPS build,
+with additional make variable specifications to choose the host and
+device you will run the resulting executable on, e.g.
+</P>
+<PRE>make g++ OMP=yes
+make cuda CUDA=yes 
+</PRE>
+<P>As illustrated above, the most important variables to set are OMP,
+CUDA, and MIC.  The default settings are OMP=yes, CUDA=no, MIC=no
+Setting OMP to <I>yes</I> will use OpenMP for threading on the host, as
+well as on the device (if no GPU is present).  Setting CUDA to <I>yes</I>
+will use one or more GPUs as the device.  Setting MIC=yes is necessary
+when building for an Intel Phi processor.
+</P>
+<P>Note that to use a GPU, you must use a lo-level Makefile,
+e.g. src/MAKE/Makefile.cuda as included in the LAMMPS distro, which
+uses the NVIDA "nvcc" compiler.  You must check that the CCFLAGS -arch
+setting is appropriate for your NVIDIA hardware and installed
+software.  Typical values for -arch are given in <A HREF = "Section_start.html#start_3_4">this
+section</A> of the manual, as well as other
+settings that must be included in the lo-level Makefile, if you create
+your own.
+</P>
+<P><B>Input scripts and use of command-line switches -kokkos and -suffix:</B>
+</P>
+<P>To use any Kokkos-enabled style provided in the KOKKOS package, you
+must use a Kokkos-enabled atom style.  LAMMPS will give an error if
+you do not do this.
+</P>
+<P>There are two command-line switches relevant to using Kokkos, -k or
+-kokkos, and -sf or -suffix.  They are described in detail in <A HREF = "Section_start.html#start_7">this
+section</A> of the manual.
+</P>
+<P>Here are common options to use:
+</P>
+<UL><LI>-k off : runs an executable built with the KOKKOS pacakage, as
+ if Kokkos were not installed. 
+
+<LI>-sf kk : enables automatic use of Kokkos versions of atom, pair,
+fix, compute styles if they exist.  This can also be done with more
+precise control by using the <A HREF = "suffix.html">suffix</A> command or appending
+"kk" to styles within the input script, e.g. "pair_style lj/cut/kk". 
+
+<LI>-k on t Nt : specifies how many threads per MPI task to use within a
+ compute node.  For good performance, the product of MPI tasks *
+ threads/task should not exceed the number of physical CPU or Intel
+ Phi cores. 
+
+<LI>-k on g Ng : specifies how many GPUs per compute node are available.
+The default is 1, so this should be specified is you have 2 or more
+GPUs per compute node. 
+</UL>
+<P><B>Use of package command options:</B>
+</P>
+<P>Using the <A HREF = "package.html">package kokkos</A> command in an input script
+allows choice of options for neighbor lists and communication.  See
+the <A HREF = "package.html">package</A> command doc page for details and default
+settings.
+</P>
+<P>Experimenting with different styles of neighbor lists or inter-node
+communication can provide a speed-up for specific calculations.
+</P>
+<P><B>Running on a multi-core CPU:</B>
+</P>
+<P>Build with OMP=yes (the default) and CUDA=no (the default).
+</P>
+<P>If N is the number of physical cores/node, then the number of MPI
+tasks/node * number of threads/task should not exceed N, and should
+typically equal N.  Note that the default threads/task is 1, as set by
+the "t" keyword of the -k <A HREF = "Section_start.html#start_7">command-line
+switch</A>.  If you do not change this, no
+additional parallelism (beyond MPI) will be invoked on the host
+CPU(s).
+</P>
+<P>You can compare the performance running in different modes:
+</P>
+<UL><LI>run with 1 MPI task/node and N threads/task
+<LI>run with N MPI tasks/node and 1 thread/task
+<LI>run with settings in between these extremes 
+</UL>
+<P>Examples of mpirun commands in these modes, for nodes with dual
+hex-core CPUs and no GPU, are shown above.
+</P>
+<P><B>Running on GPUs:</B>
+</P>
+<P>Build with CUDA=yes, using src/MAKE/Makefile.cuda.  Insure the setting
+for CUDA_PATH in lib/kokkos/Makefile.lammps is correct for your Cuda
+software installation.  Insure the -arch setting in
+src/MAKE/Makefile.cuda is correct for your GPU hardware/software (see
+<A HREF = "Section_start.html#start_3_4">this section</A> of the manual for details.
+</P>
+<P>The -np setting of the mpirun command should set the number of MPI
+tasks/node to be equal to the # of physical GPUs on the node. 
+</P>
+<P>Use the <A HREF = "Section_commands.html#start_7">-kokkos command-line switch</A> to
+specify the number of GPUs per node, and the number of threads per MPI
+task.  As above for multi-core CPUs (and no GPU), if N is the number
+of physical cores/node, then the number of MPI tasks/node * number of
+threads/task should not exceed N.  With one GPU (and one MPI task) it
+may be faster to use less than all the available cores, by setting
+threads/task to a smaller value.  This is because using all the cores
+on a dual-socket node will incur extra cost to copy memory from the
+2nd socket to the GPU.
+</P>
+<P>Examples of mpirun commands that follow these rules, for nodes with
+dual hex-core CPUs and one or two GPUs, are shown above.
+</P>
+<P><B>Running on an Intel Phi:</B>
+</P>
+<P>Kokkos only uses Intel Phi processors in their "native" mode, i.e.
+not hosted by a CPU.
+</P>
+<P>Build with OMP=yes (the default) and MIC=yes.  The latter
+insures code is correctly compiled for the Intel Phi.  The
+OMP setting means OpenMP will be used for parallelization
+on the Phi, which is currently the best option within
+Kokkos.  In the future, other options may be added.
+</P>
+<P>Current-generation Intel Phi chips have either 61 or 57 cores.  One
+core should be excluded to run the OS, leaving 60 or 56 cores.  Each
+core is hyperthreaded, so there are effectively N = 240 (4*60) or N =
+224 (4*56) cores to run on.
+</P>
+<P>The -np setting of the mpirun command sets the number of MPI
+tasks/node.  The "-k on t Nt" command-line switch sets the number of
+threads/task as Nt.  The product of these 2 values should be N, i.e.
+240 or 224.  Also, the number of threads/task should be a multiple of
+4 so that logical threads from more than one MPI task do not run on
+the same physical core.
+</P>
+<P>Examples of mpirun commands that follow these rules, for Intel Phi
+nodes with 61 cores, are shown above.
+</P>
+<P><B>Examples and benchmarks:</B>
+</P>
+<P>The examples/kokkos and bench/KOKKOS directories have scripts that can
+be run with the KOKKOS package, as well as detailed instructions on
+how to run them.
+</P>
+<P>IMPORTANT NOTE: the bench/KOKKOS directory does not yet exist.  It
+will be added later.
+</P>
+<P><B>Additional performance issues:</B>
+</P>
+<P>When using threads (OpenMP or pthreads), it is important for
+performance to bind the threads to physical cores, so they do not
+migrate during a simulation.  The same is true for MPI tasks, but the
+default binding rules implemented for various MPI versions, do not
+account for thread binding.  
+</P>
+<P>Thus if you use more than one thread per MPI task, you should insure
+MPI tasks are bound to CPU sockets.  Furthermore, use thread affinity
+environment variables from the OpenMP runtime when using OpenMP and
+compile with hwloc support when using pthreads.  With OpenMP 3.1 (gcc
+4.7 or later, intel 12 or later) setting the environment variable
+OMP_PROC_BIND=true should be sufficient.  A typical mpirun command
+should set these flags:
+</P>
+<PRE>OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
+Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... 
+</PRE>
+<P>When using a GPU, you will achieve the best performance if your input
+script does not use any fix or compute styles which are not yet
+Kokkos-enabled.  This allows data to stay on the GPU for multiple
+timesteps, without being copied back to the host CPU.  Invoking a
+non-Kokkos fix or compute, or performing I/O for
+<A HREF = "thermo_style.html">thermo</A> or <A HREF = "dump.html">dump</A> output will cause data
+to be copied back to the CPU.
+</P>
+<P>You cannot yet assign multiple MPI tasks to the same GPU with the
+KOKKOS package.  We plan to support this in the future, similar to the
+GPU package in LAMMPS.
+</P>
+<P>You cannot yet use both the host (multi-threaded) and device (GPU)
+together to compute pairwise interactions with the KOKKOS package.  We
+hope to support this in the future, similar to the GPU package in
+LAMMPS.
+</P>
 <HR>

-<H4><A NAME = "acc_8"></A>5.8 Comparison of GPU and USER-CUDA packages 
+<HR>
+
+<H4><A NAME = "acc_9"></A>5.9 Comparison of GPU and USER-CUDA packages 
 </H4>
 <P>Both the GPU and USER-CUDA packages accelerate a LAMMPS calculation
 using NVIDIA hardware, but they do it in different ways.
--- a/doc/Section_accelerate.txt
+++ b/doc/Section_accelerate.txt
@ -21,7 +21,8 @@ kinds of machines.
 5.5 "USER-OMP package"_#acc_5
 5.6 "GPU package"_#acc_6
 5.7 "USER-CUDA package"_#acc_7
-5.8 "Comparison of GPU and USER-CUDA packages"_#acc_8 :all(b)
+5.8 "KOKKOS package"_#acc_8
+5.9 "Comparison of GPU and USER-CUDA packages"_#acc_9 :all(b)

 :line
 :line
@ -142,14 +143,14 @@ command is identical, their functionality is the same, and the
 numerical results it produces should also be identical, except for
 precision and round-off issues.

-For example, all of these variants of the basic Lennard-Jones pair
-style exist in LAMMPS:
+For example, all of these styles are variants of the basic
+Lennard-Jones pair style "pair_style lj/cut"_pair_lj.html:

-"pair_style lj/cut"_pair_lj.html
-"pair_style lj/cut/opt"_pair_lj.html
-"pair_style lj/cut/omp"_pair_lj.html
+"pair_style lj/cut/cuda"_pair_lj.html
 "pair_style lj/cut/gpu"_pair_lj.html
-"pair_style lj/cut/cuda"_pair_lj.html :ul
+"pair_style lj/cut/kk"_pair_lj.html
+"pair_style lj/cut/omp"_pair_lj.html
+"pair_style lj/cut/opt"_pair_lj.html :ul

 Assuming you have built LAMMPS with the appropriate package, these
 styles can be invoked by specifying them explicitly in your input
@ -157,11 +158,17 @@ script.  Or you can use the "-suffix command-line
 switch"_Section_start.html#start_7 to invoke the accelerated versions
 automatically, without changing your input script.  The
 "suffix"_suffix.html command allows you to set a suffix explicitly and
-to turn off/on the comand-line switch setting, both from within your
-input script.
+to turn off and back on the comand-line switch setting, both from
+within your input script.

-Styles with an "opt" suffix are part of the OPT package and typically
-speed-up the pairwise calculations of your simulation by 5-25%.
+Styles with a "cuda" or "gpu" suffix are part of the USER-CUDA or GPU
+packages, and can be run on NVIDIA GPUs associated with your CPUs.
+The speed-up due to GPU usage depends on a variety of factors, as
+discussed below.
+
+Styles with a "kk" suffix are part of the KOKKOS package, and can be
+run using OpenMP, pthreads, or on an NVIDIA GPU.  The speed-up depends
+on a variety of factors, as discussed below.

 Styles with an "omp" suffix are part of the USER-OMP package and allow
 a pair-style to be run in multi-threaded mode using OpenMP.  This can
@ -170,26 +177,26 @@ than cores is advantageous, e.g. when running with PPPM so that FFTs
 are run on fewer MPI processors or when the many MPI tasks would
 overload the available bandwidth for communication.

-Styles with a "gpu" or "cuda" suffix are part of the GPU or USER-CUDA
-packages, and can be run on NVIDIA GPUs associated with your CPUs.
-The speed-up due to GPU usage depends on a variety of factors, as
-discussed below.
+Styles with an "opt" suffix are part of the OPT package and typically
+speed-up the pairwise calculations of your simulation by 5-25%.

 To see what styles are currently available in each of the accelerated
 packages, see "Section_commands 5"_Section_commands.html#cmd_5 of the
 manual.  A list of accelerated styles is included in the pair, fix,
-compute, and kspace sections.
+compute, and kspace sections.  The doc page for each indvidual style
+(e.g. "pair lj/cut"_pair_lj.html or "fix nve"_fix_nve.html) will also
+list any accelerated variants available for that style.

 The following sections explain:

 what hardware and software the accelerated styles require
-how to build LAMMPS with the accelerated packages in place
+how to build LAMMPS with the accelerated package in place
 what changes (if any) are needed in your input scripts
 guidelines for best performance
 speed-ups you can expect :ul

 The final section compares and contrasts the GPU and USER-CUDA
-packages, since they are both designed to use NVIDIA GPU hardware.
+packages, since they are both designed to use NVIDIA hardware.

 :line

@ -208,8 +215,8 @@ dependencies:
 make yes-opt
 make machine :pre

-If your input script uses one of the OPT pair styles,
-you can run it as follows:
+If your input script uses one of the OPT pair styles, you can run it
+as follows:

 lmp_machine -sf opt -in in.script
 mpirun -np 4 lmp_machine -sf opt -in in.script :pre
@ -222,12 +229,13 @@ to 20% savings.

 5.5 USER-OMP package :h4,link(acc_5)

-The USER-OMP package was developed by Axel Kohlmeyer at Temple University.
-It provides multi-threaded versions of most pair styles, all dihedral
-styles and a few fixes in LAMMPS. The package currently uses the OpenMP
-interface which requires using a specific compiler flag in the makefile
-to enable multiple threads; without this flag the corresponding pair
-styles will still be compiled and work, but do not support multi-threading.
+The USER-OMP package was developed by Axel Kohlmeyer at Temple
+University.  It provides multi-threaded versions of most pair styles,
+all dihedral styles, and a few fixes in LAMMPS. The package currently
+uses the OpenMP interface which requires using a specific compiler
+flag in the makefile to enable multiple threads; without this flag the
+corresponding pair styles will still be compiled and work, but do not
+support multi-threading.

 [Building LAMMPS with the USER-OMP package:]

@ -260,18 +268,19 @@ env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
 mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script :pre

 The value of the environment variable OMP_NUM_THREADS determines how
-many threads per MPI task are launched. All three examples above use
-a total of 4 CPU cores.  For different MPI implementations the method
-to pass the OMP_NUM_THREADS environment variable to all processes is
-different.  Two different variants, one for MPICH and OpenMPI, respectively
-are shown above.  Please check the documentation of your MPI installation
-for additional details.  Alternatively, the value provided by OMP_NUM_THREADS
-can be overridded with the "package omp"_package.html command.
-Depending on which styles are accelerated in your input, you should
-see a reduction in the "Pair time" and/or "Bond time" and "Loop time"
-printed out at the end of the run. The optimal ratio of MPI to OpenMP
-can vary a lot and should always be confirmed through some benchmark
-runs for the current system and on the current machine.
+many threads per MPI task are launched. All three examples above use a
+total of 4 CPU cores.  For different MPI implementations the method to
+pass the OMP_NUM_THREADS environment variable to all processes is
+different.  Two different variants, one for MPICH and OpenMPI,
+respectively are shown above.  Please check the documentation of your
+MPI installation for additional details.  Alternatively, the value
+provided by OMP_NUM_THREADS can be overridded with the "package
+omp"_package.html command.  Depending on which styles are accelerated
+in your input, you should see a reduction in the "Pair time" and/or
+"Bond time" and "Loop time" printed out at the end of the run. The
+optimal ratio of MPI to OpenMP can vary a lot and should always be
+confirmed through some benchmark runs for the current system and on
+the current machine.

 [Restrictions:]

@ -292,53 +301,55 @@ On the other hand, in many cases you still want to use the {omp} version
 all contain optimizations similar to those in the OPT package, which
 can result in serial speedup.

-Using multi-threading is most effective under the following circumstances:
+Using multi-threading is most effective under the following
+circumstances:

-Individual compute nodes have a significant number of CPU cores
-but the CPU itself has limited memory bandwidth, e.g. Intel Xeon 53xx
-(Clovertown) and 54xx (Harpertown) quad core processors. Running
-one MPI task per CPU core will result in significant performance
-degradation, so that running with 4 or even only 2 MPI tasks per
-nodes is faster. Running in hybrid MPI+OpenMP mode will reduce the
-inter-node communication bandwidth contention in the same way,
-but offers and additional speedup from utilizing the otherwise
-idle CPU cores. :ulb,l
+Individual compute nodes have a significant number of CPU cores but
+the CPU itself has limited memory bandwidth, e.g. Intel Xeon 53xx
+(Clovertown) and 54xx (Harpertown) quad core processors. Running one
+MPI task per CPU core will result in significant performance
+degradation, so that running with 4 or even only 2 MPI tasks per nodes
+is faster. Running in hybrid MPI+OpenMP mode will reduce the
+inter-node communication bandwidth contention in the same way, but
+offers and additional speedup from utilizing the otherwise idle CPU
+cores. :ulb,l

 The interconnect used for MPI communication is not able to provide
-sufficient bandwidth for a large number of MPI tasks per node.
-This applies for example to running over gigabit ethernet or
-on Cray XT4 or XT5 series supercomputers. Same as in the aforementioned
-case this effect worsens with using an increasing number of nodes. :l
+sufficient bandwidth for a large number of MPI tasks per node.  This
+applies for example to running over gigabit ethernet or on Cray XT4 or
+XT5 series supercomputers. Same as in the aforementioned case this
+effect worsens with using an increasing number of nodes. :l

-The input is a system that has an inhomogeneous particle density
-which cannot be mapped well to the domain decomposition scheme
-that LAMMPS employs. While this can be to some degree alleviated
-through using the "processors"_processors.html keyword, multi-threading
-provides a parallelism that parallelizes over the number of particles
-not their distribution in space. :l
+The input is a system that has an inhomogeneous particle density which
+cannot be mapped well to the domain decomposition scheme that LAMMPS
+employs. While this can be to some degree alleviated through using the
+"processors"_processors.html keyword, multi-threading provides a
+parallelism that parallelizes over the number of particles not their
+distribution in space. :l

 Finally, multi-threaded styles can improve performance when running
 LAMMPS in "capability mode", i.e. near the point where the MPI
-parallelism scales out. This can happen in particular when using
-as kspace style for long-range electrostatics. Here the scaling
-of the kspace style is the performance limiting factor and using
-multi-threaded styles allows to operate the kspace style at the
-limit of scaling and then increase performance parallelizing
-the real space calculations with hybrid MPI+OpenMP. Sometimes
-additional speedup can be achived by increasing the real-space
-coulomb cutoff and thus reducing the work in the kspace part. :l,ule
+parallelism scales out. This can happen in particular when using as
+kspace style for long-range electrostatics. Here the scaling of the
+kspace style is the performance limiting factor and using
+multi-threaded styles allows to operate the kspace style at the limit
+of scaling and then increase performance parallelizing the real space
+calculations with hybrid MPI+OpenMP. Sometimes additional speedup can
+be achived by increasing the real-space coulomb cutoff and thus
+reducing the work in the kspace part. :l,ule

-The best parallel efficiency from {omp} styles is typically 
-achieved when there is at least one MPI task per physical 
-processor, i.e. socket or die.
+The best parallel efficiency from {omp} styles is typically achieved
+when there is at least one MPI task per physical processor,
+i.e. socket or die.

 Using threads on hyper-threading enabled cores is usually
 counterproductive, as the cost in additional memory bandwidth
-requirements is not offset by the gain in CPU utilization
-through hyper-threading.
+requirements is not offset by the gain in CPU utilization through
+hyper-threading.

 A description of the multi-threading strategy and some performance
-examples are "presented here"_http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1
+examples are "presented
+here"_http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1

 :line

@ -365,36 +376,23 @@ between processors, runs on the CPU. :l
 Asynchronous force computations can be performed simultaneously on the
 CPU(s) and GPU. :l

+It allows for GPU computations to be performed in single or double
+precision, or in mixed-mode precision. where pairwise forces are
+cmoputed in single precision, but accumulated into double-precision
+force vectors. :l
+
 LAMMPS-specific code is in the GPU package.  It makes calls to a
 generic GPU library in the lib/gpu directory.  This library provides
 NVIDIA support as well as more general OpenCL support, so that the
 same functionality can eventually be supported on a variety of GPU
 hardware. :l,ule

-
-
-NOTE:
-  discuss 3 precisions
-    if change, also have to re-link with LAMMPS
-  always use newton off
-  expt with differing numbers of CPUs vs GPU - can't tell what is fastest
-  give command line switches in examples
-
-
-I am not very clear to the meaning of  "Max Mem / Proc"
-in the "GPU Time Info (average)".
-Is it the maximal of GPU memory used by one CPU core?
-
-It is the maximum memory used at one time on the GPU for data storage by
-a single MPI process. - Mike
-
-
 [Hardware and software requirements:]

-To use this package, you currently need to have specific NVIDIA
-hardware and install specific NVIDIA CUDA software on your system:
+To use this package, you currently need to have an NVIDIA GPU and
+install the NVIDIA Cuda software on your system:

-Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
+Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/cards/0
 Go to http://www.nvidia.com/object/cuda_get.html
 Install a driver and toolkit appropriate for your system (SDK is not necessary)
 Follow the instructions in lammps/lib/gpu/README to build the library (see below)
@ -406,8 +404,21 @@ As with other packages that include a separately compiled library, you
 need to first build the GPU library, before building LAMMPS itself.
 General instructions for doing this are in "this
 section"_Section_start.html#start_3 of the manual.  For this package,
-do the following, using a Makefile in lib/gpu appropriate for your
-system:
+use a Makefile in lib/gpu appropriate for your system.
+
+Before building the library, you can set the precision it will use by
+editing the CUDA_PREC setting in the Makefile you are using, as
+follows:
+
+CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
+CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
+CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc, in double :pre
+
+The last setting is the mixed mode referred to above.  Note that your
+GPU must support double precision to use either the 2nd or 3rd of
+these settings.
+
+To build the library, then type:

 cd lammps/lib/gpu
 make -f Makefile.linux
@ -427,41 +438,60 @@ set appropriately to include the paths and settings for the CUDA
 system software on your machine.  See src/MAKE/Makefile.g++ for an
 example.

-[GPU configuration]
+Also note that if you change the GPU library precision, you need to
+re-build the entire library.  You should do a "clean" first,
+e.g. "make -f Makefile.linux clean".  Then you must also re-build
+LAMMPS if the library precision has changed, so that it re-links with
+the new library.

-When using GPUs, you are restricted to one physical GPU per LAMMPS
-process, which is an MPI process running on a single core or
-processor.  Multiple MPI processes (CPU cores) can share a single GPU,
-and in many cases it will be more efficient to run this way.
+[Running an input script:]

-[Input script requirements:]
+The examples/gpu and bench/GPU directories have scripts that can be
+run with the GPU package, as well as detailed instructions on how to
+run them.

-Additional input script requirements to run pair or PPPM styles with a
+The total number of MPI tasks used by LAMMPS (one or multiple per
+compute node) is set in the usual manner via the mpirun or mpiexec
+commands, and is independent of the GPU package.
+
+When using the GPU package, you cannot assign more than one physical
+GPU to an MPI task.  However multiple MPI tasks can share the same
+GPU, and in many cases it will be more efficient to run this way.
+
+Input script requirements to run using pair or PPPM styles with a
 {gpu} suffix are as follows:

-To invoke specific styles from the GPU package, you can either append
-"gpu" to the style name (e.g. pair_style lj/cut/gpu), or use the
-"-suffix command-line switch"_Section_start.html#start_7, or use the
-"suffix"_suffix.html command. :ulb,l
+To invoke specific styles from the GPU package, either append "gpu" to
+the style name (e.g. pair_style lj/cut/gpu), or use the "-suffix
+command-line switch"_Section_start.html#start_7, or use the
+"suffix"_suffix.html command in the input script. :ulb,l

-The "newton pair"_newton.html setting must be {off}. :l
+The "newton pair"_newton.html setting in the input script must be
+{off}. :l

-The "package gpu"_package.html command must be used near the beginning
-of your script to control the GPU selection and initialization
-settings.  It also has an option to enable asynchronous splitting of
-force computations between the CPUs and GPUs. :l,ule
+Unless the "-suffix gpu command-line
+switch"_Section_start.html#start_7 is used, the "package
+gpu"_package.html command must be used near the beginning of the
+script to control the GPU selection and initialization settings.  It
+also has an option to enable asynchronous splitting of force
+computations between the CPUs and GPUs. :l,ule

-As an example, if you have two GPUs per node and 8 CPU cores per node,
-and would like to run on 4 nodes (32 cores) with dynamic balancing of
-force calculation across CPU and GPU cores, you could specify
+The default for the "package gpu"_package.html command is to have all
+the MPI tasks on the compute node use a single GPU.  If you have
+multiple GPUs per node, then be sure to create one or more MPI tasks
+per GPU, and use the first/last settings in the "package
+gpu"_package.html command to include all the GPU IDs on the node.
+E.g. first = 0, last = 1, for 2 GPUs.  For example, on an 8-core 2-GPU
+compute node, if you assign 8 MPI tasks to the node, the following
+command in the input script

-package gpu force/neigh 0 1 -1 :pre
+package gpu force/neigh 0 1 -1

-In this case, all CPU cores and GPU devices on the nodes would be
-utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
-cores would perform force calculations for some fraction of the
-particles at the same time the GPUs performed force calculation for
-the other particles.
+would speciy each GPU is shared by 4 MPI tasks.  The final -1 will
+dynamically balance force calculations across the CPU cores and GPUs.
+I.e. each CPU core will perform force calculations for some small
+fraction of the particles, at the same time the GPUs perform force
+calcaultions for the majority of the particles.

 [Timing output:]

@ -485,19 +515,30 @@ screen output (not in the log file) at the end of each run.  These
 timings represent total time spent on the GPU for each routine,
 regardless of asynchronous CPU calculations.

+The output section "GPU Time Info (average)" reports "Max Mem / Proc".
+This is the maximum memory used at one time on the GPU for data
+storage by a single MPI process.
+
 [Performance tips:]

-Generally speaking, for best performance, you should use multiple CPUs
-per GPU, as provided my most multi-core CPU/GPU configurations.
+You should experiment with how many MPI tasks per GPU to use to see
+what gives the best performance for your problem.  This is a function
+of your problem size and what pair style you are using.  Likewise, you
+should also experiment with the precision setting for the GPU library
+to see if single or mixed precision will give accurate results, since
+they will typically be faster.

-Because of the large number of cores within each GPU device, it may be
-more efficient to run on fewer processes per GPU when the number of
-particles per MPI process is small (100's of particles); this can be
-necessary to keep the GPU cores busy.
+Using multiple MPI tasks per GPU will often give the best performance,
+as allowed my most multi-core CPU/GPU configurations.

-See the lammps/lib/gpu/README file for instructions on how to build
-the GPU library for single, mixed, or double precision.  The latter
-requires that your GPU card support double precision.
+If the number of particles per MPI task is small (e.g. 100s of
+particles), it can be more eefficient to run with fewer MPI tasks per
+GPU, even if you do not use all the cores on the compute node.
+
+The "Benchmark page"_http://lammps.sandia.gov/bench.html of the LAMMPS
+web site gives GPU performance on a desktop machine and the Titan HPC
+platform at ORNL for several of the LAMMPS benchmarks, as a function
+of problem size and number of compute nodes.

 :line

@ -633,10 +674,303 @@ a fix or compute that is non-GPU-ized, or until output is performed
 (thermo or dump snapshot or restart file).  The less often this
 occurs, the faster your simulation will run.

+:line
+
+5.8 KOKKOS package :h4,link(acc_8)
+
+The KOKKOS package contains versions of pair, fix, and atom styles
+that use data structures and methods and macros provided by the Kokkos
+library, which is included with LAMMPS in lib/kokkos.
+
+"Kokkos"_http://trilinos.sandia.gov/packages/kokkos is a C++ library
+that provides two key abstractions for an application like LAMMPS.
+First, it allows a single implementation of an application kernel
+(e.g. a pair style) to run efficiently on different kinds of hardware
+(GPU, Intel Phi, many-core chip).
+
+Second, it provides data abstractions to adjust (at compile time) the
+memory layout of basic data structures like 2d and 3d arrays and allow
+the transparent utilization of special hardware load and store units.
+Such data structures are used in LAMMPS to store atom coordinates or
+forces or neighbor lists.  The layout is chosen to optimize
+performance on different platforms.  Again this operation is hidden
+from the developer, and does not affect how the single implementation
+of the kernel is coded.
+
+These abstractions are set at build time, when LAMMPS is compiled with
+the KOKKOS package installed.  This is done by selecting a "host" and
+"device" to build for, compatible with the compute nodes in your
+machine.  Note that if you are running on a desktop machine, you
+typically have one compute node.  On a cluster or supercomputer there
+may be dozens or 1000s of compute nodes.  The procedure for building
+and running with the Kokkos library is the same, no matter how many
+nodes you run on.
+
+All Kokkos operations occur within the context of an individual MPI
+task running on a single node of the machine.  The total number of MPI
+tasks used by LAMMPS (one or multiple per compute node) is set in the
+usual manner via the mpirun or mpiexec commands, and is independent of
+Kokkos.
+
+Kokkos provides support for one or two modes of execution per MPI
+task.  This means that some computational tasks (pairwise
+interactions, neighbor list builds, time integration, etc) are
+parallelized in one or the other of the two modes.  The first mode is
+called the "host" and is one or more threads running on one or more
+physical CPUs (within the node).  Currently, both multi-core CPUs and
+an Intel Phi processor (running in native mode) are supported.  The
+second mode is called the "device" and is an accelerator chip of some
+kind.  Currently only an NVIDIA GPU is supported.  If your compute
+node does not have a GPU, then there is only one mode of execution,
+i.e. the host and device are the same.
+
+IMPORTNANT NOTE: Currently, if using GPUs, you should set the number
+of MPI tasks per compute node to be equal to the number of GPUs per
+compute node.  In the future Kokkos will support assigning one GPU to
+multiple MPI tasks or using multiple GPUs per MPI task.  Currently
+Kokkos does not support AMD GPUs due to limits in the available
+backend programming models (in particular relative extensive C++
+support is required for the Kernel language).  This is expected to
+change in the future.
+
+Here are several examples of how to build LAMMPS and run a simulation
+using the KOKKOS package for typical compute node configurations.
+Note that the -np setting for the mpirun command in these examples are
+for a run on a single node.  To scale these examples up to run on a
+system with N compute nodes, simply multiply the -np setting by N.
+
+All the build steps are performed from within the src directory.  All
+the run steps are performed in the bench directory using the in.lj
+input script.  It is assumed the LAMMPS executable has been copied to
+that directory or whatever directory the runs are being performed in.
+Details of the various options are discussed below.
+
+[Compute node(s) = dual hex-core CPUs and no GPU:]
+
+make yes-kokkos                           # install the KOKKOS package
+make g++ OMP=yes                          # build with OpenMP, no CUDA :pre
+
+mpirun -np 12 lmp_g++ -k off < in.lj      # MPI-only mode with no Kokkos
+mpirun -np 12 lmp_g++ -sf kk < in.lj      # MPI-only mode with Kokkos
+mpirun -np 1 lmp_g++ -k on t 12 -sf kk < in.lj     # one MPI task, 12 threads
+mpirun -np 2 lmp_g++ -k on t 6 -sf kk < in.lj      # two MPI tasks, 6 threads/task :pre
+
+[Compute node(s) = Intel Phi with 61 cores:]
+
+make yes-kokkos
+make g++ OMP=yes MIC=yes                  # build with OpenMP for Phi :pre
+
+mpirun -np 12 lmp_g++ -k on t 20 -sf kk < in.lj      # 12*20 = 240 total cores
+mpirun -np 15 lmp_g++ -k on t 16 -sf kk < in.lj
+mpirun -np 30 lmp_g++ -k on t 8 -sf kk < in.lj
+mpirun -np 1 lmp_g++ -k on t 240 -sf kk < in.lj :pre
+
+[Compute node(s) = dual hex-core CPUs and a single GPU:]
+
+make yes-kokkos
+make cuda CUDA=yes             # build for GPU, use src/MAKE/Makefile.cuda :pre
+
+mpirun -np 1 lmp_cuda -k on t 6 -sf kk < in.lj :pre
+
+[Compute node(s) = dual 8-core CPUs and 2 GPUs:]
+
+make yes-kokkos
+make cuda CUDA=yes :pre
+
+mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk < in.lj     # use both GPUs, one per MPI task :pre
+
+[Building LAMMPS with the KOKKOS package:]
+
+A summary of the build process is given here.  More details and all
+the available make variable options are given in "this
+section"_Section_start.html#start_3_4 of the manual.
+
+From the src directory, type
+
+make yes-kokkos :pre
+
+to include the KOKKOS package.  Then perform a normal LAMMPS build,
+with additional make variable specifications to choose the host and
+device you will run the resulting executable on, e.g.
+
+make g++ OMP=yes
+make cuda CUDA=yes :pre
+
+As illustrated above, the most important variables to set are OMP,
+CUDA, and MIC.  The default settings are OMP=yes, CUDA=no, MIC=no
+Setting OMP to {yes} will use OpenMP for threading on the host, as
+well as on the device (if no GPU is present).  Setting CUDA to {yes}
+will use one or more GPUs as the device.  Setting MIC=yes is necessary
+when building for an Intel Phi processor.
+
+Note that to use a GPU, you must use a lo-level Makefile,
+e.g. src/MAKE/Makefile.cuda as included in the LAMMPS distro, which
+uses the NVIDA "nvcc" compiler.  You must check that the CCFLAGS -arch
+setting is appropriate for your NVIDIA hardware and installed
+software.  Typical values for -arch are given in "this
+section"_Section_start.html#start_3_4 of the manual, as well as other
+settings that must be included in the lo-level Makefile, if you create
+your own.
+
+[Input scripts and use of command-line switches -kokkos and -suffix:]
+
+To use any Kokkos-enabled style provided in the KOKKOS package, you
+must use a Kokkos-enabled atom style.  LAMMPS will give an error if
+you do not do this.
+
+There are two command-line switches relevant to using Kokkos, -k or
+-kokkos, and -sf or -suffix.  They are described in detail in "this
+section"_Section_start.html#start_7 of the manual.
+
+Here are common options to use:
+
+-k off : runs an executable built with the KOKKOS pacakage, as
+ if Kokkos were not installed. :ulb,l
+
+-sf kk : enables automatic use of Kokkos versions of atom, pair,
+fix, compute styles if they exist.  This can also be done with more
+precise control by using the "suffix"_suffix.html command or appending
+"kk" to styles within the input script, e.g. "pair_style lj/cut/kk". :l
+
+-k on t Nt : specifies how many threads per MPI task to use within a
+ compute node.  For good performance, the product of MPI tasks *
+ threads/task should not exceed the number of physical CPU or Intel
+ Phi cores. :l
+
+-k on g Ng : specifies how many GPUs per compute node are available.
+The default is 1, so this should be specified is you have 2 or more
+GPUs per compute node. :ule,l
+
+[Use of package command options:]
+
+Using the "package kokkos"_package.html command in an input script
+allows choice of options for neighbor lists and communication.  See
+the "package"_package.html command doc page for details and default
+settings.
+
+Experimenting with different styles of neighbor lists or inter-node
+communication can provide a speed-up for specific calculations.
+
+[Running on a multi-core CPU:]
+
+Build with OMP=yes (the default) and CUDA=no (the default).
+
+If N is the number of physical cores/node, then the number of MPI
+tasks/node * number of threads/task should not exceed N, and should
+typically equal N.  Note that the default threads/task is 1, as set by
+the "t" keyword of the -k "command-line
+switch"_Section_start.html#start_7.  If you do not change this, no
+additional parallelism (beyond MPI) will be invoked on the host
+CPU(s).
+
+You can compare the performance running in different modes:
+  
+run with 1 MPI task/node and N threads/task
+run with N MPI tasks/node and 1 thread/task
+run with settings in between these extremes :ul
+
+Examples of mpirun commands in these modes, for nodes with dual
+hex-core CPUs and no GPU, are shown above.
+
+[Running on GPUs:]
+
+Build with CUDA=yes, using src/MAKE/Makefile.cuda.  Insure the setting
+for CUDA_PATH in lib/kokkos/Makefile.lammps is correct for your Cuda
+software installation.  Insure the -arch setting in
+src/MAKE/Makefile.cuda is correct for your GPU hardware/software (see
+"this section"_Section_start.html#start_3_4 of the manual for details.
+
+The -np setting of the mpirun command should set the number of MPI
+tasks/node to be equal to the # of physical GPUs on the node. 
+
+Use the "-kokkos command-line switch"_Section_commands.html#start_7 to
+specify the number of GPUs per node, and the number of threads per MPI
+task.  As above for multi-core CPUs (and no GPU), if N is the number
+of physical cores/node, then the number of MPI tasks/node * number of
+threads/task should not exceed N.  With one GPU (and one MPI task) it
+may be faster to use less than all the available cores, by setting
+threads/task to a smaller value.  This is because using all the cores
+on a dual-socket node will incur extra cost to copy memory from the
+2nd socket to the GPU.
+
+Examples of mpirun commands that follow these rules, for nodes with
+dual hex-core CPUs and one or two GPUs, are shown above.
+
+[Running on an Intel Phi:]
+
+Kokkos only uses Intel Phi processors in their "native" mode, i.e.
+not hosted by a CPU.
+
+Build with OMP=yes (the default) and MIC=yes.  The latter
+insures code is correctly compiled for the Intel Phi.  The
+OMP setting means OpenMP will be used for parallelization
+on the Phi, which is currently the best option within
+Kokkos.  In the future, other options may be added.
+
+Current-generation Intel Phi chips have either 61 or 57 cores.  One
+core should be excluded to run the OS, leaving 60 or 56 cores.  Each
+core is hyperthreaded, so there are effectively N = 240 (4*60) or N =
+224 (4*56) cores to run on.
+
+The -np setting of the mpirun command sets the number of MPI
+tasks/node.  The "-k on t Nt" command-line switch sets the number of
+threads/task as Nt.  The product of these 2 values should be N, i.e.
+240 or 224.  Also, the number of threads/task should be a multiple of
+4 so that logical threads from more than one MPI task do not run on
+the same physical core.
+
+Examples of mpirun commands that follow these rules, for Intel Phi
+nodes with 61 cores, are shown above.
+
+[Examples and benchmarks:]
+
+The examples/kokkos and bench/KOKKOS directories have scripts that can
+be run with the KOKKOS package, as well as detailed instructions on
+how to run them.
+
+IMPORTANT NOTE: the bench/KOKKOS directory does not yet exist.  It
+will be added later.
+
+[Additional performance issues:]
+
+When using threads (OpenMP or pthreads), it is important for
+performance to bind the threads to physical cores, so they do not
+migrate during a simulation.  The same is true for MPI tasks, but the
+default binding rules implemented for various MPI versions, do not
+account for thread binding.  
+
+Thus if you use more than one thread per MPI task, you should insure
+MPI tasks are bound to CPU sockets.  Furthermore, use thread affinity
+environment variables from the OpenMP runtime when using OpenMP and
+compile with hwloc support when using pthreads.  With OpenMP 3.1 (gcc
+4.7 or later, intel 12 or later) setting the environment variable
+OMP_PROC_BIND=true should be sufficient.  A typical mpirun command
+should set these flags:
+
+OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
+Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... :pre
+
+When using a GPU, you will achieve the best performance if your input
+script does not use any fix or compute styles which are not yet
+Kokkos-enabled.  This allows data to stay on the GPU for multiple
+timesteps, without being copied back to the host CPU.  Invoking a
+non-Kokkos fix or compute, or performing I/O for
+"thermo"_thermo_style.html or "dump"_dump.html output will cause data
+to be copied back to the CPU.
+
+You cannot yet assign multiple MPI tasks to the same GPU with the
+KOKKOS package.  We plan to support this in the future, similar to the
+GPU package in LAMMPS.
+
+You cannot yet use both the host (multi-threaded) and device (GPU)
+together to compute pairwise interactions with the KOKKOS package.  We
+hope to support this in the future, similar to the GPU package in
+LAMMPS.
+
 :line
 :line

-5.8 Comparison of GPU and USER-CUDA packages :h4,link(acc_8)
+5.9 Comparison of GPU and USER-CUDA packages :h4,link(acc_9)

 Both the GPU and USER-CUDA packages accelerate a LAMMPS calculation
 using NVIDIA hardware, but they do it in different ways.
--- a/doc/Section_commands.html
+++ b/doc/Section_commands.html
@ -428,10 +428,10 @@ package</A>.
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "fix_freeze.html">freeze/cuda</A></TD><TD ><A HREF = "fix_addforce.html">addforce/cuda</A></TD><TD ><A HREF = "fix_aveforce.html">aveforce/cuda</A></TD><TD ><A HREF = "fix_enforce2d.html">enforce2d/cuda</A></TD><TD ><A HREF = "fix_gravity.html">gravity/cuda</A></TD><TD ><A HREF = "fix_gravity.html">gravity/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_nh.html">nph/omp</A></TD><TD ><A HREF = "fix_nphug.html">nphug/omp</A></TD><TD ><A HREF = "fix_nph_asphere.html">nph/asphere/omp</A></TD><TD ><A HREF = "fix_nph_sphere.html">nph/sphere/omp</A></TD><TD ><A HREF = "fix_nh.html">npt/cuda</A></TD><TD ><A HREF = "fix_nh.html">npt/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_npt_asphere.html">npt/asphere/omp</A></TD><TD ><A HREF = "fix_npt_sphere.html">npt/sphere/omp</A></TD><TD ><A HREF = "fix_nh.html">nve/cuda</A></TD><TD ><A HREF = "fix_nve.html">nve/omp</A></TD><TD ><A HREF = "fix_nve_sphere.html">nve/sphere/omp</A></TD><TD ><A HREF = "fix_nh.html">nvt/cuda</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_nh.html">nvt/omp</A></TD><TD ><A HREF = "fix_nvt_asphere.html">nvt/asphere/omp</A></TD><TD ><A HREF = "fix_nvt_sllod.html">nvt/sllod/omp</A></TD><TD ><A HREF = "fix_nvt_sphere.html">nvt/sphere/omp</A></TD><TD ><A HREF = "fix_qeq_comb.html">qeq/comb/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_rigid.html">rigid/nph/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/npt/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nve/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nvt/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/omp</A></TD><TD ><A HREF = "fix_setforce.html">setforce/cuda</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_shake.html">shake/cuda</A></TD><TD ><A HREF = "fix_temp_berendsen.html">temp/berendsen/cuda</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale/cuda</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale/limit/cuda</A></TD><TD ><A HREF = "fix_viscous.html">viscous/cuda</A> 
+<TR ALIGN="center"><TD ><A HREF = "fix_npt_asphere.html">npt/asphere/omp</A></TD><TD ><A HREF = "fix_npt_sphere.html">npt/sphere/omp</A></TD><TD ><A HREF = "fix_nve.html">nve/cuda</A></TD><TD ><A HREF = "fix_nve.html">nve/kk</A></TD><TD ><A HREF = "fix_nve.html">nve/omp</A></TD><TD ><A HREF = "fix_nve_sphere.html">nve/sphere/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_nh.html">nvt/cuda</A></TD><TD ><A HREF = "fix_nh.html">nvt/omp</A></TD><TD ><A HREF = "fix_nvt_asphere.html">nvt/asphere/omp</A></TD><TD ><A HREF = "fix_nvt_sllod.html">nvt/sllod/omp</A></TD><TD ><A HREF = "fix_nvt_sphere.html">nvt/sphere/omp</A></TD><TD ><A HREF = "fix_qeq_comb.html">qeq/comb/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_rigid.html">rigid/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nph/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/npt/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nve/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nvt/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_setforce.html">setforce/cuda</A></TD><TD ><A HREF = "fix_shake.html">shake/cuda</A></TD><TD ><A HREF = "fix_temp_berendsen.html">temp/berendsen/cuda</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale/cuda</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale/limit/cuda</A></TD><TD ><A HREF = "fix_viscous.html">viscous/cuda</A> 
 </TD></TR></TABLE></DIV>

 <HR>
@ -552,25 +552,25 @@ package</A>.
 <TR ALIGN="center"><TD ><A HREF = "pair_lj_soft.html">lj/cut/coul/cut/soft/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye/cuda</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/dsf/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/dsf/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/cuda</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/gpu</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/opt</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/msm/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/msm/opt</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj_soft.html">lj/cut/coul/long/soft/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/cuda</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/cut/gpu</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/cut/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/sf/gpu</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/sf/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/experimental/cuda</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/gpu</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/opt</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/soft/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/cut/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/long/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/long/opt</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/tip4p/long/soft/omp</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand/cuda</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj_expand.html">lj/expand/gpu</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand/omp</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs/cuda</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_gromacs.html">lj/gromacs/cuda</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/gpu</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/omp</A></TD><TD ><A HREF = "pair_lj_long.html">lj/long/coul/long/opt</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_sdk.html">lj/sdk/gpu</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/omp</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/long/gpu</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/long/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/msm/omp</A></TD><TD ><A HREF = "pair_lj_sf.html">lj/sf/omp</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth/cuda</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj_smooth_linear.html">lj/smooth/linear/omp</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut/cuda</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut/gpu</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lubricate.html">lubricate/omp</A></TD><TD ><A HREF = "pair_lubricate.html">lubricate/poly/omp</A></TD><TD ><A HREF = "pair_meam_spline.html">meam/spline/omp</A></TD><TD ><A HREF = "pair_mie.html">mie/cut/gpu</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_morse.html">morse/cuda</A></TD><TD ><A HREF = "pair_morse.html">morse/gpu</A></TD><TD ><A HREF = "pair_morse.html">morse/omp</A></TD><TD ><A HREF = "pair_morse.html">morse/opt</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_nb3b_harmonic.html">nb3b/harmonic/omp</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/omp</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/coul/cut/omp</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/coul/long/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_peri.html">peri/lps/omp</A></TD><TD ><A HREF = "pair_peri.html">peri/pmb/omp</A></TD><TD ><A HREF = "pair_airebo.html">rebo/omp</A></TD><TD ><A HREF = "pair_resquared.html">resquared/gpu</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_resquared.html">resquared/omp</A></TD><TD ><A HREF = "pair_soft.html">soft/gpu</A></TD><TD ><A HREF = "pair_soft.html">soft/omp</A></TD><TD ><A HREF = "pair_sw.html">sw/cuda</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_sw.html">sw/gpu</A></TD><TD ><A HREF = "pair_sw.html">sw/omp</A></TD><TD ><A HREF = "pair_table.html">table/gpu</A></TD><TD ><A HREF = "pair_table.html">table/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_tersoff.html">tersoff/cuda</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/omp</A></TD><TD ><A HREF = "pair_tersoff_mod.html">tersoff/mod/omp</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/table/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl/omp</A></TD><TD ><A HREF = "pair_coul.html">tip4p/cut/omp</A></TD><TD ><A HREF = "pair_coul.html">tip4p/long/omp</A></TD><TD ><A HREF = "pair_lj_soft.html">tip4p/long/soft/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_tri_lj.html">tri/lj/omp</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa/gpu</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa/omp</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid/gpu</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid/omp</A></TD><TD ><A HREF = "pair_zbl.html">zbl/omp</A> 
+<TR ALIGN="center"><TD ><A HREF = "pair_lj_soft.html">lj/cut/coul/long/soft/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/cuda</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/kk</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/cut/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/cut/omp</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/sf/gpu</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/sf/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/experimental/cuda</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/opt</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/soft/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/cut/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/long/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/long/opt</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/tip4p/long/soft/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj_expand.html">lj/expand/cuda</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand/gpu</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand/omp</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs/cuda</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs/omp</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/cuda</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/gpu</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj_long.html">lj/long/coul/long/opt</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/gpu</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/omp</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/long/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/long/omp</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/msm/omp</A></TD><TD ><A HREF = "pair_lj_sf.html">lj/sf/omp</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth/cuda</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj_smooth.html">lj/smooth/omp</A></TD><TD ><A HREF = "pair_lj_smooth_linear.html">lj/smooth/linear/omp</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut/cuda</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj96.html">lj96/cut/omp</A></TD><TD ><A HREF = "pair_lubricate.html">lubricate/omp</A></TD><TD ><A HREF = "pair_lubricate.html">lubricate/poly/omp</A></TD><TD ><A HREF = "pair_meam_spline.html">meam/spline/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_mie.html">mie/cut/gpu</A></TD><TD ><A HREF = "pair_morse.html">morse/cuda</A></TD><TD ><A HREF = "pair_morse.html">morse/gpu</A></TD><TD ><A HREF = "pair_morse.html">morse/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_morse.html">morse/opt</A></TD><TD ><A HREF = "pair_nb3b_harmonic.html">nb3b/harmonic/omp</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/omp</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/coul/cut/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_nm.html">nm/cut/coul/long/omp</A></TD><TD ><A HREF = "pair_peri.html">peri/lps/omp</A></TD><TD ><A HREF = "pair_peri.html">peri/pmb/omp</A></TD><TD ><A HREF = "pair_airebo.html">rebo/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_resquared.html">resquared/gpu</A></TD><TD ><A HREF = "pair_resquared.html">resquared/omp</A></TD><TD ><A HREF = "pair_soft.html">soft/gpu</A></TD><TD ><A HREF = "pair_soft.html">soft/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_sw.html">sw/cuda</A></TD><TD ><A HREF = "pair_sw.html">sw/gpu</A></TD><TD ><A HREF = "pair_sw.html">sw/omp</A></TD><TD ><A HREF = "pair_table.html">table/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_table.html">table/kk</A></TD><TD ><A HREF = "pair_table.html">table/omp</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/cuda</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_tersoff_mod.html">tersoff/mod/omp</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/table/omp</A></TD><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl/omp</A></TD><TD ><A HREF = "pair_coul.html">tip4p/cut/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_coul.html">tip4p/long/omp</A></TD><TD ><A HREF = "pair_lj_soft.html">tip4p/long/soft/omp</A></TD><TD ><A HREF = "pair_tri_lj.html">tri/lj/omp</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_yukawa.html">yukawa/omp</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid/gpu</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid/omp</A></TD><TD ><A HREF = "pair_zbl.html">zbl/omp</A> 
 </TD></TR></TABLE></DIV>

 <HR>
--- a/doc/Section_commands.txt
+++ b/doc/Section_commands.txt
@ -615,7 +615,8 @@ package"_Section_accelerate.html.
 "npt/omp"_fix_nh.html,
 "npt/asphere/omp"_fix_npt_asphere.html,
 "npt/sphere/omp"_fix_npt_sphere.html,
-"nve/cuda"_fix_nh.html,
+"nve/cuda"_fix_nve.html,
+"nve/kk"_fix_nve.html,
 "nve/omp"_fix_nve.html,
 "nve/sphere/omp"_fix_nve_sphere.html,
 "nvt/cuda"_fix_nh.html,
@ -981,6 +982,7 @@ package"_Section_accelerate.html.
 "lj/cut/coul/msm/opt"_pair_lj.html,
 "lj/cut/coul/long/soft/omp"_pair_lj_soft.html,
 "lj/cut/cuda"_pair_lj.html,
+"lj/cut/kk"_pair_lj.html,
 "lj/cut/dipole/cut/gpu"_pair_dipole.html,
 "lj/cut/dipole/cut/omp"_pair_dipole.html,
 "lj/cut/dipole/sf/gpu"_pair_dipole.html,
@ -1038,6 +1040,7 @@ package"_Section_accelerate.html.
 "sw/gpu"_pair_sw.html,
 "sw/omp"_pair_sw.html,
 "table/gpu"_pair_table.html,
+"table/kk"_pair_table.html,
 "table/omp"_pair_table.html,
 "tersoff/cuda"_pair_tersoff.html,
 "tersoff/omp"_pair_tersoff.html,
--- a/doc/Section_packages.html
+++ b/doc/Section_packages.html
@ -50,15 +50,16 @@ packages, more details are provided.
 <TR ALIGN="center"><TD >COLLOID</TD><TD > colloidal particles</TD><TD > -</TD><TD > <A HREF = "atom_style.html">atom_style colloid</A></TD><TD > colloid</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >DIPOLE</TD><TD > point dipole particles</TD><TD > -</TD><TD > <A HREF = "pair_dipole.html">pair_style dipole/cut</A></TD><TD > dipole</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >FLD</TD><TD > Fast Lubrication Dynamics</TD><TD > Kumar & Bybee & Higdon (1)</TD><TD > <A HREF = "pair_lubricateU.html">pair_style lubricateU</A></TD><TD > -</TD><TD > -</TD></TR>
-<TR ALIGN="center"><TD >GPU</TD><TD > GPU-enabled potentials</TD><TD > Mike Brown (ORNL)</TD><TD > <A HREF = "Section_accelerate.html#acc_6">Section accelerate</A></TD><TD > gpu</TD><TD > lib/gpu</TD></TR>
+<TR ALIGN="center"><TD >GPU</TD><TD > GPU-enabled styles</TD><TD > Mike Brown (ORNL)</TD><TD > <A HREF = "Section_accelerate.html#acc_6">Section accelerate</A></TD><TD > gpu</TD><TD > lib/gpu</TD></TR>
 <TR ALIGN="center"><TD >GRANULAR</TD><TD > granular systems</TD><TD > -</TD><TD > <A HREF = "Section_howto.html#howto_6">Section_howto</A></TD><TD > pour</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >KIM</TD><TD > openKIM potentials</TD><TD > Smirichinski & Elliot & Tadmor (3)</TD><TD > <A HREF = "pair_kim.html">pair_style kim</A></TD><TD > kim</TD><TD > KIM</TD></TR>
+<TR ALIGN="center"><TD >KOKKOS</TD><TD > Kokkos-enabled styles</TD><TD > Trott & Edwards (4)</TD><TD > <A HREF = "Section_accelerate.html#acc_8">Section_accelerate</A></TD><TD > kokkos</TD><TD > lib/kokkos</TD></TR>
 <TR ALIGN="center"><TD >KSPACE</TD><TD > long-range Coulombic solvers</TD><TD > -</TD><TD > <A HREF = "kspace_style.html">kspace_style</A></TD><TD > peptide</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >MANYBODY</TD><TD > many-body potentials</TD><TD > -</TD><TD > <A HREF = "pair_tersoff.html">pair_style tersoff</A></TD><TD > shear</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >MEAM</TD><TD > modified EAM potential</TD><TD > Greg Wagner (Sandia)</TD><TD > <A HREF = "pair_meam.html">pair_style meam</A></TD><TD > meam</TD><TD > lib/meam</TD></TR>
 <TR ALIGN="center"><TD >MC</TD><TD > Monte Carlo options</TD><TD > -</TD><TD > <A HREF = "fix_gcmc.html">fix gcmc</A></TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >MOLECULE</TD><TD > molecular system force fields</TD><TD > -</TD><TD > <A HREF = "Section_howto.html#howto_3">Section_howto</A></TD><TD > peptide</TD><TD > -</TD></TR>
-<TR ALIGN="center"><TD >OPT</TD><TD > optimized pair potentials</TD><TD > Fischer & Richie & Natoli (2)</TD><TD > <A HREF = "Section_accelerate.html#acc_4">Section accelerate</A></TD><TD > -</TD><TD > -</TD></TR>
+<TR ALIGN="center"><TD >OPT</TD><TD > optimized pair styles</TD><TD > Fischer & Richie & Natoli (2)</TD><TD > <A HREF = "Section_accelerate.html#acc_4">Section accelerate</A></TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >PERI</TD><TD > Peridynamics models</TD><TD > Mike Parks (Sandia)</TD><TD > <A HREF = "pair_peri.html">pair_style peri</A></TD><TD > peri</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >POEMS</TD><TD > coupled rigid body motion</TD><TD > Rudra Mukherjee (JPL)</TD><TD > <A HREF = "fix_poems.html">fix poems</A></TD><TD > rigid</TD><TD > lib/poems</TD></TR>
 <TR ALIGN="center"><TD >REAX</TD><TD > ReaxFF potential</TD><TD > Aidan Thompson (Sandia)</TD><TD > <A HREF = "pair_reax.html">pair_style reax</A></TD><TD > reax</TD><TD >  lib/reax</TD></TR>
@ -84,6 +85,10 @@ Technolgy).
 <P>(3) The KIM package was created by Valeriu Smirichinski, Ryan Elliott,
 and Ellad Tadmor (U Minn).
 </P>
+<P>(4) The KOKKOS package was created primarily by Christian Trott
+(Sandia).  It uses the Kokkos library which was developed by Carter
+Edwards, Christian, and collaborators at Sandia.
+</P>
 <P>The "Doc page" column links to either a portion of the
 <A HREF = "Section_howto.html">Section_howto</A> of the manual, or an input script
 command implemented as part of the package.
--- a/doc/Section_packages.txt
+++ b/doc/Section_packages.txt
@ -45,15 +45,16 @@ CLASS2, class 2 force fields, -, "pair_style lj/class2"_pair_class2.html, -, -
 COLLOID, colloidal particles, -, "atom_style colloid"_atom_style.html, colloid, -
 DIPOLE, point dipole particles, -, "pair_style dipole/cut"_pair_dipole.html, dipole, -
 FLD, Fast Lubrication Dynamics, Kumar & Bybee & Higdon (1), "pair_style lubricateU"_pair_lubricateU.html, -, -
-GPU, GPU-enabled potentials, Mike Brown (ORNL), "Section accelerate"_Section_accelerate.html#acc_6, gpu, lib/gpu
+GPU, GPU-enabled styles, Mike Brown (ORNL), "Section accelerate"_Section_accelerate.html#acc_6, gpu, lib/gpu
 GRANULAR, granular systems, -, "Section_howto"_Section_howto.html#howto_6, pour, -
 KIM, openKIM potentials, Smirichinski & Elliot & Tadmor (3), "pair_style kim"_pair_kim.html, kim, KIM
+KOKKOS, Kokkos-enabled styles, Trott & Edwards (4), "Section_accelerate"_Section_accelerate.html#acc_8, kokkos, lib/kokkos
 KSPACE, long-range Coulombic solvers, -, "kspace_style"_kspace_style.html, peptide, -
 MANYBODY, many-body potentials, -, "pair_style tersoff"_pair_tersoff.html, shear, -
 MEAM, modified EAM potential, Greg Wagner (Sandia), "pair_style meam"_pair_meam.html, meam, lib/meam
 MC, Monte Carlo options, -, "fix gcmc"_fix_gcmc.html, -, -
 MOLECULE, molecular system force fields, -, "Section_howto"_Section_howto.html#howto_3, peptide, -
-OPT, optimized pair potentials, Fischer & Richie & Natoli (2), "Section accelerate"_Section_accelerate.html#acc_4, -, -
+OPT, optimized pair styles, Fischer & Richie & Natoli (2), "Section accelerate"_Section_accelerate.html#acc_4, -, -
 PERI, Peridynamics models, Mike Parks (Sandia), "pair_style peri"_pair_peri.html, peri, -
 POEMS, coupled rigid body motion, Rudra Mukherjee (JPL), "fix poems"_fix_poems.html, rigid, lib/poems
 REAX, ReaxFF potential, Aidan Thompson (Sandia), "pair_style reax"_pair_reax.html, reax,  lib/reax
@ -78,6 +79,10 @@ Technolgy).
 (3) The KIM package was created by Valeriu Smirichinski, Ryan Elliott,
 and Ellad Tadmor (U Minn).

+(4) The KOKKOS package was created primarily by Christian Trott
+(Sandia).  It uses the Kokkos library which was developed by Carter
+Edwards, Christian, and collaborators at Sandia.
+
 The "Doc page" column links to either a portion of the
 "Section_howto"_Section_howto.html of the manual, or an input script
 command implemented as part of the package.
--- a/doc/Section_start.html
+++ b/doc/Section_start.html
@ -555,7 +555,7 @@ on both a basic build and a customized build with pacakges you select.
 <UL><LI><A HREF = "#start_3_1">Package basics</A>
 <LI><A HREF = "#start_3_2">Including/excluding packages</A>
 <LI><A HREF = "#start_3_3">Packages that require extra libraries</A>
-<LI><A HREF = "#start_3_4">Additional Makefile settings for extra libraries</A> 
+<LI><A HREF = "#start_3_4">Packages that use make variable settings</A> 
 </UL>
 <HR>

@ -688,7 +688,7 @@ for a list of packages that have auxiliary libraries.
 </P>
 <P>Code for some of these auxiliary libraries is included in the LAMMPS
 distribution under the lib directory.  Examples are the USER-ATC and
-MEAM packages.  Some auxiliary libraries are not included with LAMMPS;
+MEAM packages.  Some auxiliary libraries are NOT included with LAMMPS;
 to use the associated package you must download and install the
 auxiliary library yourself.  Examples are the KIM and VORONOI and
 USER-MOLFILE packages.
@ -699,31 +699,33 @@ that library.  Typically this is done by typing something like:
 </P>
 <PRE>make -f Makefile.g++ 
 </PRE>
-<P>If one of the provided Makefiles is not
-appropriate for your system you will need to edit or add one.
-Note that all the Makefiles have a setting for EXTRAMAKE at
-the top that names a Makefile.lammps.* file.
+<P>If one of the provided Makefiles is not appropriate for your system
+you will need to edit or add one.  Note that all the Makefiles have a
+setting for EXTRAMAKE at the top that specifies a Makefile.lammps.*
+file.
 </P>
-<P>If successful, this will produce 2 files in the lib directory:
+<P>If the library build is successful, it will produce 2 files in the lib
+directory:
 </P>
 <PRE>libpackage.a
 Makefile.lammps 
 </PRE>
-<P>The Makefile.lammps file is a copy of the EXTRAMAKE file specified
-in the Makefile you used.
+<P>The Makefile.lammps file will be a copy of the EXTRAMAKE file setting
+specified in the library Makefile.* you used.
 </P>
-<P>You MUST insure that the settings in Makefile.lammps are appropriate
-for your system.  If they are not, the LAMMPS build will fail.
+<P>Note that you must insure that the settings in Makefile.lammps are
+appropriate for your system.  If they are not, the LAMMPS build will
+fail.
 </P>
-<P>As explained in the lib/package/README files, they are used to specify
-additional system libraries and their locations so that LAMMPS can
-build with the auxiliary library.  For example, if the MEAM or REAX
-packages are used, the auxiliary libraries consist of F90 code, build
-with a F90 complier.  To link that library with LAMMPS (a C++ code)
-via whatever C++ compiler LAMMPS is built with, typically requires
-additional Fortran-to-C libraries be included in the link.  Another
-example are the BLAS and LAPACK libraries needed to use the USER-ATC
-or USER-AWPMD packages.
+<P>As explained in the lib/package/README files, the settings in
+Makefile.lammps are used to specify additional system libraries and
+their locations so that LAMMPS can build with the auxiliary library.
+For example, if the MEAM or REAX packages are used, the auxiliary
+libraries consist of F90 code, built with a Fortran complier.  To link
+that library with LAMMPS (a C++ code) via whatever C++ compiler LAMMPS
+is built with, typically requires additional Fortran-to-C libraries be
+included in the link.  Another example are the BLAS and LAPACK
+libraries needed to use the USER-ATC or USER-AWPMD packages.
 </P>
 <P>For libraries without provided source code, see the
 src/package/Makefile.lammps file for information on where to find the
@ -731,10 +733,105 @@ library and how to build it.  E.g. the file src/KIM/Makefile.lammps or
 src/VORONOI/Makefile.lammps or src/UESR-MOLFILE/Makefile.lammps.
 These files serve the same purpose as the lib/package/Makefile.lammps
 files described above.  The files have settings needed when LAMMPS is
-built to link with the corresponding auxiliary library.  Again, you
-MUST insure that the settings in src/package/Makefile.lammps are
-appropriate for your system and where you installed the auxiliary
-library.  If they are not, the LAMMPS build will fail.
+built to link with the corresponding auxiliary library.
+</P>
+<P>Again, you must insure that the settings in
+src/package/Makefile.lammps are appropriate for your system and where
+you installed the auxiliary library.  If they are not, the LAMMPS
+build will fail.
+</P>
+<HR>
+
+<A NAME = "start_3_4"></A><B><I>Packages that use make variable settings</I></B> 
+
+<P>One package, the KOKKOS package, allows its build options to be
+specified by setting variables via the "make" command, rather than by
+first building an auxiliary library and editing a Makefile.lammps
+file, as discussed in the previous sub-section for other packages.
+This is for convenience since it is common to want to experiment with
+different Kokkos library options.  Using variables enables a direct
+re-build of LAMMPS and its Kokkos dependencies, so that a benchmark
+test with different Kokkos options can be quickly performed.
+</P>
+<P>The syntax for setting make variables is as follows.  You must
+use a GNU-compatible make command for this to work.  Try "gmake"
+if your system's standard make complains.
+</P>
+<PRE>make yes-kokkos
+make g++ VAR1=value VAR2=value ... 
+</PRE>
+<P>The first line installs the KOKKOS package, which only needs to be
+done once.  The second line builds LAMMPS with src/MAKE/Makefile.g++
+and optionally sets one or more variables that affect the build.  Each
+variable is specified in upper-case; its value follows an equal sign
+with no spaces.  The second line can be repeated with different
+variable settings, though a "clean" must be done before the rebuild.
+Type "make clean" to see options for this operation.
+</P>
+<P>These are the variables that can be specified.  Each takes a value of
+<I>yes</I> or <I>no</I>.  The default value is listed, which is set in the
+lib/kokkos/Makefile.lammps file.  See <A HREF = "Section_accelerate.html#acc_8">this
+section</A> for a discussion of what is
+meant by "host" and "device" in the Kokkos context.
+</P>
+<UL><LI>OMP, default = <I>yes</I>
+<LI>CUDA, default = <I>no</I>
+<LI>HWLOC, default = <I>no</I>
+<LI>AVX, default = <I>no</I>
+<LI>MIC, default = <I>no</I>
+<LI>LIBRT, default = <I>no</I>
+<LI>DEBUG, default = <I>no</I> 
+</UL>
+<P>OMP sets the parallelization method used for Kokkos code (within
+LAMMPS) that runs on the host.  OMP=yes means that OpenMP will be
+used.  OMP=no means that pthreads will be used.
+</P>
+<P>CUDA sets the parallelization method used for Kokkos code (within
+LAMMPS) that runs on the device.  CUDA=yes means an NVIDIA GPU running
+CUDA will be used.  CUDA=no means that the OMP=yes or OMP=no setting
+will be used for the device as well as the host.
+</P>
+<P>If CUDA=yes, then the lo-level Makefile in the src/MAKE directory must
+use "nvcc" as its compiler, via its CC setting.  For best performance
+its CCFLAGS setting should use -O3 and have an -arch setting that
+matches the compute capability of your NVIDIA hardware and software
+installation, e.g. -arch=sm_20.  Generally Fermi Generation GPUs are
+sm_20, while Kepler generation GPUs are sm_30 or sm_35 and Maxwell
+cards are sm_50.  A complete list can be found on
+<A HREF = "http://en.wikipedia.org/wiki/CUDA#Supported_GPUs">wikipedia</A>. You can
+also use the deviceQuery tool that comes with the CUDA samples.  Note
+the minimal required compute capability is 2.0, but this will give
+signicantly reduced performance compared to Kepler generation GPUs
+with compute capability 3.x.  For the LINK setting, "nvcc" should not
+be used; instead use g++ or another compiler suitable for linking C++
+applications.  Often you will want to use your MPI compiler wrapper
+for this setting (i.e. mpicxx).  Finally, the lo-level Makefile must
+also have a "Compilation rule" for creating *.o files from *.cu files.
+See src/Makefile.cuda for an example of a lo-level Makefile with all
+of these settings.
+</P>
+<P>HWLOC binds threads to hardware cores, so they do not migrate during a
+simulation.  HWLOC=yes should always be used if running with OMP=no
+for pthreads.  It is not necessary for OMP=yes for OpenMP, because
+OpenMP provides alternative methods via environment variables for
+binding threads to hardware cores.  More info on binding threads to
+cores is given in <A HREF = "Section_accelerate.html#acc_8">this section</A>.
+</P>
+<P>AVX enables Intel advanced vector extensions when compiling for an
+Intel-compatible chip.  AVX=yes should only be set if your host
+hardware supports AVX.  If it does not support it, this will cause a
+run-time crash.
+</P>
+<P>MIC enables compiler switches needed when compling for an Intel Phi
+processor.
+</P>
+<P>LIBRT enables use of a more accurate timer mechanism on most Unix
+platforms.  This library is not available on all platforms.
+</P>
+<P>DEBUG is only useful when developing a Kokkos-enabled style within
+LAMMPS.  DEBUG=yes enables printing of run-time debugging information
+that can be useful.  It also enables runtime bounds checking on Kokkos
+data structures.
 </P>
 <HR>

@ -1044,6 +1141,7 @@ letter abbreviation can be used:
 <LI>-e or -echo
 <LI>-i or -in
 <LI>-h or -help
+<LI>-k or -kokkos
 <LI>-l or -log
 <LI>-nc or -nocite
 <LI>-p or -partition
@ -1104,6 +1202,93 @@ want to use was included via the appropriate package at compile time.
 LAMMPS will print the info and immediately exit if this switch is
 used.
 </P>
+<PRE>-kokkos on/off keyword/value ... 
+</PRE>
+<P>Explicitly enable or disable Kokkos support, as provided by the KOKKOS
+package.  If LAMMPS is built with this package, as described above in
+<A HREF = "#start_3">Section 2.3</A>, then by default LAMMPS will run in Kokkos
+mode.  If this switch is set to "off", then it will not, even if it
+was built with the KOKKOS package, which means you can run standard
+LAMMPS styles or use styles enhanced by other acceleration packages,
+such as the GPU or USER-CUDA or USER-OMP packages, for testing or
+benchmarking purposes.  The only reason to set the switch to "on", is
+to check if LAMMPS was built with the KOKKOS package, since an error
+will be generated if it was not.
+</P>
+<P>Additional optional keyword/value pairs can be specified which
+determine how Kokkos will use the underlying hardware on your
+platform.  These settings apply to each MPI task you launch via the
+"mpirun" or "mpiexec" command.  You may choose to run one or more MPI
+tasks per physical node.  Note that if you are running on a desktop
+machine, you typically have one physical node.  On a cluster or
+supercomputer there may be dozens or 1000s of physical nodes.
+</P>
+<P>Either the full word or an abbreviation can be used for the keywords.
+Note that the keywords do not use a leading minus sign.  I.e. the
+keyword is "t", not "-t".  Also note that each of the keywords has a
+default setting.  More explanation as to when to use these options and
+what settings to use on different platforms is given in <A HREF = "Section_accerlerate.html#acc_8">this
+section</A>.
+</P>
+<UL><LI>d or device
+<LI>g or gpus
+<LI>t or threads
+<LI>n or numa 
+</UL>
+<PRE>device Nd 
+</PRE>
+<P>This option is only relevant if you built LAMMPS with CUDA=yes, you
+have more than one GPU per node, and if you are running with only one
+MPI task per node.  The Nd setting is the ID of the GPU on the node to
+run on.  By default Nd = 0.  If you have multiple GPUs per node, they
+have consecutive IDs numbered as 0,1,2,etc.  This setting allows you
+to launch multiple independent jobs on the node, each with a single
+MPI task per node, and assign each job to run on a different GPU.
+</P>
+<PRE>gpus Ng Ns 
+</PRE>
+<P>This option is only relevant if you built LAMMPS with CUDA=yes, you
+have more than one GPU per node, and you are running with multiple MPI
+tasks per node (up to one per GPU).  The Ng setting is how many GPUs
+you will use.  The Ns setting is optional.  If set, it is the ID of a
+GPU to skip when assigning MPI tasks to GPUs.  This may be useful if
+your desktop system reserves one GPU to drive the screen and the rest
+are intended for computational work like running LAMMPS.  By default
+Ng = 1 and Ns is not set.
+</P>
+<P>Depending on which flavor of MPI you are running, LAMMPS will look for
+one of these 3 environment variables
+</P>
+<PRE>SLURM_LOCALID (various MPI variants compiled with SLURM support)
+MV2_COMM_WORLD_LOCAL_RANK (Mvapich)
+OMPI_COMM_WORLD_LOCAL_RANK (OpenMPI) 
+</PRE>
+<P>which are initialized by the "srun", "mpirun" or "mpiexec" commands.
+The environment variable setting for each MPI rank is used to assign a
+unique GPU ID to the MPI task.
+</P>
+<PRE>threads Nt 
+</PRE>
+<P>This option assigns Nt number of threads to each MPI task for
+performing work when Kokkos is executing in OpenMP or pthreads mode.
+The default is Nt = 1, which essentially runs in MPI-only mode.  If
+there are Np MPI tasks per physical node, you generally want Np*Nt =
+the number of physical cores per node, to use your available hardware
+optimally.  This also sets the number of threads used by the host when
+LAMMPS is compiled with CUDA=yes.
+</P>
+<PRE>numa Nm 
+</PRE>
+<P>This option is only relevant when using pthreads with hwloc support.
+In this case Nm defines the number of NUMA regions (typicaly sockets)
+on a node which will be utilizied by a single MPI rank.  By default Nm
+= 1.  If this option is used the total number of worker-threads per
+MPI rank is threads*numa.  Currently it is always almost better to
+assign at least one MPI rank per NUMA region, and leave numa set to
+its default value of 1. This is because letting a single process span
+multiple NUMA regions induces a significant amount of cross NUMA data
+traffic which is slow.
+</P>
 <PRE>-log file 
 </PRE>
 <P>Specify a log file for LAMMPS to write status information to.  In
@ -1277,23 +1462,24 @@ multi-partition mode, if the specified file is "none", then no screen
 output is performed. Option -pscreen will override the name of the
 partition screen files file.N.
 </P>
-<PRE>-suffix style 
+<PRE>-suffix style args 
 </PRE>
 <P>Use variants of various styles if they exist.  The specified style can
-be <I>opt</I>, <I>omp</I>, <I>gpu</I>, or <I>cuda</I>.  These refer to optional packages that
-LAMMPS can be built with, as described above in <A HREF = "#start_3">Section
-2.3</A>.  The "opt" style corrsponds to the OPT package, the
-"omp" style to the USER-OMP package, the "gpu" style to the GPU 
-package, and the "cuda" style to the USER-CUDA package.
+be <I>cuda</I>, <I>gpu</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I>.  These refer to optional
+packages that LAMMPS can be built with, as described above in <A HREF = "#start_3">Section
+2.3</A>.  The "cuda" style corresponds to the USER-CUDA package,
+the "gpu" style to the GPU package, the "kk" style to the KOKKOS
+pacakge, the "opt" style to the OPT package, and the "omp" style to
+the USER-OMP package.
 </P>
 <P>As an example, all of the packages provide a <A HREF = "pair_lj.html">pair_style
-lj/cut</A> variant, with style names lj/cut/opt, lj/cut/omp,
-lj/cut/gpu, or lj/cut/cuda.  A variant styles can be specified
-explicitly in your input script, e.g. pair_style lj/cut/gpu.  If the
-suffix switch is used, you do not need to modify your input script.
-The specified suffix (opt,omp,gpu,cuda) is automatically appended
-whenever your input script command creates a new
-<A HREF = "atom_style.html">atom</A>, <A HREF = "pair_style.html">pair</A>, <A HREF = "fix.html">fix</A>,
+lj/cut</A> variant, with style names lj/cut/cuda,
+lj/cut/gpu, lj/cut/kk, lj/cut/omp, or lj/cut/opt.  A variant styles
+can be specified explicitly in your input script, e.g. pair_style
+lj/cut/gpu.  If the -suffix switch is used, you do not need to modify
+your input script.  The specified suffix (cuda,gpu,kk,omp,opt) is
+automatically appended whenever your input script command creates a
+new <A HREF = "atom_style.html">atom</A>, <A HREF = "pair_style.html">pair</A>, <A HREF = "fix.html">fix</A>,
 <A HREF = "compute.html">compute</A>, or <A HREF = "run_style.html">run</A> style.  If the variant
 version does not exist, the standard version is created.
 </P>
@ -1303,13 +1489,20 @@ default GPU settings, as if the command "package gpu force/neigh 0 0
 changed by using the <A HREF = "package.html">package gpu</A> command in your script
 if desired.
 </P>
+<P>For the KOKKOS package, using this command-line switch also invokes
+the default KOKKOS settings, as if the command "package kokkos neigh
+full comm/exchange host comm/forward host " were used at the top of
+your input script.  These settings can be changed by using the
+<A HREF = "package.html">package kokkos</A> command in your script if desired.
+</P>
 <P>For the OMP package, using this command-line switch also invokes the
 default OMP settings, as if the command "package omp *" were used at
 the top of your input script.  These settings can be changed by using
 the <A HREF = "package.html">package omp</A> command in your script if desired.
 </P>
-<P>The <A HREF = "suffix.html">suffix</A> command can also set a suffix and it can also
-turn off/on any suffix setting made via the command line.
+<P>The <A HREF = "suffix.html">suffix</A> command can also be used set a suffix and it
+can also turn off or back on any suffix setting made via the command
+line.
 </P>
 <PRE>-var name value1 value2 ... 
 </PRE>
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@ -549,7 +549,7 @@ This section has the following sub-sections:
 "Package basics"_#start_3_1
 "Including/excluding packages"_#start_3_2
 "Packages that require extra libraries"_#start_3_3
-"Additional Makefile settings for extra libraries"_#start_3_4 :ul
+"Packages that use make variable settings"_#start_3_4 :ul

 :line

@ -682,7 +682,7 @@ for a list of packages that have auxiliary libraries.

 Code for some of these auxiliary libraries is included in the LAMMPS
 distribution under the lib directory.  Examples are the USER-ATC and
-MEAM packages.  Some auxiliary libraries are not included with LAMMPS;
+MEAM packages.  Some auxiliary libraries are NOT included with LAMMPS;
 to use the associated package you must download and install the
 auxiliary library yourself.  Examples are the KIM and VORONOI and
 USER-MOLFILE packages.
@ -693,31 +693,33 @@ that library.  Typically this is done by typing something like:

 make -f Makefile.g++ :pre

-If one of the provided Makefiles is not
-appropriate for your system you will need to edit or add one.
-Note that all the Makefiles have a setting for EXTRAMAKE at
-the top that names a Makefile.lammps.* file.
+If one of the provided Makefiles is not appropriate for your system
+you will need to edit or add one.  Note that all the Makefiles have a
+setting for EXTRAMAKE at the top that specifies a Makefile.lammps.*
+file.

-If successful, this will produce 2 files in the lib directory:
+If the library build is successful, it will produce 2 files in the lib
+directory:

 libpackage.a
 Makefile.lammps :pre

-The Makefile.lammps file is a copy of the EXTRAMAKE file specified
-in the Makefile you used.
+The Makefile.lammps file will be a copy of the EXTRAMAKE file setting
+specified in the library Makefile.* you used.

-You MUST insure that the settings in Makefile.lammps are appropriate
-for your system.  If they are not, the LAMMPS build will fail.
+Note that you must insure that the settings in Makefile.lammps are
+appropriate for your system.  If they are not, the LAMMPS build will
+fail.

-As explained in the lib/package/README files, they are used to specify
-additional system libraries and their locations so that LAMMPS can
-build with the auxiliary library.  For example, if the MEAM or REAX
-packages are used, the auxiliary libraries consist of F90 code, build
-with a F90 complier.  To link that library with LAMMPS (a C++ code)
-via whatever C++ compiler LAMMPS is built with, typically requires
-additional Fortran-to-C libraries be included in the link.  Another
-example are the BLAS and LAPACK libraries needed to use the USER-ATC
-or USER-AWPMD packages.
+As explained in the lib/package/README files, the settings in
+Makefile.lammps are used to specify additional system libraries and
+their locations so that LAMMPS can build with the auxiliary library.
+For example, if the MEAM or REAX packages are used, the auxiliary
+libraries consist of F90 code, built with a Fortran complier.  To link
+that library with LAMMPS (a C++ code) via whatever C++ compiler LAMMPS
+is built with, typically requires additional Fortran-to-C libraries be
+included in the link.  Another example are the BLAS and LAPACK
+libraries needed to use the USER-ATC or USER-AWPMD packages.

 For libraries without provided source code, see the
 src/package/Makefile.lammps file for information on where to find the
@ -725,10 +727,105 @@ library and how to build it.  E.g. the file src/KIM/Makefile.lammps or
 src/VORONOI/Makefile.lammps or src/UESR-MOLFILE/Makefile.lammps.
 These files serve the same purpose as the lib/package/Makefile.lammps
 files described above.  The files have settings needed when LAMMPS is
-built to link with the corresponding auxiliary library.  Again, you
-MUST insure that the settings in src/package/Makefile.lammps are
-appropriate for your system and where you installed the auxiliary
-library.  If they are not, the LAMMPS build will fail.
+built to link with the corresponding auxiliary library.
+
+Again, you must insure that the settings in
+src/package/Makefile.lammps are appropriate for your system and where
+you installed the auxiliary library.  If they are not, the LAMMPS
+build will fail.
+
+:line
+
+[{Packages that use make variable settings}] :link(start_3_4)
+
+One package, the KOKKOS package, allows its build options to be
+specified by setting variables via the "make" command, rather than by
+first building an auxiliary library and editing a Makefile.lammps
+file, as discussed in the previous sub-section for other packages.
+This is for convenience since it is common to want to experiment with
+different Kokkos library options.  Using variables enables a direct
+re-build of LAMMPS and its Kokkos dependencies, so that a benchmark
+test with different Kokkos options can be quickly performed.
+
+The syntax for setting make variables is as follows.  You must
+use a GNU-compatible make command for this to work.  Try "gmake"
+if your system's standard make complains.
+
+make yes-kokkos
+make g++ VAR1=value VAR2=value ... :pre
+
+The first line installs the KOKKOS package, which only needs to be
+done once.  The second line builds LAMMPS with src/MAKE/Makefile.g++
+and optionally sets one or more variables that affect the build.  Each
+variable is specified in upper-case; its value follows an equal sign
+with no spaces.  The second line can be repeated with different
+variable settings, though a "clean" must be done before the rebuild.
+Type "make clean" to see options for this operation.
+
+These are the variables that can be specified.  Each takes a value of
+{yes} or {no}.  The default value is listed, which is set in the
+lib/kokkos/Makefile.lammps file.  See "this
+section"_Section_accelerate.html#acc_8 for a discussion of what is
+meant by "host" and "device" in the Kokkos context.
+
+OMP, default = {yes}
+CUDA, default = {no}
+HWLOC, default = {no}
+AVX, default = {no}
+MIC, default = {no}
+LIBRT, default = {no}
+DEBUG, default = {no} :ul
+
+OMP sets the parallelization method used for Kokkos code (within
+LAMMPS) that runs on the host.  OMP=yes means that OpenMP will be
+used.  OMP=no means that pthreads will be used.
+
+CUDA sets the parallelization method used for Kokkos code (within
+LAMMPS) that runs on the device.  CUDA=yes means an NVIDIA GPU running
+CUDA will be used.  CUDA=no means that the OMP=yes or OMP=no setting
+will be used for the device as well as the host.
+
+If CUDA=yes, then the lo-level Makefile in the src/MAKE directory must
+use "nvcc" as its compiler, via its CC setting.  For best performance
+its CCFLAGS setting should use -O3 and have an -arch setting that
+matches the compute capability of your NVIDIA hardware and software
+installation, e.g. -arch=sm_20.  Generally Fermi Generation GPUs are
+sm_20, while Kepler generation GPUs are sm_30 or sm_35 and Maxwell
+cards are sm_50.  A complete list can be found on
+"wikipedia"_http://en.wikipedia.org/wiki/CUDA#Supported_GPUs. You can
+also use the deviceQuery tool that comes with the CUDA samples.  Note
+the minimal required compute capability is 2.0, but this will give
+signicantly reduced performance compared to Kepler generation GPUs
+with compute capability 3.x.  For the LINK setting, "nvcc" should not
+be used; instead use g++ or another compiler suitable for linking C++
+applications.  Often you will want to use your MPI compiler wrapper
+for this setting (i.e. mpicxx).  Finally, the lo-level Makefile must
+also have a "Compilation rule" for creating *.o files from *.cu files.
+See src/Makefile.cuda for an example of a lo-level Makefile with all
+of these settings.
+
+HWLOC binds threads to hardware cores, so they do not migrate during a
+simulation.  HWLOC=yes should always be used if running with OMP=no
+for pthreads.  It is not necessary for OMP=yes for OpenMP, because
+OpenMP provides alternative methods via environment variables for
+binding threads to hardware cores.  More info on binding threads to
+cores is given in "this section"_Section_accelerate.html#acc_8.
+
+AVX enables Intel advanced vector extensions when compiling for an
+Intel-compatible chip.  AVX=yes should only be set if your host
+hardware supports AVX.  If it does not support it, this will cause a
+run-time crash.
+
+MIC enables compiler switches needed when compling for an Intel Phi
+processor.
+
+LIBRT enables use of a more accurate timer mechanism on most Unix
+platforms.  This library is not available on all platforms.
+
+DEBUG is only useful when developing a Kokkos-enabled style within
+LAMMPS.  DEBUG=yes enables printing of run-time debugging information
+that can be useful.  It also enables runtime bounds checking on Kokkos
+data structures.

 :line

@ -1038,6 +1135,7 @@ letter abbreviation can be used:
 -e or -echo
 -i or -in
 -h or -help
+-k or -kokkos
 -l or -log
 -nc or -nocite
 -p or -partition
@ -1098,6 +1196,93 @@ want to use was included via the appropriate package at compile time.
 LAMMPS will print the info and immediately exit if this switch is
 used.

+-kokkos on/off keyword/value ... :pre
+
+Explicitly enable or disable Kokkos support, as provided by the KOKKOS
+package.  If LAMMPS is built with this package, as described above in
+"Section 2.3"_#start_3, then by default LAMMPS will run in Kokkos
+mode.  If this switch is set to "off", then it will not, even if it
+was built with the KOKKOS package, which means you can run standard
+LAMMPS styles or use styles enhanced by other acceleration packages,
+such as the GPU or USER-CUDA or USER-OMP packages, for testing or
+benchmarking purposes.  The only reason to set the switch to "on", is
+to check if LAMMPS was built with the KOKKOS package, since an error
+will be generated if it was not.
+
+Additional optional keyword/value pairs can be specified which
+determine how Kokkos will use the underlying hardware on your
+platform.  These settings apply to each MPI task you launch via the
+"mpirun" or "mpiexec" command.  You may choose to run one or more MPI
+tasks per physical node.  Note that if you are running on a desktop
+machine, you typically have one physical node.  On a cluster or
+supercomputer there may be dozens or 1000s of physical nodes.
+
+Either the full word or an abbreviation can be used for the keywords.
+Note that the keywords do not use a leading minus sign.  I.e. the
+keyword is "t", not "-t".  Also note that each of the keywords has a
+default setting.  More explanation as to when to use these options and
+what settings to use on different platforms is given in "this
+section"_Section_accerlerate.html#acc_8.
+
+d or device
+g or gpus
+t or threads
+n or numa :ul
+
+device Nd :pre
+
+This option is only relevant if you built LAMMPS with CUDA=yes, you
+have more than one GPU per node, and if you are running with only one
+MPI task per node.  The Nd setting is the ID of the GPU on the node to
+run on.  By default Nd = 0.  If you have multiple GPUs per node, they
+have consecutive IDs numbered as 0,1,2,etc.  This setting allows you
+to launch multiple independent jobs on the node, each with a single
+MPI task per node, and assign each job to run on a different GPU.
+
+gpus Ng Ns :pre
+
+This option is only relevant if you built LAMMPS with CUDA=yes, you
+have more than one GPU per node, and you are running with multiple MPI
+tasks per node (up to one per GPU).  The Ng setting is how many GPUs
+you will use.  The Ns setting is optional.  If set, it is the ID of a
+GPU to skip when assigning MPI tasks to GPUs.  This may be useful if
+your desktop system reserves one GPU to drive the screen and the rest
+are intended for computational work like running LAMMPS.  By default
+Ng = 1 and Ns is not set.
+
+Depending on which flavor of MPI you are running, LAMMPS will look for
+one of these 3 environment variables
+
+SLURM_LOCALID (various MPI variants compiled with SLURM support)
+MV2_COMM_WORLD_LOCAL_RANK (Mvapich)
+OMPI_COMM_WORLD_LOCAL_RANK (OpenMPI) :pre
+
+which are initialized by the "srun", "mpirun" or "mpiexec" commands.
+The environment variable setting for each MPI rank is used to assign a
+unique GPU ID to the MPI task.
+
+threads Nt :pre
+
+This option assigns Nt number of threads to each MPI task for
+performing work when Kokkos is executing in OpenMP or pthreads mode.
+The default is Nt = 1, which essentially runs in MPI-only mode.  If
+there are Np MPI tasks per physical node, you generally want Np*Nt =
+the number of physical cores per node, to use your available hardware
+optimally.  This also sets the number of threads used by the host when
+LAMMPS is compiled with CUDA=yes.
+
+numa Nm :pre
+
+This option is only relevant when using pthreads with hwloc support.
+In this case Nm defines the number of NUMA regions (typicaly sockets)
+on a node which will be utilizied by a single MPI rank.  By default Nm
+= 1.  If this option is used the total number of worker-threads per
+MPI rank is threads*numa.  Currently it is always almost better to
+assign at least one MPI rank per NUMA region, and leave numa set to
+its default value of 1. This is because letting a single process span
+multiple NUMA regions induces a significant amount of cross NUMA data
+traffic which is slow.
+
 -log file :pre

 Specify a log file for LAMMPS to write status information to.  In
@ -1271,23 +1456,24 @@ multi-partition mode, if the specified file is "none", then no screen
 output is performed. Option -pscreen will override the name of the
 partition screen files file.N.

-suffix style :pre
+-suffix style args :pre

 Use variants of various styles if they exist.  The specified style can
-be {opt}, {omp}, {gpu}, or {cuda}.  These refer to optional packages that
-LAMMPS can be built with, as described above in "Section
-2.3"_#start_3.  The "opt" style corrsponds to the OPT package, the
-"omp" style to the USER-OMP package, the "gpu" style to the GPU 
-package, and the "cuda" style to the USER-CUDA package.
+be {cuda}, {gpu}, {kk}, {omp}, or {opt}.  These refer to optional
+packages that LAMMPS can be built with, as described above in "Section
+2.3"_#start_3.  The "cuda" style corresponds to the USER-CUDA package,
+the "gpu" style to the GPU package, the "kk" style to the KOKKOS
+pacakge, the "opt" style to the OPT package, and the "omp" style to
+the USER-OMP package.

 As an example, all of the packages provide a "pair_style
-lj/cut"_pair_lj.html variant, with style names lj/cut/opt, lj/cut/omp,
-lj/cut/gpu, or lj/cut/cuda.  A variant styles can be specified
-explicitly in your input script, e.g. pair_style lj/cut/gpu.  If the
-suffix switch is used, you do not need to modify your input script.
-The specified suffix (opt,omp,gpu,cuda) is automatically appended
-whenever your input script command creates a new
-"atom"_atom_style.html, "pair"_pair_style.html, "fix"_fix.html,
+lj/cut"_pair_lj.html variant, with style names lj/cut/cuda,
+lj/cut/gpu, lj/cut/kk, lj/cut/omp, or lj/cut/opt.  A variant styles
+can be specified explicitly in your input script, e.g. pair_style
+lj/cut/gpu.  If the -suffix switch is used, you do not need to modify
+your input script.  The specified suffix (cuda,gpu,kk,omp,opt) is
+automatically appended whenever your input script command creates a
+new "atom"_atom_style.html, "pair"_pair_style.html, "fix"_fix.html,
 "compute"_compute.html, or "run"_run_style.html style.  If the variant
 version does not exist, the standard version is created.

@ -1297,13 +1483,20 @@ default GPU settings, as if the command "package gpu force/neigh 0 0
 changed by using the "package gpu"_package.html command in your script
 if desired.

+For the KOKKOS package, using this command-line switch also invokes
+the default KOKKOS settings, as if the command "package kokkos neigh
+full comm/exchange host comm/forward host " were used at the top of
+your input script.  These settings can be changed by using the
+"package kokkos"_package.html command in your script if desired.
+
 For the OMP package, using this command-line switch also invokes the
 default OMP settings, as if the command "package omp *" were used at
 the top of your input script.  These settings can be changed by using
 the "package omp"_package.html command in your script if desired.

-The "suffix"_suffix.html command can also set a suffix and it can also
-turn off/on any suffix setting made via the command line.
+The "suffix"_suffix.html command can also be used set a suffix and it
+can also turn off or back on any suffix setting made via the command
+line.

 -var name value1 value2 ... :pre

--- a/doc/atom_style.html
+++ b/doc/atom_style.html
@ -26,11 +26,16 @@
    template-ID = ID of molecule template specified in a separate <A HREF = "molecule.html">molecule</A> command
  <I>hybrid</I> args = list of one or more sub-styles, each with their args 
 </PRE>
+<P>accelerated styles (with same args):
+</P>
+<UL><LI>style = <I>angle/cuda</I> or <I>atomic/cuda</I> or <I>atomic/kokkos</I> or <I>charge/cuda</I> or <I>full/cuda</I> 
+</UL>
 <P><B>Examples:</B>
 </P>
 <PRE>atom_style atomic
 atom_style bond
 atom_style full
+atom_style full/cuda
 atom_style body nparticle 2 10
 atom_style hybrid charge bond
 atom_style hybrid charge body nparticle 2 5
@ -200,6 +205,31 @@ per-atom basis.
 <P>LAMMPS can be extended with new atom styles as well as new body
 styles; see <A HREF = "Section_modify.html">this section</A>.
 </P>
+<HR>
+
+<P>Styles with a <I>cuda</I> or <I>kk</I> suffix are functionally the same as the
+corresponding style without the suffix.  They have been optimized to
+run faster, depending on your available hardware, as discussed in
+<A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual.  The
+accelerated styles take the same arguments and should produce the same
+results, except for round-off and precision issues.
+</P>
+<P>Note that other acceleration packages in LAMMPS, specifically the GPU,
+USER-OMP, and OPT packages do not use of accelerated atom styles.
+</P>
+<P>These accelerated styles are part of the USER-CUDA and KOKKOS packages
+respectively.  They are only enabled if LAMMPS was built with those
+packages.  See the <A HREF = "Section_start.html#start_3">Making LAMMPS</A> section
+for more info.
+</P>
+<P>You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the <A HREF = "Section_start.html#start_7">-suffix command-line
+switch</A> when you invoke LAMMPS, or you can
+use the <A HREF = "suffix.html">suffix</A> command in your input script.
+</P>
+<P>See <A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual for
+more instructions on how to use the accelerated styles effectively.
+</P>
 <P><B>Restrictions:</B>
 </P>
 <P>This command cannot be used after the simulation box is defined by a
--- a/doc/atom_style.txt
+++ b/doc/atom_style.txt
@ -24,11 +24,16 @@ style = {angle} or {atomic} or {body} or {bond} or {charge} or {dipole} or \
    template-ID = ID of molecule template specified in a separate "molecule"_molecule.html command
  {hybrid} args = list of one or more sub-styles, each with their args :pre

+accelerated styles (with same args):
+
+style = {angle/cuda} or {atomic/cuda} or {atomic/kokkos} or {charge/cuda} or {full/cuda} :ul
+
 [Examples:]

 atom_style atomic
 atom_style bond
 atom_style full
+atom_style full/cuda
 atom_style body nparticle 2 10
 atom_style hybrid charge bond
 atom_style hybrid charge body nparticle 2 5
@ -196,6 +201,31 @@ per-atom basis.
 LAMMPS can be extended with new atom styles as well as new body
 styles; see "this section"_Section_modify.html.

+:line
+
+Styles with a {cuda} or {kk} suffix are functionally the same as the
+corresponding style without the suffix.  They have been optimized to
+run faster, depending on your available hardware, as discussed in
+"Section_accelerate"_Section_accelerate.html of the manual.  The
+accelerated styles take the same arguments and should produce the same
+results, except for round-off and precision issues.
+
+Note that other acceleration packages in LAMMPS, specifically the GPU,
+USER-OMP, and OPT packages do not use of accelerated atom styles.
+
+These accelerated styles are part of the USER-CUDA and KOKKOS packages
+respectively.  They are only enabled if LAMMPS was built with those
+packages.  See the "Making LAMMPS"_Section_start.html#start_3 section
+for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_7 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section_accelerate"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
 [Restrictions:]

 This command cannot be used after the simulation box is defined by a
--- a/doc/fix_nve.html
+++ b/doc/fix_nve.html
@ -13,6 +13,8 @@
 </H3>
 <H3>fix nve/cuda command 
 </H3>
+<H3>fix nve/kk command 
+</H3>
 <H3>fix nve/omp command 
 </H3>
 <P><B>Syntax:</B>
@ -35,17 +37,18 @@ ensemble.
 </P>
 <HR>

-<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>omp</I>, or <I>opt</I> suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A> of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
+<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I> suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A>
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
 </P>
-<P>These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the <A HREF = "Section_start.html#start_3">Making LAMMPS</A>
-section for more info.
+<P>These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the <A HREF = "Section_start.html#start_3">Making
+LAMMPS</A> section for more info.
 </P>
 <P>You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the <A HREF = "Section_start.html#start_7">-suffix command-line
--- a/doc/fix_nve.txt
+++ b/doc/fix_nve.txt
@ -8,6 +8,7 @@

 fix nve command :h3
 fix nve/cuda command :h3
+fix nve/kk command :h3
 fix nve/omp command :h3

 [Syntax:]
@ -30,17 +31,18 @@ ensemble.

 :line

-Styles with a {cuda}, {gpu}, {omp}, or {opt} suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in "Section_accelerate"_Section_accelerate.html of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
+Styles with a {cuda}, {gpu}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section_accelerate"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.

-These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the "Making LAMMPS"_Section_start.html#start_3
-section for more info.
+These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.

 You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the "-suffix command-line
--- a/doc/fix_rigid.html
+++ b/doc/fix_rigid.html
@ -764,10 +764,27 @@ of the <A HREF = "run.html">run</A> command.  These fixes are not invoked during
 LAMMPS was built with that package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
 </P>
+<P>Assigning a temperature via the <A HREF = "velocity.html">velocity create</A>
+command to a system with <A HREF = "fix_rigid.html">rigid bodies</A> may not have
+the desired outcome for two reasons.  First, the velocity command can
+be invoked before the rigid-body fix is invoked or initialized and the
+number of adjusted degrees of freedom (DOFs) is known.  Thus it is not
+possible to compute the target temperature correctly.  Second, the
+assigned velocities may be partially canceled when constraints are
+first enforced, leading to a different temperature than desired.  A
+workaround for this is to perform a <A HREF = "run.html">run 0</A> command, which
+insures all DOFs are accounted for properly, and then rescale the
+temperature to the desired value before performing a simulation.  For
+example:
+</P>
+<PRE>velocity all create 300.0 12345
+run 0                             # temperature may not be 300K
+velocity all scale 300.0          # now it should be 
+</PRE>
 <P><B>Related commands:</B>
 </P>
 <P><A HREF = "delete_bonds.html">delete_bonds</A>, <A HREF = "neigh_modify.html">neigh_modify</A>
-exclude
+exclude, <A HREF = "fix_shake.html">fix shake</A>
 </P>
 <P><B>Default:</B>
 </P>
--- a/doc/fix_rigid.txt
+++ b/doc/fix_rigid.txt
@ -746,10 +746,27 @@ These fixes are all part of the RIGID package.  It is only enabled if
 LAMMPS was built with that package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.

+Assigning a temperature via the "velocity create"_velocity.html
+command to a system with "rigid bodies"_fix_rigid.html may not have
+the desired outcome for two reasons.  First, the velocity command can
+be invoked before the rigid-body fix is invoked or initialized and the
+number of adjusted degrees of freedom (DOFs) is known.  Thus it is not
+possible to compute the target temperature correctly.  Second, the
+assigned velocities may be partially canceled when constraints are
+first enforced, leading to a different temperature than desired.  A
+workaround for this is to perform a "run 0"_run.html command, which
+insures all DOFs are accounted for properly, and then rescale the
+temperature to the desired value before performing a simulation.  For
+example:
+
+velocity all create 300.0 12345
+run 0                             # temperature may not be 300K
+velocity all scale 300.0          # now it should be :pre
+
 [Related commands:]

 "delete_bonds"_delete_bonds.html, "neigh_modify"_neigh_modify.html
-exclude
+exclude, "fix shake"_fix_shake.html

 [Default:]

--- a/doc/package.html
+++ b/doc/package.html
@ -15,24 +15,11 @@
 </P>
 <PRE>package style args 
 </PRE>
-<UL><LI>style = <I>gpu</I> or <I>cuda</I> or <I>omp</I> 
+<UL><LI>style = <I>cuda</I> or <I>gpu</I> or <I>kokkos</I> or <I>omp</I> 

 <LI>args = arguments specific to the style 

-<PRE>  <I>gpu</I> args = mode first last split keyword value ...
-    mode = force or force/neigh
-    first = ID of first GPU to be used on each node
-    last = ID of last GPU to be used on each node
-    split = fraction of particles assigned to the GPU
-    zero or more keyword/value pairs may be appended
-    keywords = <I>threads_per_atom</I> or <I>cellsize</I> or <I>device</I>
-      <I>threads_per_atom</I> value = Nthreads
-        Nthreads = # of GPU threads used per atom
-      <I>cellsize</I> value = dist
-        dist = length (distance units) in each dimension for neighbor bins
-      <I>device</I> value = device_type
-        device_type = <I>kepler</I> or <I>fermi</I> or <I>cypress</I> or <I>generic</I>
-  <I>cuda</I> args = keyword value ...
+<PRE>  <I>cuda</I> args = keyword value ...
    one or more keyword/value pairs may be appended
    keywords = <I>gpu/node</I> or <I>gpu/node/special</I> or <I>timing</I> or <I>test</I> or <I>override/bpa</I>
      <I>gpu/node</I> value = N
@ -45,6 +32,25 @@
        id = atom-ID of a test particle
      <I>override/bpa</I> values = flag
        flag = 0 for TpA algorithm, 1 for BpA algorithm 
+  <I>gpu</I> args = mode first last split keyword value ...
+    mode = force or force/neigh
+    first = ID of first GPU to be used on each node
+    last = ID of last GPU to be used on each node
+    split = fraction of particles assigned to the GPU
+    zero or more keyword/value pairs may be appended
+    keywords = <I>threads_per_atom</I> or <I>cellsize</I> or <I>device</I>
+      <I>threads_per_atom</I> value = Nthreads
+        Nthreads = # of GPU threads used per atom
+      <I>cellsize</I> value = dist
+        dist = length (distance units) in each dimension for neighbor bins
+      <I>device</I> value = device_type
+        device_type = <I>kepler</I> or <I>fermi</I> or <I>cypress</I> or <I>generic</I>
+  <I>kokkos</I> args = keyword value ...
+    one or more keyword/value pairs may be appended
+    keywords = <I>neigh</I> or <I>comm/exchange</I> or <I>comm/forward</I>
+      <I>neigh</I> value = <I>full</I> or <I>half/thread</I> or <I>half</I> or <I>n2</I> or <I>full/cluster</I>
+      <I>comm/exchange</I> value = <I>no</I> or <I>host</I> or <I>device</I>
+      <I>comm/forward</I> value = <I>no</I> or <I>host</I> or <I>device</I>
  <I>omp</I> args = Nthreads mode
    Nthreads = # of OpenMP threads to associate with each MPI process
    mode = force or force/neigh (optional) 
@ -59,13 +65,14 @@ package gpu force/neigh 0 0 1.0
 package gpu force/neigh 0 1 -1.0
 package cuda gpu/node/special 2 0 2
 package cuda test 3948
+package kokkos neigh half/thread comm/forward device
 package omp * force/neigh
 package omp 4 force 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>This command invokes package-specific settings.  Currently the
-following packages use it: GPU, USER-CUDA, and USER-OMP.
+following packages use it: USER-CUDA, GPU, KOKKOS, and USER-OMP.
 </P>
 <P>To use the accelerated GPU and USER-OMP styles, the use of the package
 command is required.  However, as described in the "Defaults" section
@ -74,9 +81,9 @@ options</A> to enable use of these styles,
 then default package settings are enabled.  In that case you only need
 to use the package command if you want to change the defaults.
 </P>
-<P>To use the accelerate USER-CUDA styles, the package command is not
-required as defaults are assigned internally.  You only need to use
-the package command if you want to change the defaults.
+<P>To use the accelerated USER-CUDA and KOKKOS styles, the package
+command is not required as defaults are assigned internally.  You only
+need to use the package command if you want to change the defaults.
 </P>
 <P>See <A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual for
 more details about using these various packages for accelerating
@ -84,6 +91,58 @@ LAMMPS calculations.
 </P>
 <HR>

+<P>The <I>cuda</I> style invokes options associated with the use of the
+USER-CUDA package.  
+</P>
+<P>The <I>gpu/node</I> keyword specifies the number <I>N</I> of GPUs to be used on
+each node.  An MPI process with rank <I>K</I> will use the GPU (K mod N).
+This implies that processes should be assigned with successive ranks
+on each node, which is the default with most (or even all) MPI
+implementations. The default value for <I>N</I> is 2.
+</P>
+<P>The <I>gpu/node/special</I> keyword also specifies the number (N) of GPUs
+to be used on each node, but allows more control over their
+specification.  An MPI process with rank <I>K</I> will use the GPU <I>gpuI</I>
+with l = (K mod N) + 1. This implies that processes should be assigned
+with successive ranks on each node, which is the default with most (or
+even all) MPI implementations.  For example if you have three GPUs on
+a machine, one of which is used for the X-Server (the GPU with the ID
+1) while the others (with IDs 0 and 2) are used for computations you
+would specify:
+</P>
+<PRE>package cuda gpu/node/special 2 0 2 
+</PRE>
+<P>A main purpose of the <I>gpu/node/special</I> optoin is to allow two (or
+more) simulations to be run on one workstation.  In that case one
+would set the first simulation to use GPU 0 and the second to use GPU
+1. This is not necessary though, if the GPUs are in what is called
+<I>compute exclusive</I> mode.  Using that setting, every process will get
+its own GPU automatically.  This <I>compute exclusive</I> mode can be set
+as root using the <I>nvidia-smi</I> tool which is part of the CUDA
+installation.
+</P>
+<P>Note that if the <I>gpu/node/special</I> keyword is not used, the USER-CUDA
+package sorts existing GPUs on each node according to their number of
+multiprocessors.  This way, compute GPUs will be priorized over
+X-Server GPUs.
+</P>
+<P>Use of the <I>timing</I> keyword will output detailed timing information
+for various subroutines.
+</P>
+<P>The <I>test</I> keyword will output info for the the specified atom at
+several points during each time step.  This is mainly usefull for
+debugging purposes.  Note that the simulation will be severly slowed
+down if this option is used.
+</P>
+<P>The <I>override/bpa</I> keyword can be used to specify which mode is used
+for pair-force evaluation.  TpA = one thread per atom; BpA = one block
+per atom.  If this keyword is not used, a short test at the begin of
+each run will determine which method is more effective (the result of
+this test is part of the LAMMPS output).  Therefore it is usually not
+necessary to use this keyword.
+</P>
+<HR>
+
 <P>The <I>gpu</I> style invokes options associated with the use of the GPU
 package. 
 </P>
@ -157,55 +216,59 @@ device type can be specified when building LAMMPS with the GPU library.
 </P>
 <HR>

-<P>The <I>cuda</I> style invokes options associated with the use of the
-USER-CUDA package.  
+<P>The <I>kokkos</I> style invokes options associated with the use of the
+KOKKOS package.
 </P>
-<P>The <I>gpu/node</I> keyword specifies the number <I>N</I> of GPUs to be used on
-each node.  An MPI process with rank <I>K</I> will use the GPU (K mod N).
-This implies that processes should be assigned with successive ranks
-on each node, which is the default with most (or even all) MPI
-implementations. The default value for <I>N</I> is 2.
+<P>The <I>neigh</I> keyword determines what kinds of neighbor lists are built.
+A value of <I>half</I> uses half-neighbor lists, the same as used by most
+pair styles in LAMMPS.  A value of <I>half/thread</I> uses a threadsafe
+variant of the half-neighbor list.  It should be used instead of
+<I>half</I> when running with threads on a CPU.  A value of <I>full</I> uses a
+full-neighborlist, i.e. f_ij and f_ji are both calculated.  This
+performs twice as much computation as the <I>half</I> option, however that
+can be a win because it is threadsafe and doesn't require atomic
+operations.  A value of <I>full/cluster</I> is an experimental neighbor
+style, where particles interact with all particles within a small
+cluster, if at least one of the clusters particles is within the
+neighbor cutoff range.  This potentially allows for better
+vectorization on architectures such as the Intel Phi.  If also reduces
+the size of the neighbor list by roughly a factor of the cluster size,
+thus reducing the total memory footprint considerably.
 </P>
-<P>The <I>gpu/node/special</I> keyword also specifies the number (N) of GPUs
-to be used on each node, but allows more control over their
-specification.  An MPI process with rank <I>K</I> will use the GPU <I>gpuI</I>
-with l = (K mod N) + 1. This implies that processes should be assigned
-with successive ranks on each node, which is the default with most (or
-even all) MPI implementations.  For example if you have three GPUs on
-a machine, one of which is used for the X-Server (the GPU with the ID
-1) while the others (with IDs 0 and 2) are used for computations you
-would specify:
+<P>The <I>comm/exchange</I> and <I>comm/forward</I> keywords determine whether the
+host or device performs the packing and unpacking of data when
+communicating information between processors.  "Exchange"
+communication happens only on timesteps that neighbor lists are
+rebuilt.  The data is only for atoms that migrate to new processors.
+"Forward" communication happens every timestep.  The data is for atom
+coordinates and any other atom properties that needs to be updated for
+ghost atoms owned by each processor.
 </P>
-<PRE>package cuda gpu/node/special 2 0 2 
-</PRE>
-<P>A main purpose of the <I>gpu/node/special</I> optoin is to allow two (or
-more) simulations to be run on one workstation.  In that case one
-would set the first simulation to use GPU 0 and the second to use GPU
-1. This is not necessary though, if the GPUs are in what is called
-<I>compute exclusive</I> mode.  Using that setting, every process will get
-its own GPU automatically.  This <I>compute exclusive</I> mode can be set
-as root using the <I>nvidia-smi</I> tool which is part of the CUDA
-installation.
+<P>The value options for these keywords are <I>no</I> or <I>host</I> or <I>device</I>.
+A value of <I>no</I> means to use the standard non-KOKKOS method of
+packing/unpacking data for the communication.  A value of <I>host</I> means
+to use the host, typically a multi-core CPU, and perform the
+packing/unpacking in parallel with threads.  A value of <I>device</I> means
+to use the device, typically a GPU, to perform the packing/unpacking
+operation.
 </P>
-<P>Note that if the <I>gpu/node/special</I> keyword is not used, the USER-CUDA
-package sorts existing GPUs on each node according to their number of
-multiprocessors.  This way, compute GPUs will be priorized over
-X-Server GPUs.
-</P>
-<P>Use of the <I>timing</I> keyword will output detailed timing information
-for various subroutines.
-</P>
-<P>The <I>test</I> keyword will output info for the the specified atom at
-several points during each time step.  This is mainly usefull for
-debugging purposes.  Note that the simulation will be severly slowed
-down if this option is used.
-</P>
-<P>The <I>override/bpa</I> keyword can be used to specify which mode is used
-for pair-force evaluation.  TpA = one thread per atom; BpA = one block
-per atom.  If this keyword is not used, a short test at the begin of
-each run will determine which method is more effective (the result of
-this test is part of the LAMMPS output).  Therefore it is usually not
-necessary to use this keyword.
+<P>The optimal choice for these keywords depends on the input script and
+the hardware used.  The <I>no</I> value is useful for verifying that Kokkos
+code is working correctly.  It may also be the fastest choice when
+using Kokkos styles in MPI-only mode (i.e. with a thread count of 1).
+When running on CPUs or Xeon Phi, the <I>host</I> and <I>device</I> values work
+identically.  When using GPUs, the <I>device</I> value will typically be
+optimal if all of your styles used in your input script are supported
+by the KOKKOS package.  In this case data can stay on the GPU for many
+timesteps without being moved between the host and GPU, if you use the
+<I>device</I> value.  This requires that your MPI is able to access GPU
+memory directly.  Currently that is true for OpenMPI 1.8 (or later
+versions), Mvapich2 1.9 (or later), and CrayMPI.  If your script uses
+styles (e.g. fixes) which are not yet supported by the KOKKOS package,
+then data has to be move between the host and device anyway, so it is
+typically faster to let the host handle communication, by using the
+<I>host</I> value.  Using <I>host</I> instead of <I>no</I> will enable use of
+multiple threads to pack/unpack communicated data.
 </P>
 <HR>

@ -262,6 +325,10 @@ LAMMPS</A> section for more info.
 with the GPU package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
 </P>
+<P>The kk style of this command can only be invoked if LAMMPS was built
+with the KOKKOS package.  See the <A HREF = "Section_start.html#start_3">Making
+LAMMPS</A> section for more info.
+</P>
 <P>The omp style of this command can only be invoked if LAMMPS was built
 with the USER-OMP package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
@ -272,15 +339,20 @@ LAMMPS</A> section for more info.
 </P>
 <P><B>Default:</B>
 </P>
+<P>The default settings for the USER-CUDA package are "package cuda gpu
+2".  This is the case whether the "-sf cuda" <A HREF = "Section_start.html#start_7">command-line
+switch</A> is used or not.
+</P>
 <P>If the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line switch</A> is
 used then it is as if the command "package gpu force/neigh 0 0 1" were
 invoked, to specify default settings for the GPU package.  If the
 command-line switch is not used, then no defaults are set, and you
 must specify the appropriate package command in your input script.
 </P>
-<P>The default settings for the USER CUDA package are "package cuda gpu
-2".  This is the case whether the "-sf cuda" <A HREF = "Section_start.html#start_7">command-line
-switch</A> is used or not.
+<P>The default settings for the KOKKOS package are "package kk neigh full 
+comm/exchange host comm/forward host".  This is the case whether the
+"-sf kk" <A HREF = "Section_start.html#start_7">command-line switch</A> is used or
+not.
 </P>
 <P>If the "-sf omp" <A HREF = "Section_start.html#start_7">command-line switch</A> is
 used then it is as if the command "package omp *" were invoked, to
--- a/doc/package.txt
+++ b/doc/package.txt
@ -12,21 +12,8 @@ package command :h3

 package style args :pre

-style = {gpu} or {cuda} or {omp} :ulb,l
+style = {cuda} or {gpu} or {kokkos} or {omp} :ulb,l
 args = arguments specific to the style :l
-  {gpu} args = mode first last split keyword value ...
-    mode = force or force/neigh
-    first = ID of first GPU to be used on each node
-    last = ID of last GPU to be used on each node
-    split = fraction of particles assigned to the GPU
-    zero or more keyword/value pairs may be appended
-    keywords = {threads_per_atom} or {cellsize} or {device}
-      {threads_per_atom} value = Nthreads
-        Nthreads = # of GPU threads used per atom
-      {cellsize} value = dist
-        dist = length (distance units) in each dimension for neighbor bins
-      {device} value = device_type
-        device_type = {kepler} or {fermi} or {cypress} or {phi} or {intel} or {generic}
  {cuda} args = keyword value ...
    one or more keyword/value pairs may be appended
    keywords = {gpu/node} or {gpu/node/special} or {timing} or {test} or {override/bpa}
@ -40,6 +27,25 @@ args = arguments specific to the style :l
        id = atom-ID of a test particle
      {override/bpa} values = flag
        flag = 0 for TpA algorithm, 1 for BpA algorithm 
+  {gpu} args = mode first last split keyword value ...
+    mode = force or force/neigh
+    first = ID of first GPU to be used on each node
+    last = ID of last GPU to be used on each node
+    split = fraction of particles assigned to the GPU
+    zero or more keyword/value pairs may be appended
+    keywords = {threads_per_atom} or {cellsize} or {device}
+      {threads_per_atom} value = Nthreads
+        Nthreads = # of GPU threads used per atom
+      {cellsize} value = dist
+        dist = length (distance units) in each dimension for neighbor bins
+      {device} value = device_type
+        device_type = {kepler} or {fermi} or {cypress} or {phi} or {intel} or {generic}
+  {kokkos} args = keyword value ...
+    one or more keyword/value pairs may be appended
+    keywords = {neigh} or {comm/exchange} or {comm/forward}
+      {neigh} value = {full} or {half/thread} or {half} or {n2} or {full/cluster}
+      {comm/exchange} value = {no} or {host} or {device}
+      {comm/forward} value = {no} or {host} or {device}
  {omp} args = Nthreads mode
    Nthreads = # of OpenMP threads to associate with each MPI process
    mode = force or force/neigh (optional) :pre
@ -53,13 +59,14 @@ package gpu force/neigh 0 0 1.0
 package gpu force/neigh 0 1 -1.0
 package cuda gpu/node/special 2 0 2
 package cuda test 3948
+package kokkos neigh half/thread comm/forward device
 package omp * force/neigh
 package omp 4 force :pre

 [Description:]

 This command invokes package-specific settings.  Currently the
-following packages use it: GPU, USER-CUDA, and USER-OMP.
+following packages use it: USER-CUDA, GPU, KOKKOS, and USER-OMP.

 To use the accelerated GPU and USER-OMP styles, the use of the package
 command is required.  However, as described in the "Defaults" section
@ -68,9 +75,9 @@ options"_Section_start.html#start_7 to enable use of these styles,
 then default package settings are enabled.  In that case you only need
 to use the package command if you want to change the defaults.

-To use the accelerate USER-CUDA styles, the package command is not
-required as defaults are assigned internally.  You only need to use
-the package command if you want to change the defaults.
+To use the accelerated USER-CUDA and KOKKOS styles, the package
+command is not required as defaults are assigned internally.  You only
+need to use the package command if you want to change the defaults.

 See "Section_accelerate"_Section_accelerate.html of the manual for
 more details about using these various packages for accelerating
@ -78,6 +85,58 @@ LAMMPS calculations.

 :line

+The {cuda} style invokes options associated with the use of the
+USER-CUDA package.  
+
+The {gpu/node} keyword specifies the number {N} of GPUs to be used on
+each node.  An MPI process with rank {K} will use the GPU (K mod N).
+This implies that processes should be assigned with successive ranks
+on each node, which is the default with most (or even all) MPI
+implementations. The default value for {N} is 2.
+
+The {gpu/node/special} keyword also specifies the number (N) of GPUs
+to be used on each node, but allows more control over their
+specification.  An MPI process with rank {K} will use the GPU {gpuI}
+with l = (K mod N) + 1. This implies that processes should be assigned
+with successive ranks on each node, which is the default with most (or
+even all) MPI implementations.  For example if you have three GPUs on
+a machine, one of which is used for the X-Server (the GPU with the ID
+1) while the others (with IDs 0 and 2) are used for computations you
+would specify:
+
+package cuda gpu/node/special 2 0 2 :pre
+
+A main purpose of the {gpu/node/special} optoin is to allow two (or
+more) simulations to be run on one workstation.  In that case one
+would set the first simulation to use GPU 0 and the second to use GPU
+1. This is not necessary though, if the GPUs are in what is called
+{compute exclusive} mode.  Using that setting, every process will get
+its own GPU automatically.  This {compute exclusive} mode can be set
+as root using the {nvidia-smi} tool which is part of the CUDA
+installation.
+
+Note that if the {gpu/node/special} keyword is not used, the USER-CUDA
+package sorts existing GPUs on each node according to their number of
+multiprocessors.  This way, compute GPUs will be priorized over
+X-Server GPUs.
+ 
+Use of the {timing} keyword will output detailed timing information
+for various subroutines.
+
+The {test} keyword will output info for the the specified atom at
+several points during each time step.  This is mainly usefull for
+debugging purposes.  Note that the simulation will be severly slowed
+down if this option is used.
+
+The {override/bpa} keyword can be used to specify which mode is used
+for pair-force evaluation.  TpA = one thread per atom; BpA = one block
+per atom.  If this keyword is not used, a short test at the begin of
+each run will determine which method is more effective (the result of
+this test is part of the LAMMPS output).  Therefore it is usually not
+necessary to use this keyword.
+
+:line
+
 The {gpu} style invokes options associated with the use of the GPU
 package. 

@ -152,55 +211,59 @@ the GPU library.

 :line

-The {cuda} style invokes options associated with the use of the
-USER-CUDA package.  
+The {kokkos} style invokes options associated with the use of the
+KOKKOS package.

-The {gpu/node} keyword specifies the number {N} of GPUs to be used on
-each node.  An MPI process with rank {K} will use the GPU (K mod N).
-This implies that processes should be assigned with successive ranks
-on each node, which is the default with most (or even all) MPI
-implementations. The default value for {N} is 2.
+The {neigh} keyword determines what kinds of neighbor lists are built.
+A value of {half} uses half-neighbor lists, the same as used by most
+pair styles in LAMMPS.  A value of {half/thread} uses a threadsafe
+variant of the half-neighbor list.  It should be used instead of
+{half} when running with threads on a CPU.  A value of {full} uses a
+full-neighborlist, i.e. f_ij and f_ji are both calculated.  This
+performs twice as much computation as the {half} option, however that
+can be a win because it is threadsafe and doesn't require atomic
+operations.  A value of {full/cluster} is an experimental neighbor
+style, where particles interact with all particles within a small
+cluster, if at least one of the clusters particles is within the
+neighbor cutoff range.  This potentially allows for better
+vectorization on architectures such as the Intel Phi.  If also reduces
+the size of the neighbor list by roughly a factor of the cluster size,
+thus reducing the total memory footprint considerably.

-The {gpu/node/special} keyword also specifies the number (N) of GPUs
-to be used on each node, but allows more control over their
-specification.  An MPI process with rank {K} will use the GPU {gpuI}
-with l = (K mod N) + 1. This implies that processes should be assigned
-with successive ranks on each node, which is the default with most (or
-even all) MPI implementations.  For example if you have three GPUs on
-a machine, one of which is used for the X-Server (the GPU with the ID
-1) while the others (with IDs 0 and 2) are used for computations you
-would specify:
+The {comm/exchange} and {comm/forward} keywords determine whether the
+host or device performs the packing and unpacking of data when
+communicating information between processors.  "Exchange"
+communication happens only on timesteps that neighbor lists are
+rebuilt.  The data is only for atoms that migrate to new processors.
+"Forward" communication happens every timestep.  The data is for atom
+coordinates and any other atom properties that needs to be updated for
+ghost atoms owned by each processor.

-package cuda gpu/node/special 2 0 2 :pre
+The value options for these keywords are {no} or {host} or {device}.
+A value of {no} means to use the standard non-KOKKOS method of
+packing/unpacking data for the communication.  A value of {host} means
+to use the host, typically a multi-core CPU, and perform the
+packing/unpacking in parallel with threads.  A value of {device} means
+to use the device, typically a GPU, to perform the packing/unpacking
+operation.

-A main purpose of the {gpu/node/special} optoin is to allow two (or
-more) simulations to be run on one workstation.  In that case one
-would set the first simulation to use GPU 0 and the second to use GPU
-1. This is not necessary though, if the GPUs are in what is called
-{compute exclusive} mode.  Using that setting, every process will get
-its own GPU automatically.  This {compute exclusive} mode can be set
-as root using the {nvidia-smi} tool which is part of the CUDA
-installation.
-
-Note that if the {gpu/node/special} keyword is not used, the USER-CUDA
-package sorts existing GPUs on each node according to their number of
-multiprocessors.  This way, compute GPUs will be priorized over
-X-Server GPUs.
- 
-Use of the {timing} keyword will output detailed timing information
-for various subroutines.
-
-The {test} keyword will output info for the the specified atom at
-several points during each time step.  This is mainly usefull for
-debugging purposes.  Note that the simulation will be severly slowed
-down if this option is used.
-
-The {override/bpa} keyword can be used to specify which mode is used
-for pair-force evaluation.  TpA = one thread per atom; BpA = one block
-per atom.  If this keyword is not used, a short test at the begin of
-each run will determine which method is more effective (the result of
-this test is part of the LAMMPS output).  Therefore it is usually not
-necessary to use this keyword.
+The optimal choice for these keywords depends on the input script and
+the hardware used.  The {no} value is useful for verifying that Kokkos
+code is working correctly.  It may also be the fastest choice when
+using Kokkos styles in MPI-only mode (i.e. with a thread count of 1).
+When running on CPUs or Xeon Phi, the {host} and {device} values work
+identically.  When using GPUs, the {device} value will typically be
+optimal if all of your styles used in your input script are supported
+by the KOKKOS package.  In this case data can stay on the GPU for many
+timesteps without being moved between the host and GPU, if you use the
+{device} value.  This requires that your MPI is able to access GPU
+memory directly.  Currently that is true for OpenMPI 1.8 (or later
+versions), Mvapich2 1.9 (or later), and CrayMPI.  If your script uses
+styles (e.g. fixes) which are not yet supported by the KOKKOS package,
+then data has to be move between the host and device anyway, so it is
+typically faster to let the host handle communication, by using the
+{host} value.  Using {host} instead of {no} will enable use of
+multiple threads to pack/unpack communicated data.

 :line

@ -256,8 +319,10 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 The gpu style of this command can only be invoked if LAMMPS was built
 with the GPU package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
-When using the "r-RESPA run style"_run_style.html, GPU accelerated
-styles can only be used on the outermost RESPA level.
+
+The kk style of this command can only be invoked if LAMMPS was built
+with the KOKKOS package.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.

 The omp style of this command can only be invoked if LAMMPS was built
 with the USER-OMP package.  See the "Making
@ -269,15 +334,20 @@ LAMMPS"_Section_start.html#start_3 section for more info.

 [Default:]

+The default settings for the USER-CUDA package are "package cuda gpu
+2".  This is the case whether the "-sf cuda" "command-line
+switch"_Section_start.html#start_7 is used or not.
+
 If the "-sf gpu" "command-line switch"_Section_start.html#start_7 is
 used then it is as if the command "package gpu force/neigh 0 0 1" were
 invoked, to specify default settings for the GPU package.  If the
 command-line switch is not used, then no defaults are set, and you
 must specify the appropriate package command in your input script.

-The default settings for the USER CUDA package are "package cuda gpu
-2".  This is the case whether the "-sf cuda" "command-line
-switch"_Section_start.html#start_7 is used or not.
+The default settings for the KOKKOS package are "package kk neigh full 
+comm/exchange host comm/forward host".  This is the case whether the
+"-sf kk" "command-line switch"_Section_start.html#start_7 is used or
+not.

 If the "-sf omp" "command-line switch"_Section_start.html#start_7 is
 used then it is as if the command "package omp *" were invoked, to
--- a/doc/pair_lj.html
+++ b/doc/pair_lj.html
@ -17,6 +17,8 @@
 </H3>
 <H3>pair_style lj/cut/gpu command 
 </H3>
+<H3>pair_style lj/cut/kk command 
+</H3>
 <H3>pair_style lj/cut/opt command 
 </H3>
 <H3>pair_style lj/cut/omp command 
@ -263,17 +265,18 @@ pair_style command.
 </P>
 <HR>

-<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>omp</I>, or <I>opt</I> suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A> of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
+<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I> suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A>
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
 </P>
-<P>These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the <A HREF = "Section_start.html#start_3">Making LAMMPS</A>
-section for more info.
+<P>These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the <A HREF = "Section_start.html#start_3">Making
+LAMMPS</A> section for more info.
 </P>
 <P>You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the <A HREF = "Section_start.html#start_7">-suffix command-line
--- a/doc/pair_lj.txt
+++ b/doc/pair_lj.txt
@ -10,6 +10,7 @@ pair_style lj/cut command :h3
 pair_style lj/cut/cuda command :h3
 pair_style lj/cut/experimental/cuda command :h3
 pair_style lj/cut/gpu command :h3
+pair_style lj/cut/kk command :h3
 pair_style lj/cut/opt command :h3
 pair_style lj/cut/omp command :h3
 pair_style lj/cut/coul/cut command :h3
@ -230,17 +231,18 @@ pair_style command.

 :line

-Styles with a {cuda}, {gpu}, {omp}, or {opt} suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in "Section_accelerate"_Section_accelerate.html of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
+Styles with a {cuda}, {gpu}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section_accelerate"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.

-These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the "Making LAMMPS"_Section_start.html#start_3
-section for more info.
+These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.

 You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the "-suffix command-line
--- a/doc/pair_table.html
+++ b/doc/pair_table.html
@ -13,6 +13,8 @@
 </H3>
 <H3>pair_style table/gpu command 
 </H3>
+<H3>pair_style table/kk command 
+</H3>
 <H3>pair_style table/omp command 
 </H3>
 <P><B>Syntax:</B>
@ -200,17 +202,18 @@ one that matches the specified keyword.
 </P>
 <HR>

-<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>omp</I>, or <I>opt</I> suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A> of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
+<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I> suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A>
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
 </P>
-<P>These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the <A HREF = "Section_start.html#start_3">Making LAMMPS</A>
-section for more info.
+<P>These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the <A HREF = "Section_start.html#start_3">Making
+LAMMPS</A> section for more info.
 </P>
 <P>You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the <A HREF = "Section_start.html#start_7">-suffix command-line
--- a/doc/pair_table.txt
+++ b/doc/pair_table.txt
@ -8,6 +8,7 @@

 pair_style table command :h3
 pair_style table/gpu command :h3
+pair_style table/kk command :h3
 pair_style table/omp command :h3

 [Syntax:]
@ -195,17 +196,18 @@ one that matches the specified keyword.

 :line

-Styles with a {cuda}, {gpu}, {omp}, or {opt} suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in "Section_accelerate"_Section_accelerate.html of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
+Styles with a {cuda}, {gpu}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section_accelerate"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.

-These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the "Making LAMMPS"_Section_start.html#start_3
-section for more info.
+These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.

 You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the "-suffix command-line
--- a/doc/suffix.html
+++ b/doc/suffix.html
@ -15,56 +15,62 @@
 </P>
 <PRE>suffix style 
 </PRE>
-<UL><LI>style = <I>off</I> or <I>on</I> or <I>opt</I> or <I>omp</I> or <I>gpu</I> or <I>cuda</I> 
+<UL><LI>style = <I>off</I> or <I>on</I> or <I>cuda</I> or <I>gpu</I> or <I>kk</I> or <I>omp</I> or <I>opt</I> 
 </UL>
 <P><B>Examples:</B>
 </P>
 <PRE>suffix off
 suffix on
-suffix gpu 
+suffix gpu
+suffix kk 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>This command allows you to use variants of various styles if they
 exist.  In that respect it operates the same as the <A HREF = "Section_start.html#start_7">-suffix
 command-line switch</A>.  It also has options
-to turn off/on any suffix setting made via the command line.
+to turn off or back on any suffix setting made via the command line.
 </P>
-<P>The specified style can be <I>opt</I>, <I>omp</I>, <I>gpu</I>, or <I>cuda</I>.  These refer to
-optional packages that LAMMPS can be built with, as described in <A HREF = "Section_start.html#start_3">this
-section of the manual</A>.  The "opt" style
-corrsponds to the OPT package, the "omp" style to the USER-OMP package, 
-the "gpu" style to the GPU package, and the "cuda" style to the
-USER-CUDA package.
+<P>The specified style can be <I>cuda</I>, <I>gpu</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I>.
+These refer to optional packages that LAMMPS can be built with, as
+described in <A HREF = "Section_start.html#start_3">this section of the manual</A>.
+The "cuda" style corresponds to the USER-CUDA package, the "gpu" style
+to the GPU package, the "kk" style to the KOKKOS package, the "omp"
+style to the USER-OMP package, and the "opt" style to the OPT package,
 </P>
 <P>These are the variants these packages provide:
 </P>
-<UL><LI>OPT = a handful of pair styles, cache-optimized for faster CPU
-performance 
+<UL><LI>USER-CUDA = a collection of atom, pair, fix, compute, and intergrate
+styles, optimized to run on one or more NVIDIA GPUs 
+
+<LI>GPU = a handful of pair styles and the PPPM kspace_style, optimized to
+run on one or more GPUs or multicore CPU/GPU nodes 
+
+<LI>KOKKOS = a collection of atom, pair, and fix styles optimized to run
+using the Kokkos library on various kinds of hardware, including GPUs
+via Cuda and many-core chips via OpenMP or threading. 

 <LI>USER-OMP = a collection of pair, bond, angle, dihedral, improper,
 kspace, compute, and fix styles with support for OpenMP
 multi-threading 

-<LI>GPU = a handful of pair styles and the PPPM kspace_style, optimized to
-run on one or more GPUs or multicore CPU/GPU nodes 
-
-<LI>USER-CUDA = a collection of atom, pair, fix, compute, and intergrate
-styles, optimized to run on one or more NVIDIA GPUs 
+<LI>OPT = a handful of pair styles, cache-optimized for faster CPU
+performance 
 </UL>
 <P>As an example, all of the packages provide a <A HREF = "pair_lj.html">pair_style
 lj/cut</A> variant, with style names lj/cut/opt, lj/cut/omp,
-lj/cut/gpu, or lj/cut/cuda.  A variant styles can be specified
-explicitly in your input script, e.g. pair_style lj/cut/gpu.  If the
-suffix command is used with the appropriate style, you do not need to
-modify your input script.  The specified suffix (opt,omp,gpu,cuda) is
-automatically appended whenever your input script command creates a
-new <A HREF = "atom_style.html">atom</A>, <A HREF = "pair_style.html">pair</A>,
-<A HREF = "bond_style.html">bond</A>, <A HREF = "angle_style.html">angle</A>,
-<A HREF = "dihedral_style.html">dihedral</A>, <A HREF = "improper_style.html">improper</A>,
-<A HREF = "kspace_style.html">kspace</A>, <A HREF = "fix.html">fix</A>, <A HREF = "compute.html">compute</A>, or
-<A HREF = "run_style.html">run</A> style.  If the variant version does not exist,
-the standard version is created.
+lj/cut/gpu, lj/cut/cuda, or lj/cut/kk.  A variant styles can be
+specified explicitly in your input script, e.g. pair_style lj/cut/gpu.
+If the suffix command is used with the appropriate style, you do not
+need to modify your input script.  The specified suffix
+(opt,omp,gpu,cuda,kk) is automatically appended whenever your input
+script command creates a new <A HREF = "atom_style.html">atom</A>,
+<A HREF = "pair_style.html">pair</A>, <A HREF = "bond_style.html">bond</A>,
+<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>,
+<A HREF = "improper_style.html">improper</A>, <A HREF = "kspace_style.html">kspace</A>,
+<A HREF = "fix.html">fix</A>, <A HREF = "compute.html">compute</A>, or <A HREF = "run_style.html">run</A> style.
+If the variant version does not exist, the standard version is
+created.
 </P>
 <P>If the specified style is <I>off</I>, then any previously specified suffix
 is temporarily disabled, whether it was specified by a command-line
--- a/doc/suffix.txt
+++ b/doc/suffix.txt
@ -12,56 +12,62 @@ suffix command :h3

 suffix style :pre

-style = {off} or {on} or {opt} or {omp} or {gpu} or {cuda} :ul
+style = {off} or {on} or {cuda} or {gpu} or {kk} or {omp} or {opt} :ul

 [Examples:]

 suffix off
 suffix on
-suffix gpu :pre
+suffix gpu
+suffix kk :pre

 [Description:]

 This command allows you to use variants of various styles if they
 exist.  In that respect it operates the same as the "-suffix
 command-line switch"_Section_start.html#start_7.  It also has options
-to turn off/on any suffix setting made via the command line.
+to turn off or back on any suffix setting made via the command line.

-The specified style can be {opt}, {omp}, {gpu}, or {cuda}.  These refer to
-optional packages that LAMMPS can be built with, as described in "this
-section of the manual"_Section_start.html#start_3.  The "opt" style
-corrsponds to the OPT package, the "omp" style to the USER-OMP package, 
-the "gpu" style to the GPU package, and the "cuda" style to the
-USER-CUDA package.
+The specified style can be {cuda}, {gpu}, {kk}, {omp}, or {opt}.
+These refer to optional packages that LAMMPS can be built with, as
+described in "this section of the manual"_Section_start.html#start_3.
+The "cuda" style corresponds to the USER-CUDA package, the "gpu" style
+to the GPU package, the "kk" style to the KOKKOS package, the "omp"
+style to the USER-OMP package, and the "opt" style to the OPT package,

 These are the variants these packages provide:

-OPT = a handful of pair styles, cache-optimized for faster CPU
-performance :ulb,l
+USER-CUDA = a collection of atom, pair, fix, compute, and intergrate
+styles, optimized to run on one or more NVIDIA GPUs :ulb,l
+
+GPU = a handful of pair styles and the PPPM kspace_style, optimized to
+run on one or more GPUs or multicore CPU/GPU nodes :l
+
+KOKKOS = a collection of atom, pair, and fix styles optimized to run
+using the Kokkos library on various kinds of hardware, including GPUs
+via Cuda and many-core chips via OpenMP or threading. :l

 USER-OMP = a collection of pair, bond, angle, dihedral, improper,
 kspace, compute, and fix styles with support for OpenMP
 multi-threading :l

-GPU = a handful of pair styles and the PPPM kspace_style, optimized to
-run on one or more GPUs or multicore CPU/GPU nodes :l
-
-USER-CUDA = a collection of atom, pair, fix, compute, and intergrate
-styles, optimized to run on one or more NVIDIA GPUs :l,ule
+OPT = a handful of pair styles, cache-optimized for faster CPU
+performance :ule,l

 As an example, all of the packages provide a "pair_style
 lj/cut"_pair_lj.html variant, with style names lj/cut/opt, lj/cut/omp,
-lj/cut/gpu, or lj/cut/cuda.  A variant styles can be specified
-explicitly in your input script, e.g. pair_style lj/cut/gpu.  If the
-suffix command is used with the appropriate style, you do not need to
-modify your input script.  The specified suffix (opt,omp,gpu,cuda) is
-automatically appended whenever your input script command creates a
-new "atom"_atom_style.html, "pair"_pair_style.html,
-"bond"_bond_style.html, "angle"_angle_style.html,
-"dihedral"_dihedral_style.html, "improper"_improper_style.html,
-"kspace"_kspace_style.html, "fix"_fix.html, "compute"_compute.html, or
-"run"_run_style.html style.  If the variant version does not exist,
-the standard version is created.
+lj/cut/gpu, lj/cut/cuda, or lj/cut/kk.  A variant styles can be
+specified explicitly in your input script, e.g. pair_style lj/cut/gpu.
+If the suffix command is used with the appropriate style, you do not
+need to modify your input script.  The specified suffix
+(opt,omp,gpu,cuda,kk) is automatically appended whenever your input
+script command creates a new "atom"_atom_style.html,
+"pair"_pair_style.html, "bond"_bond_style.html,
+"angle"_angle_style.html, "dihedral"_dihedral_style.html,
+"improper"_improper_style.html, "kspace"_kspace_style.html,
+"fix"_fix.html, "compute"_compute.html, or "run"_run_style.html style.
+If the variant version does not exist, the standard version is
+created.

 If the specified style is {off}, then any previously specified suffix
 is temporarily disabled, whether it was specified by a command-line
--- a/doc/velocity.html
+++ b/doc/velocity.html
@ -207,11 +207,28 @@ are in units of lattice spacings per time (e.g. spacings/fmsec) and
 coordinates are in lattice spacings.  The <A HREF = "lattice.html">lattice</A>
 command must have been previously used to define the lattice spacing.
 </P>
-<P><B>Restrictions:</B> none
+<P><B>Restrictions:</B>
 </P>
+<P>Assigning a temperature via the <I>create</I> option to a system with
+<A HREF = "fix_rigid.html">rigid bodies</A> or <A HREF = "fix_shake.html">SHAKE constraints</A>
+may not have the desired outcome for two reasons.  First, the velocity
+command can be invoked before all of the relevant fixes are created
+and initialized and the number of adjusted degrees of freedom (DOFs)
+is known.  Thus it is not possible to compute the target temperature
+correctly.  Second, the assigned velocities may be partially canceled
+when constraints are first enforced, leading to a different
+temperature than desired.  A workaround for this is to perform a <A HREF = "run.html">run
+0</A> command, which insures all DOFs are accounted for
+properly, and then rescale the temperature to the desired value before
+performing a simulation.  For example:
+</P>
+<PRE>velocity all create 300.0 12345
+run 0                             # temperature may not be 300K
+velocity all scale 300.0          # now it should be 
+</PRE>
 <P><B>Related commands:</B>
 </P>
-<P><A HREF = "fix_shake.html">fix shake</A>, <A HREF = "lattice.html">lattice</A>
+<P><A HREF = "fix_rigid.html">fix rigid</A>, <A HREF = "fix_shake.html">fix shake</A>, <A HREF = "lattice.html">lattice</A>
 </P>
 <P><B>Default:</B>
 </P>
--- a/doc/velocity.txt
+++ b/doc/velocity.txt
@ -199,11 +199,28 @@ are in units of lattice spacings per time (e.g. spacings/fmsec) and
 coordinates are in lattice spacings.  The "lattice"_lattice.html
 command must have been previously used to define the lattice spacing.

-[Restrictions:] none
+[Restrictions:]
+
+Assigning a temperature via the {create} option to a system with
+"rigid bodies"_fix_rigid.html or "SHAKE constraints"_fix_shake.html
+may not have the desired outcome for two reasons.  First, the velocity
+command can be invoked before all of the relevant fixes are created
+and initialized and the number of adjusted degrees of freedom (DOFs)
+is known.  Thus it is not possible to compute the target temperature
+correctly.  Second, the assigned velocities may be partially canceled
+when constraints are first enforced, leading to a different
+temperature than desired.  A workaround for this is to perform a "run
+0"_run.html command, which insures all DOFs are accounted for
+properly, and then rescale the temperature to the desired value before
+performing a simulation.  For example:
+
+velocity all create 300.0 12345
+run 0                             # temperature may not be 300K
+velocity all scale 300.0          # now it should be :pre

 [Related commands:]

-"fix shake"_fix_shake.html, "lattice"_lattice.html
+"fix rigid"_fix_rigid.html, "fix shake"_fix_shake.html, "lattice"_lattice.html

 [Default:]

--- a/examples/README
+++ b/examples/README
@ -73,6 +73,7 @@ gpu:      use of the GPU package for GPU acceleration
 hugoniostat: Hugoniostat shock dynamics
 indent:	  spherical indenter into a 2d solid
 kim:      use of potentials in Knowledge Base for Interatomic Models (KIM)
+kokkos:   use of the KOKKOS package for multi-threading and GPU acceleration
 meam:	  MEAM test for SiC and shear (same as shear examples)
 melt:	  rapid melt of 3d LJ system
 micelle:  self-assembly of small lipid-like molecules into 2d bilayers
--- a/examples/gpu/README
+++ b/examples/gpu/README
@ -0,0 +1,35 @@
+These are input scripts designed for use with the GPU package.
+
+To run them, you must first build LAMMPS with the GPU package
+installed, following the steps explained in Section 2.3 of
+doc/Section_start.html and lib/gpu/README.  An overview of building
+and running LAMMPS with the GPU package is given in Section 5.6 of
+doc/Section_accelerate.html.  Note that you can choose the precision
+at which computations are performed on the GPU in the build process.
+
+Note that lines such as this in each of the input scripts:
+
+package 	gpu force/neigh 0 1 1
+
+are set for running on a compute node with 2 GPUs.  If you
+have a single GPU, you should comment out the line, since
+the default is 1 GPU per compute node.
+
+The scripts can be run in the usual manner:
+
+lmp_g++ < in.gpu.melt.2.5
+lmp_g++ < in.gpu.melt.5.0
+lmp_g++ < in.gpu.phosphate
+lmp_g++ < in.gpu.rhodo
+
+mpirun -np 4 lmp_g++ < in.gpu.melt.2.5
+mpirun -np 4 lmp_g++ < in.gpu.melt.5.0
+mpirun -np 4 lmp_g++ < in.gpu.phosphate
+mpirun -np 4 lmp_g++ < in.gpu.rhodo
+
+The first set of commmands will run a single MPI task using a single
+GPU (even if you have 2 GPUs).
+
+The second set of commands will run 4 MPI tasks, with 2 MPI tasks per
+GPU (if you have 2 GPUs), or 4 MPI tasks per GPU (if you have a single
+GPU).
--- a/examples/kokkos/README
+++ b/examples/kokkos/README
@ -0,0 +1,42 @@
+The in.kokkos input script is a copy of the bench/in.lj script,
+but can be run with the KOKKOS package,
+
+To run it, you must first build LAMMPS with the KOKKOS package
+installed, following the steps explained in Section 2.3.4 of
+doc/Section_start.html.  An overview of building and running LAMMPS
+with the KOKKOS package, for different compute-node hardware on your
+machine, is given in Section 5.8 of doc/Section_accelerate.html.
+
+The example log files included in this directory are for a desktop box
+with dual hex-core CPUs and 2 GPUs.
+
+Two executables were built in the following manner:
+
+make yes-kokkos
+make g++ OMP=yes -> lmp_cpu
+make cuda CUDA=yes -> lmp_cuda
+
+Then the following runs were made.  The "->" means that the run
+produced log.lammps which was then copied to the named log file.
+
+* MPI-only runs
+
+lmp_cpu -k off < in.kokkos -> log.kokkos.date.mpionly.1
+mpirun -np 4 lmp_cpu -k off < in.kokkos -> log.kokkos.date.mpionly.4
+
+* OpenMP threaded runs on CPUs only
+
+lmp_cpu -k on t 1 -sf kk < in.kokkos.half -> log.kokkos.date.cpu.1
+lmp_cpu -k on t 4 -sf kk < in.kokkos -> log.kokkos.date.cpu.4
+
+Note that in.kokkos.half was use for one of the runs, which uses the
+package command to force the use of half neighbor lists which are
+faster when running on just 1 thread.
+
+* GPU runs on 1 or 2 GPUs
+
+lmp_cuda -k on t 6 -sf kk < in.kokkos -> log.kokkos.date.gpu.1
+mpirun -np 2 lmp_cuda -k on t 6 -sf kk < in.kokkos -> log.kokkos.date.gpu.2
+
+Note that this is a very small problem (32K atoms) to run
+on 1 or 2 GPUs.
--- a/examples/kokkos/in.kokkos
+++ b/examples/kokkos/in.kokkos
@ -0,0 +1,30 @@
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	yy equal 20*$y
+variable	zz equal 20*$z
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box	1 box
+create_atoms	1 box
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
--- a/examples/kokkos/in.kokkos.half
+++ b/examples/kokkos/in.kokkos.half
@ -0,0 +1,32 @@
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	yy equal 20*$y
+variable	zz equal 20*$z
+
+package         kokkos neigh half
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box	1 box
+create_atoms	1 box
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
--- a/examples/kokkos/log.kokkos.1Feb14.cpu.1
+++ b/examples/kokkos/log.kokkos.1Feb14.cpu.1
@ -0,0 +1,68 @@
+LAMMPS (27 May 2014)
+KOKKOS mode is enabled (../lammps.cpp:468)
+  using 1 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+package         kokkos neigh half
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 7.79551 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 2.29105 on 1 procs (1 MPI x 1 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 1.82425 (79.6249)
+Neigh time (%) = 0.338632 (14.7806)
+Comm  time (%) = 0.0366232 (1.59853)
+Outpt time (%) = 0.000144005 (0.00628553)
+Other time (%) = 0.0914049 (3.98965)
+
+Nlocal:    32000 ave 32000 max 32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    19657 ave 19657 max 19657 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    1.20283e+06 ave 1.20283e+06 max 1.20283e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 1202833
+Ave neighs/atom = 37.5885
+Neighbor list builds = 5
+Dangerous builds = 0
--- a/examples/kokkos/log.kokkos.1Feb14.cpu.4
+++ b/examples/kokkos/log.kokkos.1Feb14.cpu.4
@ -0,0 +1,68 @@
+LAMMPS (27 May 2014)
+KOKKOS mode is enabled (../lammps.cpp:468)
+  using 4 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 13.2888 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 0.983697 on 4 procs (1 MPI x 4 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 0.767155 (77.9869)
+Neigh time (%) = 0.14734 (14.9782)
+Comm  time (%) = 0.041466 (4.21532)
+Outpt time (%) = 0.000172138 (0.0174991)
+Other time (%) = 0.0275636 (2.80204)
+
+Nlocal:    32000 ave 32000 max 32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    19657 ave 19657 max 19657 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    0 ave 0 max 0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:  2.40567e+06 ave 2.40567e+06 max 2.40567e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 2405666
+Ave neighs/atom = 75.1771
+Neighbor list builds = 5
+Dangerous builds = 0
--- a/examples/kokkos/log.kokkos.1Feb14.gpu.1
+++ b/examples/kokkos/log.kokkos.1Feb14.gpu.1
@ -0,0 +1,68 @@
+LAMMPS (27 May 2014)
+KOKKOS mode is enabled (../lammps.cpp:468)
+  using 6 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 16.9509 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 0.57192 on 6 procs (1 MPI x 6 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 0.205416 (35.917)
+Neigh time (%) = 0.112468 (19.665)
+Comm  time (%) = 0.174223 (30.4629)
+Outpt time (%) = 0.000159025 (0.0278055)
+Other time (%) = 0.0796535 (13.9274)
+
+Nlocal:    32000 ave 32000 max 32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    19657 ave 19657 max 19657 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    0 ave 0 max 0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:  2.40567e+06 ave 2.40567e+06 max 2.40567e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 2405666
+Ave neighs/atom = 75.1771
+Neighbor list builds = 5
+Dangerous builds = 0
--- a/examples/kokkos/log.kokkos.1Feb14.gpu.2
+++ b/examples/kokkos/log.kokkos.1Feb14.gpu.2
@ -0,0 +1,68 @@
+LAMMPS (27 May 2014)
+KOKKOS mode is enabled (../lammps.cpp:468)
+  using 6 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 1 by 2 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 8.95027 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 0.689608 on 12 procs (2 MPI x 6 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 0.210953 (30.5903)
+Neigh time (%) = 0.122991 (17.8349)
+Comm  time (%) = 0.25264 (36.6353)
+Outpt time (%) = 0.000259042 (0.0375636)
+Other time (%) = 0.102765 (14.9019)
+
+Nlocal:    16000 ave 16001 max 15999 min
+Histogram: 1 0 0 0 0 0 0 0 0 1
+Nghost:    13632.5 ave 13635 max 13630 min
+Histogram: 1 0 0 0 0 0 0 0 0 1
+Neighs:    0 ave 0 max 0 min
+Histogram: 2 0 0 0 0 0 0 0 0 0
+FullNghs:  1.20283e+06 ave 1.20347e+06 max 1.2022e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 1
+
+Total # of neighbors = 2405666
+Ave neighs/atom = 75.1771
+Neighbor list builds = 5
+Dangerous builds = 0
--- a/examples/kokkos/log.kokkos.1Feb14.mpionly.1
+++ b/examples/kokkos/log.kokkos.1Feb14.mpionly.1
@ -0,0 +1,65 @@
+LAMMPS (27 May 2014)
+  using 1 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 8.21387 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 2.57975 on 1 procs (1 MPI x 1 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 2.20959 (85.6512)
+Neigh time (%) = 0.269136 (10.4326)
+Comm  time (%) = 0.0252256 (0.977833)
+Outpt time (%) = 0.000126123 (0.00488898)
+Other time (%) = 0.0756752 (2.93343)
+
+Nlocal:    32000 ave 32000 max 32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    19657 ave 19657 max 19657 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    1.20283e+06 ave 1.20283e+06 max 1.20283e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 1202833
+Ave neighs/atom = 37.5885
+Neighbor list builds = 5
+Dangerous builds = 0
--- a/examples/kokkos/log.kokkos.1Feb14.mpionly.4
+++ b/examples/kokkos/log.kokkos.1Feb14.mpionly.4
@ -0,0 +1,65 @@
+LAMMPS (27 May 2014)
+  using 1 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 2 by 2 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 4.09506 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 0.709072 on 4 procs (4 MPI x 1 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 0.574495 (81.0206)
+Neigh time (%) = 0.0709588 (10.0073)
+Comm  time (%) = 0.0474771 (6.69567)
+Outpt time (%) = 6.62804e-05 (0.00934748)
+Other time (%) = 0.0160753 (2.26708)
+
+Nlocal:    8000 ave 8037 max 7964 min
+Histogram: 2 0 0 0 0 0 0 0 1 1
+Nghost:    9007.5 ave 9050 max 8968 min
+Histogram: 1 1 0 0 0 0 0 1 0 1
+Neighs:    300708 ave 305113 max 297203 min
+Histogram: 1 0 0 1 1 0 0 0 0 1
+
+Total # of neighbors = 1202833
+Ave neighs/atom = 37.5885
+Neighbor list builds = 5
+Dangerous builds = 0
--- a/lib/README
+++ b/lib/README
@ -19,6 +19,8 @@ cuda	      NVIDIA GPU routines, USER-CUDA package
                from Christian Trott (U Tech Ilmenau)
 gpu	      general GPU routines, GPU package
 	        from Mike Brown (ORNL)
+kokkos        Kokkos package for GPU and many-core acceleration
+                from Kokkos development team (Sandia)
 linalg        set of BLAS and LAPACK routines needed by USER-ATC package
 	        from Axel Kohlmeyer (Temple U)
 poems	      POEMS rigid-body integration package, POEMS package
--- a/lib/kokkos/Makefile.lammps
+++ b/lib/kokkos/Makefile.lammps
@ -0,0 +1,104 @@
+# Settings that the LAMMPS build will import when this package library is used
+ 
+OMP = yes
+CUDA = no
+HWLOC = no
+AVX = no
+MIC = no
+LIBRT = no
+DEBUG = no
+
+CUDA_PATH = /usr/local/cuda
+
+KOKKOS_PATH = ../../lib/kokkos
+kokkos_SYSINC = -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I../ 
+SRC_KOKKOS = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp)
+
+ifeq ($(CUDA), yes)
+kokkos_SYSINC += -x cu -DDEVICE=2 -DKOKKOS_HAVE_CUDA
+SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cu)
+USRLIB += -L$(CUDA_PATH)/lib64 -lcudart -lcuda 
+ifeq ($(UVM), yes)
+kokkos_SYSINC += -DKOKKOS_USE_UVM
+endif
+else
+kokkos_SYSINC += -DDEVICE=1
+endif
+
+ifeq ($(CUSPARSE), yes)
+kokkos_SYSINC += -DKOKKOS_USE_CUSPARSE
+USRLIB += -lcusparse
+endif
+
+ifeq ($(CUBLAS), yes)
+kokkos_SYSINC += -DKOKKOS_USE_CUBLAS
+USRLIB += -lcublas
+endif
+
+ifeq ($(AVX), yes)
+ifeq ($(CUDA), yes) 
+kokkos_SYSINC += -Xcompiler -mavx
+else
+kokkos_SYSINC += -mavx
+endif
+LINKFLAGS += -mavx
+endif
+
+ifeq ($(MIC), yes)
+kokkos_SYSINC += -mmic
+LINKFLAGS += -mmic
+endif
+
+ifeq ($(OMP),yes)
+kokkos_SYSINC += -DKOKKOS_HAVE_OPENMP 
+SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
+ifeq ($(CUDA), yes) 
+kokkos_SYSINC += -Xcompiler -fopenmp
+else
+kokkos_SYSINC += -fopenmp
+endif
+LINKFLAGS += -fopenmp
+else
+kokkos_SYSINC += -DKOKKOS_HAVE_PTHREAD
+USRLIB += -lpthread
+SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
+endif
+
+ifeq ($(HWLOC),yes)
+kokkos_SYSINC += -DKOKKOS_HAVE_HWLOC -I$(HWLOCPATH)/include
+USRLIB += -L$(HWLOCPATH)/lib -lhwloc
+endif
+
+ifeq ($(RED_PREC), yes)
+kokkos_SYSINC += --use_fast_math
+endif
+
+ifeq ($(DEBUG), yes)
+kokkos_SYSINC += -g -G -DKOKKOS_EXPRESSION_CHECK -DENABLE_TRACEBACK
+LINKFLAGS += -g
+endif
+
+ifeq ($(LIBRT),yes)
+kokkos_SYSINC += -DKOKKOS_USE_LIBRT -DPREC_TIMER
+USRLIB += -lrt
+endif
+
+ifeq ($(CUDALDG), yes)
+kokkos_SYSINC += -DKOKKOS_USE_LDG_INTRINSIC
+endif
+
+OBJ_KOKKOS_TMP = $(SRC_KOKKOS:.cpp=.o)
+OBJ_KOKKOS = $(OBJ_KOKKOS_TMP:.cu=.o)
+OBJ_KOKKOS_LINK = $(notdir $(OBJ_KOKKOS))
+
+override OBJ += kokkos_depend.o
+
+libkokkoscore.a: $(OBJ_KOKKOS)
+	ar cr libkokkoscore.a $(OBJ_KOKKOS_LINK)	
+
+kokkos_depend.o: libkokkoscore.a
+	touch kokkos_depend.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c kokkos_depend.cpp
+
+kokkos_SYSLIB = -L./ $(LINKFLAGS) $(USRLIB)
--- a/lib/kokkos/README
+++ b/lib/kokkos/README
@ -0,0 +1,44 @@
+Kokkos library
+
+Carter Edwards, Christian Trott, Daniel Sunderland
+Sandia National Labs
+
+29 May 2014
+http://trilinos.sandia.gov/packages/kokkos/
+
+-------------------------
+
+This directory has source files from the Kokkos library that LAMMPS
+uses when building with its KOKKOS package.  The package contains
+versions of pair, fix, and atom styles written with Kokkos data
+structures and calls to the Kokkos library that should run efficiently
+on various kinds of accelerated nodes, including GPU and many-core
+chips.
+
+Kokkos is a C++ library that provides two key abstractions for an
+application like LAMMPS.  First, it allows a single implementation of
+an application kernel (e.g. a pair style) to run efficiently on
+different kinds of hardware (GPU, Intel Phi, many-core chip).
+
+Second, it provides data abstractions to adjust (at compile time) the
+memory layout of basic data structures like 2d and 3d arrays and allow
+the transparent utilization of special hardware load and store units.
+Such data structures are used in LAMMPS to store atom coordinates or
+forces or neighbor lists.  The layout is chosen to optimize
+performance on different platforms.  Again this operation is hidden
+from the developer, and does not affect how the single implementation
+of the kernel is coded.
+
+To build LAMMPS with Kokkos, you should not need to make any changes
+to files in this directory.  You can overrided defaults that are set
+in Makefile.lammps when building LAMMPS, by defining variables as part
+of the make command.  Details of the build process with Kokkos are
+explained in Section 2.3 of doc/Section_start.html. and in Section 5.9
+of doc/Section_accelerate.html.
+
+The one exception is that when using Kokkos with NVIDIA GPUs, the
+CUDA_PATH setting in Makefile.lammps needs to point to the
+installation of the Cuda software on your machine.  The normal default
+location is /usr/local/cuda.  If this is not correct, you need to edit
+Makefile.lammps.
+
--- a/lib/kokkos/TPL/cub/block/block_discontinuity.cuh
+++ b/lib/kokkos/TPL/cub/block/block_discontinuity.cuh
@ -0,0 +1,587 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+ * that differ from their predecessors (or successors).  For example, head flags are convenient
+ * for demarcating disjoint data segments as part of a segmented scan or reduction.
+ *
+ * \tparam T                    The data type to be flagged.
+ * \tparam BLOCK_THREADS        The thread block size in threads.
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockDiscontinuity}
+ * \par
+ * The code snippet below illustrates the head flagging of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockDiscontinuity for 128 threads on type int
+ *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+ *
+ *     // Allocate shared memory for BlockDiscontinuity
+ *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute head flags for discontinuities in the segment
+ *     int head_flags[4];
+ *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+ * The corresponding output \p head_flags in those threads will be
+ * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+ *
+ * \par Performance Considerations
+ * - Zero bank conflicts for most types.
+ *
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS>
+class BlockDiscontinuity
+{
+private:
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    typedef T _TempStorage[BLOCK_THREADS];
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b);
+        }
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockDiscontinuity()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        int linear_tid)             ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage,  ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)             ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+     *
+     * The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     * <tt>input<sub><em>i</em></sub></tt> when
+     * <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     * returns \p true (where <em>previous-item</em> is either the preceding item
+     * in the same thread or the last item in the previous thread).
+     * Furthermore, <tt>head_flags<sub><em>i</em></sub></tt> is always set for
+     * <tt>input><sub>0</sub></tt> in <em>thread</em><sub>0</sub>.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+     * The corresponding output \p head_flags in those threads will be
+     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        __syncthreads();
+
+        // Set flag for first item
+        head_flags[0] = (linear_tid == 0) ?
+            1 :                                 // First thread
+            ApplyOp<FlagOp>::Flag(
+                flag_op,
+                temp_storage[linear_tid - 1],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            head_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+                flag_op,
+                input[ITEM - 1],
+                input[ITEM],
+                (linear_tid * ITEMS_PER_THREAD) + ITEM);
+        }
+    }
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     * <tt>input<sub><em>i</em></sub></tt> when
+     * <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     * returns \p true (where <em>previous-item</em> is either the preceding item
+     * in the same thread or the last item in the previous thread).
+     * For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     * against \p tile_predecessor_item.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(
+     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
+     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
+     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)                   ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        __syncthreads();
+
+        // Set flag for first item
+        int predecessor = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::Flag(
+            flag_op,
+            predecessor,
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for remaining items
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            head_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+                flag_op,
+                input[ITEM - 1],
+                input[ITEM],
+                (linear_tid * ITEMS_PER_THREAD) + ITEM);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+     *
+     * The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     * <tt>input<sub><em>i</em></sub></tt> when
+     * <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     * returns \p true (where <em>next-item</em> is either the next item
+     * in the same thread or the first item in the next thread).
+     * Furthermore, <tt>tail_flags<sub>ITEMS_PER_THREAD-1</sub></tt> is always
+     * set for <em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub>.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
+     * The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage[linear_tid] = input[0];
+
+        __syncthreads();
+
+        // Set flag for last item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::Flag(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
+
+        // Set flags for remaining items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
+        {
+            tail_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+                flag_op,
+                input[ITEM],
+                input[ITEM + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEM);
+        }
+    }
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     * <tt>input<sub><em>i</em></sub></tt> when
+     * <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     * returns \p true (where <em>next-item</em> is either the next item
+     * in the same thread or the first item in the next thread).
+     * For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     * <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     * against \p tile_predecessor_item.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                   ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage[linear_tid] = input[0];
+
+        __syncthreads();
+
+        // Set flag for last item
+        int successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
+
+        // Set flags for remaining items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
+        {
+            tail_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+                flag_op,
+                input[ITEM],
+                input[ITEM + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEM);
+        }
+    }
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/block/block_exchange.cuh
+++ b/lib/kokkos/TPL/cub/block/block_exchange.cuh
@ -0,0 +1,918 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_arch.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * It is commonplace for blocks of threads to rearrange data items between
+ * threads.  For example, the global memory subsystem prefers access patterns
+ * where data items are "striped" across threads (where consecutive threads access consecutive items),
+ * yet most block-wide operations prefer a "blocked" partitioning of items across threads
+ * (where consecutive items belong to a single thread).
+ *
+ * \par
+ * BlockExchange supports the following types of data exchanges:
+ * - Transposing between [<em>blocked</em>](index.html#sec5sec4) and [<em>striped</em>](index.html#sec5sec4) arrangements
+ * - Transposing between [<em>blocked</em>](index.html#sec5sec4) and [<em>warp-striped</em>](index.html#sec5sec4) arrangements
+ * - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec4)
+ * - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec4)
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_THREADS        The thread block size in threads.
+ * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockExchange}
+ * \par
+ * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+ * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockExchange for 128 threads owning 4 integer items each
+ *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+ *
+ *     // Allocate shared memory for BlockExchange
+ *     __shared__ typename BlockExchange::TempStorage temp_storage;
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[4];
+ *     cub::LoadStriped<LOAD_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of striped input \p thread_data across the block of threads is
+ * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Performance Considerations
+ * - Proper device-specific padding ensures zero bank conflicts for most types.
+ *
+ */
+template <
+    typename        T,
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD,
+    bool            WARP_TIME_SLICING = false>
+class BlockExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    enum
+    {
+        LOG_WARP_THREADS            = PtxArchProps::LOG_WARP_THREADS,
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
+
+        LOG_SMEM_BANKS              = PtxArchProps::LOG_SMEM_BANKS,
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
+
+        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
+        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        // Insert padding if the number of items per thread is a power of two
+        INSERT_PADDING              = ((ITEMS_PER_THREAD & (ITEMS_PER_THREAD - 1)) == 0),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    typedef T _TempStorage[TIME_SLICED_ITEMS + PADDING_ITEMS];
+
+public:
+
+    /// \smemstorage{BlockExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+    int warp_lane;
+    int warp_id;
+    int warp_offset;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    __device__ __forceinline__ void BlockedToStriped(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    __device__ __forceinline__ void BlockedToStriped(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  time_slicing)
+    {
+        T temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            __syncthreads();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage[item_offset] = items[ITEM];
+                }
+            }
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
+     */
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (warp_lane * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
+     */
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
+        Int2Type<true>  time_slicing)
+    {
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            __syncthreads();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (warp_lane * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage[item_offset] = items[ITEM];
+                }
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    items[ITEM] = temp_storage[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    __device__ __forceinline__ void StripedToBlocked(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        __syncthreads();
+
+        // No timeslicing
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    __device__ __forceinline__ void StripedToBlocked(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        Int2Type<true>  time_slicing)
+    {
+        // Warp time-slicing
+        T temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Write a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_storage[item_offset] = items[ITEM];
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_items[ITEM] = temp_storage[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
+     */
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (warp_lane * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
+     */
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
+        Int2Type<true>  time_slicing)
+    {
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            __syncthreads();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage[item_offset] = items[ITEM];
+                }
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (warp_lane * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    items[ITEM] = temp_storage[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    __device__ __forceinline__ void ScatterToBlocked(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    __device__ __forceinline__ void ScatterToBlocked(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true>  time_slicing)
+    {
+        T temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            __syncthreads();
+
+            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage[item_offset] = items[ITEM];
+                }
+            }
+
+            __syncthreads();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_items[ITEM] = temp_storage[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true> time_slicing)
+    {
+        T temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage[item_offset] = items[ITEM];
+                }
+            }
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockExchange()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x),
+        warp_lane(linear_tid & (WARP_THREADS - 1)),
+        warp_id(linear_tid >> LOG_WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x),
+        warp_lane(linear_tid & (WARP_THREADS - 1)),
+        warp_id(linear_tid >> LOG_WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockExchange(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid),
+        warp_lane(linear_tid & (WARP_THREADS - 1)),
+        warp_id(linear_tid >> LOG_WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage,              ///< [in] Reference to memory allocation having layout type TempStorage
+        int         linear_tid)                 ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid),
+        warp_lane(linear_tid & (WARP_THREADS - 1)),
+        warp_id(linear_tid >> LOG_WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Structured exchanges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a striped arrangement across block threads
+     *     int thread_data[4];
+     *     cub::LoadStriped<LOAD_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of striped input \p thread_data across the block of threads is
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from global memory.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void StripedToBlocked(
+        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToStriped(thread_data);
+     *
+     *     // Store data striped across block threads into an ordered tile
+     *     cub::StoreStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
+     * preparation for storing to global memory.
+     *
+     */
+    __device__ __forceinline__ void BlockedToStriped(
+        T               items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+    {
+        BlockedToStriped(items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
+     *     int thread_data[4];
+     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of warp-striped input \p thread_data across the block of threads is
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * after loading from global memory.  (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a warp-striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data);
+     *
+     *     // Store data striped across warp threads into an ordered tile
+     *     cub::StoreStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * in preparation for storing to global memory. (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     *
+     */
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
+    {
+        BlockedToWarpStriped(items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Scatter exchanges
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
+     *
+     * \smemreuse
+     */
+    __device__ __forceinline__ void ScatterToBlocked(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \smemreuse
+     */
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/block_histogram.cuh
+++ b/lib/kokkos/TPL/cub/block/block_histogram.cuh
@ -0,0 +1,414 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_histogram_sort.cuh"
+#include "specializations/block_histogram_atomic.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
+ */
+enum BlockHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * Sorting followed by differentiation.  Execution is comprised of two phases:
+     * -# Sort the data using efficient radix sort
+     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     */
+    BLOCK_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * Use atomic addition to update byte counts directly
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    BLOCK_HISTO_ATOMIC,
+};
+
+
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
+/**
+ * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *
+ * \par
+ * Optionally, BlockHistogram can be specialized to use different algorithms:
+ *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
+ *
+ * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam BINS                 The number bins within the histogram
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockHistogram}
+ * \par
+ * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+ * are partitioned across 128 threads where each thread owns 4 samples.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
+ *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+ *
+ *     // Allocate shared memory for BlockHistogram
+ *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+ *
+ *     // Allocate shared memory for block-wide histogram bin counts
+ *     __shared__ unsigned int smem_histogram[256];
+ *
+ *     // Obtain input samples per thread
+ *     unsigned char data[4];
+ *     ...
+ *
+ *     // Compute the block-wide histogram
+ *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+ *
+ * \endcode
+ *
+ * \par Performance and Usage Considerations
+ * - The histogram output can be constructed in shared or global memory
+ * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS,
+    BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT>
+class BlockHistogram
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
+     * regardless.
+     */
+    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
+            BLOCK_HISTO_SORT :
+            ALGORITHM;
+
+    /// Internal specialization.
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
+        BlockHistogramSort<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS>,
+        BlockHistogramAtomic<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS> >::Type InternalBlockHistogram;
+
+    /// Shared memory storage layout type for BlockHistogram
+    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{BlockHistogram}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockHistogram()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockHistogram(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Histogram operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Initialize the shared histogram counters to zero.
+     *
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
+     */
+    template <typename HistoCounter>
+    __device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS])
+    {
+        // Initialize histogram bin counts to zeros
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+    }
+
+
+    /**
+     * \brief Constructs a block-wide histogram in shared/global memory.  Each thread contributes an array of input elements.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+     * are partitioned across 128 threads where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Compute the block-wide histogram
+     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            HistoCounter>
+    __device__ __forceinline__ void Histogram(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        HistoCounter        histogram[BINS])                ///< [out] Reference to shared/global memory histogram
+    {
+        // Initialize histogram bin counts to zeros
+        InitHistogram(histogram);
+
+        // Composite the histogram
+        InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
+    }
+
+
+
+    /**
+     * \brief Updates an existing block-wide histogram in shared/global memory.  Each thread composites an array of input elements.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            HistoCounter>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
+    {
+        InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/block_load.cuh
+++ b/lib/kokkos/TPL/cub/block/block_load.cuh
--- a/lib/kokkos/TPL/cub/block/block_radix_rank.cuh
+++ b/lib/kokkos/TPL/cub/block/block_radix_rank.cuh
@ -0,0 +1,479 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock
+ */
+
+#pragma once
+
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_scan.cuh"
+#include "../block/block_scan.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock.
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * Blah...
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 5 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ *
+ * \par Usage Considerations
+ * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+ * - Assumes a [<em>blocked arrangement</em>](index.html#sec5sec4) of elements across threads
+ * - \smemreuse{BlockRadixRank::TempStorage}
+ *
+ * \par Performance Considerations
+ *
+ * \par Algorithm
+ * These parallel radix ranking variants have <em>O</em>(<em>n</em>) work complexity and are implemented in XXX phases:
+ * -# blah
+ * -# blah
+ *
+ * \par Examples
+ * \par
+ * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
+ *      \code
+ *      #include <cub/cub.cuh>
+ *
+ *      template <int BLOCK_THREADS>
+ *      __global__ void ExampleKernel(...)
+ *      {
+ *
+ *      \endcode
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte>
+class BlockRadixRank
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    // Integer type for digit counters (to be packed into words of type PackedCounters)
+    typedef unsigned short DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
+        unsigned long long,
+        unsigned int>::Type PackedCounter;
+
+    enum
+    {
+        RADIX_DIGITS                 = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS             = PtxArchProps::LOG_WARP_THREADS,
+        WARP_THREADS                 = 1 << LOG_WARP_THREADS,
+        WARPS                        = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_COUNTER            = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER        = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO                = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO            = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES            = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
+        COUNTER_LANES                = 1 << LOG_COUNTER_LANES,
+
+        // The number of packed counters per thread (plus one for padding)
+        RAKING_SEGMENT               = COUNTER_LANES + 1,
+
+        LOG_SMEM_BANKS               = PtxArchProps::LOG_SMEM_BANKS,
+        SMEM_BANKS                   = 1 << LOG_SMEM_BANKS,
+    };
+
+
+    /// BlockScan type
+    typedef BlockScan<PackedCounter, BLOCK_THREADS, INNER_SCAN_ALGORITHM> BlockScan;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct _TempStorage
+    {
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
+
+        union
+        {
+            DigitCounter            digit_counters[COUNTER_LANES + 1][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
+        };
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+    /// Copy of raking segment, promoted to registers
+    PackedCounter cached_segment[RAKING_SEGMENT];
+
+
+    /******************************************************************************
+     * Templated iteration
+     ******************************************************************************/
+
+    // General template iteration
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        /**
+         * Decode keys.  Decodes the radix digit from the current digit place
+         * and increments the thread's corresponding counter in shared
+         * memory for that digit.
+         *
+         * Saves both (1) the prior value of that counter (the key's
+         * thread-local exclusive prefix sum for that digit), and (2) the shared
+         * memory offset of the counter (for later use).
+         */
+        template <typename UnsignedBits, int KEYS_PER_THREAD>
+        static __device__ __forceinline__ void DecodeKeys(
+            BlockRadixRank  &cta,                                   // BlockRadixRank instance
+            UnsignedBits    (&keys)[KEYS_PER_THREAD],               // Key to decode
+            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value (out parameter)
+            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],     // Counter smem offset (out parameter)
+            int             current_bit)                            // The least-significant bit position of the current digit to extract
+        {
+            // Add in sub-counter offset
+            UnsignedBits sub_counter = BFE(keys[COUNT], current_bit + LOG_COUNTER_LANES, LOG_PACKING_RATIO);
+
+            // Add in row offset
+            UnsignedBits row_offset = BFE(keys[COUNT], current_bit, LOG_COUNTER_LANES);
+
+            // Pointer to smem digit counter
+            digit_counters[COUNT] = &cta.temp_storage.digit_counters[row_offset][cta.linear_tid][sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[COUNT] = *digit_counters[COUNT];
+
+            // Store inclusive prefix
+            *digit_counters[COUNT] = thread_prefixes[COUNT] + 1;
+
+            // Iterate next key
+            Iterate<COUNT + 1, MAX>::DecodeKeys(cta, keys, thread_prefixes, digit_counters, current_bit);
+        }
+
+
+        // Termination
+        template <int KEYS_PER_THREAD>
+        static __device__ __forceinline__ void UpdateRanks(
+            int             (&ranks)[KEYS_PER_THREAD],              // Local ranks (out parameter)
+            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value
+            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD])     // Counter smem offset
+        {
+            // Add in threadblock exclusive prefix
+            ranks[COUNT] = thread_prefixes[COUNT] + *digit_counters[COUNT];
+
+            // Iterate next key
+            Iterate<COUNT + 1, MAX>::UpdateRanks(ranks, thread_prefixes, digit_counters);
+        }
+    };
+
+
+    // Termination
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // DecodeKeys
+        template <typename UnsignedBits, int KEYS_PER_THREAD>
+        static __device__ __forceinline__ void DecodeKeys(
+            BlockRadixRank  &cta,
+            UnsignedBits    (&keys)[KEYS_PER_THREAD],
+            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
+            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],
+            int             current_bit) {}
+
+
+        // UpdateRanks
+        template <int KEYS_PER_THREAD>
+        static __device__ __forceinline__ void UpdateRanks(
+            int             (&ranks)[KEYS_PER_THREAD],
+            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
+            DigitCounter    *(&digit_counters)[KEYS_PER_THREAD]) {}
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal storage allocator
+     */
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Performs upsweep raking reduction, returning the aggregate
+     */
+    __device__ __forceinline__ PackedCounter Upsweep()
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
+        PackedCounter *raking_ptr;
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        return ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        PackedCounter raking_partial)
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
+
+        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        // Exclusive raking downsweep scan
+        ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /**
+     * Reset shared memory digit counters
+     */
+    __device__ __forceinline__ void ResetCounters()
+    {
+        // Reset shared memory digit counters
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES + 1; LANE++)
+        {
+            *((PackedCounter*) temp_storage.digit_counters[LANE][linear_tid]) = 0;
+        }
+    }
+
+
+    /**
+     * Scan shared memory digit counters.
+     */
+    __device__ __forceinline__ void ScanCounters()
+    {
+        // Upsweep scan
+        PackedCounter raking_partial = Upsweep();
+
+        // Compute inclusive sum
+        PackedCounter inclusive_partial;
+        PackedCounter packed_aggregate;
+        BlockScan(temp_storage.block_scan, linear_tid).InclusiveSum(raking_partial, inclusive_partial, packed_aggregate);
+
+        // Propagate totals in packed fields
+        #pragma unroll
+        for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+        {
+            inclusive_partial += packed_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+        }
+
+        // Downsweep scan with exclusive partial
+        PackedCounter exclusive_partial = inclusive_partial - raking_partial;
+        ExclusiveDownsweep(exclusive_partial);
+    }
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockRadixRank()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit)                        ///< [in] The least-significant bit position of the current digit to extract
+    {
+        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
+        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
+
+        // Reset shared memory digit counters
+        ResetCounters();
+
+        // Decode keys and update digit counters
+        Iterate<0, KEYS_PER_THREAD>::DecodeKeys(*this, keys, thread_prefixes, digit_counters, current_bit);
+
+        __syncthreads();
+
+        // Scan shared memory counters
+        ScanCounters();
+
+        __syncthreads();
+
+        // Extract the local ranks of each key
+        Iterate<0, KEYS_PER_THREAD>::UpdateRanks(ranks, thread_prefixes, digit_counters);
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             &inclusive_digit_prefix)            ///< [out] The incluisve prefix sum for the digit threadIdx.x
+    {
+        // Rank keys
+        RankKeys(keys, ranks, current_bit);
+
+        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (linear_tid < RADIX_DIGITS))
+        {
+            // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+            // first counter column, resulting in unavoidable bank conflicts.)
+            int counter_lane = (linear_tid & (COUNTER_LANES - 1));
+            int sub_counter = linear_tid >> (LOG_COUNTER_LANES);
+            inclusive_digit_prefix = temp_storage.digit_counters[counter_lane + 1][0][sub_counter];
+        }
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
--- a/lib/kokkos/TPL/cub/block/block_radix_sort.cuh
+++ b/lib/kokkos/TPL/cub/block/block_radix_sort.cuh
@ -0,0 +1,608 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
+ */
+
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "block_exchange.cuh"
+#include "block_radix_rank.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending order.  It relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
+ * <tt>unsigned char</tt>, \p int, \p double, etc.  Within each key, the implementation treats fixed-length
+ * bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, BlockRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \tparam Key                  Key type
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam Value                <b>[optional]</b> Value type (default: cub::NullType)
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockRadixSort}
+ * \par
+ * The code snippet below illustrates a sort of 512 integer keys that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockRadixSort for 128 threads owning 4 integer items each
+ *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+ *
+ *     // Allocate shared memory for BlockRadixSort
+ *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     // Collectively sort the keys
+ *     BlockRadixSort(temp_storage).Sort(thread_keys);
+ *
+ *     ...
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+ * corresponding output \p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename                Key,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    typename                Value                   = NullType,
+    int                     RADIX_BITS              = 4,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte>
+class BlockRadixSort
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    // Key traits and unsigned bits type
+    typedef NumericTraits<Key>                  KeyTraits;
+    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
+
+    /// BlockRadixRank utility type
+    typedef BlockRadixRank<BLOCK_THREADS, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixRank;
+
+    /// BlockExchange utility type for keys
+    typedef BlockExchange<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchangeKeys;
+
+    /// BlockExchange utility type for values
+    typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchangeValues;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockRadixRank::TempStorage          ranking_storage;
+            typename BlockExchangeKeys::TempStorage        exchange_keys;
+            typename BlockExchangeValues::TempStorage      exchange_values;
+        };
+    };
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockRadixSort()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangements)
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Performs a block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     */
+    __device__ __forceinline__ void Sort(
+        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+            begin_bit += RADIX_BITS;
+
+            __syncthreads();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            __syncthreads();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+
+    /**
+     * \brief Performs a block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys and values.
+     *
+     * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     * more than one tile of values, simply perform a key-value sort of the keys paired
+     * with a temporary value array that enumerates the key indices.  The reordered indices
+     * can then be used as a gather-vector for exchanging other associated tile data through
+     * shared memory.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void Sort(
+        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+            begin_bit += RADIX_BITS;
+
+            __syncthreads();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
+            __syncthreads();
+
+            // Exchange values through shared memory in blocked arrangement
+            BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            __syncthreads();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangement -> striped arrangement)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Performs a radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec4).
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+            begin_bit += RADIX_BITS;
+
+            __syncthreads();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
+            __syncthreads();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+
+    /**
+     * \brief Performs a radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec4).
+     *
+     * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     * more than one tile of values, simply perform a key-value sort of the keys paired
+     * with a temporary value array that enumerates the key indices.  The reordered indices
+     * can then be used as a gather-vector for exchanging other associated tile data through
+     * shared memory.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+            begin_bit += RADIX_BITS;
+
+            __syncthreads();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
+
+                __syncthreads();
+
+                // Last pass exchanges through shared memory in striped arrangement
+                BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToStriped(values, ranks);
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
+            __syncthreads();
+
+            // Exchange values through shared memory in blocked arrangement
+            BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
+
+            __syncthreads();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+
+    //@}  end member group
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/block_raking_layout.cuh
+++ b/lib/kokkos/TPL/cub/block/block_raking_layout.cuh
@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_THREADS        The thread block size in threads.
+ * \tparam BLOCK_STRIPS         When strip-mining, the number of threadblock-strips per tile
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         BLOCK_STRIPS = 1>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and typedefs
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS =
+            BLOCK_THREADS * BLOCK_STRIPS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS =
+            CUB_MIN(BLOCK_THREADS, PtxArchProps::WARP_THREADS),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH =
+            (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS =
+            (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Pad each segment length with one element if it evenly divides the number of banks
+        SEGMENT_PADDING =
+            (PtxArchProps::SMEM_BANKS % SEGMENT_LENGTH == 0) ? 1 : 0,
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS =
+            RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the warp size)
+        UNGUARDED =
+            (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    typedef T TempStorage[BlockRakingLayout::GRID_ELEMENTS];
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        int linear_tid,
+        int block_strip = 0)
+    {
+        // Offset for partial
+        unsigned int offset = (block_strip * BLOCK_THREADS) + linear_tid;
+
+        // Add in one padding element for every segment
+        if (SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        int linear_tid)
+    {
+        return temp_storage + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/block_reduce.cuh
+++ b/lib/kokkos/TPL/cub/block/block_reduce.cuh
@ -0,0 +1,563 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_reduce_raking.cuh"
+#include "specializations/block_reduce_warp_reductions.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * BlockReduceAlgorithm enumerates alternative algorithms for parallel
+ * reduction across a CUDA threadblock.
+ */
+enum BlockReduceAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm.  Execution is comprised of
+     * three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer longer turnaround latencies when the
+     *   GPU is under-occupied, it can often provide higher overall throughput
+     *   across the GPU when suitably occupied.
+     */
+    BLOCK_REDUCE_RAKING,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warp-reductions" reduction algorithm.  Execution is
+     * comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer lower overall throughput across the
+     *   GPU because due to a heavy reliance on inefficient warp-reductions, it
+     *   can often provide lower turnaround latencies when the GPU is
+     *   under-occupied.
+     */
+    BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
+/**
+ * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a list of input elements.
+ *
+ * \par
+ * Optionally, BlockReduce can be specialized by algorithm to accommodate different latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * \tparam T                Data type being reduced
+ * \tparam BLOCK_THREADS    The thread block size in threads
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_RAKING)
+ *
+ * \par Performance Considerations
+ * - Very efficient (only one synchronization barrier).
+ * - Zero bank conflicts for most types.
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Summation (<b><em>vs.</em></b> generic reduction)
+ *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
+ * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockReduce}
+ * \par
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockReduce for 128 threads on type int
+ *     typedef cub::BlockReduce<int, 128> BlockReduce;
+ *
+ *     // Allocate shared memory for BlockReduce
+ *     __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Compute the block-wide sum for thread0
+ *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    BlockReduceAlgorithm    ALGORITHM = BLOCK_REDUCE_RAKING>
+class BlockReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Internal specialization.
+    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
+        BlockReduceWarpReductions<T, BLOCK_THREADS>,
+        BlockReduceRaking<T, BLOCK_THREADS> >::Type InternalBlockReduce;
+
+    /// Shared memory storage layout type for BlockReduce
+    typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockReduce()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockReduce(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a max reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                      ///< [in] Calling thread's input
+        ReductionOp     reduction_op)               ///< [in] Binary reduction operator
+    {
+        return InternalBlockReduce(temp_storage, linear_tid).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a max reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
+        ReductionOp     reduction_op)                   ///< [in] Binary reduction operator
+    {
+        // Reduce partials
+        T partial = ThreadReduce(inputs, reduction_op);
+        return Reduce(partial, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid) thread_data = ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        ReductionOp         reduction_op,           ///< [in] Binary reduction operator
+        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage, linear_tid).template Reduce<true>(input, num_valid, reduction_op);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage, linear_tid).template Reduce<false>(input, num_valid, reduction_op);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sum reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input)                      ///< [in] Calling thread's input
+    {
+        return InternalBlockReduce(temp_storage, linear_tid).template Sum<true>(input, BLOCK_THREADS);
+    }
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sum reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ T Sum(
+        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
+    {
+        // Reduce partials
+        T partial = ThreadReduce(inputs, cub::Sum());
+        return Sum(partial);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item (up to num_items)
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid)
+     *         thread_data = ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input,                  ///< [in] Calling thread's input
+        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage, linear_tid).template Sum<true>(input, num_valid);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage, linear_tid).template Sum<false>(input, num_valid);
+        }
+    }
+
+
+    //@}  end member group
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/block_scan.cuh
+++ b/lib/kokkos/TPL/cub/block/block_scan.cuh
--- a/lib/kokkos/TPL/cub/block/block_store.cuh
+++ b/lib/kokkos/TPL/cub/block/block_store.cuh
@ -0,0 +1,926 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_vector.cuh"
+#include "../thread/thread_store.cuh"
+#include "block_exchange.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup IoModule
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked I/O
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \blocked
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        ThreadStore<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+        {
+            ThreadStore<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped I/O
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \striped
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        ThreadStore<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
+    }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier, guarded by range
+ *
+ * \striped
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+        {
+            ThreadStore<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped I/O
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+    int wid         = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+    int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        ThreadStore<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
+    }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    int tid         = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+    int wid         = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+    int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS) < valid_items)
+        {
+            ThreadStore<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Blocked, vectorized I/O
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreBlockedVectorized(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T                   *block_ptr,                 ///< [in] Input pointer for storing from
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename VectorHelper<T, VEC_SIZE>::Type Vector;
+
+    // Alias global pointer
+    Vector *block_ptr_vectors = reinterpret_cast<Vector *>(block_ptr);
+
+    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    Vector raw_vector[VECTORS_PER_THREAD];
+    T *raw_items = reinterpret_cast<T*>(raw_vector);
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        raw_items[ITEM] = items[ITEM];
+    }
+
+    // Direct-store using vector types
+    StoreBlocked<MODIFIER>(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+
+//@}  end member group
+
+
+/** @} */       // end group IoModule
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ */
+enum BlockStoreAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written
+     * directly to memory.  The thread block writes items in a parallel "raking" fashion:
+     * thread<sub><em>i</em></sub> writes the <em>i</em><sup>th</sup> segment of consecutive elements.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written directly
+     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+     * The thread block writes items in a parallel "raking" fashion: thread<sub><em>i</em></sub> uses vector stores to
+     * write the <em>i</em><sup>th</sup> segment of consecutive elements.
+     *
+     * For example, <tt>st.global.v4.s32</tt> instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p OutputIteratorRA is not a simple pointer type
+     *   - The block output offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_STORE_VECTORIZE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec4) is locally
+     * transposed into a [<em>striped arrangement</em>](index.html#sec5sec4)
+     * which is then written to memory.  More specifically, cub::BlockExchange
+     * used to locally reorder the items into a
+     * [<em>striped arrangement</em>](index.html#sec5sec4), after which the
+     * thread block writes items in a parallel "strip-mining" fashion: consecutive
+     * items owned by thread<sub><em>i</em></sub> are written to memory with
+     * stride \p BLOCK_THREADS between them.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec4) is locally
+     * transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec4)
+     * which is then written to memory.  More specifically, cub::BlockExchange used
+     * to locally reorder the items into a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec4), after which
+     * each warp writes its own contiguous segment in a parallel "strip-mining" fashion:
+     * consecutive items owned by lane<sub><em>i</em></sub> are written to memory
+     * with stride \p WARP_THREADS between them.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE,
+};
+
+
+
+/**
+ * \addtogroup BlockModule
+ * @{
+ */
+
+
+/**
+ * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec4) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
+ *
+ * \par Overview
+ * The BlockStore class provides a single data movement abstraction that can be specialized
+ * to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
+ * performance policies for different architectures, data types, granularity sizes, etc.
+ *
+ * \par Optionally, BlockStore can be specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written
+ *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
+ *      of data is written directly to memory using CUDA's built-in vectorized stores as a
+ *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec4) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec4) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *
+ * \tparam OutputIteratorRA     The input iterator type (may be a simple pointer type).
+ * \tparam BLOCK_THREADS        The thread block size in threads.
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
+ * \tparam MODIFIER             <b>[optional]</b> cub::PtxStoreModifier cache modifier.  default: cub::STORE_DEFAULT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> For transposition-based cub::BlockStoreAlgorithm parameterizations that utilize shared memory: When \p true, only use enough shared memory for a single warp's worth of data, time-slicing the block-wide exchange over multiple synchronized rounds (default: false)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockStore}
+ * \par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * meaning items are locally reordered among threads so that memory references will be
+ * efficiently coalesced using a warp-striped access pattern.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockStore for 128 threads owning 4 integer items each
+ *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+ *
+ *     // Allocate shared memory for BlockStore
+ *     __shared__ typename BlockStore::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     int thread_data[4];
+ *     BlockStore(temp_storage).Store(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of \p thread_data across the block of threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ *
+ */
+template <
+    typename                OutputIteratorRA,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
+    PtxStoreModifier        MODIFIER            = STORE_DEFAULT,
+    bool                    WARP_TIME_SLICING   = false>
+class BlockStore
+{
+private:
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<OutputIteratorRA>::value_type T;
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Store helper
+    template <BlockStoreAlgorithm _POLICY, int DUMMY = 0>
+    struct StoreInternal;
+
+
+    /**
+     * BLOCK_STORE_DIRECT specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreBlocked<MODIFIER>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_VECTORIZE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Store(
+            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreBlockedVectorized<MODIFIER>(linear_tid, block_ptr, items);
+        }
+
+        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        template <typename _OutputIteratorRA>
+        __device__ __forceinline__ void Store(
+            _OutputIteratorRA   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreBlocked<MODIFIER>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
+
+        /// Shared memory storage layout type
+        typedef typename BlockExchange::TempStorage _TempStorage;
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = PtxArchProps::WARP_THREADS
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
+
+        /// Shared memory storage layout type
+        typedef typename BlockExchange::TempStorage _TempStorage;
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreWarpStriped<MODIFIER>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef StoreInternal<ALGORITHM> InternalStore;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalStore::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+
+    /// \smemstorage{BlockStore}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockStore()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockStore(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Store items into a linear segment of memory.
+     *
+     * \blocked
+     *
+     * The code snippet below illustrates the storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockStore for 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     */
+    __device__ __forceinline__ void Store(
+        OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+    }
+
+    /**
+     * \brief Store items into a linear segment of memory, guarded by range.
+     *
+     * \blocked
+     *
+     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockStore for 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * only the first two threads being unmasked to store portions of valid data.
+     *
+     */
+    __device__ __forceinline__ void Store(
+        OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+    }
+};
+
+/** @} */       // end group BlockModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/specializations/block_histogram_atomic.cuh
+++ b/lib/kokkos/TPL/cub/block/specializations/block_histogram_atomic.cuh
@ -0,0 +1,85 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS>
+struct BlockHistogramAtomic
+{
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramAtomic(
+        TempStorage     &temp_storage,
+        int             linear_tid)
+    {}
+
+
+    /// Composite data onto an existing histogram
+    template <
+        typename            HistoCounter>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
+    {
+        // Update histogram
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+        {
+              atomicAdd(histogram + items[i], 1);
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/specializations/block_histogram_sort.cuh
+++ b/lib/kokkos/TPL/cub/block/specializations/block_histogram_sort.cuh
@ -0,0 +1,197 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS>
+struct BlockHistogramSort
+{
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<T, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
+
+    // Shared memory
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            unsigned int run_begin[BINS];
+            unsigned int run_end[BINS];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramSort(
+        TempStorage     &temp_storage,
+        int             linear_tid)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    // Composite data onto an existing histogram
+    template <
+        typename            HistoCounter>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
+    {
+        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort, linear_tid).Sort(items);
+
+        __syncthreads();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+
+        __syncthreads();
+
+        int flags[ITEMS_PER_THREAD];    // unused
+
+        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag, linear_tid).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
+
+        __syncthreads();
+
+        // Composite into histogram
+        histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            int thread_offset = histo_offset + linear_tid;
+            HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+        // Finish up with guarded composition if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            int thread_offset = histo_offset + linear_tid;
+            HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/specializations/block_reduce_raking.cuh
+++ b/lib/kokkos/TPL/cub/block/specializations/block_reduce_raking.cuh
@ -0,0 +1,214 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA threadblock
+ */
+
+#pragma once
+
+#include "../../block/block_raking_layout.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA threadblock
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_THREADS>  ///< The thread block size in threads
+struct BlockReduceRaking
+{
+    /// Layout type for padded threadblock raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, 1> BlockRakingLayout;
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, 1, BlockRakingLayout::RAKING_THREADS>::InternalWarpReduce WarpReduce;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
+
+        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
+        WARP_SYNCHRONOUS_UNGUARDED = ((RAKING_THREADS & (RAKING_THREADS - 1)) == 0),
+
+        /// Whether or not accesses into smem are unguarded
+        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+    };
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
+        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded threadblock raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRaking(
+        TempStorage &temp_storage,
+        int linear_tid)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum reduction_op;
+
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Sum<FULL_TILE, SEGMENT_LENGTH>(
+                partial,
+                num_valid);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            __syncthreads();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                #pragma unroll
+                for (int ITEM = 1; ITEM < SEGMENT_LENGTH; ITEM++)
+                {
+                    // Update partial if addend is in range
+                    if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITEM < num_valid))
+                    {
+                        partial = reduction_op(partial, raking_segment[ITEM]);
+                    }
+                }
+
+                partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Sum<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
+                    partial,
+                    num_valid);
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Reduce<FULL_TILE, SEGMENT_LENGTH>(
+                partial,
+                num_valid,
+                reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            __syncthreads();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                #pragma unroll
+                for (int ITEM = 1; ITEM < SEGMENT_LENGTH; ITEM++)
+                {
+                    // Update partial if addend is in range
+                    if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITEM < num_valid))
+                    {
+                        partial = reduction_op(partial, raking_segment[ITEM]);
+                    }
+                }
+
+                partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Reduce<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
+                    partial,
+                    num_valid,
+                    reduction_op);
+            }
+        }
+
+        return partial;
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/lib/kokkos/TPL/cub/block/specializations/block_reduce_warp_reductions.cuh
@ -0,0 +1,198 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock
+ */
+
+#pragma once
+
+#include "../../warp/warp_reduce.cuh"
+#include "../../util_arch.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_THREADS>  ///< The thread block size in threads
+struct BlockReduceWarpReductions
+{
+    /// Constants
+    enum
+    {
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
+
+        /// The logical warp size for warp reductions
+        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, PtxArchProps::WARP_THREADS),
+
+        /// Whether or not the logical warp size evenly divides the threadblock size
+        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+    };
+
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, WARPS, LOGICAL_WARP_SIZE>::InternalWarpReduce WarpReduce;
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage    warp_reduce;                ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire threadblock
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+    int warp_id;
+    int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceWarpReductions(
+        TempStorage &temp_storage,
+        int linear_tid)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid),
+        warp_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
+            0 :
+            linear_tid / PtxArchProps::WARP_THREADS),
+        lane_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
+            linear_tid :
+            linear_tid % PtxArchProps::WARP_THREADS)
+    {}
+
+
+    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         reduction_op,       ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub>s only]</b> Warp-wide aggregate reduction of input items
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        // Share lane aggregates
+        if (lane_id == 0)
+        {
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+        }
+
+        __syncthreads();
+
+        // Update total aggregate in warp 0, lane 0
+        if (linear_tid == 0)
+        {
+            #pragma unroll
+            for (int SUCCESSOR_WARP = 1; SUCCESSOR_WARP < WARPS; SUCCESSOR_WARP++)
+            {
+                if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+                {
+                    warp_aggregate = reduction_op(warp_aggregate, temp_storage.warp_aggregates[SUCCESSOR_WARP]);
+                }
+            }
+        }
+
+        return warp_aggregate;
+    }
+
+
+    /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   input,          ///< [in] Calling thread's input partial reductions
+        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum     reduction_op;
+        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
+                            LOGICAL_WARP_SIZE :
+                            (warp_offset < num_valid) ?
+                                num_valid - warp_offset :
+                                0;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce, warp_id, lane_id).template Sum<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
+            input,
+            warp_num_valid);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+
+    /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        unsigned int    warp_id = (WARPS == 1) ? 0 : (linear_tid / LOGICAL_WARP_SIZE);
+        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
+                            LOGICAL_WARP_SIZE :
+                            (warp_offset < num_valid) ?
+                                num_valid - warp_offset :
+                                0;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce, warp_id, lane_id).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
+            input,
+            warp_num_valid,
+            reduction_op);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/specializations/block_scan_raking.cuh
+++ b/lib/kokkos/TPL/cub/block/specializations/block_scan_raking.cuh
@ -0,0 +1,761 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+/**
+ * \file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../block/block_raking_layout.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_scan.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
+ */
+template <
+    typename            T,              ///< Data type being scanned
+    int                 BLOCK_THREADS,  ///< The thread block size in threads
+    bool                MEMOIZE>        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+struct BlockScanRaking
+{
+    /// Layout type for padded threadblock raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS> BlockRakingLayout;
+
+    /// Constants
+    enum
+    {
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
+
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, 1, RAKING_THREADS> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
+        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded threadblock raking grid
+        T                                           block_aggregate;    ///< Block aggregate
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    int             linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRaking(
+        TempStorage &temp_storage,
+        int linear_tid)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(
+        ScanOp scan_op)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+        T *raking_ptr;
+
+        if (MEMOIZE)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < SEGMENT_LENGTH; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        T raking_partial = raking_ptr[0];
+
+        #pragma unroll
+        for (int i = 1; i < SEGMENT_LENGTH; i++)
+        {
+            if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + i) < BLOCK_THREADS))
+            {
+                raking_partial = scan_op(raking_partial, raking_ptr[i]);
+            }
+        }
+
+        return raking_partial;
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        T *raking_ptr = (MEMOIZE) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        ThreadScanExclusive<SEGMENT_LENGTH>(raking_ptr, raking_ptr, scan_op, raking_partial, apply_prefix);
+
+        if (MEMOIZE)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < SEGMENT_LENGTH; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /// Performs inclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        T *raking_ptr = (MEMOIZE) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        ThreadScanInclusive<SEGMENT_LENGTH>(raking_ptr, raking_ptr, scan_op, raking_partial, apply_prefix);
+
+        if (MEMOIZE)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < SEGMENT_LENGTH; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &identity,          ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                input,
+                output,
+                identity,
+                scan_op,
+                block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    identity,
+                    scan_op,
+                    temp_storage.block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename        ScanOp,
+        typename        BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               identity,                       ///< [in] Identity value
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                input,
+                output,
+                identity,
+                scan_op,
+                block_aggregate,
+                block_prefix_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    identity,
+                    scan_op,
+                    temp_storage.block_aggregate,
+                    block_prefix_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                input,
+                output,
+                scan_op,
+                block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    scan_op,
+                    temp_storage.block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                input,
+                output,
+                scan_op,
+                block_aggregate,
+                block_prefix_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    scan_op,
+                    temp_storage.block_aggregate,
+                    block_prefix_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                input,
+                output,
+                block_aggregate);
+        }
+        else
+        {
+            // Raking scan
+            Sum scan_op;
+
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                    raking_partial,
+                    raking_partial,
+                    temp_storage.block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                input,
+                output,
+                block_aggregate,
+                block_prefix_op);
+        }
+        else
+        {
+            // Raking scan
+            Sum scan_op;
+
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                    raking_partial,
+                    raking_partial,
+                    temp_storage.block_aggregate,
+                    block_prefix_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveScan(
+                input,
+                output,
+                scan_op,
+                block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    scan_op,
+                    temp_storage.block_aggregate);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveScan(
+                input,
+                output,
+                scan_op,
+                block_aggregate,
+                block_prefix_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    scan_op,
+                    temp_storage.block_aggregate,
+                    block_prefix_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveSum(
+                input,
+                output,
+                block_aggregate);
+        }
+        else
+        {
+            // Raking scan
+            Sum scan_op;
+
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                    raking_partial,
+                    raking_partial,
+                    temp_storage.block_aggregate);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveSum(
+                input,
+                output,
+                block_aggregate,
+                block_prefix_op);
+        }
+        else
+        {
+            // Raking scan
+            Sum scan_op;
+
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                    raking_partial,
+                    raking_partial,
+                    temp_storage.block_aggregate,
+                    block_prefix_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/lib/kokkos/TPL/cub/block/specializations/block_scan_warp_scans.cuh
@ -0,0 +1,342 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS>
+struct BlockScanWarpScans
+{
+    /// Constants
+    enum
+    {
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PtxArchProps::WARP_THREADS> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage      warp_scan;                  ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire threadblock
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+    int warp_id;
+    int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage,
+        int linear_tid)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid),
+        warp_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
+            0 :
+            linear_tid / PtxArchProps::WARP_THREADS),
+        lane_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
+            linear_tid :
+            linear_tid % PtxArchProps::WARP_THREADS)
+    {}
+
+
+    /// Update the calling thread's partial reduction with the warp-wide aggregates from preceding warps.  Also returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &partial,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub>s only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        bool            lane_valid = true)  ///< [in] Whether or not the partial belonging to the current thread is valid
+    {
+        // Share lane aggregates
+        temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        __syncthreads();
+
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; WARP++)
+        {
+            if (warp_id == WARP)
+            {
+                partial = (lane_valid) ?
+                    scan_op(block_aggregate, partial) :     // fold it in our valid partial
+                    block_aggregate;                        // replace our invalid partial with the aggregate
+            }
+
+            block_aggregate = scan_op(block_aggregate, temp_storage.warp_aggregates[WARP]);
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &identity,          ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        T warp_aggregate;
+        WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
+
+        // Update outputs and block_aggregate with warp-wide aggregates
+        ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate);
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               identity,                       ///< [in] Identity value
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        ExclusiveScan(input, output, identity, scan_op, block_aggregate);
+
+        // Compute and share threadblock prefix
+        if (warp_id == 0)
+        {
+            temp_storage.block_prefix = block_prefix_op(block_aggregate);
+        }
+
+        __syncthreads();
+
+        // Incorporate threadblock prefix into outputs
+        output = scan_op(temp_storage.block_prefix, output);
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        T warp_aggregate;
+        WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveScan(input, output, scan_op, warp_aggregate);
+
+        // Update outputs and block_aggregate with warp-wide aggregates
+        ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate, (lane_id > 0));
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        ExclusiveScan(input, output, scan_op, block_aggregate);
+
+        // Compute and share threadblock prefix
+        if (warp_id == 0)
+        {
+            temp_storage.block_prefix = block_prefix_op(block_aggregate);
+        }
+
+        __syncthreads();
+
+        // Incorporate threadblock prefix into outputs
+        output = (linear_tid == 0) ?
+            temp_storage.block_prefix :
+            scan_op(temp_storage.block_prefix, output);
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        T warp_aggregate;
+        WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveSum(input, output, warp_aggregate);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        ApplyWarpAggregates(output, Sum(), warp_aggregate, block_aggregate);
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        ExclusiveSum(input, output, block_aggregate);
+
+        // Compute and share threadblock prefix
+        if (warp_id == 0)
+        {
+            temp_storage.block_prefix = block_prefix_op(block_aggregate);
+        }
+
+        __syncthreads();
+
+        // Incorporate threadblock prefix into outputs
+        Sum scan_op;
+        output = scan_op(temp_storage.block_prefix, output);
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        T warp_aggregate;
+        WarpScan(temp_storage.warp_scan, warp_id, lane_id).InclusiveScan(input, output, scan_op, warp_aggregate);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate);
+
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        InclusiveScan(input, output, scan_op, block_aggregate);
+
+        // Compute and share threadblock prefix
+        if (warp_id == 0)
+        {
+            temp_storage.block_prefix = block_prefix_op(block_aggregate);
+        }
+
+        __syncthreads();
+
+        // Incorporate threadblock prefix into outputs
+        output = scan_op(temp_storage.block_prefix, output);
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        T warp_aggregate;
+        WarpScan(temp_storage.warp_scan, warp_id, lane_id).InclusiveSum(input, output, warp_aggregate);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        ApplyWarpAggregates(output, Sum(), warp_aggregate, block_aggregate);
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        InclusiveSum(input, output, block_aggregate);
+
+        // Compute and share threadblock prefix
+        if (warp_id == 0)
+        {
+            temp_storage.block_prefix = block_prefix_op(block_aggregate);
+        }
+
+        __syncthreads();
+
+        // Incorporate threadblock prefix into outputs
+        Sum scan_op;
+        output = scan_op(temp_storage.block_prefix, output);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/cub.cuh
+++ b/lib/kokkos/TPL/cub/cub.cuh
@ -0,0 +1,84 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * CUB umbrella include file
+ */
+
+#pragma once
+
+
+// Block
+#include "block/block_histogram.cuh"
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_rank.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+
+// Device
+#include "device/device_histogram.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_scan.cuh"
+
+// Grid
+//#include "grid/grid_barrier.cuh"
+#include "grid/grid_even_share.cuh"
+#include "grid/grid_mapping.cuh"
+#include "grid/grid_queue.cuh"
+
+// Host
+#include "host/spinlock.cuh"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_store.cuh"
+
+// Warp
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+
+// Util
+#include "util_allocator.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_device.cuh"
+#include "util_macro.cuh"
+#include "util_ptx.cuh"
+#include "util_type.cuh"
+#include "util_iterator.cuh"
+#include "util_vector.cuh"
+
--- a/lib/kokkos/TPL/cub/device/block/block_histo_tiles.cuh
+++ b/lib/kokkos/TPL/cub/device/block/block_histo_tiles.cuh
@ -0,0 +1,322 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockHistogramTiles implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "specializations/block_histo_tiles_gatomic.cuh"
+#include "specializations/block_histo_tiles_satomic.cuh"
+#include "specializations/block_histo_tiles_sort.cuh"
+#include "../../util_type.cuh"
+#include "../../grid/grid_mapping.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+
+/**
+ * \brief BlockHistogramTilesAlgorithm enumerates alternative algorithms for BlockHistogramTiles.
+ */
+enum BlockHistogramTilesAlgorithm
+{
+
+    /**
+     * \par Overview
+     * A two-kernel approach in which:
+     * -# Thread blocks in the first kernel aggregate their own privatized
+     *    histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
+     * -# A single thread block in the second kernel reduces them into the output histogram(s).
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     */
+    GRID_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * A two-kernel approach in which:
+     * -# Thread blocks in the first kernel aggregate their own privatized
+     *    histograms using shared-memory \p atomicAdd().
+     * -# A single thread block in the second kernel reduces them into the
+     *    output histogram(s).
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     */
+    GRID_HISTO_SHARED_ATOMIC,
+
+
+    /**
+     * \par Overview
+     * A single-kernel approach in which thread blocks update the output histogram(s) directly
+     * using global-memory \p atomicAdd().
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     *
+     * Performance is not significantly impacted when computing histograms having large
+     * numbers of bins (e.g., thousands).
+     */
+    GRID_HISTO_GLOBAL_ATOMIC,
+
+};
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockHistogramTiles
+ */
+template <
+    int                             _BLOCK_THREADS,
+    int                             _ITEMS_PER_THREAD,
+    BlockHistogramTilesAlgorithm    _GRID_ALGORITHM,
+    GridMappingStrategy             _GRID_MAPPING,
+    int                             _SM_OCCUPANCY>
+struct BlockHistogramTilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
+        SM_OCCUPANCY        = _SM_OCCUPANCY,
+    };
+
+    static const BlockHistogramTilesAlgorithm   GRID_ALGORITHM      = _GRID_ALGORITHM;
+    static const GridMappingStrategy            GRID_MAPPING        = _GRID_MAPPING;
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * Implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
+ */
+template <
+    typename    BlockHistogramTilesPolicy,          ///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins per channel
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
+    typename    SizeT>                          ///< Integer type for offsets
+struct BlockHistogramTiles
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Histogram grid algorithm
+    static const BlockHistogramTilesAlgorithm GRID_ALGORITHM = BlockHistogramTilesPolicy::GRID_ALGORITHM;
+
+    // Alternative internal implementation types
+    typedef BlockHistogramTilesSort<            BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>   BlockHistogramTilesSortT;
+    typedef BlockHistogramTilesSharedAtomic<    BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>   BlockHistogramTilesSharedAtomicT;
+    typedef BlockHistogramTilesGlobalAtomic<    BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>   BlockHistogramTilesGlobalAtomicT;
+
+    // Internal block sweep histogram type
+    typedef typename If<(GRID_ALGORITHM == GRID_HISTO_SORT),
+        BlockHistogramTilesSortT,
+        typename If<(GRID_ALGORITHM == GRID_HISTO_SHARED_ATOMIC),
+            BlockHistogramTilesSharedAtomicT,
+            BlockHistogramTilesGlobalAtomicT>::Type>::Type InternalBlockDelegate;
+
+    enum
+    {
+        TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
+    };
+
+
+    // Temporary storage type
+    typedef typename InternalBlockDelegate::TempStorage TempStorage;
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Internal block delegate
+    InternalBlockDelegate internal_delegate;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockHistogramTiles(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        internal_delegate(temp_storage, d_in, d_out_histograms)
+    {}
+
+
+    /**
+     * \brief Reduce a consecutive segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        SizeT   block_oob)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        // Consume subsequent full tiles of input
+        while (block_offset + TILE_ITEMS <= block_oob)
+        {
+            internal_delegate.ConsumeTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_oob)
+        {
+            int valid_items = block_oob - block_offset;
+            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Aggregate output
+        internal_delegate.AggregateOutput();
+    }
+
+
+    /**
+     * Reduce a consecutive segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT                               num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
+        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
+    {
+        even_share.BlockInit();
+        ConsumeTiles(even_share.block_offset, even_share.block_oob);
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        int                 num_items,          ///< Total number of input items
+        GridQueue<SizeT>    queue)              ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Shared block offset
+        __shared__ SizeT shared_block_offset;
+
+        // We give each thread block at least one tile of input.
+        SizeT block_offset      = blockIdx.x * TILE_ITEMS;
+        SizeT even_share_base   = gridDim.x * TILE_ITEMS;
+
+        // Process full tiles of input
+        while (block_offset + TILE_ITEMS <= num_items)
+        {
+            internal_delegate.ConsumeTile<true>(block_offset);
+
+            // Dequeue up to TILE_ITEMS
+            if (threadIdx.x == 0)
+                shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+            __syncthreads();
+
+            block_offset = shared_block_offset;
+
+            __syncthreads();
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < num_items)
+        {
+            int valid_items = num_items - block_offset;
+            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Aggregate output
+        internal_delegate.AggregateOutput();
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT                               num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
+        Int2Type<GRID_MAPPING_DYNAMIC>      is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
+    {
+        ConsumeTiles(num_items, queue);
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/block/block_partition_tiles.cuh
+++ b/lib/kokkos/TPL/cub/device/block/block_partition_tiles.cuh
@ -0,0 +1,381 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockPartitionTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide list partitioning.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "scan_tiles_types.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_store.cuh"
+#include "../../block/block_scan.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_vector.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockPartitionTiles
+ */
+template <
+    int                         _PARTITIONS,
+    int                         _BLOCK_THREADS,
+    int                         _ITEMS_PER_THREAD,
+    PtxLoadModifier             _LOAD_MODIFIER,
+    BlockScanAlgorithm          _SCAN_ALGORITHM>
+struct BlockPartitionTilesPolicy
+{
+    enum
+    {
+        PARTITIONS              = _PARTITIONS,
+        BLOCK_THREADS           = _BLOCK_THREADS,
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
+    };
+
+    static const PtxLoadModifier        LOAD_MODIFIER       = _LOAD_MODIFIER;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM      = _SCAN_ALGORITHM;
+};
+
+
+
+/**
+ * Tuple type for scanning partition membership flags
+ */
+template <
+    typename    SizeT,
+    int         PARTITIONS>
+struct PartitionScanTuple;
+
+
+/**
+ * Tuple type for scanning partition membership flags (specialized for 1 output partition)
+ */
+template <typename SizeT>
+struct PartitionScanTuple<SizeT, 1> : VectorHelper<SizeT, 1>::Type
+{
+    __device__ __forceinline__ PartitionScanTuple operator+(const PartitionScanTuple &other)
+    {
+        PartitionScanTuple retval;
+        retval.x = x + other.x;
+        return retval;
+    }
+
+    template <typename PredicateOp, typename T>
+    __device__ __forceinline__ void SetFlags(PredicateOp pred_op, T val)
+    {
+        this->x = pred_op(val);
+    }
+
+    template <typename PredicateOp, typename T, typename OutputIteratorRA, SizeT num_items>
+    __device__ __forceinline__ void Scatter(PredicateOp pred_op, T val, OutputIteratorRA d_out, SizeT num_items)
+    {
+        if (pred_op(val))
+            d_out[this->x - 1] = val;
+    }
+
+};
+
+
+/**
+ * Tuple type for scanning partition membership flags (specialized for 2 output partitions)
+ */
+template <typename SizeT>
+struct PartitionScanTuple<SizeT, 2> : VectorHelper<SizeT, 2>::Type
+{
+    __device__ __forceinline__ PartitionScanTuple operator+(const PartitionScanTuple &other)
+    {
+        PartitionScanTuple retval;
+        retval.x = x + other.x;
+        retval.y = y + other.y;
+        return retval;
+    }
+
+    template <typename PredicateOp, typename T>
+    __device__ __forceinline__ void SetFlags(PredicateOp pred_op, T val)
+    {
+        bool pred = pred_op(val);
+        this->x = pred;
+        this->y = !pred;
+    }
+
+    template <typename PredicateOp, typename T, typename OutputIteratorRA, SizeT num_items>
+    __device__ __forceinline__ void Scatter(PredicateOp pred_op, T val, OutputIteratorRA d_out, SizeT num_items)
+    {
+        SizeT scatter_offset = (pred_op(val)) ?
+            this->x - 1 :
+            num_items - this->y;
+
+        d_out[scatter_offset] = val;
+    }
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockPartitionTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide list partitioning.
+ *
+ * Implements a single-pass "domino" strategy with adaptive prefix lookback.
+ */
+template <
+    typename BlockPartitionTilesPolicy, ///< Tuning policy
+    typename InputIteratorRA,           ///< Input iterator type
+    typename OutputIteratorRA,          ///< Output iterator type
+    typename PredicateOp,               ///< Partition predicate functor type
+    typename SizeT>                     ///< Offset integer type
+struct BlockPartitionTiles
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        PARTITIONS          = BlockPartitionTilesPolicy::PARTITIONS,
+        BLOCK_THREADS       = BlockPartitionTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockPartitionTilesPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Load modifier
+    static const PtxLoadModifier LOAD_MODIFIER = BlockPartitionTilesPolicy::LOAD_MODIFIER;
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+    // Tuple type for scanning partition membership flags
+    typedef PartitionScanTuple<SizeT, PARTITIONS> PartitionScanTuple;
+
+    // Tile status descriptor type
+    typedef ScanTileDescriptor<PartitionScanTuple> ScanTileDescriptorT;
+
+    // Block scan type for scanning membership flag scan_tuples
+    typedef BlockScan<
+        PartitionScanTuple,
+        BlockPartitionTilesPolicy::BLOCK_THREADS,
+        BlockPartitionTilesPolicy::SCAN_ALGORITHM> BlockScanT;
+
+    // Callback type for obtaining inter-tile prefix during block scan
+    typedef DeviceScanBlockPrefixOp<PartitionScanTuple, Sum> InterblockPrefixOp;
+
+    // Shared memory type for this threadblock
+    struct TempStorage
+    {
+        typename InterblockPrefixOp::TempStorage    prefix;         // Smem needed for cooperative prefix callback
+        typename BlockScanT::TempStorage            scan;           // Smem needed for tile scanning
+        SizeT                                       tile_idx;       // Shared tile index
+    };
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    TempStorage                 &temp_storage;      ///< Reference to temp_storage
+    InputIteratorRA             d_in;               ///< Input data
+    OutputIteratorRA            d_out;              ///< Output data
+    ScanTileDescriptorT         *d_tile_status;     ///< Global list of tile status
+    PredicateOp                 pred_op;            ///< Unary predicate operator indicating membership in the first partition
+    SizeT                       num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    BlockPartitionTiles(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorRA             d_in,               ///< Input data
+        OutputIteratorRA            d_out,              ///< Output data
+        ScanTileDescriptorT         *d_tile_status,     ///< Global list of tile status
+        PredicateOp                 pred_op,            ///< Unary predicate operator indicating membership in the first partition
+        SizeT                       num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        d_tile_status(d_tile_status),
+        pred_op(pred_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Domino scan
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        int                 tile_idx,           ///< Tile index
+        SizeT               block_offset,       ///< Tile offset
+        PartitionScanTuple  &partition_ends)    ///< Running total
+    {
+        T                   items[ITEMS_PER_THREAD];
+        PartitionScanTuple  scan_tuples[ITEMS_PER_THREAD];
+
+        // Load items
+        int valid_items = num_items - block_offset;
+        if (FULL_TILE)
+            LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+        else
+            LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+
+        // Prevent hoisting
+//        __syncthreads();
+//        __threadfence_block();
+
+        // Set partition membership flags in scan scan_tuples
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_tuples[ITEM].SetFlags(pred_op, items[ITEM]);
+        }
+
+        // Perform inclusive scan over scan scan_tuples
+        PartitionScanTuple block_aggregate;
+        if (tile_idx == 0)
+        {
+            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, Sum(), block_aggregate);
+            partition_ends = block_aggregate;
+
+            // Update tile status if there are successor tiles
+            if (FULL_TILE && (threadIdx.x == 0))
+                ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
+        }
+        else
+        {
+            InterblockPrefixOp prefix_op(d_tile_status, temp_storage.prefix, Sum(), tile_idx);
+            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, Sum(), block_aggregate, prefix_op);
+            partition_ends = prefix_op.inclusive_prefix;
+        }
+
+        // Scatter items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Scatter if not out-of-bounds
+            if (FULL_TILE || (threadIdx.x + (ITEM * BLOCK_THREADS) < valid_items))
+            {
+                scan_tuples[ITEM].Scatter(pred_op, items[ITEM], d_out, num_items);
+            }
+        }
+    }
+
+
+    /**
+     * Dequeue and scan tiles of items as part of a domino scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        GridQueue<int>      queue,              ///< [in] Queue descriptor for assigning tiles of work to thread blocks
+        SizeT               num_tiles,          ///< [in] Total number of input tiles
+        PartitionScanTuple  &partition_ends,    ///< [out] Running partition end offsets
+        bool                &is_last_tile)      ///< [out] Whether or not this block handled the last tile (i.e., partition_ends is valid for the entire input)
+    {
+#if CUB_PTX_ARCH < 200
+
+        // No concurrent kernels allowed and blocks are launched in increasing order, so just assign one tile per block (up to 65K blocks)
+        int     tile_idx        = blockIdx.x;
+        SizeT   block_offset    = SizeT(TILE_ITEMS) * tile_idx;
+
+        if (block_offset + TILE_ITEMS <= num_items)
+        {
+            ConsumeTile<true>(tile_idx, block_offset, partition_ends);
+        }
+        else if (block_offset < num_items)
+        {
+            ConsumeTile<false>(tile_idx, block_offset, partition_ends);
+        }
+        is_last_tile = (tile_idx == num_tiles - 1);
+
+#else
+
+        // Get first tile
+        if (threadIdx.x == 0)
+            temp_storage.tile_idx = queue.Drain(1);
+
+        __syncthreads();
+
+        int tile_idx = temp_storage.tile_idx;
+        SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
+
+        while (block_offset + TILE_ITEMS <= num_items)
+        {
+            // Consume full tile
+            ConsumeTile<true>(tile_idx, block_offset, partition_ends);
+            is_last_tile = (tile_idx == num_tiles - 1);
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = queue.Drain(1);
+
+            __syncthreads();
+
+            tile_idx = temp_storage.tile_idx;
+            block_offset = SizeT(TILE_ITEMS) * tile_idx;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < num_items)
+        {
+            ConsumeTile<false>(tile_idx, block_offset, partition_ends);
+            is_last_tile = (tile_idx == num_tiles - 1);
+        }
+#endif
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/block/block_radix_sort_downsweep_tiles.cuh
+++ b/lib/kokkos/TPL/cub/device/block/block_radix_sort_downsweep_tiles.cuh
@ -0,0 +1,713 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * BlockRadixSortDownsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep.
+ */
+
+
+#pragma once
+
+#include "../../thread/thread_load.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_store.cuh"
+#include "../../block/block_radix_rank.cuh"
+#include "../../block/block_exchange.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Types of scattering strategies
+ */
+enum RadixSortScatterAlgorithm
+{
+    RADIX_SORT_SCATTER_DIRECT,      ///< Scatter directly from registers to global bins
+    RADIX_SORT_SCATTER_TWO_PHASE,   ///< First scatter from registers into shared memory bins, then into global bins
+};
+
+
+/**
+ * Tuning policy for BlockRadixSortDownsweepTiles
+ */
+template <
+    int                         _BLOCK_THREADS,             ///< The number of threads per CTA
+    int                         _ITEMS_PER_THREAD,          ///< The number of consecutive downsweep keys to process per thread
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
+    PtxLoadModifier             _LOAD_MODIFIER,             ///< The PTX cache-modifier to use for loads
+    bool                        _EXCHANGE_TIME_SLICING,     ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
+    bool                        _MEMOIZE_OUTER_SCAN,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure.  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+    BlockScanAlgorithm          _INNER_SCAN_ALGORITHM,      ///< The cub::BlockScanAlgorithm algorithm to use
+    RadixSortScatterAlgorithm   _SCATTER_ALGORITHM,         ///< The scattering strategy to use
+    cudaSharedMemConfig         _SMEM_CONFIG,               ///< Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+    int                         _RADIX_BITS>                ///< The number of radix bits, i.e., log2(bins)
+struct BlockRadixSortDownsweepTilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING   = _EXCHANGE_TIME_SLICING,
+        RADIX_BITS              = _RADIX_BITS,
+        MEMOIZE_OUTER_SCAN      = _MEMOIZE_OUTER_SCAN,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = _LOAD_ALGORITHM;
+    static const PtxLoadModifier            LOAD_MODIFIER           = _LOAD_MODIFIER;
+    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = _INNER_SCAN_ALGORITHM;
+    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = _SCATTER_ALGORITHM;
+    static const cudaSharedMemConfig        SMEM_CONFIG             = _SMEM_CONFIG;
+
+    typedef BlockRadixSortDownsweepTilesPolicy<
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM,
+        LOAD_MODIFIER,
+        EXCHANGE_TIME_SLICING,
+        MEMOIZE_OUTER_SCAN,
+        INNER_SCAN_ALGORITHM,
+        SCATTER_ALGORITHM,
+        SMEM_CONFIG,
+        CUB_MAX(1, RADIX_BITS - 1)> AltPolicy;
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * CTA-wide "downsweep" abstraction for distributing keys from
+ * a range of input tiles.
+ */
+template <
+    typename BlockRadixSortDownsweepTilesPolicy,
+    typename Key,
+    typename Value,
+    typename SizeT>
+struct BlockRadixSortDownsweepTiles
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of Key
+    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY;
+    static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY;
+
+    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = BlockRadixSortDownsweepTilesPolicy::LOAD_ALGORITHM;
+    static const PtxLoadModifier            LOAD_MODIFIER           = BlockRadixSortDownsweepTilesPolicy::LOAD_MODIFIER;
+    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = BlockRadixSortDownsweepTilesPolicy::INNER_SCAN_ALGORITHM;
+    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = BlockRadixSortDownsweepTilesPolicy::SCATTER_ALGORITHM;
+    static const cudaSharedMemConfig        SMEM_CONFIG             = BlockRadixSortDownsweepTilesPolicy::SMEM_CONFIG;
+
+    enum
+    {
+        BLOCK_THREADS           = BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = BlockRadixSortDownsweepTilesPolicy::ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING   = BlockRadixSortDownsweepTilesPolicy::EXCHANGE_TIME_SLICING,
+        RADIX_BITS              = BlockRadixSortDownsweepTilesPolicy::RADIX_BITS,
+        MEMOIZE_OUTER_SCAN      = BlockRadixSortDownsweepTilesPolicy::MEMOIZE_OUTER_SCAN,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = Equals<Value, NullType>::VALUE,
+
+        WARP_THREADS            = PtxArchProps::LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_SIZET         = sizeof(SizeT),
+        LOG_BYTES_PER_SIZET     = Log2<BYTES_PER_SIZET>::VALUE,
+
+        LOG_SMEM_BANKS          = PtxArchProps::LOG_SMEM_BANKS,
+        SMEM_BANKS              = 1 << LOG_SMEM_BANKS,
+
+        DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS,
+        SCATTER_PASSES          = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS,
+
+        LOG_STORE_TXN_THREADS   = LOG_SMEM_BANKS,
+        STORE_TXN_THREADS       = 1 << LOG_STORE_TXN_THREADS,
+    };
+
+    // BlockRadixRank type
+    typedef BlockRadixRank<
+        BLOCK_THREADS,
+        RADIX_BITS,
+        MEMOIZE_OUTER_SCAN,
+        INNER_SCAN_ALGORITHM,
+        SMEM_CONFIG> BlockRadixRank;
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        UnsignedBits*,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM,
+        LOAD_MODIFIER,
+        EXCHANGE_TIME_SLICING> BlockLoadKeys;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        Value*,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM,
+        LOAD_MODIFIER,
+        EXCHANGE_TIME_SLICING> BlockLoadValues;
+
+    // BlockExchange type (keys)
+    typedef BlockExchange<
+        UnsignedBits,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING> BlockExchangeKeys;
+
+    // BlockExchange type (values)
+    typedef BlockExchange<
+        Value,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING> BlockExchangeValues;
+
+
+    /**
+     * Shared memory storage layout
+     */
+    struct _TempStorage
+    {
+        SizeT   relative_bin_offsets[RADIX_DIGITS + 1];
+        bool    short_circuit;
+
+        union
+        {
+            typename BlockRadixRank::TempStorage        ranking;
+            typename BlockLoadKeys::TempStorage         load_keys;
+            typename BlockLoadValues::TempStorage       load_values;
+            typename BlockExchangeKeys::TempStorage     exchange_keys;
+            typename BlockExchangeValues::TempStorage   exchange_values;
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    UnsignedBits    *d_keys_in;
+    UnsignedBits    *d_keys_out;
+    Value           *d_values_in;
+    Value           *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    SizeT           bin_offset;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Whether to short-ciruit
+    bool            short_circuit;
+
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decodes given keys to lookup digit offsets in shared memory
+     */
+    __device__ __forceinline__ void DecodeRelativeBinOffsets(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        SizeT           (&relative_bin_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, RADIX_BITS);
+
+            // Lookup base digit offset from shared memory
+            relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit];
+        }
+    }
+
+
+    /**
+     * Scatter ranked items to global memory
+     */
+    template <bool FULL_TILE, typename T>
+    __device__ __forceinline__ void ScatterItems(
+        T       (&items)[ITEMS_PER_THREAD],
+        int     (&local_ranks)[ITEMS_PER_THREAD],
+        SizeT   (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        T       *d_out,
+        SizeT   valid_items)
+    {
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Scatter if not out-of-bounds
+            if (FULL_TILE || (local_ranks[ITEM] < valid_items))
+            {
+                d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM];
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked keys directly to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
+        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        SizeT                                   valid_items,
+        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
+    {
+        // Compute scatter offsets
+        DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets);
+
+        // Untwiddle keys before outputting
+        UnsignedBits keys[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits<Key>::TwiddleOut(twiddled_keys[KEY]);
+        }
+
+        // Scatter to global
+        ScatterItems<FULL_TILE>(keys, ranks, relative_bin_offsets, d_keys_out, valid_items);
+    }
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
+        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        SizeT                                   valid_items,
+        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
+    {
+        // Exchange keys through shared memory
+        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks);
+
+        // Compute striped local ranks
+        int local_ranks[ITEMS_PER_THREAD];
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
+        }
+
+        // Scatter directly
+        ScatterKeys<FULL_TILE>(
+            twiddled_keys,
+            relative_bin_offsets,
+            local_ranks,
+            valid_items,
+            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
+    }
+
+
+    /**
+     * Scatter ranked values directly to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        Value                                   (&values)[ITEMS_PER_THREAD],
+        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        SizeT                                   valid_items,
+        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
+    {
+        // Scatter to global
+        ScatterItems<FULL_TILE>(values, ranks, relative_bin_offsets, d_values_out, valid_items);
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        Value                                   (&values)[ITEMS_PER_THREAD],
+        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        SizeT                                   valid_items,
+        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
+    {
+        __syncthreads();
+
+        // Exchange keys through shared memory
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+
+        // Compute striped local ranks
+        int local_ranks[ITEMS_PER_THREAD];
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
+        }
+
+        // Scatter directly
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            local_ranks,
+            valid_items,
+            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile)
+     */
+    template <typename BlockLoadT, typename T>
+    __device__ __forceinline__ void LoadItems(
+        BlockLoadT      &block_loader, 
+        T               (&items)[ITEMS_PER_THREAD],
+        T               *d_in, 
+        SizeT           valid_items, 
+        Int2Type<true>  is_full_tile)
+    {
+        block_loader.Load(d_in, items);
+    }
+
+
+    /**
+     * Load a tile of items (specialized for partial tile)
+     */
+    template <typename BlockLoadT, typename T>
+    __device__ __forceinline__ void LoadItems(
+        BlockLoadT      &block_loader, 
+        T               (&items)[ITEMS_PER_THREAD],
+        T               *d_in, 
+        SizeT           valid_items, 
+        Int2Type<false> is_full_tile)
+    {
+        block_loader.Load(d_in, items, valid_items);
+    }
+
+
+    /**
+     * Truck along associated values
+     */
+    template <bool FULL_TILE, typename _Value>
+    __device__ __forceinline__ void GatherScatterValues(
+        _Value      (&values)[ITEMS_PER_THREAD],
+        SizeT       (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        SizeT       block_offset,
+        SizeT       valid_items)
+    {
+        BlockLoadValues loader(temp_storage.load_values);
+        LoadItems(
+            loader,
+            values,
+            d_values_in + block_offset,
+            valid_items,
+            Int2Type<FULL_TILE>());
+
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items,
+            Int2Type<SCATTER_ALGORITHM>());
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        NullType    (&values)[ITEMS_PER_THREAD],
+        SizeT       (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        SizeT       block_offset,
+        SizeT       valid_items)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        SizeT block_offset,
+        const SizeT &valid_items = TILE_ITEMS)
+    {
+        // Per-thread tile data
+        UnsignedBits    keys[ITEMS_PER_THREAD];                     // Keys
+        UnsignedBits    twiddled_keys[ITEMS_PER_THREAD];            // Twiddled keys
+        int             ranks[ITEMS_PER_THREAD];                    // For each key, the local rank within the CTA
+        SizeT           relative_bin_offsets[ITEMS_PER_THREAD];     // For each key, the global scatter base offset of the corresponding digit
+
+        if (LOAD_ALGORITHM != BLOCK_LOAD_DIRECT) __syncthreads();
+
+        // Assign max-key to all keys
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            keys[ITEM] = MAX_KEY;
+        }
+
+        // Load tile of keys
+        BlockLoadKeys loader(temp_storage.load_keys);
+        LoadItems(
+            loader,
+            keys,
+            d_keys_in + block_offset,
+            valid_items, 
+            Int2Type<FULL_TILE>());
+
+        __syncthreads();
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            twiddled_keys[KEY] = Traits<Key>::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int inclusive_digit_prefix;
+        BlockRadixRank(temp_storage.ranking).RankKeys(
+            twiddled_keys,
+            ranks,
+            current_bit,
+            inclusive_digit_prefix);
+
+        // Update global scatter base offsets for each digit
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS))
+        {
+            int exclusive_digit_prefix;
+
+            // Get exclusive digit prefix from inclusive prefix
+#if CUB_PTX_ARCH >= 300
+            exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1);
+            if (threadIdx.x == 0)
+                exclusive_digit_prefix = 0;
+#else
+            volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
+            exchange[threadIdx.x] = 0;
+            exchange[threadIdx.x + 1] = inclusive_digit_prefix;
+            exclusive_digit_prefix = exchange[threadIdx.x];
+#endif
+
+            bin_offset -= exclusive_digit_prefix;
+            temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset;
+            bin_offset += inclusive_digit_prefix;
+        }
+
+        __syncthreads();
+
+        // Scatter keys
+        ScatterKeys<FULL_TILE>(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type<SCATTER_ALGORITHM>());
+
+        // Gather/scatter values
+        Value values[ITEMS_PER_THREAD];
+        GatherScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, block_offset, valid_items);
+    }
+
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <typename T>
+    __device__ __forceinline__ void Copy(
+        T       *d_in,
+        T       *d_out,
+        SizeT   block_offset,
+        SizeT   block_oob)
+    {
+        // Simply copy the input
+        while (block_offset + TILE_ITEMS <= block_oob)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadStriped<LOAD_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+            __syncthreads();
+            StoreStriped<STORE_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_oob)
+        {
+            SizeT valid_items = block_oob - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadStriped<LOAD_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+            __syncthreads();
+            StoreStriped<STORE_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    __device__ __forceinline__ void Copy(
+        NullType    *d_in,
+        NullType    *d_out,
+        SizeT       block_offset,
+        SizeT       block_oob)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRadixSortDownsweepTiles(
+        TempStorage &temp_storage,
+        SizeT       bin_offset,
+        Key         *d_keys_in,
+        Key         *d_keys_out,
+        Value       *d_values_in,
+        Value       *d_values_out,
+        int         current_bit)
+    :
+        temp_storage(temp_storage.Alias()),
+        bin_offset(bin_offset),
+        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        short_circuit(false)
+    {}
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRadixSortDownsweepTiles(
+        TempStorage &temp_storage,
+        SizeT       num_items,
+        SizeT       *d_spine,
+        Key         *d_keys_in,
+        Key         *d_keys_out,
+        Value       *d_values_in,
+        Value       *d_values_out,
+        int         current_bit)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        current_bit(current_bit)
+    {
+        // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+        if (threadIdx.x < RADIX_DIGITS)
+        {
+            // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+            SizeT first_block_bin_offset = d_spine[gridDim.x * threadIdx.x];
+            int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+            this->temp_storage.short_circuit = WarpAll(predicate);
+
+            // Load my block's bin offset for my bin
+            bin_offset = d_spine[(gridDim.x * threadIdx.x) + blockIdx.x];
+        }
+
+        __syncthreads();
+
+        short_circuit = this->temp_storage.short_circuit;
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessTiles(
+        SizeT           block_offset,
+        const SizeT     &block_oob)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_oob);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_oob);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            while (block_offset + TILE_ITEMS <= block_oob)
+            {
+                ProcessTile<true>(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_oob)
+            {
+                ProcessTile<false>(block_offset, block_oob - block_offset);
+            }
+        }
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/block/block_radix_sort_upsweep_tiles.cuh
+++ b/lib/kokkos/TPL/cub/device/block/block_radix_sort_upsweep_tiles.cuh
@ -0,0 +1,464 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
+ */
+
+#pragma once
+
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../block/block_load.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockRadixSortUpsweepTiles
+ */
+template <
+    int                 _BLOCK_THREADS,     ///< The number of threads per CTA
+    int                 _ITEMS_PER_THREAD,  ///< The number of items to load per thread per tile
+    PtxLoadModifier     _LOAD_MODIFIER,     ///< Load cache-modifier
+    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
+struct BlockRadixSortUpsweepTilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
+        RADIX_BITS          = _RADIX_BITS,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+
+    typedef BlockRadixSortUpsweepTilesPolicy<
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_MODIFIER,
+        CUB_MAX(1, RADIX_BITS - 1)> AltPolicy;
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
+ *
+ * Computes radix digit histograms over a range of input tiles.
+ */
+template <
+    typename BlockRadixSortUpsweepTilesPolicy,
+    typename Key,
+    typename SizeT>
+struct BlockRadixSortUpsweepTiles
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortUpsweepTilesPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = BlockRadixSortUpsweepTilesPolicy::RADIX_BITS,
+        BLOCK_THREADS           = BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = BlockRadixSortUpsweepTilesPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = PtxArchProps::LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+
+    /**
+     * Shared memory storage layout
+     */
+    struct _TempStorage
+    {
+        union
+        {
+            DigitCounter    digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter   packed_counters[COUNTER_LANES][BLOCK_THREADS];
+            SizeT           digit_partials[RADIX_DIGITS][WARP_THREADS + 1];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    SizeT           local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    UnsignedBits    *d_keys_in;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        enum {
+            HALF = (MAX / 2),
+        };
+
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            BlockRadixSortUpsweepTiles &cta,
+            UnsignedBits keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+
+        // ProcessTiles
+        static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
+        {
+            // Next
+            Iterate<1, HALF>::ProcessTiles(cta, block_offset);
+            Iterate<1, MAX - HALF>::ProcessTiles(cta, block_offset + (HALF * TILE_ITEMS));
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(BlockRadixSortUpsweepTiles &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
+
+        // ProcessTiles
+        static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
+        {
+            cta.ProcessFullTile(block_offset);
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<Key>::TwiddleIn(key);
+
+        // Add in sub-counter offset
+        UnsignedBits sub_counter = BFE(converted_key, current_bit, LOG_PACKING_RATIO);
+
+        // Add in row offset
+        UnsignedBits row_offset = BFE(converted_key, current_bit + LOG_PACKING_RATIO, LOG_COUNTER_LANES);
+
+        // Increment counter
+        temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++;
+
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        SizeT counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Places unpacked counters into smem for final digit reduction
+     */
+    __device__ __forceinline__ void ReduceUnpackedCounts(SizeT &bin_count)
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        __syncthreads();
+
+        // Rake-reduce bin_count reductions
+        if (threadIdx.x < RADIX_DIGITS)
+        {
+            bin_count = ThreadReduce<WARP_THREADS>(
+                temp_storage.digit_partials[threadIdx.x],
+                Sum());
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(SizeT block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+//        __threadfence_block();
+//        __syncthreads();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        SizeT block_offset,
+        const SizeT &block_oob)
+    {
+        // Process partial tile if necessary using single loads
+        block_offset += threadIdx.x;
+        while (block_offset < block_oob)
+        {
+            // Load and bucket key
+            UnsignedBits key = ThreadLoad<LOAD_MODIFIER>(d_keys_in + block_offset);
+            Bucket(key);
+            block_offset += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRadixSortUpsweepTiles(
+        TempStorage &temp_storage,
+        Key         *d_keys_in,
+        int         current_bit)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+        current_bit(current_bit)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessTiles(
+        SizeT           block_offset,
+        const SizeT     &block_oob,
+        SizeT           &bin_count)                ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_offset + UNROLLED_ELEMENTS <= block_oob)
+        {
+            Iterate<0, UNROLL_COUNT>::ProcessTiles(*this, block_offset);
+            block_offset += UNROLLED_ELEMENTS;
+
+            __syncthreads();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            __syncthreads();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_offset + TILE_ITEMS <= block_oob)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_oob);
+
+        __syncthreads();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+
+        __syncthreads();
+
+        // Final raking reduction of counts by bin
+        ReduceUnpackedCounts(bin_count);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/block/block_reduce_by_key_tiles.cuh
+++ b/lib/kokkos/TPL/cub/device/block/block_reduce_by_key_tiles.cuh
@ -0,0 +1,399 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceByKeyiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "scan_tiles_types.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../block/block_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Utility data types
+ ******************************************************************************/
+
+/// Scan tuple data type for reduce-value-by-key
+template <typename Value, typename SizeT>
+struct ReduceByKeyuple
+{
+    Value   value;      // Initially set as value, contains segment aggregate after prefix scan
+    SizeT   flag;       // Initially set as a tail flag, contains scatter offset after prefix scan
+};
+
+
+/// Binary reduce-by-key scan operator
+template <typename ReductionOp>
+struct ReduceByKeyScanOp
+{
+    /// Reduction functor
+    ReductionOp reduction_op;
+
+    /// Constructor
+    ReduceByKeyScanOp(ReductionOp reduction_op) : reduction_op(reduction_op)
+    {}
+
+    /// Binary scan operator
+    template <typename ReduceByKeyuple>
+    __device__ __forceinline__ ReduceByKeyuple operator()(
+        const ReduceByKeyuple &first,
+        const ReduceByKeyuple &second)
+    {
+        ReduceByKeyuple retval;
+        retval.val = (second.flag) ? second.val : reduction_op(first.val, second.val);
+        retval.flag = first.flag + second.flag;
+        return retval;
+    }
+};
+
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockReduceByKeyiles
+ */
+template <
+    int                         _BLOCK_THREADS,
+    int                         _ITEMS_PER_THREAD,
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,
+    bool                        _LOAD_WARP_TIME_SLICING,
+    PtxLoadModifier             _LOAD_MODIFIER,
+    BlockScanAlgorithm          _SCAN_ALGORITHM>
+struct BlockReduceByKeyilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
+        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM      = _LOAD_ALGORITHM;
+    static const PtxLoadModifier        LOAD_MODIFIER       = _LOAD_MODIFIER;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM      = _SCAN_ALGORITHM;
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockReduceByKeyiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
+ */
+template <
+    typename BlockReduceByKeyilesPolicy,   ///< Tuning policy
+    typename KeyInputIteratorRA,            ///< Random-access input iterator type for keys
+    typename KeyOutputIteratorRA,           ///< Random-access output iterator type for keys
+    typename ValueInputIteratorRA,          ///< Random-access input iterator type for values
+    typename ValueOutputIteratorRA,         ///< Random-access output iterator type for values
+    typename ReductionOp,                   ///< Reduction functor type
+    typename SizeT>                         ///< Offset integer type
+struct BlockReduceByKeyiles
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data types of input iterators
+    typedef typename std::iterator_traits<KeyInputIteratorRA>::value_type   Key;    // Key data type
+    typedef typename std::iterator_traits<ValueInputIteratorRA>::value_type Value;  // Value data type
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockReduceByKeyilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        STATUS_PADDING      = PtxArchProps::WARP_THREADS,
+    };
+
+    // Block load type for keys
+    typedef BlockLoad<
+        KeyInputIteratorRA,
+        BlockReduceByKeyilesPolicy::BLOCK_THREADS,
+        BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
+        BlockReduceByKeyilesPolicy::LOAD_ALGORITHM,
+        BlockReduceByKeyilesPolicy::LOAD_MODIFIER,
+        BlockReduceByKeyilesPolicy::LOAD_WARP_TIME_SLICING>    BlockLoadKeys;
+
+    // Block load type for values
+    typedef BlockLoad<
+        ValueInputIteratorRA,
+        BlockReduceByKeyilesPolicy::BLOCK_THREADS,
+        BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
+        BlockReduceByKeyilesPolicy::LOAD_ALGORITHM,
+        BlockReduceByKeyilesPolicy::LOAD_MODIFIER,
+        BlockReduceByKeyilesPolicy::LOAD_WARP_TIME_SLICING>    BlockLoadValues;
+
+    // Block discontinuity type for setting tail flags
+    typedef BlockDiscontinuity<Key, BLOCK_THREADS>              BlockDiscontinuityKeys;
+
+    // Scan tuple type
+    typedef ReduceByKeyuple<Value, SizeT>                      ScanTuple;
+
+    // Tile status descriptor type
+    typedef ScanTileDescriptor<ScanTuple>                 ScanTileDescriptorT;
+
+    // Block scan functor type
+    typedef ReduceByKeyScanOp<ReductionOp>                      ScanOp;
+
+    // Block scan prefix callback type
+    typedef DeviceScanBlockPrefixOp<ScanTuple, ScanOp>          PrefixCallback;
+
+    // Block scan type
+    typedef BlockScan<
+        ScanTuple,
+        BlockReduceByKeyilesPolicy::BLOCK_THREADS,
+        BlockReduceByKeyilesPolicy::SCAN_ALGORITHM>            BlockScanT;
+
+    /// Shared memory type for this threadblock
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockLoadKeys::TempStorage         load_keys;      // Smem needed for loading tiles of keys
+            typename BlockLoadValues::TempStorage       load_values;    // Smem needed for loading tiles of values
+            struct
+            {
+                typename BlockScanT::TempStorage        scan;           // Smem needed for tile scanning
+                typename PrefixCallback::TempStorage    prefix;         // Smem needed for cooperative prefix callback
+            };
+        };
+
+        typename BlockDiscontinuityKeys::TempStorage    flagging;       // Smem needed for tile scanning
+        SizeT                                           tile_idx;       // Shared tile index
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                &temp_storage;      ///< Reference to temp_storage
+    KeyInputIteratorRA          d_keys_in;          ///< Key input data
+    KeyOutputIteratorRA         d_keys_out;         ///< Key output data
+    ValueInputIteratorRA        d_values_in;        ///< Value input data
+    ValueOutputIteratorRA       d_values_out;       ///< Value output data
+    ScanTileDescriptorT         *d_tile_status;     ///< Global list of tile status
+    ScanOp                      scan_op;            ///< Binary scan operator
+    int                         num_tiles;          ///< Total number of input tiles for the entire problem
+    SizeT                       num_items;          ///< Total number of scan items for the entire problem
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    BlockReduceByKeyiles(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        KeyInputIteratorRA          d_keys_in,          ///< Key input data
+        KeyOutputIteratorRA         d_keys_out,         ///< Key output data
+        ValueInputIteratorRA        d_values_in,        ///< Value input data
+        ValueOutputIteratorRA       d_values_out,       ///< Value output data
+        ScanTileDescriptorT       *d_tile_status,     ///< Global list of tile status
+        ReductionOp                 reduction_op,       ///< Binary scan operator
+        int                         num_tiles,          ///< Total number of input tiles for the entire problem
+        SizeT                       num_items)          ///< Total number of scan items for the entire problem
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_keys_out(d_keys_out),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        d_tile_status(d_tile_status),
+        scan_op(reduction_op),
+        num_tiles(num_tiles),
+        num_items(num_items)
+    {}
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        int     tile_idx,                   ///< Tile index
+        SizeT   block_offset,               ///< Tile offset
+        int     valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        Key         keys[ITEMS_PER_THREAD];
+        Value       values[ITEMS_PER_THREAD];
+        int         tail_flags[ITEMS_PER_THREAD];
+        ScanTuple   scan_tuples[ITEMS_PER_THREAD];
+
+        // Load keys
+        if (FULL_TILE)
+            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
+        else
+            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items);
+
+        // Set tail flags
+        if (tile_idx == num_tiles - 1)
+        {
+            // Last tile
+            BlockDiscontinuityKeys(temp_storage.flagging).FlagTails(tail_flags, keys, Equality());
+        }
+        else
+        {
+            // Preceding tiles require the first element of the next tile
+            Key tile_suffix_item;
+            if (threadIdx.x == 0)
+                tile_suffix_item = d_keys_in[block_offset + TILE_ITEMS];
+
+            BlockDiscontinuityKeys(temp_storage.flagging).FlagTails(tail_flags, keys, Equality(), tile_suffix_item);
+        }
+
+        __syncthreads();
+
+        // Load values
+        if (FULL_TILE)
+            BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
+        else
+            BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items);
+
+        // Assemble scan tuples
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_tuples[ITEM].value     = values[ITEM];
+            scan_tuples[ITEM].flag      = tail_flags[ITEM];
+        }
+
+        __syncthreads();
+
+        // Perform inclusive prefix scan
+        ScanTuple block_aggregate;
+        if (tile_idx == 0)
+        {
+            // Without prefix callback
+            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, scan_op, block_aggregate);
+
+            // Update tile status
+            if (threadIdx.x == 0)
+                ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
+        }
+        else
+        {
+            // With prefix callback
+            PrefixCallback prefix_op(d_tile_status, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, scan_op, block_aggregate, prefix_op);
+        }
+
+        // Scatter flagged keys and values to output
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int tile_item = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+
+            // Set the head flag on the last item in a partially-full tile
+            if (!FULL_TILE && (tile_item == valid_items - 1))
+                tail_flags[ITEM] = 1;
+
+            // Decrement scatter offset
+            scan_tuples[ITEM].flag--;
+
+            // Scatter key and aggregate value if flagged and in range
+            if ((FULL_TILE || (tile_item < valid_items)) && (tail_flags[ITEM]))
+            {
+                d_keys_out[scan_tuples[ITEM].flag]      = keys[ITEM];
+                d_values_out[scan_tuples[ITEM].flag]    = scan_tuples[ITEM].value;
+            }
+        }
+    }
+
+
+
+    /**
+     * Dequeue and scan tiles of elements
+     */
+    __device__ __forceinline__ void ProcessTiles(GridQueue<int> queue)          ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // We give each thread block at least one tile of input
+        int tile_idx = blockIdx.x;
+
+        // Consume full tiles of input
+        SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
+        while (block_offset + TILE_ITEMS <= num_items)
+        {
+            ConsumeTile<true>(tile_idx, block_offset);
+
+            // Get next tile
+#if CUB_PTX_ARCH < 200
+            // No concurrent kernels allowed, so just stripe tiles
+            tile_idx += gridDim.x;
+#else
+            // Concurrent kernels are allowed, so we must only use active blocks to dequeue tile indices
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = queue.Drain(1) + gridDim.x;
+
+            __syncthreads();
+
+            tile_idx = temp_storage.tile_idx;
+#endif
+            block_offset = SizeT(TILE_ITEMS) * tile_idx;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < num_items)
+        {
+            // Consume a partially-full tile
+            int valid_items = num_items - block_offset;
+            ConsumeTile<false>(tile_idx, block_offset, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/block/block_reduce_tiles.cuh
+++ b/lib/kokkos/TPL/cub/device/block/block_reduce_tiles.cuh
@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../block/block_load.cuh"
+#include "../../block/block_reduce.cuh"
+#include "../../grid/grid_mapping.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_vector.cuh"
+#include "../../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockReduceTiles
+ */
+template <
+    int                     _BLOCK_THREADS,         ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,      ///< Items per thread per tile of input
+    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
+    PtxLoadModifier         _LOAD_MODIFIER,         ///< PTX load modifier
+    GridMappingStrategy     _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
+struct BlockReduceTilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;
+    static const GridMappingStrategy   GRID_MAPPING         = _GRID_MAPPING;
+    static const PtxLoadModifier       LOAD_MODIFIER        = _LOAD_MODIFIER;
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename BlockReduceTilesPolicy,
+    typename InputIteratorRA,
+    typename SizeT,
+    typename ReductionOp>
+struct BlockReduceTiles
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type  T;              // Type of input iterator
+    typedef VectorHelper<T, BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH> VecHelper;      // Helper type for vectorizing loads of T
+    typedef typename VecHelper::Type                                    VectorT;        // Vector of T
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockReduceTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockReduceTilesPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a built-in primitive
+        CAN_VECTORIZE       = (BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH > 1) &&
+                                (IsPointer<InputIteratorRA>::VALUE) &&
+                                (VecHelper::BUILT_IN),
+
+    };
+
+    static const PtxLoadModifier      LOAD_MODIFIER   = BlockReduceTilesPolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockReduceTilesPolicy::BLOCK_ALGORITHM;
+
+    // Parameterized BlockReduce primitive
+    typedef BlockReduce<T, BLOCK_THREADS, BlockReduceTilesPolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    typedef typename BlockReduceT::TempStorage _TempStorage;
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    T                       thread_aggregate;   ///< Each thread's partial reduction
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorRA         d_in;               ///< Input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+    int                     first_tile_size;    ///< Size of first tile consumed
+    bool                    input_aligned;      ///< Whether or not input is vector-aligned
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockReduceTiles(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorRA         d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        reduction_op(reduction_op),
+        first_tile_size(0),
+        input_aligned(CAN_VECTORIZE && ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0))
+    {}
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT   block_offset,                   ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)       ///< The number of valid items in the tile
+    {
+        if (FULL_TILE)
+        {
+            T stripe_partial;
+
+            // Load full tile
+            if (input_aligned)
+            {
+                // Alias items as an array of VectorT and load it in striped fashion
+                enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+                VectorT vec_items[WORDS];
+
+                // Load striped into vec items
+                VectorT* alias_ptr = reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH));
+
+                #pragma unroll
+                for (int i = 0; i < WORDS; ++i)
+                    vec_items[i] = alias_ptr[BLOCK_THREADS * i];
+
+                // Reduce items within each thread stripe
+                stripe_partial = ThreadReduce<ITEMS_PER_THREAD>(
+                    reinterpret_cast<T*>(vec_items),
+                    reduction_op);
+            }
+            else
+            {
+                T items[ITEMS_PER_THREAD];
+
+                // Load items in striped fashion
+                LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+
+                // Reduce items within each thread stripe
+                stripe_partial = ThreadReduce(items, reduction_op);
+            }
+
+            // Update running thread aggregate
+            thread_aggregate = (first_tile_size) ?
+                reduction_op(thread_aggregate, stripe_partial) :       // Update
+                stripe_partial;                                        // Assign
+        }
+        else
+        {
+
+            // Partial tile
+            int thread_offset = threadIdx.x;
+
+            if (!first_tile_size && (thread_offset < valid_items))
+            {
+                // Assign thread_aggregate
+                thread_aggregate = ThreadLoad<LOAD_MODIFIER>(d_in + block_offset + thread_offset);
+                thread_offset += BLOCK_THREADS;
+            }
+
+            while (thread_offset < valid_items)
+            {
+                // Update thread aggregate
+                T item = ThreadLoad<LOAD_MODIFIER>(d_in + block_offset + thread_offset);
+                thread_aggregate = reduction_op(thread_aggregate, item);
+                thread_offset += BLOCK_THREADS;
+            }
+        }
+
+        // Set first tile size if necessary
+        if (!first_tile_size)
+            first_tile_size = valid_items;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        SizeT   block_oob,                          ///< [in] Threadblock end offset (exclusive)
+        T       &block_aggregate)                   ///< [out] Running total
+    {
+        // Consume subsequent full tiles of input
+        while (block_offset + TILE_ITEMS <= block_oob)
+        {
+            ConsumeTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_oob)
+        {
+            int valid_items = block_oob - block_offset;
+            ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Compute block-wide reduction
+        block_aggregate = (first_tile_size < TILE_ITEMS) ?
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT                               num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
+        T                                   &block_aggregate,   ///< [out] Running total
+        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
+    {
+        // Initialize even-share descriptor for this thread block
+        even_share.BlockInit();
+
+        // Consume input tiles
+        ConsumeTiles(even_share.block_offset, even_share.block_oob, block_aggregate);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Dynamically consume tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        int                 num_items,          ///< Total number of input items
+        GridQueue<SizeT>    queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
+        T                   &block_aggregate)   ///< [out] Running total
+    {
+        // Shared dequeue offset
+        __shared__ SizeT dequeue_offset;
+
+        // We give each thread block at least one tile of input.
+        SizeT block_offset = blockIdx.x * TILE_ITEMS;
+        SizeT even_share_base = gridDim.x * TILE_ITEMS;
+
+        if (block_offset + TILE_ITEMS <= num_items)
+        {
+            // Consume full tile of input
+            ConsumeTile<true>(block_offset);
+
+            // Dequeue more tiles
+            while (true)
+            {
+                 // Dequeue a tile of items
+                if (threadIdx.x == 0)
+                    dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+                __syncthreads();
+
+                // Grab tile offset and check if we're done with full tiles
+                block_offset = dequeue_offset;
+
+                __syncthreads();
+
+                if (block_offset + TILE_ITEMS > num_items)
+                    break;
+
+                // Consume a full tile
+                ConsumeTile<true>(block_offset);
+            }
+        }
+
+        if (block_offset < num_items)
+        {
+            int valid_items = num_items - block_offset;
+            ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Compute block-wide reduction
+        block_aggregate = (first_tile_size < TILE_ITEMS) ?
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT                               num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
+        T                                   &block_aggregate,   ///< [out] Running total
+        Int2Type<GRID_MAPPING_DYNAMIC>      is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
+    {
+        ConsumeTiles(num_items, queue, block_aggregate);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/block/block_scan_tiles.cuh
+++ b/lib/kokkos/TPL/cub/device/block/block_scan_tiles.cuh
@ -0,0 +1,509 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "scan_tiles_types.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_store.cuh"
+#include "../../block/block_scan.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockScanTiles
+ */
+template <
+    int                         _BLOCK_THREADS,
+    int                         _ITEMS_PER_THREAD,
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,
+    bool                        _LOAD_WARP_TIME_SLICING,
+    PtxLoadModifier             _LOAD_MODIFIER,
+    BlockStoreAlgorithm         _STORE_ALGORITHM,
+    bool                        _STORE_WARP_TIME_SLICING,
+    BlockScanAlgorithm          _SCAN_ALGORITHM>
+struct BlockScanTilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
+        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM      = _LOAD_ALGORITHM;
+    static const PtxLoadModifier        LOAD_MODIFIER       = _LOAD_MODIFIER;
+    static const BlockStoreAlgorithm    STORE_ALGORITHM     = _STORE_ALGORITHM;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM      = _SCAN_ALGORITHM;
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
+ *
+ * Implements a single-pass "domino" strategy with adaptive prefix lookback.
+ */
+template <
+    typename BlockScanTilesPolicy,     ///< Tuning policy
+    typename InputIteratorRA,               ///< Input iterator type
+    typename OutputIteratorRA,              ///< Output iterator type
+    typename ScanOp,                        ///< Scan functor type
+    typename Identity,                      ///< Identity element type (cub::NullType for inclusive scan)
+    typename SizeT>                         ///< Offset integer type
+struct BlockScanTiles
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+    // Constants
+    enum
+    {
+        INCLUSIVE           = Equals<Identity, NullType>::VALUE,            // Inclusive scan if no identity type is provided
+        BLOCK_THREADS       = BlockScanTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockScanTilesPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Block load type
+    typedef BlockLoad<
+        InputIteratorRA,
+        BlockScanTilesPolicy::BLOCK_THREADS,
+        BlockScanTilesPolicy::ITEMS_PER_THREAD,
+        BlockScanTilesPolicy::LOAD_ALGORITHM,
+        BlockScanTilesPolicy::LOAD_MODIFIER,
+        BlockScanTilesPolicy::LOAD_WARP_TIME_SLICING>   BlockLoadT;
+
+    // Block store type
+    typedef BlockStore<
+        OutputIteratorRA,
+        BlockScanTilesPolicy::BLOCK_THREADS,
+        BlockScanTilesPolicy::ITEMS_PER_THREAD,
+        BlockScanTilesPolicy::STORE_ALGORITHM,
+        STORE_DEFAULT,
+        BlockScanTilesPolicy::STORE_WARP_TIME_SLICING>  BlockStoreT;
+
+    // Tile status descriptor type
+    typedef ScanTileDescriptor<T>                 ScanTileDescriptorT;
+
+    // Block scan type
+    typedef BlockScan<
+        T,
+        BlockScanTilesPolicy::BLOCK_THREADS,
+        BlockScanTilesPolicy::SCAN_ALGORITHM> BlockScanT;
+
+    // Callback type for obtaining inter-tile prefix during block scan
+    typedef DeviceScanBlockPrefixOp<T, ScanOp> InterblockPrefixOp;
+
+    // Shared memory type for this threadblock
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockLoadT::TempStorage            load;               // Smem needed for tile loading
+            typename BlockStoreT::TempStorage           store;              // Smem needed for tile storing
+            struct
+            {
+                typename InterblockPrefixOp::TempStorage    prefix;         // Smem needed for cooperative prefix callback
+                typename BlockScanT::TempStorage            scan;           // Smem needed for tile scanning
+            };
+        };
+
+        SizeT                                           tile_idx;           // Shared tile index
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                &temp_storage;      ///< Reference to temp_storage
+    InputIteratorRA             d_in;               ///< Input data
+    OutputIteratorRA            d_out;              ///< Output data
+    ScanOp                      scan_op;            ///< Binary scan operator
+    Identity                    identity;           ///< Identity element
+
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods (first tile)
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization
+     */
+    template <typename _ScanOp, typename _Identity>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
+    }
+
+    /**
+     * Exclusive sum specialization
+     */
+    template <typename _Identity>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
+    }
+
+    /**
+     * Inclusive scan specialization
+     */
+    template <typename _ScanOp>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+    /**
+     * Inclusive sum specialization
+     */
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
+    }
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods (subsequent tiles)
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (with prefix from predecessors)
+     */
+    template <typename _ScanOp, typename _Identity, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Exclusive sum specialization (with prefix from predecessors)
+     */
+    template <typename _Identity, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Inclusive scan specialization (with prefix from predecessors)
+     */
+    template <typename _ScanOp, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Inclusive sum specialization (with prefix from predecessors)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
+    }
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    BlockScanTiles(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorRA             d_in,               ///< Input data
+        OutputIteratorRA            d_out,              ///< Output data
+        ScanOp                      scan_op,            ///< Binary scan operator
+        Identity                    identity)           ///< Identity element
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        identity(identity)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Domino scan
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (domino scan)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT                 num_items,          ///< Total number of input items
+        int                   tile_idx,           ///< Tile index
+        SizeT                 block_offset,       ///< Tile offset
+        ScanTileDescriptorT   *d_tile_status)     ///< Global list of tile status
+    {
+        // Load items
+        T items[ITEMS_PER_THREAD];
+
+        if (FULL_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_items - block_offset);
+
+        __syncthreads();
+
+        T block_aggregate;
+        if (tile_idx == 0)
+        {
+            ScanBlock(items, scan_op, identity, block_aggregate);
+
+            // Update tile status if there are successor tiles
+            if (FULL_TILE && (threadIdx.x == 0))
+                ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
+        }
+        else
+        {
+            InterblockPrefixOp prefix_op(d_tile_status, temp_storage.prefix, scan_op, tile_idx);
+            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+        }
+
+        __syncthreads();
+
+        // Store items
+        if (FULL_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_items - block_offset);
+    }
+
+    /**
+     * Dequeue and scan tiles of items as part of a domino scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        int                   num_items,          ///< Total number of input items
+        GridQueue<int>        queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
+        ScanTileDescriptorT   *d_tile_status)     ///< Global list of tile status
+    {
+#if CUB_PTX_ARCH < 200
+
+        // No concurrent kernels allowed and blocks are launched in increasing order, so just assign one tile per block (up to 65K blocks)
+        int     tile_idx        = blockIdx.x;
+        SizeT   block_offset    = SizeT(TILE_ITEMS) * tile_idx;
+
+        if (block_offset + TILE_ITEMS <= num_items)
+            ConsumeTile<true>(num_items, tile_idx, block_offset, d_tile_status);
+        else if (block_offset < num_items)
+            ConsumeTile<false>(num_items, tile_idx, block_offset, d_tile_status);
+
+#else
+
+        // Get first tile
+        if (threadIdx.x == 0)
+            temp_storage.tile_idx = queue.Drain(1);
+
+        __syncthreads();
+
+        int tile_idx = temp_storage.tile_idx;
+        SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
+
+        while (block_offset + TILE_ITEMS <= num_items)
+        {
+            // Consume full tile
+            ConsumeTile<true>(num_items, tile_idx, block_offset, d_tile_status);
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = queue.Drain(1);
+
+            __syncthreads();
+
+            tile_idx = temp_storage.tile_idx;
+            block_offset = SizeT(TILE_ITEMS) * tile_idx;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < num_items)
+        {
+            ConsumeTile<false>(num_items, tile_idx, block_offset, d_tile_status);
+        }
+#endif
+
+    }
+
+
+    //---------------------------------------------------------------------
+    // Even-share scan
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool FULL_TILE,
+        bool FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT                   block_offset,               ///< Tile offset
+        RunningBlockPrefixOp<T> &prefix_op,                 ///< Running prefix operator
+        int                     valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        T items[ITEMS_PER_THREAD];
+
+        if (FULL_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);
+
+        __syncthreads();
+
+        // Block scan
+        T block_aggregate;
+        if (FIRST_TILE)
+        {
+            ScanBlock(items, scan_op, identity, block_aggregate);
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+        }
+
+        __syncthreads();
+
+        // Store items
+        if (FULL_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        SizeT   block_oob)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        RunningBlockPrefixOp<T> prefix_op;
+
+        if (block_offset + TILE_ITEMS <= block_oob)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(block_offset, prefix_op);
+            block_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (block_offset + TILE_ITEMS <= block_oob)
+            {
+                ConsumeTile<true, false>(block_offset, prefix_op);
+                block_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (block_offset < block_oob)
+            {
+                int valid_items = block_oob - block_offset;
+                ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = block_oob - block_offset;
+            ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        SizeT   block_oob,                          ///< [in] Threadblock end offset (exclusive)
+        T       prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        RunningBlockPrefixOp<T> prefix_op;
+        prefix_op.running_total = prefix;
+
+        // Consume full tiles of input
+        while (block_offset + TILE_ITEMS <= block_oob)
+        {
+            ConsumeTile<true, false>(block_offset, prefix_op);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_oob)
+        {
+            int valid_items = block_oob - block_offset;
+            ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/block/scan_tiles_types.cuh
+++ b/lib/kokkos/TPL/cub/device/block/scan_tiles_types.cuh
@ -0,0 +1,318 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Utility types for device-wide scan
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
+    SCAN_TILE_INVALID,      // Not yet processed
+    SCAN_TILE_PARTIAL,      // Tile aggregate is available
+    SCAN_TILE_PREFIX,       // Inclusive tile prefix is available
+};
+
+
+/**
+ * Data type of tile status descriptor.
+ *
+ * Specialized for scan status and value types that can be combined into the same
+ * machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename    T,
+    bool        SINGLE_WORD = (PowerOfTwo<sizeof(T)>::VALUE && (sizeof(T) <= 8))>
+struct ScanTileDescriptor
+{
+    // Status word type
+    typedef typename If<(sizeof(T) == 8),
+        long long,
+        typename If<(sizeof(T) == 4),
+            int,
+            typename If<(sizeof(T) == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+    // Vector word type
+    typedef typename If<(sizeof(T) == 8),
+        longlong2,
+        typename If<(sizeof(T) == 4),
+            int2,
+            typename If<(sizeof(T) == 2),
+                int,
+                short>::Type>::Type>::Type VectorWord;
+
+    T           value;
+    StatusWord  status;
+
+    static __device__ __forceinline__ void SetPrefix(ScanTileDescriptor *ptr, T prefix)
+    {
+        ScanTileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PREFIX;
+        tile_descriptor.value = prefix;
+
+        VectorWord alias;
+        *reinterpret_cast<ScanTileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(reinterpret_cast<VectorWord*>(ptr), alias);
+    }
+
+    static __device__ __forceinline__ void SetPartial(ScanTileDescriptor *ptr, T partial)
+    {
+        ScanTileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = partial;
+
+        VectorWord alias;
+        *reinterpret_cast<ScanTileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(reinterpret_cast<VectorWord*>(ptr), alias);
+    }
+
+    static __device__ __forceinline__ void WaitForValid(
+        ScanTileDescriptor    *ptr,
+        int                     &status,
+        T                       &value)
+    {
+        ScanTileDescriptor tile_descriptor;
+        while (true)
+        {
+            VectorWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<VectorWord*>(ptr));
+
+            tile_descriptor = *reinterpret_cast<ScanTileDescriptor*>(&alias);
+            if (tile_descriptor.status != SCAN_TILE_INVALID) break;
+
+            __threadfence_block();
+        }
+
+        status = tile_descriptor.status;
+        value = tile_descriptor.value;
+    }
+
+};
+
+
+/**
+ * Data type of tile status descriptor.
+ *
+ * Specialized for scan status and value types that cannot fused into
+ * the same machine word.
+ */
+template <typename T>
+struct ScanTileDescriptor<T, false>
+{
+    T       prefix_value;
+    T       partial_value;
+
+    /// Workaround for the fact that win32 doesn't guarantee 16B alignment 16B values of T
+    union
+    {
+        int                     status;
+        Uninitialized<T>        padding;
+    };
+
+    static __device__ __forceinline__ void SetPrefix(ScanTileDescriptor *ptr, T prefix)
+    {
+        ThreadStore<STORE_CG>(&ptr->prefix_value, prefix);
+        __threadfence_block();
+//        __threadfence();        // __threadfence_block seems sufficient on current architectures to prevent reordeing
+        ThreadStore<STORE_CG>(&ptr->status, (int) SCAN_TILE_PREFIX);
+
+    }
+
+    static __device__ __forceinline__ void SetPartial(ScanTileDescriptor *ptr, T partial)
+    {
+        ThreadStore<STORE_CG>(&ptr->partial_value, partial);
+        __threadfence_block();
+//        __threadfence();        // __threadfence_block seems sufficient on current architectures to prevent reordeing
+        ThreadStore<STORE_CG>(&ptr->status, (int) SCAN_TILE_PARTIAL);
+    }
+
+    static __device__ __forceinline__ void WaitForValid(
+        ScanTileDescriptor    *ptr,
+        int                         &status,
+        T                           &value)
+    {
+        while (true)
+        {
+            status = ThreadLoad<LOAD_CG>(&ptr->status);
+            if (status != SCAN_TILE_INVALID) break;
+
+            __threadfence_block();
+        }
+
+        value = (status == SCAN_TILE_PARTIAL) ?
+            ThreadLoad<LOAD_CG>(&ptr->partial_value) :
+            ThreadLoad<LOAD_CG>(&ptr->prefix_value);
+    }
+};
+
+
+/**
+ * Stateful prefix functor that provides the the running prefix for
+ * the current tile by using the callback warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available
+ */
+template <
+    typename T,
+    typename ScanOp>
+struct DeviceScanBlockPrefixOp
+{
+    // Parameterized warp reduce
+    typedef WarpReduce<T>                       WarpReduceT;
+
+    // Storage type
+    typedef typename WarpReduceT::TempStorage   _TempStorage;
+
+    // Alias wrapper allowing storage to be unioned
+    typedef Uninitialized<_TempStorage>         TempStorage;
+
+    // Tile status descriptor type
+    typedef ScanTileDescriptor<T>               ScanTileDescriptorT;
+
+    // Fields
+    ScanTileDescriptorT         *d_tile_status;     ///< Pointer to array of tile status
+    _TempStorage                &temp_storage;      ///< Reference to a warp-reduction instance
+    ScanOp                      scan_op;            ///< Binary scan operator
+    int                         tile_idx;           ///< The current tile index
+    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+
+    // Constructor
+    __device__ __forceinline__
+    DeviceScanBlockPrefixOp(
+        ScanTileDescriptorT     *d_tile_status,
+        TempStorage             &temp_storage,
+        ScanOp                  scan_op,
+        int                     tile_idx) :
+            d_tile_status(d_tile_status),
+            temp_storage(temp_storage.Alias()),
+            scan_op(scan_op),
+            tile_idx(tile_idx) {}
+
+
+    // Block until all predecessors within the specified window have non-invalid status
+    __device__ __forceinline__
+    void ProcessWindow(
+        int                         predecessor_idx,
+        int                         &predecessor_status,
+        T                           &window_aggregate)
+    {
+        T value;
+        ScanTileDescriptorT::WaitForValid(d_tile_status + predecessor_idx, predecessor_status, value);
+
+        // Perform a segmented reduction to get the prefix for the current window
+        int flag = (predecessor_status != SCAN_TILE_PARTIAL);
+        window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce(value, flag, scan_op);
+    }
+
+
+    // Prefix functor (called by the first warp)
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+        // Update our status with our tile-aggregate
+        if (threadIdx.x == 0)
+        {
+            ScanTileDescriptorT::SetPartial(d_tile_status + tile_idx, block_aggregate);
+        }
+
+        // Wait for the window of predecessor tiles to become valid
+        int predecessor_idx = tile_idx - threadIdx.x - 1;
+        int predecessor_status;
+        T window_aggregate;
+        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+        // The exclusive tile prefix starts out as the current window aggregate
+        T exclusive_prefix = window_aggregate;
+
+        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+        while (WarpAll(predecessor_status != SCAN_TILE_PREFIX))
+        {
+            predecessor_idx -= PtxArchProps::WARP_THREADS;
+
+            // Update exclusive tile prefix with the window prefix
+            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+        }
+
+        // Compute the inclusive tile prefix and update the status for this tile
+        if (threadIdx.x == 0)
+        {
+            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+            ScanTileDescriptorT::SetPrefix(
+                d_tile_status + tile_idx,
+                inclusive_prefix);
+        }
+
+        // Return exclusive_prefix
+        return exclusive_prefix;
+    }
+};
+
+
+// Running scan prefix callback type for single-block scans.
+// Maintains a running prefix that can be applied to consecutive
+// scan operations.
+template <typename T>
+struct RunningBlockPrefixOp
+{
+    // Running prefix
+    T running_total;
+
+    // Callback operator.
+    __device__ T operator()(T block_aggregate)
+    {
+        T old_prefix = running_total;
+        running_total += block_aggregate;
+        return old_prefix;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_gatomic.cuh
+++ b/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_gatomic.cuh
@ -0,0 +1,184 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockHistogramTilesGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../../util_type.cuh"
+#include "../../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * BlockHistogramTilesGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
+ */
+template <
+    typename    BlockHistogramTilesPolicy,      ///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins per channel
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
+    typename    SizeT>                          ///< Integer type for offsets
+struct BlockHistogramTilesGlobalAtomic
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Sample type
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockHistogramTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
+    };
+
+    // Shared memory type required by this thread block
+    typedef NullType TempStorage;
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIteratorRA d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockHistogramTilesGlobalAtomic(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        d_in(d_in),
+        d_out_histograms(d_out_histograms)
+    {}
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT   block_offset,               ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
+    {
+        if (FULL_TILE)
+        {
+            // Full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD][CHANNELS];
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                    }
+                }
+            }
+
+            __threadfence_block();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+                    }
+                }
+            }
+        }
+        else
+        {
+            // Only a partially-full tile of samples to read and composite
+            int bounds = valid_items - (threadIdx.x * CHANNELS);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+                    {
+                        SampleT item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                        atomicAdd(d_out_histograms[CHANNEL] + item, 1);
+                    }
+                }
+            }
+
+        }
+    }
+
+
+    /**
+     * Aggregate results into output
+     */
+    __device__ __forceinline__ void AggregateOutput()
+    {}
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_satomic.cuh
+++ b/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_satomic.cuh
@ -0,0 +1,237 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockHistogramTilesSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../../util_type.cuh"
+#include "../../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * BlockHistogramTilesSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
+ */
+template <
+    typename    BlockHistogramTilesPolicy,          ///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
+    typename    SizeT>                          ///< Integer type for offsets
+struct BlockHistogramTilesSharedAtomic
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Sample type
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockHistogramTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1];  // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIteratorRA d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockHistogramTilesSharedAtomic(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out_histograms(d_out_histograms)
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int histo_offset = 0;
+
+            #pragma unroll
+            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+            {
+                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+            }
+            // Finish up with guarded initialization if necessary
+            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
+            {
+                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT   block_offset,               ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
+    {
+        if (FULL_TILE)
+        {
+            // Full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD][CHANNELS];
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                    }
+                }
+            }
+
+            __threadfence_block();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+                    }
+                }
+            }
+
+            __threadfence_block();
+        }
+        else
+        {
+            // Only a partially-full tile of samples to read and composite
+            int bounds = valid_items - (threadIdx.x * CHANNELS);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+                    {
+                        SampleT item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                        atomicAdd(temp_storage.histograms[CHANNEL] + item, 1);
+                    }
+                }
+            }
+
+        }
+    }
+
+
+    /**
+     * Aggregate results into output
+     */
+    __device__ __forceinline__ void AggregateOutput()
+    {
+        // Barrier to ensure shared memory histograms are coherent
+        __syncthreads();
+
+        // Copy shared memory histograms to output
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_offset  = (blockIdx.x * BINS);
+            int histo_offset    = 0;
+
+            #pragma unroll
+            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+            {
+                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+            }
+            // Finish up with guarded initialization if necessary
+            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
+            {
+                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+            }
+        }
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_sort.cuh
+++ b/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_sort.cuh
@ -0,0 +1,364 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockHistogramTilesSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../../block/block_radix_sort.cuh"
+#include "../../../block/block_discontinuity.cuh"
+#include "../../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * BlockHistogramTilesSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
+ */
+template <
+    typename    BlockHistogramTilesPolicy,          ///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins per channel
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
+    typename    SizeT>                          ///< Integer type for offsets
+struct BlockHistogramTilesSort
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Sample type
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS               = BlockHistogramTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD            = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS                  = TILE_CHANNEL_ITEMS * CHANNELS,
+
+        STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<SampleT, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<SampleT, BLOCK_THREADS> BlockDiscontinuityT;
+
+    /// Shared memory type required by this thread block
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+            int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Histogram counters striped across threads
+    HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD];
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIteratorRA d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockHistogramTilesSort(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out_histograms(d_out_histograms)
+    {
+        // Initialize histogram counters striped across threads
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            #pragma unroll
+            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+            {
+                thread_counters[CHANNEL][COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Composite a tile of input items
+     */
+    __device__ __forceinline__ void Composite(
+        SampleT   (&items)[ITEMS_PER_THREAD],                     ///< Tile of samples
+        HistoCounter    thread_counters[STRIPED_COUNTERS_PER_THREAD])   ///< Histogram counters striped across threads
+    {
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort).Sort(items);
+
+        __syncthreads();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        #pragma unroll
+        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+        {
+            temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+            temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+        }
+
+        __syncthreads();
+
+        // Note the begin/end run offsets of bin runs in the sorted tile
+        int flags[ITEMS_PER_THREAD];                // unused
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0;
+
+        __syncthreads();
+
+        // Composite into histogram
+        // Initialize the shared memory's run_begin and run_end for each bin
+        #pragma unroll
+        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+        {
+            int          bin            = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+            HistoCounter run_length     = temp_storage.run_end[bin] - temp_storage.run_begin[bin];
+
+            thread_counters[COUNTER] += run_length;
+        }
+    }
+
+
+    /**
+     * Process one channel within a tile.
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTileChannel(
+        int     channel,
+        SizeT   block_offset,
+        int     valid_items)
+    {
+        // Load items in striped fashion
+        if (FULL_TILE)
+        {
+            // Full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD];
+
+            // Unguarded loads
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
+            }
+
+            // Composite our histogram data
+            Composite(items, thread_counters[channel]);
+        }
+        else
+        {
+            // Only a partially-full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD];
+
+            // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
+            int bounds = (valid_items - (threadIdx.x * CHANNELS));
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
+                    d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
+                    0;
+            }
+
+            // Composite our histogram data
+            Composite(items, thread_counters[channel]);
+
+            __syncthreads();
+
+            // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items
+            if (threadIdx.x == 0)
+            {
+                int extra = (TILE_ITEMS - valid_items) / CHANNELS;
+                thread_counters[channel][0] -= extra;
+            }
+        }
+    }
+
+
+    /**
+     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Inductive step.
+     */
+    template <bool FULL_TILE, int CHANNEL, int END>
+    struct IterateChannels
+    {
+        /**
+         * Process one channel within a tile.
+         */
+        static __device__ __forceinline__ void ConsumeTileChannel(
+            BlockHistogramTilesSort *cta,
+            SizeT               block_offset,
+            int                 valid_items)
+        {
+            __syncthreads();
+
+            cta->ConsumeTileChannel<FULL_TILE>(CHANNEL, block_offset, valid_items);
+
+            IterateChannels<FULL_TILE, CHANNEL + 1, END>::ConsumeTileChannel(cta, block_offset, valid_items);
+        }
+    };
+
+
+    /**
+     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Base step.
+     */
+    template <bool FULL_TILE, int END>
+    struct IterateChannels<FULL_TILE, END, END>
+    {
+        static __device__ __forceinline__ void ConsumeTileChannel(BlockHistogramTilesSort *cta, SizeT block_offset, int valid_items) {}
+    };
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT   block_offset,               ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
+    {
+        // First channel
+        ConsumeTileChannel<FULL_TILE>(0, block_offset, valid_items);
+
+        // Iterate through remaining channels
+        IterateChannels<FULL_TILE, 1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, valid_items);
+    }
+
+
+    /**
+     * Aggregate results into output
+     */
+    __device__ __forceinline__ void AggregateOutput()
+    {
+        // Copy counters striped across threads into the histogram output
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_offset  = (blockIdx.x * BINS);
+
+            #pragma unroll
+            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+            {
+                int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+
+                if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS))
+                {
+                    d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER];
+                }
+            }
+        }
+    }
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/device/device_histogram.cuh
+++ b/lib/kokkos/TPL/cub/device/device_histogram.cuh
--- a/lib/kokkos/TPL/cub/device/device_radix_sort.cuh
+++ b/lib/kokkos/TPL/cub/device/device_radix_sort.cuh
@ -0,0 +1,890 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides operations for computing a device-wide, parallel reduction across data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "block/block_radix_sort_upsweep_tiles.cuh"
+#include "block/block_radix_sort_downsweep_tiles.cuh"
+#include "block/block_scan_tiles.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep pass kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ */
+template <
+    typename                BlockRadixSortUpsweepTilesPolicy, ///< Tuning policy for cub::BlockRadixSortUpsweepTiles abstraction
+    typename                Key,                            ///< Key type
+    typename                SizeT>                          ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortUpsweepKernel(
+    Key                     *d_keys,                        ///< [in] Input keys buffer
+    SizeT                   *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    SizeT                   num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    bool                    use_primary_bit_granularity,    ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
+    bool                    first_pass,                     ///< [in] Whether this is the first digit pass
+    GridEvenShare<SizeT>    even_share)                     ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
+{
+
+    // Alternate policy for when fewer bits remain
+    typedef typename BlockRadixSortUpsweepTilesPolicy::AltPolicy AltPolicy;
+
+    // Parameterize two versions of BlockRadixSortUpsweepTiles type for the current configuration
+    typedef BlockRadixSortUpsweepTiles<BlockRadixSortUpsweepTilesPolicy, Key, SizeT>    BlockRadixSortUpsweepTilesT;          // Primary
+    typedef BlockRadixSortUpsweepTiles<AltPolicy, Key, SizeT>                           AltBlockRadixSortUpsweepTilesT;       // Alternate (smaller bit granularity)
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockRadixSortUpsweepTilesT::TempStorage     pass_storage;
+        typename AltBlockRadixSortUpsweepTilesT::TempStorage  alt_pass_storage;
+    } temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.BlockInit();
+
+    // Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
+    if (use_primary_bit_granularity)
+    {
+        // Primary granularity
+        SizeT bin_count;
+        BlockRadixSortUpsweepTilesT(temp_storage.pass_storage, d_keys, current_bit).ProcessTiles(
+            even_share.block_offset,
+            even_share.block_oob,
+            bin_count);
+
+        // Write out digit counts (striped)
+        if (threadIdx.x < BlockRadixSortUpsweepTilesT::RADIX_DIGITS)
+        {
+            d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
+        }
+    }
+    else
+    {
+        // Alternate granularity
+        // Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
+        SizeT bin_count;
+        AltBlockRadixSortUpsweepTilesT(temp_storage.alt_pass_storage, d_keys, current_bit).ProcessTiles(
+            even_share.block_offset,
+            even_share.block_oob,
+            bin_count);
+
+        // Write out digit counts (striped)
+        if (threadIdx.x < AltBlockRadixSortUpsweepTilesT::RADIX_DIGITS)
+        {
+            d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
+        }
+    }
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+    typename    BlockScanTilesPolicy,   ///< Tuning policy for cub::BlockScanTiles abstraction
+    typename    SizeT>                  ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanKernel(
+    SizeT       *d_spine,               ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    int         num_counts)             ///< [in] Total number of bin-counts
+{
+    // Parameterize the BlockScanTiles type for the current configuration
+    typedef BlockScanTiles<BlockScanTilesPolicy, SizeT*, SizeT*, cub::Sum, SizeT, SizeT> BlockScanTilesT;
+
+    // Shared memory storage
+    __shared__ typename BlockScanTilesT::TempStorage temp_storage;
+
+    // Block scan instance
+    BlockScanTilesT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), SizeT(0)) ;
+
+    // Process full input tiles
+    int block_offset = 0;
+    RunningBlockPrefixOp<SizeT> prefix_op;
+    prefix_op.running_total = 0;
+    while (block_offset < num_counts)
+    {
+        block_scan.ConsumeTile<true, false>(block_offset, prefix_op);
+        block_offset += BlockScanTilesT::TILE_ITEMS;
+    }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+    typename                BlockRadixSortDownsweepTilesPolicy,   ///< Tuning policy for cub::BlockRadixSortUpsweepTiles abstraction
+    typename                Key,                                ///< Key type
+    typename                Value,                              ///< Value type
+    typename                SizeT>                              ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS))
+__global__ void RadixSortDownsweepKernel(
+    Key                     *d_keys_in,                     ///< [in] Input keys ping buffer
+    Key                     *d_keys_out,                    ///< [in] Output keys pong buffer
+    Value                   *d_values_in,                   ///< [in] Input values ping buffer
+    Value                   *d_values_out,                  ///< [in] Output values pong buffer
+    SizeT                   *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    SizeT                   num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    bool                    use_primary_bit_granularity,    ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
+    bool                    first_pass,                     ///< [in] Whether this is the first digit pass
+    bool                    last_pass,                      ///< [in] Whether this is the last digit pass
+    GridEvenShare<SizeT>    even_share)                     ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
+{
+
+    // Alternate policy for when fewer bits remain
+    typedef typename BlockRadixSortDownsweepTilesPolicy::AltPolicy AltPolicy;
+
+    // Parameterize two versions of BlockRadixSortDownsweepTiles type for the current configuration
+    typedef BlockRadixSortDownsweepTiles<BlockRadixSortDownsweepTilesPolicy, Key, Value, SizeT>     BlockRadixSortDownsweepTilesT;
+    typedef BlockRadixSortDownsweepTiles<AltPolicy, Key, Value, SizeT>                            AltBlockRadixSortDownsweepTilesT;
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockRadixSortDownsweepTilesT::TempStorage       pass_storage;
+        typename AltBlockRadixSortDownsweepTilesT::TempStorage    alt_pass_storage;
+
+    } temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.BlockInit();
+
+    if (use_primary_bit_granularity)
+    {
+        // Process input tiles
+        BlockRadixSortDownsweepTilesT(temp_storage.pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
+            even_share.block_offset,
+            even_share.block_oob);
+    }
+    else
+    {
+        // Process input tiles
+        AltBlockRadixSortDownsweepTilesT(temp_storage.alt_pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
+            even_share.block_offset,
+            even_share.block_oob);
+    }
+}
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+/******************************************************************************
+ * DeviceRadixSort
+ *****************************************************************************/
+
+/**
+ * \brief DeviceRadixSort provides operations for computing a device-wide, parallel radix sort across data items residing within global memory. ![](sorting_logo.png)
+ * \ingroup DeviceModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending order.  It relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
+ * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, BlockRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRadixSort}
+ *
+ * \par Performance
+ *
+ * \image html lsd_sort_perf.png
+ *
+ */
+struct DeviceRadixSort
+{
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties codified in block policy.
+    struct KernelDispachParams
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        cudaSharedMemConfig     smem_config;
+        int                     radix_bits;
+        int                     alt_radix_bits;
+        int                     subscription_factor;
+        int                     tile_size;
+
+        template <typename SortBlockPolicy>
+        __host__ __device__ __forceinline__
+        void InitUpsweepPolicy(int subscription_factor = 1)
+        {
+            block_threads               = SortBlockPolicy::BLOCK_THREADS;
+            items_per_thread            = SortBlockPolicy::ITEMS_PER_THREAD;
+            radix_bits                  = SortBlockPolicy::RADIX_BITS;
+            alt_radix_bits              = SortBlockPolicy::AltPolicy::RADIX_BITS;
+            smem_config                 = cudaSharedMemBankSizeFourByte;
+            this->subscription_factor   = subscription_factor;
+            tile_size                   = block_threads * items_per_thread;
+        }
+
+        template <typename ScanBlockPolicy>
+        __host__ __device__ __forceinline__
+        void InitScanPolicy()
+        {
+            block_threads               = ScanBlockPolicy::BLOCK_THREADS;
+            items_per_thread            = ScanBlockPolicy::ITEMS_PER_THREAD;
+            radix_bits                  = 0;
+            alt_radix_bits              = 0;
+            smem_config                 = cudaSharedMemBankSizeFourByte;
+            subscription_factor         = 0;
+            tile_size                   = block_threads * items_per_thread;
+        }
+
+        template <typename SortBlockPolicy>
+        __host__ __device__ __forceinline__
+        void InitDownsweepPolicy(int subscription_factor = 1)
+        {
+            block_threads               = SortBlockPolicy::BLOCK_THREADS;
+            items_per_thread            = SortBlockPolicy::ITEMS_PER_THREAD;
+            radix_bits                  = SortBlockPolicy::RADIX_BITS;
+            alt_radix_bits              = SortBlockPolicy::AltPolicy::RADIX_BITS;
+            smem_config                 = SortBlockPolicy::SMEM_CONFIG;
+            this->subscription_factor   = subscription_factor;
+            tile_size                   = block_threads * items_per_thread;
+        }
+    };
+
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <typename Key, typename Value, typename SizeT, int ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <typename Key, typename Value, typename SizeT>
+    struct TunedPolicies<Key, Value, SizeT, 350>
+    {
+        enum {
+            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
+            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+            RADIX_BITS      = 5,
+        };
+
+        // UpsweepPolicy
+        typedef BlockRadixSortUpsweepTilesPolicy <64,     CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
+        typedef BlockRadixSortUpsweepTilesPolicy <128,    CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+/*
+        // 4bit
+        typedef BlockRadixSortUpsweepTilesPolicy <128, 15, LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
+        typedef BlockRadixSortUpsweepTilesPolicy <256, 13, LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
+*/
+        // ScanPolicy
+        typedef BlockScanTilesPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // DownsweepPolicy
+        typedef BlockRadixSortDownsweepTilesPolicy <64,   CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+        typedef BlockRadixSortDownsweepTilesPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+/*
+        // 4bit
+        typedef BlockRadixSortDownsweepTilesPolicy <128, 15, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+        typedef BlockRadixSortDownsweepTilesPolicy <256, 13, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+*/
+        enum { SUBSCRIPTION_FACTOR = 7 };
+    };
+
+
+    /// SM20 tune
+    template <typename Key, typename Value, typename SizeT>
+    struct TunedPolicies<Key, Value, SizeT, 200>
+    {
+        enum {
+            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
+            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+            RADIX_BITS      = 5,
+        };
+
+        // UpsweepPolicy
+        typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
+        typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
+        // ScanPolicy
+        typedef BlockScanTilesPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // DownsweepPolicy
+        typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
+        typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 3 };
+    };
+
+
+    /// SM10 tune
+    template <typename Key, typename Value, typename SizeT>
+    struct TunedPolicies<Key, Value, SizeT, 100>
+    {
+        enum {
+            RADIX_BITS = 4,
+        };
+
+        // UpsweepPolicy
+        typedef BlockRadixSortUpsweepTilesPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy;
+
+        // ScanPolicy
+        typedef BlockScanTilesPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // DownsweepPolicy
+        typedef BlockRadixSortDownsweepTilesPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 3 };
+    };
+
+
+
+    /******************************************************************************
+     * Default policy initializer
+     ******************************************************************************/
+
+    /// Tuning policy for the PTX architecture that DeviceRadixSort operations will get dispatched to
+    template <typename Key, typename Value, typename SizeT>
+    struct PtxDefaultPolicies
+    {
+
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 200) ?
+                                                    200 :
+                                                    100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<Key, Value, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // UpsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct UpsweepPolicy : PtxTunedPolicies::UpsweepPolicy {};
+
+        // ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
+
+        // DownsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct DownsweepPolicy : PtxTunedPolicies::DownsweepPolicy {};
+
+        // Subscription factor for the current PTX compiler pass
+        enum { SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR };
+
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(
+            int                    ptx_version,
+            KernelDispachParams    &upsweep_dispatch_params,
+            KernelDispachParams    &scan_dispatch_params,
+            KernelDispachParams    &downsweep_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<Key, Value, SizeT, 350> TunedPolicies;
+                upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
+                downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<Key, Value, SizeT, 200> TunedPolicies;
+                upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
+                downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+            else
+            {
+                typedef TunedPolicies<Key, Value, SizeT, 100> TunedPolicies;
+                upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
+                downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+        }
+    };
+
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
+     */
+    template <
+        typename            UpsweepKernelPtr,                       ///< Function type of cub::RadixSortUpsweepKernel
+        typename            SpineKernelPtr,                         ///< Function type of cub::SpineScanKernel
+        typename            DownsweepKernelPtr,                     ///< Function type of cub::RadixSortUpsweepKernel
+        typename            Key,                                    ///< Key type
+        typename            Value,                                  ///< Value type
+        typename            SizeT>                                  ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        UpsweepKernelPtr    upsweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+        SpineKernelPtr      scan_kernel,                            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelPtr  downsweep_kernel,                       ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+        KernelDispachParams &upsweep_dispatch_params,               ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for
+        KernelDispachParams &scan_dispatch_params,                  ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+        KernelDispachParams &downsweep_dispatch_params,             ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
+        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<Value> &d_values,                              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        SizeT               num_items,                              ///< [in] Number of items to reduce
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get a rough estimate of downsweep_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+            int downsweep_sm_occupancy = CUB_MIN(
+                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / downsweep_dispatch_params.block_threads);
+            int upsweep_sm_occupancy = downsweep_sm_occupancy;
+
+#ifndef __CUDA_ARCH__
+            // We're on the host, so come up with more accurate estimates of SM occupancy from actual device properties
+            Device device_props;
+            if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+            if (CubDebug(error = device_props.MaxSmOccupancy(
+                downsweep_sm_occupancy,
+                downsweep_kernel,
+                downsweep_dispatch_params.block_threads))) break;
+
+            if (CubDebug(error = device_props.MaxSmOccupancy(
+                upsweep_sm_occupancy,
+                upsweep_kernel,
+                upsweep_dispatch_params.block_threads))) break;
+#endif
+            // Get device occupancies
+            int downsweep_occupancy = downsweep_sm_occupancy * sm_count;
+
+            // Get even-share work distribution descriptor
+            GridEvenShare<SizeT> even_share;
+            int max_downsweep_grid_size = downsweep_occupancy * downsweep_dispatch_params.subscription_factor;
+            int downsweep_grid_size;
+            even_share.GridInit(num_items, max_downsweep_grid_size, downsweep_dispatch_params.tile_size);
+            downsweep_grid_size = even_share.grid_size;
+
+            // Get number of spine elements (round up to nearest spine scan kernel tile size)
+            int bins            = 1 << downsweep_dispatch_params.radix_bits;
+            int spine_size      = downsweep_grid_size * bins;
+            int spine_tiles     = (spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+            spine_size          = spine_tiles * scan_dispatch_params.tile_size;
+
+            int alt_bins            = 1 << downsweep_dispatch_params.alt_radix_bits;
+            int alt_spine_size      = downsweep_grid_size * alt_bins;
+            int alt_spine_tiles     = (alt_spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+            alt_spine_size          = alt_spine_tiles * scan_dispatch_params.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[1];
+            size_t allocation_sizes[1] =
+            {
+                spine_size * sizeof(SizeT),    // bytes needed for privatized block digit histograms
+            };
+
+            // Alias temporaries (or set the necessary size of the storage allocation)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Privatized per-block digit histograms
+            SizeT *d_spine = (SizeT*) allocations[0];
+
+#ifndef __CUDA_ARCH__
+            // Get current smem bank configuration
+            cudaSharedMemConfig original_smem_config;
+            if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
+            cudaSharedMemConfig current_smem_config = original_smem_config;
+#endif
+            // Iterate over digit places
+            int current_bit = begin_bit;
+            while (current_bit < end_bit)
+            {
+                // Use primary bit granularity if bits remaining is a whole multiple of bit primary granularity
+                int bits_remaining = end_bit - current_bit;
+                bool use_primary_bit_granularity = (bits_remaining % downsweep_dispatch_params.radix_bits == 0);
+                int radix_bits = (use_primary_bit_granularity) ?
+                    downsweep_dispatch_params.radix_bits :
+                    downsweep_dispatch_params.alt_radix_bits;
+
+#ifndef __CUDA_ARCH__
+                // Update smem config if necessary
+                if (current_smem_config != upsweep_dispatch_params.smem_config)
+                {
+                    if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_dispatch_params.smem_config))) break;
+                    current_smem_config = upsweep_dispatch_params.smem_config;
+                }
+#endif
+
+                // Log upsweep_kernel configuration
+                if (stream_synchronous)
+                    CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n",
+                    downsweep_grid_size, upsweep_dispatch_params.block_threads, (long long) stream, upsweep_dispatch_params.smem_config, upsweep_dispatch_params.items_per_thread, upsweep_sm_occupancy, d_keys.selector, current_bit, radix_bits);
+
+                // Invoke upsweep_kernel with same grid size as downsweep_kernel
+                upsweep_kernel<<<downsweep_grid_size, upsweep_dispatch_params.block_threads, 0, stream>>>(
+                    d_keys.d_buffers[d_keys.selector],
+                    d_spine,
+                    num_items,
+                    current_bit,
+                    use_primary_bit_granularity,
+                    (current_bit == begin_bit),
+                    even_share);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                // Log scan_kernel configuration
+                if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                    1, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread);
+
+                // Invoke scan_kernel
+                scan_kernel<<<1, scan_dispatch_params.block_threads, 0, stream>>>(
+                    d_spine,
+                    (use_primary_bit_granularity) ? spine_size : alt_spine_size);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+#ifndef __CUDA_ARCH__
+                // Update smem config if necessary
+                if (current_smem_config != downsweep_dispatch_params.smem_config)
+                {
+                    if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_dispatch_params.smem_config))) break;
+                    current_smem_config = downsweep_dispatch_params.smem_config;
+                }
+#endif
+
+                // Log downsweep_kernel configuration
+                if (stream_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n",
+                    downsweep_grid_size, downsweep_dispatch_params.block_threads, (long long) stream, downsweep_dispatch_params.smem_config, downsweep_dispatch_params.items_per_thread, downsweep_sm_occupancy);
+
+                // Invoke downsweep_kernel
+                downsweep_kernel<<<downsweep_grid_size, downsweep_dispatch_params.block_threads, 0, stream>>>(
+                    d_keys.d_buffers[d_keys.selector],
+                    d_keys.d_buffers[d_keys.selector ^ 1],
+                    d_values.d_buffers[d_values.selector],
+                    d_values.d_buffers[d_values.selector ^ 1],
+                    d_spine,
+                    num_items,
+                    current_bit,
+                    use_primary_bit_granularity,
+                    (current_bit == begin_bit),
+                    (current_bit + downsweep_dispatch_params.radix_bits >= end_bit),
+                    even_share);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                // Invert selectors
+                d_keys.selector ^= 1;
+                d_values.selector ^= 1;
+
+                // Update current bit position
+                current_bit += radix_bits;
+            }
+
+#ifndef __CUDA_ARCH__
+            // Reset smem config if necessary
+            if (current_smem_config != original_smem_config)
+            {
+                if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
+            }
+#endif
+
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+
+    /**
+     * \brief Sorts key-value pairs.
+     *
+     * \par
+     * The sorting operation requires a pair of key buffers and a pair of value
+     * buffers.  Each pair is wrapped in a DoubleBuffer structure whose member
+     * DoubleBuffer::Current() references the active buffer.  The currently-active
+     * buffer may be changed by the sorting operation.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \par
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers for
+     * // sorting data (keys, values, and equivalently-sized alternate buffers)
+     * int num_items = ...
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements for sorting operation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage for sorting operation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Sorted keys and values are referenced by d_keys.Current() and d_values.Current()
+     *
+     * \endcode
+     *
+     * \tparam Key      <b>[inferred]</b> Key type
+     * \tparam Value    <b>[inferred]</b> Value type
+     */
+    template <
+        typename            Key,
+        typename            Value>
+    __host__ __device__ __forceinline__
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<Value> &d_values,                              ///< [in,out] Double-buffer of values whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                 num_items,                              ///< [in] Number of items to reduce
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
+        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Type used for array indexing
+        typedef int SizeT;
+
+        // Tuning polices
+        typedef PtxDefaultPolicies<Key, Value, SizeT>           PtxDefaultPolicies; // Wrapper of default kernel policies
+        typedef typename PtxDefaultPolicies::UpsweepPolicy      UpsweepPolicy;      // Upsweep kernel policy
+        typedef typename PtxDefaultPolicies::ScanPolicy         ScanPolicy;         // Scan kernel policy
+        typedef typename PtxDefaultPolicies::DownsweepPolicy    DownsweepPolicy;    // Downsweep kernel policy
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams upsweep_dispatch_params;
+            KernelDispachParams scan_dispatch_params;
+            KernelDispachParams downsweep_dispatch_params;
+
+#ifdef __CUDA_ARCH__
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            upsweep_dispatch_params.InitUpsweepPolicy<UpsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+            scan_dispatch_params.InitScanPolicy<ScanPolicy>();
+            downsweep_dispatch_params.InitDownsweepPolicy<DownsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+#else
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(
+                ptx_version,
+                upsweep_dispatch_params,
+                scan_dispatch_params,
+                downsweep_dispatch_params);
+#endif
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                RadixSortUpsweepKernel<UpsweepPolicy, Key, SizeT>,
+                RadixSortScanKernel<ScanPolicy, SizeT>,
+                RadixSortDownsweepKernel<DownsweepPolicy, Key, Value, SizeT>,
+                upsweep_dispatch_params,
+                scan_dispatch_params,
+                downsweep_dispatch_params,
+                d_keys,
+                d_values,
+                num_items,
+                begin_bit,
+                end_bit,
+                stream,
+                stream_synchronous))) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Sorts keys
+     *
+     * \par
+     * The sorting operation requires a pair of key buffers.  The pair is
+     * wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current()
+     * references the active buffer.  The currently-active buffer may be changed
+     * by the sorting operation.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \par
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers for
+     * // sorting data (keys and equivalently-sized alternate buffer)
+     * int num_items = ...
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements for sorting operation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage for sorting operation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Sorted keys are referenced by d_keys.Current()
+     *
+     * \endcode
+     *
+     * \tparam Key      <b>[inferred]</b> Key type
+     */
+    template <typename Key>
+    __host__ __device__ __forceinline__
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to reduce
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
+        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        DoubleBuffer<NullType> d_values;
+        return SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, stream_synchronous);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
--- a/lib/kokkos/TPL/cub/device/device_reduce.cuh
+++ b/lib/kokkos/TPL/cub/device/device_reduce.cuh
@ -0,0 +1,775 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides operations for computing a device-wide, parallel reduction across data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "block/block_reduce_tiles.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+
+
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduction pass kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ */
+template <
+    typename                BlockReduceTilesPolicy, ///< Tuning policy for cub::BlockReduceTiles abstraction
+    typename                InputIteratorRA,        ///< Random-access iterator type for input (may be a simple pointer type)
+    typename                OutputIteratorRA,       ///< Random-access iterator type for output (may be a simple pointer type)
+    typename                SizeT,                  ///< Integer type used for global array indexing
+    typename                ReductionOp>            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(BlockReduceTilesPolicy::BLOCK_THREADS), 1)
+__global__ void ReducePrivatizedKernel(
+    InputIteratorRA         d_in,                   ///< [in] Input data to reduce
+    OutputIteratorRA        d_out,                  ///< [out] Output location for result
+    SizeT                   num_items,              ///< [in] Total number of input data items
+    GridEvenShare<SizeT>    even_share,             ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
+    GridQueue<SizeT>        queue,                  ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
+    ReductionOp             reduction_op)           ///< [in] Binary reduction operator
+{
+    // Data type
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+    // Thread block type for reducing input tiles
+    typedef BlockReduceTiles<BlockReduceTilesPolicy, InputIteratorRA, SizeT, ReductionOp> BlockReduceTilesT;
+
+    // Block-wide aggregate
+    T block_aggregate;
+
+    // Shared memory storage
+    __shared__ typename BlockReduceTilesT::TempStorage temp_storage;
+
+    // Consume input tiles
+    BlockReduceTilesT(temp_storage, d_in, reduction_op).ConsumeTiles(
+        num_items,
+        even_share,
+        queue,
+        block_aggregate,
+        Int2Type<BlockReduceTilesPolicy::GRID_MAPPING>());
+
+    // Output result
+    if (threadIdx.x == 0)
+    {
+        d_out[blockIdx.x] = block_aggregate;
+    }
+}
+
+
+/**
+ * Reduction pass kernel entry point (single-block).  Aggregates privatized threadblock reductions from a previous multi-block reduction pass.
+ */
+template <
+    typename                BlockReduceTilesPolicy,  ///< Tuning policy for cub::BlockReduceTiles abstraction
+    typename                InputIteratorRA,        ///< Random-access iterator type for input (may be a simple pointer type)
+    typename                OutputIteratorRA,       ///< Random-access iterator type for output (may be a simple pointer type)
+    typename                SizeT,                  ///< Integer type used for global array indexing
+    typename                ReductionOp>            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(BlockReduceTilesPolicy::BLOCK_THREADS), 1)
+__global__ void ReduceSingleKernel(
+    InputIteratorRA         d_in,                   ///< [in] Input data to reduce
+    OutputIteratorRA        d_out,                  ///< [out] Output location for result
+    SizeT                   num_items,              ///< [in] Total number of input data items
+    ReductionOp             reduction_op)           ///< [in] Binary reduction operator
+{
+    // Data type
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+    // Thread block type for reducing input tiles
+    typedef BlockReduceTiles<BlockReduceTilesPolicy, InputIteratorRA, SizeT, ReductionOp> BlockReduceTilesT;
+
+    // Block-wide aggregate
+    T block_aggregate;
+
+    // Shared memory storage
+    __shared__ typename BlockReduceTilesT::TempStorage temp_storage;
+
+    // Consume input tiles
+    BlockReduceTilesT(temp_storage, d_in, reduction_op).ConsumeTiles(
+        SizeT(0),
+        SizeT(num_items),
+        block_aggregate);
+
+    // Output result
+    if (threadIdx.x == 0)
+    {
+        d_out[blockIdx.x] = block_aggregate;
+    }
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * DeviceReduce
+ *****************************************************************************/
+
+/**
+ * \brief DeviceReduce provides operations for computing a device-wide, parallel reduction across data items residing within global memory. ![](reduce_logo.png)
+ * \ingroup DeviceModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a list of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ * \par Performance
+ *
+ * \image html reduction_perf.png
+ *
+ */
+struct DeviceReduce
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties codified in block policy.
+    struct KernelDispachParams
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        int                     vector_load_length;
+        BlockReduceAlgorithm    block_algorithm;
+        PtxLoadModifier         load_modifier;
+        GridMappingStrategy     grid_mapping;
+        int                     subscription_factor;
+        int                     tile_size;
+
+        template <typename BlockPolicy>
+        __host__ __device__ __forceinline__
+        void Init(int subscription_factor = 1)
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockPolicy::ITEMS_PER_THREAD;
+            vector_load_length          = BlockPolicy::VECTOR_LOAD_LENGTH;
+            block_algorithm             = BlockPolicy::BLOCK_ALGORITHM;
+            load_modifier               = BlockPolicy::LOAD_MODIFIER;
+            grid_mapping                = BlockPolicy::GRID_MAPPING;
+            this->subscription_factor   = subscription_factor;
+            tile_size                   = block_threads * items_per_thread;
+        }
+
+        __host__ __device__ __forceinline__
+        void Print()
+        {
+            printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping, %d subscription",
+                block_threads,
+                items_per_thread,
+                vector_load_length,
+                block_algorithm,
+                load_modifier,
+                grid_mapping,
+                subscription_factor);
+        }
+
+    };
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <
+        typename    T,
+        typename    SizeT,
+        int         ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 350>
+    {
+        // PrivatizedPolicy (1B): GTX Titan: 206.0 GB/s @ 192M 1B items
+        typedef BlockReduceTilesPolicy<128, 12,  1, BLOCK_REDUCE_RAKING, LOAD_LDG, GRID_MAPPING_DYNAMIC>                PrivatizedPolicy1B;
+
+        // PrivatizedPolicy (4B): GTX Titan: 254.2 GB/s @ 48M 4B items
+        typedef BlockReduceTilesPolicy<512, 20,  1, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>         PrivatizedPolicy4B;
+
+        // PrivatizedPolicy
+        typedef typename If<(sizeof(T) < 4),
+            PrivatizedPolicy1B,
+            PrivatizedPolicy4B>::Type PrivatizedPolicy;
+
+        // SinglePolicy
+        typedef BlockReduceTilesPolicy<256, 8, 1, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>  SinglePolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 7 };
+
+    };
+
+    /// SM30 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 300>
+    {
+        // PrivatizedPolicy: GTX670: 154.0 @ 48M 32-bit T
+        typedef BlockReduceTilesPolicy<256, 2,  1, BLOCK_REDUCE_WARP_REDUCTIONS,  LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>    PrivatizedPolicy;
+
+        // SinglePolicy
+        typedef BlockReduceTilesPolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS,  LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>    SinglePolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 1 };
+    };
+
+    /// SM20 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 200>
+    {
+        // PrivatizedPolicy (1B): GTX 580: 158.1 GB/s @ 192M 1B items
+        typedef BlockReduceTilesPolicy<192, 24,  4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            PrivatizedPolicy1B;
+
+        // PrivatizedPolicy (4B): GTX 580: 178.9 GB/s @ 48M 4B items
+        typedef BlockReduceTilesPolicy<128, 8,  4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_DYNAMIC>                PrivatizedPolicy4B;
+
+        // PrivatizedPolicy
+        typedef typename If<(sizeof(T) < 4),
+            PrivatizedPolicy1B,
+            PrivatizedPolicy4B>::Type PrivatizedPolicy;
+
+        // SinglePolicy
+        typedef BlockReduceTilesPolicy<192, 7,  1, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>             SinglePolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 2 };
+    };
+
+    /// SM13 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 130>
+    {
+        // PrivatizedPolicy
+        typedef BlockReduceTilesPolicy<128, 8,  2,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            PrivatizedPolicy;
+
+        // SinglePolicy
+        typedef BlockReduceTilesPolicy<32,  4,  4,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            SinglePolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 1 };
+    };
+
+    /// SM10 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 100>
+    {
+        // PrivatizedPolicy
+        typedef BlockReduceTilesPolicy<128, 8,  2,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            PrivatizedPolicy;
+
+        // SinglePolicy
+        typedef BlockReduceTilesPolicy<32,  4,  4,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            SinglePolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 1 };
+    };
+
+
+
+    /******************************************************************************
+     * Default policy initializer
+     ******************************************************************************/
+
+    /// Tuning policy for the PTX architecture that DeviceReduce operations will get dispatched to
+    template <typename T, typename SizeT>
+    struct PtxDefaultPolicies
+    {
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 300) ?
+                                                    300 :
+                                                    (CUB_PTX_ARCH >= 200) ?
+                                                        200 :
+                                                        (CUB_PTX_ARCH >= 130) ?
+                                                            130 :
+                                                            100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // Subscription factor for the current PTX compiler pass
+        static const int SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR;
+
+        // PrivatizedPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct PrivatizedPolicy : PtxTunedPolicies::PrivatizedPolicy {};
+
+        // SinglePolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct SinglePolicy : PtxTunedPolicies::SinglePolicy {};
+
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(
+            int                    ptx_version,
+            KernelDispachParams    &privatized_dispatch_params,
+            KernelDispachParams    &single_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
+                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+            }
+            else if (ptx_version >= 300)
+            {
+                typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
+                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
+                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+            }
+            else if (ptx_version >= 130)
+            {
+                typedef TunedPolicies<T, SizeT, 130> TunedPolicies;
+                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+            }
+            else
+            {
+                typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
+                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+            }
+        }
+    };
+
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
+     */
+    template <
+        typename                    ReducePrivatizedKernelPtr,          ///< Function type of cub::ReducePrivatizedKernel
+        typename                    ReduceSingleKernelPtr,              ///< Function type of cub::ReduceSingleKernel
+        typename                    ResetDrainKernelPtr,                ///< Function type of cub::ResetDrainKernel
+        typename                    InputIteratorRA,                    ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,                   ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    SizeT,                              ///< Integer type used for global array indexing
+        typename                    ReductionOp>                        ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        ReducePrivatizedKernelPtr   privatized_kernel,                  ///< [in] Kernel function pointer to parameterization of cub::ReducePrivatizedKernel
+        ReduceSingleKernelPtr       single_kernel,                      ///< [in] Kernel function pointer to parameterization of cub::ReduceSingleKernel
+        ResetDrainKernelPtr         prepare_drain_kernel,               ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
+        KernelDispachParams         &privatized_dispatch_params,        ///< [in] Dispatch parameters that match the policy that \p privatized_kernel_ptr was compiled for
+        KernelDispachParams         &single_dispatch_params,            ///< [in] Dispatch parameters that match the policy that \p single_kernel was compiled for
+        InputIteratorRA             d_in,                               ///< [in] Input data to reduce
+        OutputIteratorRA            d_out,                              ///< [out] Output location for result
+        SizeT                       num_items,                          ///< [in] Number of items to reduce
+        ReductionOp                 reduction_op,                       ///< [in] Binary reduction operator
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        // Data type of input iterator
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            if ((privatized_kernel == NULL) || (num_items <= (single_dispatch_params.tile_size)))
+            {
+                // Dispatch a single-block reduction kernel
+
+                // Return if the caller is simply requesting the size of the storage allocation
+                if (d_temp_storage == NULL)
+                {
+                    temp_storage_bytes = 1;
+                    return cudaSuccess;
+                }
+
+                // Log single_kernel configuration
+                if (stream_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                    single_dispatch_params.block_threads, (long long) stream, single_dispatch_params.items_per_thread);
+
+                // Invoke single_kernel
+                single_kernel<<<1, single_dispatch_params.block_threads>>>(
+                    d_in,
+                    d_out,
+                    num_items,
+                    reduction_op);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            }
+            else
+            {
+                // Dispatch two kernels: a multi-block kernel to compute
+                // privatized per-block reductions, and then a single-block
+                // to reduce those
+
+                // Get device ordinal
+                int device_ordinal;
+                if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+                // Get SM count
+                int sm_count;
+                if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+                // Get a rough estimate of privatized_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+                int privatized_sm_occupancy = CUB_MIN(
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / privatized_dispatch_params.block_threads);
+
+#ifndef __CUDA_ARCH__
+                // We're on the host, so come up with a more accurate estimate of privatized_kernel SM occupancy from actual device properties
+                Device device_props;
+                if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+                if (CubDebug(error = device_props.MaxSmOccupancy(
+                    privatized_sm_occupancy,
+                    privatized_kernel,
+                    privatized_dispatch_params.block_threads))) break;
+#endif
+
+                // Get device occupancy for privatized_kernel
+                int privatized_occupancy = privatized_sm_occupancy * sm_count;
+
+                // Even-share work distribution
+                GridEvenShare<SizeT> even_share;
+
+                // Get grid size for privatized_kernel
+                int privatized_grid_size;
+                switch (privatized_dispatch_params.grid_mapping)
+                {
+                case GRID_MAPPING_EVEN_SHARE:
+
+                    // Work is distributed evenly
+                    even_share.GridInit(
+                        num_items,
+                        privatized_occupancy * privatized_dispatch_params.subscription_factor,
+                        privatized_dispatch_params.tile_size);
+                    privatized_grid_size = even_share.grid_size;
+                    break;
+
+                case GRID_MAPPING_DYNAMIC:
+
+                    // Work is distributed dynamically
+                    int num_tiles = (num_items + privatized_dispatch_params.tile_size - 1) / privatized_dispatch_params.tile_size;
+                    privatized_grid_size   = (num_tiles < privatized_occupancy) ?
+                        num_tiles :                 // Not enough to fill the device with threadblocks
+                        privatized_occupancy;      // Fill the device with threadblocks
+                    break;
+                };
+
+                // Temporary storage allocation requirements
+                void* allocations[2];
+                size_t allocation_sizes[2] =
+                {
+                    privatized_grid_size * sizeof(T),      // bytes needed for privatized block reductions
+                    GridQueue<int>::AllocationSize()        // bytes needed for grid queue descriptor
+                };
+
+                // Alias temporaries (or set the necessary size of the storage allocation)
+                if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+                // Return if the caller is simply requesting the size of the storage allocation
+                if (d_temp_storage == NULL)
+                    return cudaSuccess;
+
+                // Privatized per-block reductions
+                T *d_block_reductions = (T*) allocations[0];
+
+                // Grid queue descriptor
+                GridQueue<SizeT> queue(allocations[1]);
+
+                // Prepare the dynamic queue descriptor if necessary
+                if (privatized_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC)
+                {
+                    // Prepare queue using a kernel so we know it gets prepared once per operation
+                    if (stream_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream);
+
+                    // Invoke prepare_drain_kernel
+                    prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items);
+
+                    // Sync the stream if specified
+                    if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+                }
+
+                // Log privatized_kernel configuration
+                if (stream_synchronous) CubLog("Invoking privatized_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    privatized_grid_size, privatized_dispatch_params.block_threads, (long long) stream, privatized_dispatch_params.items_per_thread, privatized_sm_occupancy);
+
+                // Invoke privatized_kernel
+                privatized_kernel<<<privatized_grid_size, privatized_dispatch_params.block_threads, 0, stream>>>(
+                    d_in,
+                    d_block_reductions,
+                    num_items,
+                    even_share,
+                    queue,
+                    reduction_op);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                // Log single_kernel configuration
+                if (stream_synchronous) CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                    1, single_dispatch_params.block_threads, (long long) stream, single_dispatch_params.items_per_thread);
+
+                // Invoke single_kernel
+                single_kernel<<<1, single_dispatch_params.block_threads, 0, stream>>>(
+                    d_block_reductions,
+                    d_out,
+                    privatized_grid_size,
+                    reduction_op);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * Does not support non-commutative reduction operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the max reduction of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_reduce_input, *d_aggregate;
+     * int num_items = ...
+     * ...
+     *
+     * // Determine temporary device storage requirements for reduction
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items, cub::Max());
+     *
+     * // Allocate temporary storage for reduction
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction (max)
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename                    InputIteratorRA,
+        typename                    OutputIteratorRA,
+        typename                    ReductionOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t Reduce(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA             d_in,                               ///< [in] Input data to reduce
+        OutputIteratorRA            d_out,                              ///< [out] Output location for result
+        int                         num_items,                          ///< [in] Number of items to reduce
+        ReductionOp                 reduction_op,                       ///< [in] Binary reduction operator
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Type used for array indexing
+        typedef int SizeT;
+
+        // Data type of input iterator
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Tuning polices
+        typedef PtxDefaultPolicies<T, SizeT>                    PtxDefaultPolicies;     // Wrapper of default kernel policies
+        typedef typename PtxDefaultPolicies::PrivatizedPolicy   PrivatizedPolicy;       // Multi-block kernel policy
+        typedef typename PtxDefaultPolicies::SinglePolicy       SinglePolicy;           // Single-block kernel policy
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams privatized_dispatch_params;
+            KernelDispachParams single_dispatch_params;
+
+#ifdef __CUDA_ARCH__
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            privatized_dispatch_params.Init<PrivatizedPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+            single_dispatch_params.Init<SinglePolicy>();
+#else
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(ptx_version, privatized_dispatch_params, single_dispatch_params);
+#endif
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                ReducePrivatizedKernel<PrivatizedPolicy, InputIteratorRA, T*, SizeT, ReductionOp>,
+                ReduceSingleKernel<SinglePolicy, T*, OutputIteratorRA, SizeT, ReductionOp>,
+                ResetDrainKernel<SizeT>,
+                privatized_dispatch_params,
+                single_dispatch_params,
+                d_in,
+                d_out,
+                num_items,
+                reduction_op,
+                stream,
+                stream_synchronous))) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Computes a device-wide sum using the addition ('+') operator.
+     *
+     * \par
+     * Does not support non-commutative reduction operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the sum reduction of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_reduce_input, *d_aggregate;
+     * int num_items = ...
+     * ...
+     *
+     * // Determine temporary device storage requirements for summation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items);
+     *
+     * // Allocate temporary storage for summation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction summation
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items);
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     */
+    template <
+        typename                    InputIteratorRA,
+        typename                    OutputIteratorRA>
+    __host__ __device__ __forceinline__
+    static cudaError_t Sum(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA             d_in,                               ///< [in] Input data to reduce
+        OutputIteratorRA            d_out,                              ///< [out] Output location for result
+        int                         num_items,                          ///< [in] Number of items to reduce
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        return Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum(), stream, stream_synchronous);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
--- a/lib/kokkos/TPL/cub/device/device_reduce_by_key.cuh
+++ b/lib/kokkos/TPL/cub/device/device_reduce_by_key.cuh
@ -0,0 +1,633 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "block/block_reduce_by_key_tiles.cuh"
+#include "device_scan.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_iterator.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reduce-by-key kernel entry point (multi-block)
+ */
+template <
+    typename    BlockReduceByKeyilesPolicy,    ///< Tuning policy for cub::BlockReduceByKeyiles abstraction
+    typename    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+    typename    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+    typename    T,                              ///< The scan data type
+    typename    ReductionOp,                    ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+    typename    SizeT>                          ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockSweepScanPolicy::BLOCK_THREADS))
+__global__ void MultiBlockScanKernel(
+    InputIteratorRA             d_in,           ///< Input data
+    OutputIteratorRA            d_out,          ///< Output data
+    ScanTileDescriptor<T> *d_tile_status, ///< Global list of tile status
+    ReductionOp                 reduction_op,   ///< Binary scan operator
+    Identity                    identity,       ///< Identity element
+    SizeT                       num_items,      ///< Total number of scan items for the entire problem
+    GridQueue<int>              queue)          ///< Descriptor for performing dynamic mapping of tile data to thread blocks
+{
+    enum
+    {
+        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
+    };
+
+    // Thread block type for scanning input tiles
+    typedef BlockSweepScan<
+        BlockSweepScanPolicy,
+        InputIteratorRA,
+        OutputIteratorRA,
+        ReductionOp,
+        Identity,
+        SizeT> BlockSweepScanT;
+
+    // Shared memory for BlockSweepScan
+    __shared__ typename BlockSweepScanT::TempStorage temp_storage;
+
+    // Process tiles
+    BlockSweepScanT(temp_storage, d_in, d_out, reduction_op, identity).ConsumeTiles(
+        num_items,
+        queue,
+        d_tile_status + TILE_STATUS_PADDING);
+}
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * DeviceReduceByKey
+ *****************************************************************************/
+
+/**
+ * \addtogroup DeviceModule
+ * @{
+ */
+
+/**
+ * \brief DeviceReduceByKey provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory. ![](scan_logo.png)
+ */
+struct DeviceReduceByKey
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockSweepScanPolicy.
+    struct KernelDispachParams
+    {
+        // Policy fields
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        BlockStoreAlgorithm     store_policy;
+        BlockScanAlgorithm      scan_algorithm;
+
+        // Other misc
+        int                     tile_size;
+
+        template <typename BlockSweepScanPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = BlockSweepScanPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockSweepScanPolicy::ITEMS_PER_THREAD;
+            load_policy                 = BlockSweepScanPolicy::LOAD_ALGORITHM;
+            store_policy                = BlockSweepScanPolicy::STORE_ALGORITHM;
+            scan_algorithm              = BlockSweepScanPolicy::SCAN_ALGORITHM;
+
+            tile_size                   = block_threads * items_per_thread;
+        }
+
+        __host__ __device__ __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_policy,
+                scan_algorithm);
+        }
+
+    };
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <
+        typename    T,
+        typename    SizeT,
+        int         ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 350>
+    {
+        typedef BlockSweepScanPolicy<128, 16,  BLOCK_LOAD_DIRECT, false, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE, true, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
+    };
+
+    /// SM30 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 300>
+    {
+        typedef BlockSweepScanPolicy<256, 9,  BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
+    };
+
+    /// SM20 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 200>
+    {
+        typedef BlockSweepScanPolicy<128, 15,  BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
+    };
+
+    /// SM10 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 100>
+    {
+        typedef BlockSweepScanPolicy<128, 7,  BLOCK_LOAD_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_TRANSPOSE, false, BLOCK_SCAN_RAKING> MultiBlockPolicy;
+    };
+
+
+    /// Tuning policy for the PTX architecture that DeviceReduceByKey operations will get dispatched to
+    template <typename T, typename SizeT>
+    struct PtxDefaultPolicies
+    {
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 300) ?
+                                                    300 :
+                                                    (CUB_PTX_ARCH >= 200) ?
+                                                        200 :
+                                                        100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // MultiBlockPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct MultiBlockPolicy : PtxTunedPolicies::MultiBlockPolicy {};
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(int ptx_version, KernelDispachParams &multi_block_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
+            }
+            else if (ptx_version >= 300)
+            {
+                typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
+            }
+            else
+            {
+                typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
+            }
+        }
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine
+     */
+    template <
+        typename                    InitScanKernelPtr,              ///< Function type of cub::InitScanKernel
+        typename                    MultiBlockScanKernelPtr,        ///< Function type of cub::MultiBlockScanKernel
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    ReductionOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InitScanKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::InitScanKernel
+        MultiBlockScanKernelPtr     multi_block_kernel,             ///< [in] Kernel function pointer to parameterization of cub::MultiBlockScanKernel
+        KernelDispachParams         &multi_block_dispatch_params,   ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel was compiled for
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
+        ReductionOp                      reduction_op,                        ///< [in] Binary scan operator
+        Identity                    identity,                       ///< [in] Identity element
+        SizeT                       num_items,                      ///< [in] Total number of items to scan
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        enum
+        {
+            TILE_STATUS_PADDING = 32,
+        };
+
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Number of input tiles
+            int num_tiles = (num_items + multi_block_dispatch_params.tile_size - 1) / multi_block_dispatch_params.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                (num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptor<T>),        // bytes needed for tile status descriptors
+                GridQueue<int>::AllocationSize()                                            // bytes needed for grid queue descriptor
+            };
+
+            // Alias temporaries (or set the necessary size of the storage allocation)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Global list of tile status
+            ScanTileDescriptor<T> *d_tile_status = (ScanTileDescriptor<T>*) allocations[0];
+
+            // Grid queue descriptor
+            GridQueue<int> queue(allocations[1]);
+
+            // Get GPU id
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Log init_kernel configuration
+            int init_kernel_threads = 128;
+            int init_grid_size = (num_tiles + init_kernel_threads - 1) / init_kernel_threads;
+            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors and queue descriptors
+            init_kernel<<<init_grid_size, init_kernel_threads, 0, stream>>>(
+                queue,
+                d_tile_status,
+                num_tiles);
+
+            // Sync the stream if specified
+#ifndef __CUDA_ARCH__
+            if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
+#else
+            if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
+#endif
+
+            // Get a rough estimate of multi_block_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+            int multi_sm_occupancy = CUB_MIN(
+                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / multi_block_dispatch_params.block_threads);
+
+#ifndef __CUDA_ARCH__
+
+            // We're on the host, so come up with a more accurate estimate of multi_block_kernel SM occupancy from actual device properties
+            Device device_props;
+            if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+            if (CubDebug(error = device_props.MaxSmOccupancy(
+                multi_sm_occupancy,
+                multi_block_kernel,
+                multi_block_dispatch_params.block_threads))) break;
+
+#endif
+            // Get device occupancy for multi_block_kernel
+            int multi_block_occupancy = multi_sm_occupancy * sm_count;
+
+            // Get grid size for multi_block_kernel
+            int multi_block_grid_size = (num_tiles < multi_block_occupancy) ?
+                num_tiles :                 // Not enough to fill the device with threadblocks
+                multi_block_occupancy;            // Fill the device with threadblocks
+
+            // Log multi_block_kernel configuration
+            if (stream_synchronous) CubLog("Invoking multi_block_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                multi_block_grid_size, multi_block_dispatch_params.block_threads, (long long) stream, multi_block_dispatch_params.items_per_thread, multi_sm_occupancy);
+
+            // Invoke multi_block_kernel
+            multi_block_kernel<<<multi_block_grid_size, multi_block_dispatch_params.block_threads, 0, stream>>>(
+                d_in,
+                d_out,
+                d_tile_status,
+                reduction_op,
+                identity,
+                num_items,
+                queue);
+
+            // Sync the stream if specified
+#ifndef __CUDA_ARCH__
+            if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
+#else
+            if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
+#endif
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Internal scan dispatch routine for using default tuning policies
+     */
+    template <
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    ReductionOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
+        ReductionOp                      reduction_op,                        ///< [in] Binary scan operator
+        Identity                    identity,                       ///< [in] Identity element
+        SizeT                       num_items,                      ///< [in] Total number of items to scan
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Tuning polices for the PTX architecture that will get dispatched to
+        typedef PtxDefaultPolicies<T, SizeT> PtxDefaultPolicies;
+        typedef typename PtxDefaultPolicies::MultiBlockPolicy MultiBlockPolicy;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams multi_block_dispatch_params;
+
+#ifdef __CUDA_ARCH__
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            multi_block_dispatch_params.Init<MultiBlockPolicy>();
+#else
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(ptx_version, multi_block_dispatch_params);
+#endif
+
+            Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                InitScanKernel<T, SizeT>,
+                MultiBlockScanKernel<MultiBlockPolicy, InputIteratorRA, OutputIteratorRA, T, ReductionOp, Identity, SizeT>,
+                multi_block_dispatch_params,
+                d_in,
+                d_out,
+                reduction_op,
+                identity,
+                num_items,
+                stream,
+                stream_synchronous);
+
+            if (CubDebug(error)) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /******************************************************************//**
+     * Interface
+     *********************************************************************/
+
+
+    /**
+     * \brief Computes device-wide reductions of consecutive values whose corresponding keys are equal.
+     *
+     * The resulting output lists of value-aggregates and their corresponding keys are compacted.
+     *
+     * \devicestorage
+     *
+     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
+     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
+     * \tparam ValueInputIteratorRA     <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
+     * \tparam ValueOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
+     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
+     */
+    template <
+        typename                KeyInputIteratorRA,
+        typename                KeyOutputIteratorRA,
+        typename                ValueInputIteratorRA,
+        typename                ValueOutputIteratorRA,
+        typename                ReductionOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t ReduceValues(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
+        KeyOutputIteratorRA     d_keys_out,                     ///< [out] Key output data (compacted)
+        ValueInputIteratorRA    d_values_in,                    ///< [in] Value input data
+        ValueOutputIteratorRA   d_values_out,                   ///< [out] Value output data (compacted)
+        int                     num_items,                      ///< [in] Total number of input pairs
+        ReductionOp             reduction_op,                   ///< [in] Binary value reduction operator
+        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, reduction_op, num_items, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes device-wide sums of consecutive values whose corresponding keys are equal.
+     *
+     * The resulting output lists of value-aggregates and their corresponding keys are compacted.
+     *
+     * \devicestorage
+     *
+     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
+     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
+     * \tparam ValueInputIteratorRA     <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
+     * \tparam ValueOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
+     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
+     */
+    template <
+        typename                KeyInputIteratorRA,
+        typename                KeyOutputIteratorRA,
+        typename                ValueInputIteratorRA,
+        typename                ValueOutputIteratorRA>
+    __host__ __device__ __forceinline__
+    static cudaError_t SumValues(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
+        KeyOutputIteratorRA     d_keys_out,                     ///< [in] Key output data (compacted)
+        ValueInputIteratorRA    d_values_in,                    ///< [in] Value input data
+        ValueOutputIteratorRA   d_values_out,                   ///< [in] Value output data (compacted)
+        int                     num_items,                      ///< [in] Total number of input pairs
+        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return ReduceValues(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, cub::Sum(), num_items, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes the "run-length" of each group of consecutive, equal-valued keys.
+     *
+     * The resulting output lists of run-length counts and their corresponding keys are compacted.
+     *
+     * \devicestorage
+     *
+     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
+     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
+     * \tparam CountOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for output of key-counts whose value type must be convertible to an integer type (may be a simple pointer type)
+     */
+    template <
+        typename                KeyInputIteratorRA,
+        typename                KeyOutputIteratorRA,
+        typename                CountOutputIteratorRA>
+    __host__ __device__ __forceinline__
+    static cudaError_t RunLengths(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
+        KeyOutputIteratorRA     d_keys_out,                     ///< [in] Key output data (compacted)
+        CountOutputIteratorRA   d_counts_out,                   ///< [in] Run-length counts output data (compacted)
+        int                     num_items,                      ///< [in] Total number of keys
+        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef typename std::iterator_traits<CountOutputIteratorRA>::value_type CountT;
+        return SumValues(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, ConstantIteratorRA<CountT>(1), d_counts_out, num_items, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Removes duplicates within each group of consecutive, equal-valued keys.  Only the first key from each group (and corresponding value) is kept.
+     *
+     * The resulting keys are compacted.
+     *
+     * \devicestorage
+     *
+     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
+     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
+     * \tparam ValueInputIteratorRA     <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
+     * \tparam ValueOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
+     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
+     */
+    template <
+        typename                KeyInputIteratorRA,
+        typename                KeyOutputIteratorRA,
+        typename                ValueInputIteratorRA,
+        typename                ValueOutputIteratorRA,
+        typename                ReductionOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t Unique(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
+        KeyOutputIteratorRA     d_keys_out,                     ///< [out] Key output data (compacted)
+        ValueInputIteratorRA    d_values_in,                    ///< [in] Value input data
+        ValueOutputIteratorRA   d_values_out,                   ///< [out] Value output data (compacted)
+        int                     num_items,                      ///< [in] Total number of input pairs
+        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, reduction_op, num_items, stream, stream_synchronous);
+    }
+
+
+
+};
+
+
+/** @} */       // DeviceModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
--- a/lib/kokkos/TPL/cub/device/device_reorder.cuh
+++ b/lib/kokkos/TPL/cub/device/device_reorder.cuh
@ -0,0 +1,550 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReorder provides device-wide operations for partitioning and filtering lists of items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "device_scan.cuh"
+#include "block/block_partition_tiles.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_vector.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Partition kernel entry point (multi-block)
+ */
+template <
+    typename    BlockPartitionTilesPolicy,  ///< Tuning policy for cub::BlockPartitionTiles abstraction
+    typename    InputIteratorRA,            ///< Random-access iterator type for input (may be a simple pointer type)
+    typename    OutputIteratorRA,           ///< Random-access iterator type for output (may be a simple pointer type)
+    typename    LengthOutputIterator,       ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
+    typename    PredicateOp,                ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
+    typename    SizeT>                      ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockPartitionTilesPolicy::BLOCK_THREADS))
+__global__ void PartitionKernel(
+    InputIteratorRA                                                                         d_in,               ///< Input data
+    OutputIteratorRA                                                                        d_out,              ///< Output data
+    LengthOutputIterator                                                                    d_partition_length, ///< Number of items in the first partition
+    ScanTileDescriptor<PartitionScanTuple<SizeT, BlockPartitionTilesPolicy::PARTITOINS> >   *d_tile_status,     ///< Global list of tile status
+    PredicateOp                                                                             pred_op,            ///< Unary predicate operator indicating membership in the first partition
+    SizeT                                                                                   num_items,          ///< Total number of input items for the entire problem
+    int                                                                                     num_tiles,          ///< Totla number of intut tiles for the entire problem
+    GridQueue<int>                                                                          queue)              ///< Descriptor for performing dynamic mapping of tile data to thread blocks
+{
+    enum
+    {
+        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
+    };
+
+    typedef PartitionScanTuple<SizeT, BlockPartitionTilesPolicy::PARTITOINS> PartitionScanTuple;
+
+    // Thread block type for scanning input tiles
+    typedef BlockPartitionTiles<
+        BlockPartitionTilesPolicy,
+        InputIteratorRA,
+        OutputIteratorRA,
+        PredicateOp,
+        SizeT> BlockPartitionTilesT;
+
+    // Shared memory for BlockPartitionTiles
+    __shared__ typename BlockPartitionTilesT::TempStorage temp_storage;
+
+    // Process tiles
+    PartitionScanTuple  partition_ends;     // Ending offsets for partitions (one-after)
+    bool                is_last_tile;       // Whether or not this block handled the last tile (i.e., partition_ends is valid for the entire input)
+    BlockPartitionTilesT(temp_storage, d_in, d_out, d_tile_status + TILE_STATUS_PADDING, pred_op, num_items).ConsumeTiles(
+        queue,
+        num_tiles,
+        partition_ends,
+        is_last_tile);
+
+    // Record the length of the first partition
+    if (is_last_tile && (threadIdx.x == 0))
+    {
+        *d_partition_length = partition_ends.x;
+    }
+}
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * DeviceReorder
+ *****************************************************************************/
+
+/**
+ * \addtogroup DeviceModule
+ * @{
+ */
+
+/**
+ * \brief DeviceReorder provides device-wide operations for partitioning and filtering lists of items residing within global memory
+ */
+struct DeviceReorder
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockPartitionTilesPolicy.
+    struct KernelDispachParams
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockScanAlgorithm      scan_algorithm;
+        int                     tile_size;
+
+        template <typename BlockPartitionTilesPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = BlockPartitionTilesPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockPartitionTilesPolicy::ITEMS_PER_THREAD;
+            scan_algorithm              = BlockPartitionTilesPolicy::SCAN_ALGORITHM;
+            tile_size                   = block_threads * items_per_thread;
+        }
+    };
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <
+        int         PARTITIONS,
+        typename    T,
+        typename    SizeT,
+        int         ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <int PARTITIONS, typename T, typename SizeT>
+    struct TunedPolicies<PARTITIONS, T, SizeT, 350>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 16,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_LDG, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
+    };
+
+    /// SM30 tune
+    template <int PARTITIONS, typename T, typename SizeT>
+    struct TunedPolicies<PARTITIONS, T, SizeT, 300>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockPartitionTilesPolicy<PARTITIONS, 256, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
+    };
+
+    /// SM20 tune
+    template <int PARTITIONS, typename T, typename SizeT>
+    struct TunedPolicies<PARTITIONS, T, SizeT, 200>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
+    };
+
+    /// SM10 tune
+    template <int PARTITIONS, typename T, typename SizeT>
+    struct TunedPolicies<PARTITIONS, T, SizeT, 100>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+        typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING> PartitionPolicy;
+    };
+
+
+    /// Tuning policy for the PTX architecture that DevicePartition operations will get dispatched to
+    template <int PARTITIONS, typename T, typename SizeT>
+    struct PtxDefaultPolicies
+    {
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 300) ?
+                                                    300 :
+                                                    (CUB_PTX_ARCH >= 200) ?
+                                                        200 :
+                                                        100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<PARTITIONS, T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // PartitionPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct PartitionPolicy : PtxTunedPolicies::PartitionPolicy {};
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(int ptx_version, KernelDispachParams &scan_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<PARTITIONS, T, SizeT, 350> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
+            }
+            else if (ptx_version >= 300)
+            {
+                typedef TunedPolicies<PARTITIONS, T, SizeT, 300> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<PARTITIONS, T, SizeT, 200> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
+            }
+            else
+            {
+                typedef TunedPolicies<PARTITIONS, T, SizeT, 100> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
+            }
+        }
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine
+     */
+    template <
+        typename                    ScanInitKernelPtr,              ///< Function type of cub::ScanInitKernel
+        typename                    PartitionKernelPtr,             ///< Function type of cub::PartitionKernel
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    LengthOutputIterator,           ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
+        typename                    PredicateOp,                    ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        int                         ptx_version,                    ///< [in] PTX version
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        ScanInitKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::PartitionInitKernel
+        PartitionKernelPtr          partition_kernel,               ///< [in] Kernel function pointer to parameterization of cub::PartitionKernel
+        KernelDispachParams         &scan_dispatch_params,          ///< [in] Dispatch parameters that match the policy that \p partition_kernel was compiled for
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
+        LengthOutputIterator        d_partition_length,                 ///< [out] Output iterator referencing the location where the pivot offset (i.e., the length of the first partition) is to be recorded
+        PredicateOp                 pred_op,                        ///< [in] Unary predicate operator indicating membership in the first partition
+        SizeT                       num_items,                      ///< [in] Total number of items to partition
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        enum
+        {
+            TILE_STATUS_PADDING = 32,
+        };
+
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Scan tuple type and tile status descriptor type
+        typedef typename VectorHelper<SizeT, 2>::Type ScanTuple;
+        typedef ScanTileDescriptor<ScanTuple> ScanTileDescriptorT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Number of input tiles
+            int num_tiles = (num_items + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                (num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptorT),      // bytes needed for tile status descriptors
+                GridQueue<int>::AllocationSize()                                            // bytes needed for grid queue descriptor
+            };
+
+            // Alias temporaries (or set the necessary size of the storage allocation)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Global list of tile status
+            ScanTileDescriptorT *d_tile_status = (ScanTileDescriptorT*) allocations[0];
+
+            // Grid queue descriptor
+            GridQueue<int> queue(allocations[1]);
+
+            // Log init_kernel configuration
+            int init_kernel_threads = 128;
+            int init_grid_size = (num_tiles + init_kernel_threads - 1) / init_kernel_threads;
+            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors and queue descriptors
+            init_kernel<<<init_grid_size, init_kernel_threads, 0, stream>>>(
+                queue,
+                d_tile_status,
+                num_tiles);
+
+            // Sync the stream if specified
+            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get grid size for multi-block kernel
+            int scan_grid_size;
+            int multi_sm_occupancy = -1;
+            if (ptx_version < 200)
+            {
+                // We don't have atomics (or don't have fast ones), so just assign one
+                // block per tile (limited to 65K tiles)
+                scan_grid_size = num_tiles;
+            }
+            else
+            {
+                // We have atomics and can thus reuse blocks across multiple tiles using a queue descriptor.
+                // Get GPU id
+                int device_ordinal;
+                if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+                // Get SM count
+                int sm_count;
+                if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+                // Get a rough estimate of partition_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+                multi_sm_occupancy = CUB_MIN(
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / scan_dispatch_params.block_threads);
+
+#ifndef __CUDA_ARCH__
+                // We're on the host, so come up with a
+                Device device_props;
+                if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+                if (CubDebug(error = device_props.MaxSmOccupancy(
+                    multi_sm_occupancy,
+                    partition_kernel,
+                    scan_dispatch_params.block_threads))) break;
+#endif
+                // Get device occupancy for partition_kernel
+                int scan_occupancy = multi_sm_occupancy * sm_count;
+
+                // Get grid size for partition_kernel
+                scan_grid_size = (num_tiles < scan_occupancy) ?
+                    num_tiles :                 // Not enough to fill the device with threadblocks
+                    scan_occupancy;      // Fill the device with threadblocks
+            }
+
+            // Log partition_kernel configuration
+            if (stream_synchronous) CubLog("Invoking partition_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread, multi_sm_occupancy);
+
+            // Invoke partition_kernel
+            partition_kernel<<<scan_grid_size, scan_dispatch_params.block_threads, 0, stream>>>(
+                d_in,
+                d_out,
+                d_partition_length,
+                d_tile_status,
+                pred_op,
+                num_items,
+                num_tiles,
+                queue);
+
+            // Sync the stream if specified
+            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Internal partition dispatch routine for using default tuning policies
+     */
+    template <
+        typename                    PARTITIONS,                     ///< Number of partitions we are keeping
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    LengthOutputIterator,           ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
+        typename                    PredicateOp,                    ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to input items
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to output items
+        LengthOutputIterator        d_partition_length,             ///< [out] Output iterator referencing the location where the pivot offset (i.e., the length of the first partition) is to be recorded
+        PredicateOp                 pred_op,                        ///< [in] Unary predicate operator indicating membership in the first partition
+        SizeT                       num_items,                      ///< [in] Total number of items to partition
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Tuning polices
+        typedef PtxDefaultPolicies<PARTITIONS, T, SizeT>        PtxDefaultPolicies;     // Wrapper of default kernel policies
+        typedef typename PtxDefaultPolicies::PartitionPolicy    PartitionPolicy;        // Partition kernel policy
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams scan_dispatch_params;
+
+            int ptx_version;
+#ifdef __CUDA_ARCH__
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            scan_dispatch_params.Init<PartitionPolicy>();
+            ptx_version = CUB_PTX_ARCH;
+#else
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(ptx_version, scan_dispatch_params);
+#endif
+
+            Dispatch(
+                ptx_version,
+                d_temp_storage,
+                temp_storage_bytes,
+                ScanInitKernel<T, SizeT>,
+                PartitionKernel<PartitionPolicy, InputIteratorRA, OutputIteratorRA, LengthOutputIterator, PredicateOp, SizeT>,
+                scan_dispatch_params,
+                d_in,
+                d_out,
+                d_partition_length,
+                pred_op,
+                num_items,
+                stream,
+                stream_synchronous);
+
+            if (CubDebug(error)) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /**
+     * \brief Splits a list of input items into two partitions within the given output list using the specified predicate.  The relative ordering of inputs is not necessarily preserved.
+     *
+     * An item \p val is placed in the first partition if <tt>pred_op(val) == true</tt>, otherwise
+     * it is placed in the second partition.  The offset of the partitioning pivot (equivalent to
+     * the total length of the first partition as well as the starting offset of the second), is
+     * recorded to \p d_partition_length.
+     *
+     * The length of the output referenced by \p d_out is assumed to be the same as that of \p d_in.
+     *
+     * \devicestorage
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     * \tparam LengthOutputIterator <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     * \tparam PredicateOp          <b>[inferred]</b> Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
+     */
+    template <
+        typename                InputIteratorRA,
+        typename                OutputIteratorRA,
+        typename                LengthOutputIterator,
+        typename                PredicateOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t Partition(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA         d_in,                           ///< [in] Iterator pointing to input items
+        OutputIteratorRA        d_out,                          ///< [in] Iterator pointing to output items
+        LengthOutputIterator    d_pivot_offset,                 ///< [out] Output iterator referencing the location where the pivot offset is to be recorded
+        PredicateOp             pred_op,                        ///< [in] Unary predicate operator indicating membership in the first partition
+        int                     num_items,                      ///< [in] Total number of items to partition
+        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), T(), num_items, stream, stream_synchronous);
+    }
+
+
+};
+
+
+/** @} */       // DeviceModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
--- a/lib/kokkos/TPL/cub/device/device_scan.cuh
+++ b/lib/kokkos/TPL/cub/device/device_scan.cuh
@ -0,0 +1,812 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "block/block_scan_tiles.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename T,                                     ///< Scan value type
+    typename SizeT>                                 ///< Integer type used for global array indexing
+__global__ void ScanInitKernel(
+    GridQueue<SizeT>            grid_queue,         ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks
+    ScanTileDescriptor<T>       *d_tile_status,     ///< [out] Tile status words
+    int                         num_tiles)          ///< [in] Number of tiles
+{
+    typedef ScanTileDescriptor<T> ScanTileDescriptorT;
+
+    enum
+    {
+        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
+    };
+
+    // Reset queue descriptor
+    if ((blockIdx.x == 0) && (threadIdx.x == 0)) grid_queue.ResetDrain(num_tiles);
+
+    // Initialize tile status
+    int tile_offset = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tile_offset < num_tiles)
+    {
+        // Not-yet-set
+        d_tile_status[TILE_STATUS_PADDING + tile_offset].status = SCAN_TILE_INVALID;
+    }
+
+    if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+    {
+        // Padding
+        d_tile_status[threadIdx.x].status = SCAN_TILE_OOB;
+    }
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename    BlockScanTilesPolicy,           ///< Tuning policy for cub::BlockScanTiles abstraction
+    typename    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+    typename    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+    typename    T,                              ///< The scan data type
+    typename    ScanOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+    typename    SizeT>                          ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS))
+__global__ void ScanKernel(
+    InputIteratorRA             d_in,           ///< Input data
+    OutputIteratorRA            d_out,          ///< Output data
+    ScanTileDescriptor<T>       *d_tile_status, ///< Global list of tile status
+    ScanOp                      scan_op,        ///< Binary scan operator
+    Identity                    identity,       ///< Identity element
+    SizeT                       num_items,      ///< Total number of scan items for the entire problem
+    GridQueue<int>              queue)          ///< Descriptor for performing dynamic mapping of tile data to thread blocks
+{
+    enum
+    {
+        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
+    };
+
+    // Thread block type for scanning input tiles
+    typedef BlockScanTiles<
+        BlockScanTilesPolicy,
+        InputIteratorRA,
+        OutputIteratorRA,
+        ScanOp,
+        Identity,
+        SizeT> BlockScanTilesT;
+
+    // Shared memory for BlockScanTiles
+    __shared__ typename BlockScanTilesT::TempStorage temp_storage;
+
+    // Process tiles
+    BlockScanTilesT(temp_storage, d_in, d_out, scan_op, identity).ConsumeTiles(
+        num_items,
+        queue,
+        d_tile_status + TILE_STATUS_PADDING);
+}
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * DeviceScan
+ *****************************************************************************/
+
+/**
+ * \brief DeviceScan provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory. ![](device_scan.png)
+ * \ingroup DeviceModule
+ *
+ * \par Overview
+ * Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output list where each element is computed to be the reduction
+ * of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceScan}
+ *
+ * \par Performance
+ *
+ * \image html scan_perf.png
+ *
+ */
+struct DeviceScan
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockScanTilesPolicy.
+    struct KernelDispachParams
+    {
+        // Policy fields
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        BlockStoreAlgorithm     store_policy;
+        BlockScanAlgorithm      scan_algorithm;
+
+        // Other misc
+        int                     tile_size;
+
+        template <typename BlockScanTilesPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = BlockScanTilesPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockScanTilesPolicy::ITEMS_PER_THREAD;
+            load_policy                 = BlockScanTilesPolicy::LOAD_ALGORITHM;
+            store_policy                = BlockScanTilesPolicy::STORE_ALGORITHM;
+            scan_algorithm              = BlockScanTilesPolicy::SCAN_ALGORITHM;
+
+            tile_size                   = block_threads * items_per_thread;
+        }
+
+        __host__ __device__ __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_policy,
+                scan_algorithm);
+        }
+
+    };
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <
+        typename    T,
+        typename    SizeT,
+        int         ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 350>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 16,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        // ScanPolicy: GTX Titan: 29.1B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD,  BLOCK_LOAD_DIRECT, false, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE, true, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+    };
+
+    /// SM30 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 300>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockScanTilesPolicy<256, ITEMS_PER_THREAD,  BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+    };
+
+    /// SM20 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 200>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        // ScanPolicy: GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+        typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+    };
+
+    /// SM10 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 100>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+        typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_TRANSPOSE, false, BLOCK_SCAN_RAKING> ScanPolicy;
+    };
+
+
+    /// Tuning policy for the PTX architecture that DeviceScan operations will get dispatched to
+    template <typename T, typename SizeT>
+    struct PtxDefaultPolicies
+    {
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 300) ?
+                                                    300 :
+                                                    (CUB_PTX_ARCH >= 200) ?
+                                                        200 :
+                                                        100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(int ptx_version, KernelDispachParams &scan_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+            }
+            else if (ptx_version >= 300)
+            {
+                typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+            }
+            else
+            {
+                typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+            }
+        }
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine
+     */
+    template <
+        typename                    ScanInitKernelPtr,              ///< Function type of cub::ScanInitKernel
+        typename                    ScanKernelPtr,                  ///< Function type of cub::ScanKernel
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    ScanOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        int                         ptx_version,                    ///< [in] PTX version
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        ScanInitKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
+        ScanKernelPtr               scan_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::ScanKernel
+        KernelDispachParams         &scan_dispatch_params,          ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
+        ScanOp                      scan_op,                        ///< [in] Binary scan operator
+        Identity                    identity,                       ///< [in] Identity element
+        SizeT                       num_items,                      ///< [in] Total number of items to scan
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        enum
+        {
+            TILE_STATUS_PADDING     = 32,
+            INIT_KERNEL_THREADS     = 128
+        };
+
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Tile status descriptor type
+        typedef ScanTileDescriptor<T> ScanTileDescriptorT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Number of input tiles
+            int num_tiles = (num_items + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                (num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptorT),      // bytes needed for tile status descriptors
+                GridQueue<int>::AllocationSize()                                      // bytes needed for grid queue descriptor
+            };
+
+            // Alias temporaries (or set the necessary size of the storage allocation)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Global list of tile status
+            ScanTileDescriptorT *d_tile_status = (ScanTileDescriptorT*) allocations[0];
+
+            // Grid queue descriptor
+            GridQueue<int> queue(allocations[1]);
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors and queue descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                queue,
+                d_tile_status,
+                num_tiles);
+
+            // Sync the stream if specified
+            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get grid size for multi-block kernel
+            int scan_grid_size;
+            int multi_sm_occupancy = -1;
+            if (ptx_version < 200)
+            {
+                // We don't have atomics (or don't have fast ones), so just assign one
+                // block per tile (limited to 65K tiles)
+                scan_grid_size = num_tiles;
+            }
+            else
+            {
+                // We have atomics and can thus reuse blocks across multiple tiles using a queue descriptor.
+                // Get GPU id
+                int device_ordinal;
+                if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+                // Get SM count
+                int sm_count;
+                if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+                // Get a rough estimate of scan_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+                multi_sm_occupancy = CUB_MIN(
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / scan_dispatch_params.block_threads);
+
+#ifndef __CUDA_ARCH__
+                // We're on the host, so come up with a
+                Device device_props;
+                if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+                if (CubDebug(error = device_props.MaxSmOccupancy(
+                    multi_sm_occupancy,
+                    scan_kernel,
+                    scan_dispatch_params.block_threads))) break;
+#endif
+                // Get device occupancy for scan_kernel
+                int scan_occupancy = multi_sm_occupancy * sm_count;
+
+                // Get grid size for scan_kernel
+                scan_grid_size = (num_tiles < scan_occupancy) ?
+                    num_tiles :                 // Not enough to fill the device with threadblocks
+                    scan_occupancy;      // Fill the device with threadblocks
+            }
+
+            // Log scan_kernel configuration
+            if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread, multi_sm_occupancy);
+
+            // Invoke scan_kernel
+            scan_kernel<<<scan_grid_size, scan_dispatch_params.block_threads, 0, stream>>>(
+                d_in,
+                d_out,
+                d_tile_status,
+                scan_op,
+                identity,
+                num_items,
+                queue);
+
+            // Sync the stream if specified
+            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Internal scan dispatch routine for using default tuning policies
+     */
+    template <
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    ScanOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
+        ScanOp                      scan_op,                        ///< [in] Binary scan operator
+        Identity                    identity,                       ///< [in] Identity element
+        SizeT                       num_items,                      ///< [in] Total number of items to scan
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Tuning polices
+        typedef PtxDefaultPolicies<T, SizeT>                    PtxDefaultPolicies;     // Wrapper of default kernel policies
+        typedef typename PtxDefaultPolicies::ScanPolicy   ScanPolicy;       // Scan kernel policy
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams scan_dispatch_params;
+
+            int ptx_version;
+#ifdef __CUDA_ARCH__
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            scan_dispatch_params.Init<ScanPolicy>();
+            ptx_version = CUB_PTX_ARCH;
+#else
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(ptx_version, scan_dispatch_params);
+#endif
+
+            Dispatch(
+                ptx_version,
+                d_temp_storage,
+                temp_storage_bytes,
+                ScanInitKernel<T, SizeT>,
+                ScanKernel<ScanPolicy, InputIteratorRA, OutputIteratorRA, T, ScanOp, Identity, SizeT>,
+                scan_dispatch_params,
+                d_in,
+                d_out,
+                scan_op,
+                identity,
+                num_items,
+                stream,
+                stream_synchronous);
+
+            if (CubDebug(error)) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /******************************************************************//**
+     * \name Exclusive scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a device-wide exclusive prefix sum.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the exclusive prefix sum of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_scan_input, *d_scan_output;
+     * int num_items = ...
+     *
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix sum
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix sum
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     */
+    template <
+        typename            InputIteratorRA,
+        typename            OutputIteratorRA>
+    __host__ __device__ __forceinline__
+    static cudaError_t ExclusiveSum(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
+        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
+        int                 num_items,                          ///< [in] Total number of items to scan
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), T(), num_items, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * Supports non-commutative scan operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the exclusive prefix scan of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_scan_input, *d_scan_output;
+     * int num_items = ...
+     *
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), (int) MIN_INT, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix scan (max)
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), (int) MIN_INT, num_items);
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam Identity             <b>[inferred]</b> Type of the \p identity value used Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            InputIteratorRA,
+        typename            OutputIteratorRA,
+        typename            ScanOp,
+        typename            Identity>
+    __host__ __device__ __forceinline__
+    static cudaError_t ExclusiveScan(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
+        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
+        ScanOp              scan_op,                            ///< [in] Binary scan operator
+        Identity            identity,                           ///< [in] Identity element
+        int                 num_items,                          ///< [in] Total number of items to scan
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, identity, num_items, stream, stream_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix sum.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the inclusive prefix sum of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_scan_input, *d_scan_output;
+     * int num_items = ...
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix sum
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     */
+    template <
+        typename            InputIteratorRA,
+        typename            OutputIteratorRA>
+    __host__ __device__ __forceinline__
+    static cudaError_t InclusiveSum(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
+        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
+        int                 num_items,                          ///< [in] Total number of items to scan
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * Supports non-commutative scan operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the inclusive prefix scan of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_scan_input, *d_scan_output;
+     * int num_items = ...
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix scan (max)
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), num_items);
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            InputIteratorRA,
+        typename            OutputIteratorRA,
+        typename            ScanOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t InclusiveScan(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
+        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
+        ScanOp              scan_op,                            ///< [in] Binary scan operator
+        int                 num_items,                          ///< [in] Total number of items to scan
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream, stream_synchronous);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
--- a/lib/kokkos/TPL/cub/grid/grid_barrier.cuh
+++ b/lib/kokkos/TPL/cub/grid/grid_barrier.cuh
@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+
+#pragma once
+
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+#include "../thread/thread_load.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+class GridBarrier
+{
+protected :
+
+    typedef unsigned int SyncFlag;
+
+    // Counters in global device memory
+    SyncFlag* d_sync;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+
+
+    /**
+     * Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        __syncthreads();
+
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+
+            __syncthreads();
+
+            // Wait for everyone else to report in
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            __syncthreads();
+
+            // Let everyone know it's safe to proceed
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            __syncthreads();
+        }
+    }
+};
+
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+
+
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            CubDebug(retval = cudaFree(d_sync));
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+
+
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+
+
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if (CubDebug(retval = cudaFree(d_sync))) break;
+                }
+
+                sync_bytes = new_sync_bytes;
+
+                // Allocate and initialize to zero
+                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+
+        return retval;
+    }
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/grid/grid_even_share.cuh
+++ b/lib/kokkos/TPL/cub/grid/grid_even_share.cuh
@ -0,0 +1,197 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
+ */
+
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
+ *
+ * \par Overview
+ * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
+ * Threadblocks may receive one of three different amounts of work: "big", "normal",
+ * and "last".  The "big" workloads are one scheduling grain larger than "normal".  The "last" work unit
+ * for the last threadblock may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct and initialize an instance of
+ * GridEvenShare using \p GridInit().  The instance can be passed to child threadblocks which can
+ * initialize their per-threadblock offsets using \p BlockInit().
+ *
+ * \tparam SizeT Integer type for array indexing
+ */
+template <typename SizeT>
+class GridEvenShare
+{
+private:
+
+    SizeT   total_grains;
+    int     big_blocks;
+    SizeT   big_share;
+    SizeT   normal_share;
+    SizeT   normal_base_offset;
+
+
+public:
+
+    /// Total number of input items
+    SizeT   num_items;
+
+    /// Grid size in threadblocks
+    int     grid_size;
+
+    /// Offset into input marking the beginning of the owning thread block's segment of input tiles
+    SizeT   block_offset;
+
+    /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    SizeT   block_oob;
+
+    /**
+     * \brief Block-based constructor for single-block grids.
+     */
+    __device__ __forceinline__ GridEvenShare(SizeT num_items) :
+        num_items(num_items),
+        grid_size(1),
+        block_offset(0),
+        block_oob(num_items) {}
+
+
+    /**
+     * \brief Default constructor.  Zero-initializes block-specific fields.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_oob(0) {}
+
+
+    /**
+     * \brief Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
+     */
+    __host__ __device__ __forceinline__ void GridInit(
+        SizeT   num_items,                  ///< Total number of input items
+        int     max_grid_size,              ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     schedule_granularity)       ///< Granularity by which the input can be parcelled into and distributed among threablocks.  Usually the thread block's native tile size (or a multiple thereof.
+    {
+        this->num_items             = num_items;
+        this->block_offset          = 0;
+        this->block_oob             = 0;
+        this->total_grains          = (num_items + schedule_granularity - 1) / schedule_granularity;
+        this->grid_size             = CUB_MIN(total_grains, max_grid_size);
+        SizeT grains_per_block      = total_grains / grid_size;
+        this->big_blocks            = total_grains - (grains_per_block * grid_size);        // leftover grains go to big blocks
+        this->normal_share          = grains_per_block * schedule_granularity;
+        this->normal_base_offset    = big_blocks * schedule_granularity;
+        this->big_share             = normal_share + schedule_granularity;
+    }
+
+
+    /**
+     * \brief Initializes the threadblock-specific details (e.g., to be called by each threadblock after startup)
+     */
+    __device__ __forceinline__ void BlockInit()
+    {
+        if (blockIdx.x < big_blocks)
+        {
+            // This threadblock gets a big share of grains (grains_per_block + 1)
+            block_offset = (blockIdx.x * big_share);
+            block_oob = block_offset + big_share;
+        }
+        else if (blockIdx.x < total_grains)
+        {
+            // This threadblock gets a normal share of grains (grains_per_block)
+            block_offset = normal_base_offset + (blockIdx.x * normal_share);
+            block_oob = block_offset + normal_share;
+        }
+
+        // Last threadblock
+        if (blockIdx.x == grid_size - 1)
+        {
+            block_oob = num_items;
+        }
+    }
+
+
+    /**
+     * Print to stdout
+     */
+    __host__ __device__ __forceinline__ void Print()
+    {
+        printf(
+#ifdef __CUDA_ARCH__
+            "\tthreadblock(%d) "
+            "block_offset(%lu) "
+            "block_oob(%lu) "
+#endif
+            "num_items(%lu)  "
+            "total_grains(%lu)  "
+            "big_blocks(%lu)  "
+            "big_share(%lu)  "
+            "normal_share(%lu)\n",
+#ifdef __CUDA_ARCH__
+                blockIdx.x,
+                (unsigned long) block_offset,
+                (unsigned long) block_oob,
+#endif
+                (unsigned long) num_items,
+                (unsigned long) total_grains,
+                (unsigned long) big_blocks,
+                (unsigned long) big_share,
+                (unsigned long) normal_share);
+    }
+};
+
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/grid/grid_mapping.cuh
+++ b/lib/kokkos/TPL/cub/grid/grid_mapping.cuh
@ -0,0 +1,95 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An "even-share" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_EVEN_SHARE,
+
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/grid/grid_queue.cuh
+++ b/lib/kokkos/TPL/cub/grid/grid_queue.cuh
@ -0,0 +1,207 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_debug.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::ResetDrainAfterFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam SizeT Integer type for array indexing
+ */
+template <typename SizeT>
+class GridQueue
+{
+private:
+
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+
+    /// Pair of counters
+    SizeT *d_counters;
+
+public:
+
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(SizeT) * 2;
+    }
+
+
+    /// Constructs an invalid GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((SizeT*) d_storage)
+    {}
+
+
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrainAfterFill(cudaStream_t stream = 0)
+    {
+#ifdef __CUDA_ARCH__
+        d_counters[DRAIN] = 0;
+        return cudaSuccess;
+#else
+        return ResetDrain(0, stream);
+#endif
+    }
+
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrain(
+        SizeT fill_size,
+        cudaStream_t stream = 0)
+    {
+#ifdef __CUDA_ARCH__
+        d_counters[FILL] = fill_size;
+        d_counters[DRAIN] = 0;
+        return cudaSuccess;
+#else
+        SizeT counters[2];
+        counters[FILL] = fill_size;
+        counters[DRAIN] = 0;
+        return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(SizeT) * 2, cudaMemcpyHostToDevice, stream));
+#endif
+    }
+
+
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    __host__ __device__ __forceinline__ cudaError_t ResetFill()
+    {
+#ifdef __CUDA_ARCH__
+        d_counters[FILL] = 0;
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemset(d_counters + FILL, 0, sizeof(SizeT)));
+#endif
+    }
+
+
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    __host__ __device__ __forceinline__ cudaError_t FillSize(
+        SizeT &fill_size,
+        cudaStream_t stream = 0)
+    {
+#ifdef __CUDA_ARCH__
+        fill_size = d_counters[FILL];
+#else
+        return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(SizeT), cudaMemcpyDeviceToHost, stream));
+#endif
+    }
+
+
+    /// Drain num_items.  Returns offset from which to read items.
+    __device__ __forceinline__ SizeT Drain(SizeT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+
+
+    /// Fill num_items.  Returns offset from which to write items.
+    __device__ __forceinline__ SizeT Fill(SizeT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename SizeT>
+__global__ void ResetDrainKernel(
+    GridQueue<SizeT>    grid_queue,
+    SizeT               num_items)
+{
+    grid_queue.ResetDrain(num_items);
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
--- a/lib/kokkos/TPL/cub/host/spinlock.cuh
+++ b/lib/kokkos/TPL/cub/host/spinlock.cuh
@ -0,0 +1,123 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++)
+ */
+
+
+#pragma once
+
+#if defined(_WIN32) || defined(_WIN64)
+    #include <intrin.h>
+    #include <windows.h>
+    #undef small            // Windows is terrible for polluting macro namespace
+
+    /**
+     * Compiler read/write barrier
+     */
+    #pragma intrinsic(_ReadWriteBarrier)
+
+#endif
+
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+#if defined(_MSC_VER)
+
+    // Microsoft VC++
+    typedef long Spinlock;
+
+#else
+
+    // GNU g++
+    typedef int Spinlock;
+
+    /**
+     * Compiler read/write barrier
+     */
+    __forceinline__ void _ReadWriteBarrier()
+    {
+        __sync_synchronize();
+    }
+
+    /**
+     * Atomic exchange
+     */
+    __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
+    {
+        // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
+        _ReadWriteBarrier();
+        return __sync_lock_test_and_set(Target, Value);
+    }
+
+    /**
+     * Pause instruction to prevent excess processor bus usage
+     */
+    __forceinline__ void YieldProcessor()
+    {
+#ifndef __arm__
+        asm volatile("pause\n": : :"memory");
+#endif  // __arm__
+    }
+
+#endif  // defined(_MSC_VER)
+
+/**
+ * Return when the specified spinlock has been acquired
+ */
+__forceinline__ void Lock(volatile Spinlock *lock)
+{
+    while (1)
+    {
+        if (!_InterlockedExchange(lock, 1)) return;
+        while (*lock) YieldProcessor();
+    }
+}
+
+
+/**
+ * Release the specified spinlock
+ */
+__forceinline__ void Unlock(volatile Spinlock *lock)
+{
+    _ReadWriteBarrier();
+    *lock = 0;
+}
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/lib/kokkos/TPL/cub/thread/thread_load.cuh
+++ b/lib/kokkos/TPL/cub/thread/thread_load.cuh
@ -0,0 +1,429 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for reading memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include <iterator>
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup IoModule
+ * @{
+ */
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of PTX cache-modifiers for memory load operations.
+ */
+enum PtxLoadModifier
+{
+    LOAD_DEFAULT,       ///< Default (no modifier)
+    LOAD_CA,            ///< Cache at all levels
+    LOAD_CG,            ///< Cache at global level
+    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
+    LOAD_CV,            ///< Cache as volatile (including cached system lines)
+    LOAD_LDG,           ///< Cache as texture
+    LOAD_VOLATILE,      ///< Volatile (any memory space)
+};
+
+
+/**
+ * \name Simple I/O
+ * @{
+ */
+
+/**
+ * \brief Thread utility for reading memory using cub::PtxLoadModifier cache modifiers.
+ *
+ * Cache modifiers will only be effected for built-in types (i.e., C++
+ * primitives and CUDA vector-types).
+ *
+ * For example:
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * // 32-bit load using cache-global modifier:
+ * int *d_in;
+ * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
+ *
+ * // 16-bit load using default modifier
+ * short *d_in;
+ * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
+ *
+ * // 256-bit load using cache-volatile modifier
+ * double4 *d_in;
+ * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
+ *
+ * // 96-bit load using default cache modifier (ignoring LOAD_CS)
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
+ * \endcode
+ *
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename InputIteratorRA>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr);
+
+
+//@}  end member group
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Define a int4 (16B) ThreadLoad specialization for the given PTX load modifier
+ */
+#define CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ int4 ThreadLoad<cub_modifier, int4*>(int4* ptr)              \
+    {                                                                                       \
+        int4 retval;                                                                        \
+        asm volatile ("ld."#ptx_modifier".v4.s32 {%0, %1, %2, %3}, [%4];" :                 \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y),                                                                 \
+            "=r"(retval.z),                                                                 \
+            "=r"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ longlong2 ThreadLoad<cub_modifier, longlong2*>(longlong2* ptr)              \
+    {                                                                                       \
+        longlong2 retval;                                                                   \
+        asm volatile ("ld."#ptx_modifier".v2.s64 {%0, %1}, [%2];" :                         \
+            "=l"(retval.x),                                                                 \
+            "=l"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a int2 (8B) ThreadLoad specialization for the given PTX load modifier
+ */
+#define CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ short4 ThreadLoad<cub_modifier, short4*>(short4* ptr)        \
+    {                                                                                       \
+        short4 retval;                                                                      \
+        asm volatile ("ld."#ptx_modifier".v4.s16 {%0, %1, %2, %3}, [%4];" :                 \
+            "=h"(retval.x),                                                                 \
+            "=h"(retval.y),                                                                 \
+            "=h"(retval.z),                                                                 \
+            "=h"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ int2 ThreadLoad<cub_modifier, int2*>(int2* ptr)              \
+    {                                                                                       \
+        int2 retval;                                                                        \
+        asm volatile ("ld."#ptx_modifier".v2.s32 {%0, %1}, [%2];" :                         \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ long long ThreadLoad<cub_modifier, long long*>(long long* ptr)                 \
+    {                                                                                       \
+        long long retval;                                                                   \
+        asm volatile ("ld."#ptx_modifier".s64 %0, [%1];" :                                  \
+            "=l"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a int (4B) ThreadLoad specialization for the given PTX load modifier
+ */
+#define CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ int ThreadLoad<cub_modifier, int*>(int* ptr)                 \
+    {                                                                                       \
+        int retval;                                                                         \
+        asm volatile ("ld."#ptx_modifier".s32 %0, [%1];" :                                  \
+            "=r"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a short (2B) ThreadLoad specialization for the given PTX load modifier
+ */
+#define CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ short ThreadLoad<cub_modifier, short*>(short* ptr)           \
+    {                                                                                       \
+        short retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".s16 %0, [%1];" :                                  \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a char (1B) ThreadLoad specialization for the given PTX load modifier
+ */
+#define CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ char ThreadLoad<cub_modifier, char*>(char* ptr)              \
+    {                                                                                       \
+        short retval;                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .s8 datum;"                                                                \
+        "    ld."#ptx_modifier".s8 datum, [%1];"                                            \
+        "    cvt.s16.s8 %0, datum;"                                                         \
+        "}" :                                                                               \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return (char) retval;                                                               \
+    }
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the given PTX load modifier
+ */
+#define CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
+    CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
+    CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
+    CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
+    CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
+    CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
+
+
+/**
+ * Define ThreadLoad specializations for the various PTX load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    CUB_LOAD_ALL(LOAD_CA, ca)
+    CUB_LOAD_ALL(LOAD_CG, cg)
+    CUB_LOAD_ALL(LOAD_CS, cs)
+    CUB_LOAD_ALL(LOAD_CV, cv)
+#else
+    // LOAD_CV on SM10-13 uses "volatile.global" to ensure reads from last level
+    CUB_LOAD_ALL(LOAD_CV, volatile.global)
+#endif
+#if CUB_PTX_ARCH >= 350
+    CUB_LOAD_ALL(LOAD_LDG, global.nc)
+#endif
+
+
+/// Helper structure for templated load iteration (inductive case)
+template <PtxLoadModifier MODIFIER, int COUNT, int MAX>
+struct IterateThreadLoad
+{
+    template <typename T>
+    static __device__ __forceinline__ void Load(T *ptr, T *vals)
+    {
+        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+        IterateThreadLoad<MODIFIER, COUNT + 1, MAX>::Load(ptr, vals);
+    }
+};
+
+/// Helper structure for templated load iteration (termination case)
+template <PtxLoadModifier MODIFIER, int MAX>
+struct IterateThreadLoad<MODIFIER, MAX, MAX>
+{
+    template <typename T>
+    static __device__ __forceinline__ void Load(T *ptr, T *vals) {}
+};
+
+
+
+/**
+ * Load with LOAD_DEFAULT on iterator types
+ */
+template <typename InputIteratorRA>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(
+    InputIteratorRA         itr,
+    Int2Type<LOAD_DEFAULT>  modifier,
+    Int2Type<false>         is_pointer)
+{
+    return *itr;
+}
+
+
+/**
+ * Load with LOAD_DEFAULT on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_DEFAULT>  modifier,
+    Int2Type<true>          is_pointer)
+{
+    return *ptr;
+}
+
+
+/**
+ * Load with LOAD_VOLATILE on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatile(
+    T                       *ptr,
+    Int2Type<true>          is_primitive)
+{
+    T retval = *reinterpret_cast<volatile T*>(ptr);
+
+#if (CUB_PTX_ARCH <= 130)
+    if (sizeof(T) == 1) __threadfence_block();
+#endif
+
+    return retval;
+}
+
+
+/**
+ * Load with LOAD_VOLATILE on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatile(
+    T                       *ptr,
+    Int2Type<false>          is_primitive)
+{
+    typedef typename WordAlignment<T>::VolatileWord VolatileWord;   // Word type for memcopying
+    enum { NUM_WORDS = sizeof(T) / sizeof(VolatileWord) };
+
+    // Memcopy from aliased source into array of uninitialized words
+    typename WordAlignment<T>::UninitializedVolatileWords words;
+
+    #pragma unroll
+    for (int i = 0; i < NUM_WORDS; ++i)
+        words.buf[i] = reinterpret_cast<volatile VolatileWord*>(ptr)[i];
+
+    // Load from words
+    return *reinterpret_cast<T*>(words.buf);
+}
+
+
+/**
+ * Load with LOAD_VOLATILE on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_VOLATILE> modifier,
+    Int2Type<true>          is_pointer)
+{
+    return ThreadLoadVolatile(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+#if (CUB_PTX_ARCH <= 130)
+
+/**
+ * Load with LOAD_CG uses LOAD_CV in pre-SM20 PTX to ensure coherent reads when run on newer architectures with L1
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_CG>       modifier,
+    Int2Type<true>          is_pointer)
+{
+    return ThreadLoad<LOAD_CV>(ptr);
+}
+
+#endif  // (CUB_PTX_ARCH <= 130)
+
+
+/**
+ * Load with arbitrary MODIFIER on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<MODIFIER>      modifier,
+    Int2Type<true>          is_pointer)
+{
+    typedef typename WordAlignment<T>::DeviceWord DeviceWord;
+    enum { NUM_WORDS = sizeof(T) / sizeof(DeviceWord) };
+
+    // Memcopy from aliased source into array of uninitialized words
+    typename WordAlignment<T>::UninitializedDeviceWords words;
+
+    IterateThreadLoad<PtxLoadModifier(MODIFIER), 0, NUM_WORDS>::Load(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words.buf);
+
+    // Load from words
+    return *reinterpret_cast<T*>(words.buf);
+}
+
+
+/**
+ * Generic ThreadLoad definition
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename InputIteratorRA>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr)
+{
+    return ThreadLoad(
+        itr,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<InputIteratorRA>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group IoModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/thread/thread_operators.cuh
+++ b/lib/kokkos/TPL/cub/thread/thread_operators.cuh
@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple binary operator functor types
+ */
+
+/******************************************************************************
+ * Simple functor operators
+ ******************************************************************************/
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup ThreadModule
+ * @{
+ */
+
+/**
+ * \brief Default equality functor
+ */
+struct Equality
+{
+    /// Boolean equality operator, returns <tt>(a == b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return a == b;
+    }
+};
+
+
+/**
+ * \brief Default inequality functor
+ */
+struct Inequality
+{
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return a != b;
+    }
+};
+
+
+/**
+ * \brief Default sum functor
+ */
+struct Sum
+{
+    /// Boolean sum operator, returns <tt>a + b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
+    {
+        return a + b;
+    }
+};
+
+
+/**
+ * \brief Default max functor
+ */
+struct Max
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+/**
+ * \brief Default min functor
+ */
+struct Min
+{
+    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
+    {
+        return CUB_MIN(a, b);
+    }
+};
+
+
+/**
+ * \brief Default cast functor
+ */
+template <typename B>
+struct Cast
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename A>
+    __host__ __device__ __forceinline__ B operator()(const A &a)
+    {
+        return (B) a;
+    }
+};
+
+
+
+/** @} */       // end group ThreadModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/thread/thread_reduce.cuh
+++ b/lib/kokkos/TPL/cub/thread/thread_reduce.cuh
@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential reduction over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup ThreadModule
+ * @{
+ */
+
+/**
+ * \name Sequential reduction over statically-sized array types
+ * @{
+ */
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     Length of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        prefix = reduction_op(prefix, input[i]);
+    }
+
+    return prefix;
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     Length of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    T prefix = input[0];
+    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> Length of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce<LENGTH>(input, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Serial reduction with the specified operator
+ *
+ * \tparam LENGTH     <b>[inferred]</b> Length of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    return ThreadReduce<LENGTH>((T*) input, reduction_op);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group ThreadModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/thread/thread_scan.cuh
+++ b/lib/kokkos/TPL/cub/thread/thread_scan.cuh
@ -0,0 +1,231 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential prefix scan over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup ThreadModule
+ * @{
+ */
+
+/**
+ * \name Sequential prefix scan over statically-sized array types
+ * @{
+ */
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = prefix;
+    T exclusive = inclusive;
+
+    #pragma unroll
+    for (int i = 1; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    T inclusive = input[0];
+    output[0] = inclusive;
+
+    // Continue scan
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = inclusive;
+
+    // Continue scan
+    #pragma unroll
+    for (int i = 1; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group ThreadModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/thread/thread_store.cuh
+++ b/lib/kokkos/TPL/cub/thread/thread_store.cuh
@ -0,0 +1,412 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for writing memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup IoModule
+ * @{
+ */
+
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of PTX cache-modifiers for memory store operations.
+ */
+enum PtxStoreModifier
+{
+    STORE_DEFAULT,              ///< Default (no modifier)
+    STORE_WB,                   ///< Cache write-back all coherent levels
+    STORE_CG,                   ///< Cache at global level
+    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
+    STORE_WT,                   ///< Cache write-through (to system memory)
+    STORE_VOLATILE,             ///< Volatile shared (any memory space)
+};
+
+
+/**
+ * \name Simple I/O
+ * @{
+ */
+
+/**
+ * \brief Thread utility for writing memory using cub::PtxStoreModifier cache modifiers.
+ *
+ * Cache modifiers will only be effected for built-in types (i.e., C++
+ * primitives and CUDA vector-types).
+ *
+ * For example:
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * // 32-bit store using cache-global modifier:
+ * int *d_out;
+ * int val;
+ * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
+ *
+ * // 16-bit store using default modifier
+ * short *d_out;
+ * short val;
+ * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
+ *
+ * // 256-bit store using write-through modifier
+ * double4 *d_out;
+ * double4 val;
+ * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
+ *
+ * // 96-bit store using default cache modifier (ignoring STORE_CS)
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val;
+ * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
+ * \endcode
+ *
+ */
+template <
+    PtxStoreModifier MODIFIER,
+    typename OutputIteratorRA,
+    typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, T val);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Define a int4 (16B) ThreadStore specialization for the given PTX load modifier
+ */
+#define CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, int4*, int4>(int4* ptr, int4 val)              \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.s32 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y),                                                                     \
+            "r"(val.z),                                                                     \
+            "r"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, longlong2*, longlong2>(longlong2* ptr, longlong2 val)              \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.s64 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val.x),                                                                     \
+            "l"(val.y));                                                                    \
+    }
+
+
+/**
+ * Define a int2 (8B) ThreadStore specialization for the given PTX load modifier
+ */
+#define CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, short4*, short4>(short4* ptr, short4 val)              \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.s16 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val.x),                                                                     \
+            "h"(val.y),                                                                     \
+            "h"(val.z),                                                                     \
+            "h"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, int2*, int2>(int2* ptr, int2 val)              \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.s32 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, long long*, long long>(long long* ptr, long long val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".s64 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val));                                                                      \
+    }
+
+/**
+ * Define a int (4B) ThreadStore specialization for the given PTX load modifier
+ */
+#define CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, int*, int>(int* ptr, int val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".s32 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val));                                                                      \
+    }
+
+
+/**
+ * Define a short (2B) ThreadStore specialization for the given PTX load modifier
+ */
+#define CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, short*, short>(short* ptr, short val)           \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".s16 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val));                                                                      \
+    }
+
+
+/**
+ * Define a char (1B) ThreadStore specialization for the given PTX load modifier
+ */
+#define CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, char*, char>(char* ptr, char val)              \
+    {                                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .s8 datum;"                                                                \
+        "   cvt.s8.s16 datum, %1;"                                                          \
+        "   st."#ptx_modifier".s8 [%0], datum;"                                             \
+        "}" : :                                                                             \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(short(val)));                                                               \
+    }
+
+/**
+ * Define powers-of-two ThreadStore specializations for the given PTX load modifier
+ */
+#define CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
+    CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
+    CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
+    CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
+    CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
+    CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
+
+
+/**
+ * Define ThreadStore specializations for the various PTX load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    CUB_STORE_ALL(STORE_WB, ca)
+    CUB_STORE_ALL(STORE_CG, cg)
+    CUB_STORE_ALL(STORE_CS, cs)
+    CUB_STORE_ALL(STORE_WT, cv)
+#else
+    // STORE_WT on SM10-13 uses "volatile.global" to ensure writes to last level
+    CUB_STORE_ALL(STORE_WT, volatile.global)
+#endif
+
+
+
+/// Helper structure for templated store iteration (inductive case)
+template <PtxStoreModifier MODIFIER, int COUNT, int MAX>
+struct IterateThreadStore
+{
+    template <typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals)
+    {
+        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+        IterateThreadStore<MODIFIER, COUNT + 1, MAX>::Store(ptr, vals);
+    }
+};
+
+/// Helper structure for templated store iteration (termination case)
+template <PtxStoreModifier MODIFIER, int MAX>
+struct IterateThreadStore<MODIFIER, MAX, MAX>
+{
+    template <typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals) {}
+};
+
+
+
+
+/**
+ * Store with STORE_DEFAULT on iterator types
+ */
+template <typename OutputIteratorRA, typename T>
+__device__ __forceinline__ void ThreadStore(
+    OutputIteratorRA            itr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     modifier,
+    Int2Type<false>             is_pointer)
+{
+    *itr = val;
+}
+
+
+/**
+ * Store with STORE_DEFAULT on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     modifier,
+    Int2Type<true>              is_pointer)
+{
+    *ptr = val;
+}
+
+
+/**
+ * Store with STORE_VOLATILE on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatile(
+    T                           *ptr,
+    T                           val,
+    Int2Type<true>              is_primitive)
+{
+    *reinterpret_cast<volatile T*>(ptr) = val;
+}
+
+
+/**
+ * Store with STORE_VOLATILE on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatile(
+    T                           *ptr,
+    T                           val,
+    Int2Type<false>             is_primitive)
+{
+    typedef typename WordAlignment<T>::VolatileWord VolatileWord;   // Word type for memcopying
+    enum { NUM_WORDS = sizeof(T) / sizeof(VolatileWord) };
+
+    // Store into array of uninitialized words
+    typename WordAlignment<T>::UninitializedVolatileWords words;
+    *reinterpret_cast<T*>(words.buf) = val;
+
+    // Memcopy words to aliased destination
+    #pragma unroll
+    for (int i = 0; i < NUM_WORDS; ++i)
+        reinterpret_cast<volatile VolatileWord*>(ptr)[i] = words.buf[i];
+}
+
+
+/**
+ * Store with STORE_VOLATILE on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_VOLATILE>    modifier,
+    Int2Type<true>              is_pointer)
+{
+    ThreadStoreVolatile(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+#if (CUB_PTX_ARCH <= 350)
+
+/**
+ * Store with STORE_CG on pointer types (uses STORE_DEFAULT on current architectures)
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_CG>          modifier,
+    Int2Type<true>              is_pointer)
+{
+    ThreadStore<STORE_DEFAULT>(ptr, val);
+}
+
+#endif  // (CUB_PTX_ARCH <= 350)
+
+
+/**
+ * Store with arbitrary MODIFIER on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<MODIFIER>          modifier,
+    Int2Type<true>              is_pointer)
+{
+    typedef typename WordAlignment<T>::DeviceWord DeviceWord;   // Word type for memcopying
+    enum { NUM_WORDS = sizeof(T) / sizeof(DeviceWord) };
+
+    // Store into array of uninitialized words
+    typename WordAlignment<T>::UninitializedDeviceWords words;
+    *reinterpret_cast<T*>(words.buf) = val;
+
+    // Memcopy words to aliased destination
+    IterateThreadStore<PtxStoreModifier(MODIFIER), 0, NUM_WORDS>::Store(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words.buf);
+}
+
+
+/**
+ * Generic ThreadStore definition
+ */
+template <PtxStoreModifier MODIFIER, typename OutputIteratorRA, typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, T val)
+{
+    ThreadStore(
+        itr,
+        val,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<OutputIteratorRA>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group IoModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/util_allocator.cuh
+++ b/lib/kokkos/TPL/cub/util_allocator.cuh
@ -0,0 +1,661 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#ifndef __CUDA_ARCH__
+    #include <set>              // NVCC (EDG, really) takes FOREVER to compile std::map
+    #include <map>
+#endif
+
+#include <math.h>
+
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+#include "host/spinlock.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and is capable of managing cached device allocations
+ * on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations categorized by bin size.
+ * - Bin sizes progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth = 8
+ * - \p min_bin = 3
+ * - \p max_bin = 7
+ * - \p max_cached_bytes = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Invalid device ordinal
+        INVALID_DEVICE_ORDINAL = -1,
+    };
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    static void NearestPowerOf(
+        unsigned int &power,
+        size_t &rounded_bytes,
+        unsigned int base,
+        size_t value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor
+    {
+        int   device;        // device ordinal
+        void*           d_ptr;      // Device pointer
+        size_t          bytes;      // Size of allocation in bytes
+        unsigned int    bin;        // Bin enumeration
+
+        // Constructor
+        BlockDescriptor(void *d_ptr, int device) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(0),
+            device(device) {}
+
+        // Constructor
+        BlockDescriptor(size_t bytes, unsigned int bin, int device) :
+            d_ptr(NULL),
+            bytes(bytes),
+            bin(bin),
+            device(device) {}
+
+        // Comparison functor for comparing device pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device < b.device) {
+                return true;
+            } else if (a.device > b.device) {
+                return false;
+            } else {
+                return (a.d_ptr < b.d_ptr);
+            }
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device < b.device) {
+                return true;
+            } else if (a.device > b.device) {
+                return false;
+            } else {
+                return (a.bytes < b.bytes);
+            }
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+#ifndef __CUDA_ARCH__   // Only define STL container members in host code
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, size_t> GpuCachedBytes;
+
+#endif // __CUDA_ARCH__
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    Spinlock        spin_lock;          /// Spinlock for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+    bool            skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+
+#ifndef __CUDA_ARCH__   // Only define STL container members in host code
+
+    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+#endif // __CUDA_ARCH__
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int bin_growth,    ///< Geometric growth factor for bin-sizes
+        unsigned int min_bin,       ///< Minimum bin
+        unsigned int max_bin,       ///< Maximum bin
+        size_t max_cached_bytes)    ///< Maximum aggregate cached bytes per device
+    :
+    #ifndef __CUDA_ARCH__   // Only define STL container members in host code
+            cached_blocks(BlockDescriptor::SizeCompare),
+            live_blocks(BlockDescriptor::PtrCompare),
+    #endif
+            debug(false),
+            spin_lock(0),
+            bin_growth(bin_growth),
+            min_bin(min_bin),
+            max_bin(max_bin),
+            min_bin_bytes(IntPow(bin_growth, min_bin)),
+            max_bin_bytes(IntPow(bin_growth, max_bin)),
+            max_cached_bytes(max_cached_bytes)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth = 8
+     * - \p min_bin = 3
+     * - \p max_bin = 7
+     * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(bool skip_cleanup = false) :
+    #ifndef __CUDA_ARCH__   // Only define STL container members in host code
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare),
+    #endif
+        skip_cleanup(skip_cleanup),
+        debug(false),
+        spin_lock(0),
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     */
+    cudaError_t SetMaxCachedBytes(
+        size_t max_cached_bytes)
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+
+        // Lock
+        Lock(&spin_lock);
+
+        this->max_cached_bytes = max_cached_bytes;
+
+        if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);
+
+        // Unlock
+        Unlock(&spin_lock);
+
+        return cudaSuccess;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device
+     */
+    cudaError_t DeviceAllocate(
+        void** d_ptr,
+        size_t bytes,
+        int device)
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+
+        bool locked                     = false;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        // Round up to nearest bin size
+        unsigned int bin;
+        size_t bin_bytes;
+        NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
+        if (bin < min_bin) {
+            bin = min_bin;
+            bin_bytes = min_bin_bytes;
+        }
+
+        // Check if bin is greater than our maximum bin
+        if (bin > max_bin)
+        {
+            // Allocate the request exactly and give out-of-range bin
+            bin = (unsigned int) -1;
+            bin_bytes = bytes;
+        }
+
+        BlockDescriptor search_key(bin_bytes, bin, device);
+
+        // Lock
+        if (!locked) {
+            Lock(&spin_lock);
+            locked = true;
+        }
+
+        do {
+            // Find a free block big enough within the same bin on the same device
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            if ((block_itr != cached_blocks.end()) &&
+                (block_itr->device == device) &&
+                (block_itr->bin == search_key.bin))
+            {
+                // Reuse existing cache block.  Insert into live blocks.
+                search_key = *block_itr;
+                live_blocks.insert(search_key);
+
+                // Remove from free blocks
+                cached_blocks.erase(block_itr);
+                cached_bytes[device] -= search_key.bytes;
+
+                if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+                    device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+            }
+            else
+            {
+                // Need to allocate a new cache block. Unlock.
+                if (locked) {
+                    Unlock(&spin_lock);
+                    locked = false;
+                }
+
+                // Set to specified device
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+                if (CubDebug(error = cudaSetDevice(device))) break;
+
+                // Allocate
+                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
+
+                // Lock
+                if (!locked) {
+                    Lock(&spin_lock);
+                    locked = true;
+                }
+
+                // Insert into live blocks
+                live_blocks.insert(search_key);
+
+                if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+                    device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+            }
+        } while(0);
+
+        // Unlock
+        if (locked) {
+            Unlock(&spin_lock);
+            locked = false;
+        }
+
+        // Copy device pointer to output parameter (NULL on error)
+        *d_ptr = search_key.d_ptr;
+
+        // Attempt to revert back to previous device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device
+     */
+    cudaError_t DeviceAllocate(
+        void** d_ptr,
+        size_t bytes)
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+        cudaError_t error = cudaSuccess;
+        do {
+            int current_device;
+            if (CubDebug(error = cudaGetDevice(&current_device))) break;
+            if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
+        } while(0);
+
+        return error;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator
+     */
+    cudaError_t DeviceFree(
+        void* d_ptr,
+        int device)
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+
+        bool locked                     = false;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        BlockDescriptor search_key(d_ptr, device);
+
+        // Lock
+        if (!locked) {
+            Lock(&spin_lock);
+            locked = true;
+        }
+
+        do {
+            // Find corresponding block descriptor
+            BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+            if (block_itr == live_blocks.end())
+            {
+                // Cannot find pointer
+                if (CubDebug(error = cudaErrorUnknown)) break;
+            }
+            else
+            {
+                // Remove from live blocks
+                search_key = *block_itr;
+                live_blocks.erase(block_itr);
+
+                // Check if we should keep the returned allocation
+                if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
+                {
+                    // Insert returned allocation into free blocks
+                    cached_blocks.insert(search_key);
+                    cached_bytes[device] += search_key.bytes;
+
+                    if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+                        device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+                }
+                else
+                {
+                    // Free the returned allocation.  Unlock.
+                    if (locked) {
+                        Unlock(&spin_lock);
+                        locked = false;
+                    }
+
+                    // Set to specified device
+                    if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+                    if (CubDebug(error = cudaSetDevice(device))) break;
+
+                    // Free device memory
+                    if (CubDebug(error = cudaFree(d_ptr))) break;
+
+                    if (debug) CubLog("\tdevice %d freed %lld bytes.  %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+                        device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+                }
+            }
+        } while (0);
+
+        // Unlock
+        if (locked) {
+            Unlock(&spin_lock);
+            locked = false;
+        }
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator
+     */
+    cudaError_t DeviceFree(
+        void* d_ptr)
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+
+        int current_device;
+        cudaError_t error = cudaSuccess;
+
+        do {
+            if (CubDebug(error = cudaGetDevice(&current_device))) break;
+            if (CubDebug(error = DeviceFree(d_ptr, current_device))) break;
+        } while(0);
+
+        return error;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached()
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+
+        cudaError_t error         = cudaSuccess;
+        bool locked               = false;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        // Lock
+        if (!locked) {
+            Lock(&spin_lock);
+            locked = true;
+        }
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free device memory
+            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+
+            // Reduce balance and erase entry
+            cached_bytes[current_device] -= begin->bytes;
+            cached_blocks.erase(begin);
+
+            if (debug) CubLog("\tdevice %d freed %lld bytes.  %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
+        }
+
+        // Unlock
+        if (locked) {
+            Unlock(&spin_lock);
+            locked = false;
+        }
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/util_arch.cuh
+++ b/lib/kokkos/TPL/cub/util_arch.cuh
@ -0,0 +1,295 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static architectural properties by SM version.
+ */
+
+
+/******************************************************************************
+ * Static architectural properties by SM version.
+ *
+ * "Device" reflects the PTX architecture targeted by the active compiler
+ * pass.  It provides useful compile-time statics within device code.  E.g.,:
+ *
+ *     __shared__ int[Device::WARP_THREADS];
+ *
+ *     int padded_offset = threadIdx.x + (threadIdx.x >> Device::LOG_SMEM_BANKS);
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
+#ifndef __CUDA_ARCH__
+    #define CUB_PTX_ARCH 0
+#else
+    #define CUB_PTX_ARCH __CUDA_ARCH__
+#endif
+
+
+/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
+#if !defined(__CUDA_ARCH__) || defined(CUB_CDP)
+#define CUB_RUNTIME_ENABLED
+#endif
+
+
+/// Execution space for destructors
+#if ((CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH < 200))
+    #define CUB_DESTRUCTOR __host__
+#else
+    #define CUB_DESTRUCTOR __host__ __device__
+#endif
+
+
+/**
+ * \brief Structure for statically reporting CUDA device properties, parameterized by SM architecture.
+ *
+ * The default specialization is for SM10.
+ */
+template <int SM_ARCH>
+struct ArchProps
+{
+    enum
+    {
+        LOG_WARP_THREADS    =
+                                        5,                        /// Log of the number of threads per warp
+        WARP_THREADS        =
+                                        1 << LOG_WARP_THREADS,    /// Number of threads per warp
+        LOG_SMEM_BANKS      =
+                                        4,                        /// Log of the number of smem banks
+        SMEM_BANKS          =
+                                        1 << LOG_SMEM_BANKS,      /// The number of smem banks
+        SMEM_BANK_BYTES     =
+                                        4,                        /// Size of smem bank words
+        SMEM_BYTES          =
+                                        16 * 1024,                /// Maximum SM shared memory
+        SMEM_ALLOC_UNIT     =
+                                        512,                      /// Smem allocation size in bytes
+        REGS_BY_BLOCK       =
+                                        true,                     /// Whether or not the architecture allocates registers by block (or by warp)
+        REG_ALLOC_UNIT      =
+                                        256,                      /// Number of registers allocated at a time per block (or by warp)
+        WARP_ALLOC_UNIT     =
+                                        2,                        /// Granularity of warps for which registers are allocated
+        MAX_SM_THREADS      =
+                                        768,                      /// Maximum number of threads per SM
+        MAX_SM_THREADBLOCKS =
+                                        8,                        /// Maximum number of thread blocks per SM
+        MAX_BLOCK_THREADS   =
+                                        512,                      /// Maximum number of thread per thread block
+        MAX_SM_REGISTERS    =
+                                        8 * 1024,                 /// Maximum number of registers per SM
+    };
+};
+
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Architecture properties for SM30
+ */
+template <>
+struct ArchProps<300>
+{
+    enum
+    {
+        LOG_WARP_THREADS    = 5,                        // 32 threads per warp
+        WARP_THREADS        = 1 << LOG_WARP_THREADS,
+        LOG_SMEM_BANKS      = 5,                        // 32 banks
+        SMEM_BANKS          = 1 << LOG_SMEM_BANKS,
+        SMEM_BANK_BYTES     = 4,                        // 4 byte bank words
+        SMEM_BYTES          = 48 * 1024,                // 48KB shared memory
+        SMEM_ALLOC_UNIT     = 256,                      // 256B smem allocation segment size
+        REGS_BY_BLOCK       = false,                    // Allocates registers by warp
+        REG_ALLOC_UNIT      = 256,                      // 256 registers allocated at a time per warp
+        WARP_ALLOC_UNIT     = 4,                        // Registers are allocated at a granularity of every 4 warps per threadblock
+        MAX_SM_THREADS      = 2048,                     // 2K max threads per SM
+        MAX_SM_THREADBLOCKS = 16,                       // 16 max threadblocks per SM
+        MAX_BLOCK_THREADS   = 1024,                     // 1024 max threads per threadblock
+        MAX_SM_REGISTERS    = 64 * 1024,                // 64K max registers per SM
+    };
+
+    // Callback utility
+    template <typename T>
+    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+    {
+        target.template Callback<ArchProps>();
+    }
+};
+
+
+/**
+ * Architecture properties for SM20
+ */
+template <>
+struct ArchProps<200>
+{
+    enum
+    {
+        LOG_WARP_THREADS    = 5,                        // 32 threads per warp
+        WARP_THREADS        = 1 << LOG_WARP_THREADS,
+        LOG_SMEM_BANKS      = 5,                        // 32 banks
+        SMEM_BANKS          = 1 << LOG_SMEM_BANKS,
+        SMEM_BANK_BYTES     = 4,                        // 4 byte bank words
+        SMEM_BYTES          = 48 * 1024,                // 48KB shared memory
+        SMEM_ALLOC_UNIT     = 128,                      // 128B smem allocation segment size
+        REGS_BY_BLOCK       = false,                    // Allocates registers by warp
+        REG_ALLOC_UNIT      = 64,                       // 64 registers allocated at a time per warp
+        WARP_ALLOC_UNIT     = 2,                        // Registers are allocated at a granularity of every 2 warps per threadblock
+        MAX_SM_THREADS      = 1536,                     // 1536 max threads per SM
+        MAX_SM_THREADBLOCKS = 8,                        // 8 max threadblocks per SM
+        MAX_BLOCK_THREADS   = 1024,                     // 1024 max threads per threadblock
+        MAX_SM_REGISTERS    = 32 * 1024,                // 32K max registers per SM
+    };
+
+    // Callback utility
+    template <typename T>
+    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+    {
+        if (sm_version > 200) {
+            ArchProps<300>::Callback(target, sm_version);
+        } else {
+            target.template Callback<ArchProps>();
+        }
+    }
+};
+
+
+/**
+ * Architecture properties for SM12
+ */
+template <>
+struct ArchProps<120>
+{
+    enum
+    {
+        LOG_WARP_THREADS    = 5,                        // 32 threads per warp
+        WARP_THREADS        = 1 << LOG_WARP_THREADS,
+        LOG_SMEM_BANKS      = 4,                        // 16 banks
+        SMEM_BANKS          = 1 << LOG_SMEM_BANKS,
+        SMEM_BANK_BYTES     = 4,                        // 4 byte bank words
+        SMEM_BYTES          = 16 * 1024,                // 16KB shared memory
+        SMEM_ALLOC_UNIT     = 512,                      // 512B smem allocation segment size
+        REGS_BY_BLOCK       = true,                     // Allocates registers by threadblock
+        REG_ALLOC_UNIT      = 512,                      // 512 registers allocated at time per threadblock
+        WARP_ALLOC_UNIT     = 2,                        // Registers are allocated at a granularity of every 2 warps per threadblock
+        MAX_SM_THREADS      = 1024,                     // 1024 max threads per SM
+        MAX_SM_THREADBLOCKS = 8,                        // 8 max threadblocks per SM
+        MAX_BLOCK_THREADS   = 512,                      // 512 max threads per threadblock
+        MAX_SM_REGISTERS    = 16 * 1024,                // 16K max registers per SM
+    };
+
+    // Callback utility
+    template <typename T>
+    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+    {
+        if (sm_version > 120) {
+            ArchProps<200>::Callback(target, sm_version);
+        } else {
+            target.template Callback<ArchProps>();
+        }
+    }
+};
+
+
+/**
+ * Architecture properties for SM10.  Derives from the default ArchProps specialization.
+ */
+template <>
+struct ArchProps<100> : ArchProps<0>
+{
+    // Callback utility
+    template <typename T>
+    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+    {
+        if (sm_version > 100) {
+            ArchProps<120>::Callback(target, sm_version);
+        } else {
+            target.template Callback<ArchProps>();
+        }
+    }
+};
+
+
+/**
+ * Architecture properties for SM35
+ */
+template <>
+struct ArchProps<350> : ArchProps<300> {};        // Derives from SM30
+
+/**
+ * Architecture properties for SM21
+ */
+template <>
+struct ArchProps<210> : ArchProps<200> {};        // Derives from SM20
+
+/**
+ * Architecture properties for SM13
+ */
+template <>
+struct ArchProps<130> : ArchProps<120> {};        // Derives from SM12
+
+/**
+ * Architecture properties for SM11
+ */
+template <>
+struct ArchProps<110> : ArchProps<100> {};        // Derives from SM10
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief The architectural properties for the PTX version targeted by the active compiler pass.
+ */
+struct PtxArchProps : ArchProps<CUB_PTX_ARCH> {};
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/util_debug.cuh
+++ b/lib/kokkos/TPL/cub/util_debug.cuh
@ -0,0 +1,115 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Error and event logging routines.
+ *
+ * The following macros definitions are supported:
+ * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "util_namespace.cuh"
+#include "util_arch.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/// CUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG))
+    #define CUB_STDERR
+#endif
+
+
+
+/**
+ * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+__host__ __device__ __forceinline__ cudaError_t Debug(
+    cudaError_t     error,
+    const char*     filename,
+    int             line)
+{
+#ifdef CUB_STDERR
+    if (error)
+    {
+    #if (CUB_PTX_ARCH == 0)
+        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+        fflush(stderr);
+    #elif (CUB_PTX_ARCH >= 200)
+        printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line);
+    #endif
+    }
+#endif
+    return error;
+}
+
+
+/**
+ * \brief Debug macro
+ */
+#define CubDebug(e) cub::Debug((e), __FILE__, __LINE__)
+
+
+/**
+ * \brief Debug macro with exit
+ */
+#define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
+
+
+/**
+ * \brief Log macro for printf statements.
+ */
+#if (CUB_PTX_ARCH == 0)
+    #define CubLog(format, ...) printf(format,__VA_ARGS__);
+#elif (CUB_PTX_ARCH >= 200)
+    #define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__);
+#endif
+
+
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/util_device.cuh
+++ b/lib/kokkos/TPL/cub/util_device.cuh
@ -0,0 +1,378 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Properties of a given CUDA device and the corresponding PTX bundle
+ */
+
+#pragma once
+
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
+ */
+template <typename T>
+__global__ void EmptyKernel(void) { }
+
+
+/**
+ * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ */
+template <int ALLOCATIONS>
+__host__ __device__ __forceinline__
+cudaError_t AliasTemporaries(
+    void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+    size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
+    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
+    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
+{
+    const int ALIGN_BYTES   = 256;
+    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
+
+    // Compute exclusive prefix sum over allocation requests
+    size_t bytes_needed = 0;
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+        allocation_sizes[i] = bytes_needed;
+        bytes_needed += allocation_bytes;
+    }
+
+    // Check if the caller is simply requesting the size of the storage allocation
+    if (!d_temp_storage)
+    {
+        temp_storage_bytes = bytes_needed;
+        return cudaSuccess;
+    }
+
+    // Check if enough storage provided
+    if (temp_storage_bytes < bytes_needed)
+    {
+        return CubDebug(cudaErrorMemoryAllocation);
+    }
+
+    // Alias
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_sizes[i];
+    }
+
+    return cudaSuccess;
+}
+
+
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/**
+ * \brief Retrieves the PTX version (major * 100 + minor * 10)
+ */
+__host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version)
+{
+#ifndef CUB_RUNTIME_ENABLED
+
+    // CUDA API calls not supported from this device
+    return cudaErrorInvalidConfiguration;
+
+#else
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        cudaFuncAttributes empty_kernel_attrs;
+        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
+        ptx_version = empty_kernel_attrs.ptxVersion * 10;
+    }
+    while (0);
+
+    return error;
+
+#endif
+}
+
+
+/**
+ * Synchronize the stream if specified
+ */
+__host__ __device__ __forceinline__
+static cudaError_t SyncStream(cudaStream_t stream)
+{
+#ifndef __CUDA_ARCH__
+    return cudaStreamSynchronize(stream);
+#else
+    // Device can't yet sync on a specific stream
+    return cudaDeviceSynchronize();
+#endif
+}
+
+
+
+/**
+ * \brief Properties of a given CUDA device and the corresponding PTX bundle
+ */
+class Device
+{
+private:
+
+    /// Type definition of the EmptyKernel kernel entry point
+    typedef void (*EmptyKernelPtr)();
+
+    /// Force EmptyKernel<void> to be generated if this class is used
+    __host__ __device__ __forceinline__
+    EmptyKernelPtr Empty()
+    {
+        return EmptyKernel<void>;
+    }
+
+public:
+
+    // Version information
+    int     sm_version;             ///< SM version of target device (SM version X.YZ in XYZ integer form)
+    int     ptx_version;            ///< Bundled PTX version for target device (PTX version X.YZ in XYZ integer form)
+
+    // Target device properties
+    int     sm_count;               ///< Number of SMs
+    int     warp_threads;           ///< Number of threads per warp
+    int     smem_bank_bytes;        ///< Number of bytes per SM bank
+    int     smem_banks;             ///< Number of smem banks
+    int     smem_bytes;             ///< Smem bytes per SM
+    int     smem_alloc_unit;        ///< Smem segment size
+    bool    regs_by_block;          ///< Whether registers are allocated by threadblock (or by warp)
+    int     reg_alloc_unit;         ///< Granularity of register allocation within the SM
+    int     warp_alloc_unit;        ///< Granularity of warp allocation within the SM
+    int     max_sm_threads;         ///< Maximum number of threads per SM
+    int     max_sm_blocks;          ///< Maximum number of threadblocks per SM
+    int     max_block_threads;      ///< Maximum number of threads per threadblock
+    int     max_sm_registers;       ///< Maximum number of registers per SM
+    int     max_sm_warps;           ///< Maximum number of warps per SM
+
+    /**
+     * Callback for initializing device properties
+     */
+    template <typename ArchProps>
+    __host__ __device__ __forceinline__ void Callback()
+    {
+        warp_threads        = ArchProps::WARP_THREADS;
+        smem_bank_bytes     = ArchProps::SMEM_BANK_BYTES;
+        smem_banks          = ArchProps::SMEM_BANKS;
+        smem_bytes          = ArchProps::SMEM_BYTES;
+        smem_alloc_unit     = ArchProps::SMEM_ALLOC_UNIT;
+        regs_by_block       = ArchProps::REGS_BY_BLOCK;
+        reg_alloc_unit      = ArchProps::REG_ALLOC_UNIT;
+        warp_alloc_unit     = ArchProps::WARP_ALLOC_UNIT;
+        max_sm_threads      = ArchProps::MAX_SM_THREADS;
+        max_sm_blocks       = ArchProps::MAX_SM_THREADBLOCKS;
+        max_block_threads   = ArchProps::MAX_BLOCK_THREADS;
+        max_sm_registers    = ArchProps::MAX_SM_REGISTERS;
+        max_sm_warps        = max_sm_threads / warp_threads;
+    }
+
+
+public:
+
+    /**
+     * Initializer.  Properties are retrieved for the specified GPU ordinal.
+     */
+    __host__ __device__ __forceinline__
+    cudaError_t Init(int device_ordinal)
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // CUDA API calls not supported from this device
+        return CubDebug(cudaErrorInvalidConfiguration);
+
+    #else
+
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            // Fill in SM version
+            int major, minor;
+            if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
+            if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
+            sm_version = major * 100 + minor * 10;
+
+            // Fill in static SM properties
+            // Initialize our device properties via callback from static device properties
+            ArchProps<100>::Callback(*this, sm_version);
+
+            // Fill in SM count
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Fill in PTX version
+        #if CUB_PTX_ARCH > 0
+            ptx_version = CUB_PTX_ARCH;
+        #else
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+        #endif
+
+        }
+        while (0);
+
+        return error;
+
+    #endif
+    }
+
+
+    /**
+     * Initializer.  Properties are retrieved for the current GPU ordinal.
+     */
+    __host__ __device__ __forceinline__
+    cudaError_t Init()
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // CUDA API calls not supported from this device
+        return CubDebug(cudaErrorInvalidConfiguration);
+
+    #else
+
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            int device_ordinal;
+            if ((error = CubDebug(cudaGetDevice(&device_ordinal)))) break;
+            if ((error = Init(device_ordinal))) break;
+        }
+        while (0);
+        return error;
+
+    #endif
+    }
+
+
+    /**
+     * Computes maximum SM occupancy in thread blocks for the given kernel
+     */
+    template <typename KernelPtr>
+    __host__ __device__ __forceinline__
+    cudaError_t MaxSmOccupancy(
+        int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+        KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
+        int                 block_threads)              ///< [in] Number of threads per thread block
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // CUDA API calls not supported from this device
+        return CubDebug(cudaErrorInvalidConfiguration);
+
+    #else
+
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            // Get kernel attributes
+            cudaFuncAttributes kernel_attrs;
+            if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
+
+            // Number of warps per threadblock
+            int block_warps = (block_threads +  warp_threads - 1) / warp_threads;
+
+            // Max warp occupancy
+            int max_warp_occupancy = (block_warps > 0) ?
+                max_sm_warps / block_warps :
+                max_sm_blocks;
+
+            // Maximum register occupancy
+            int max_reg_occupancy;
+            if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
+            {
+                // Prevent divide-by-zero
+                max_reg_occupancy = max_sm_blocks;
+            }
+            else if (regs_by_block)
+            {
+                // Allocates registers by threadblock
+                int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
+                max_reg_occupancy = max_sm_registers / block_regs;
+            }
+            else
+            {
+                // Allocates registers by warp
+                int sm_sides                = warp_alloc_unit;
+                int sm_registers_per_side   = max_sm_registers / sm_sides;
+                int regs_per_warp           = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
+                int warps_per_side          = sm_registers_per_side / regs_per_warp;
+                int warps                   = warps_per_side * sm_sides;
+                max_reg_occupancy           = warps / block_warps;
+            }
+
+            // Shared memory per threadblock
+            int block_allocated_smem = CUB_ROUND_UP_NEAREST(
+                kernel_attrs.sharedSizeBytes,
+                smem_alloc_unit);
+
+            // Max shared memory occupancy
+            int max_smem_occupancy = (block_allocated_smem > 0) ?
+                (smem_bytes / block_allocated_smem) :
+                max_sm_blocks;
+
+            // Max occupancy
+            max_sm_occupancy = CUB_MIN(
+                CUB_MIN(max_sm_blocks, max_warp_occupancy),
+                CUB_MIN(max_smem_occupancy, max_reg_occupancy));
+
+//            printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d)", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);
+
+        } while (0);
+
+        return error;
+
+    #endif
+    }
+
+};
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/util_iterator.cuh
+++ b/lib/kokkos/TPL/cub/util_iterator.cuh
@ -0,0 +1,718 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include "thread/thread_load.cuh"
+#include "util_device.cuh"
+#include "util_debug.cuh"
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Texture references
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+// Anonymous namespace
+namespace {
+
+/// Templated texture reference type
+template <typename T>
+struct TexIteratorRef
+{
+    // Texture reference type
+    typedef texture<T, cudaTextureType1D, cudaReadModeElementType> TexRef;
+
+    static TexRef ref;
+
+    /**
+     * Bind texture
+     */
+    static cudaError_t BindTexture(void *d_in)
+    {
+        cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<T>();
+        if (d_in)
+            return (CubDebug(cudaBindTexture(NULL, ref, d_in, tex_desc)));
+
+        return cudaSuccess;
+    }
+
+    /**
+     * Unbind textures
+     */
+    static cudaError_t UnbindTexture()
+    {
+        return CubDebug(cudaUnbindTexture(ref));
+    }
+};
+
+// Texture reference definitions
+template <typename Value>
+typename TexIteratorRef<Value>::TexRef TexIteratorRef<Value>::ref = 0;
+
+} // Anonymous namespace
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Iterators
+ *****************************************************************************/
+
+/**
+ * \brief A simple random-access iterator pointing to a range of constant values
+ *
+ * \par Overview
+ * ConstantIteratorRA is a random-access iterator that when dereferenced, always
+ * returns the supplied constant of type \p OutputType.
+ *
+ * \tparam OutputType           The value type of this iterator
+ */
+template <typename OutputType>
+class ConstantIteratorRA
+{
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    typedef ConstantIteratorRA                  self_type;
+    typedef OutputType                          value_type;
+    typedef OutputType                          reference;
+    typedef OutputType*                         pointer;
+    typedef std::random_access_iterator_tag     iterator_category;
+    typedef int                                 difference_type;
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+
+    OutputType    val;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ConstantIteratorRA(
+        const OutputType &val)          ///< Constant value for the iterator instance to report
+    :
+        val(val)
+    {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        self_type i = *this;
+        return i;
+    }
+
+    __host__ __device__ __forceinline__ self_type operator++(int junk)
+    {
+        return *this;
+    }
+
+    __host__ __device__ __forceinline__ reference operator*()
+    {
+        return val;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+    {
+        return ConstantIteratorRA(val);
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+    {
+        return ConstantIteratorRA(val);
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ reference operator[](SizeT n)
+    {
+        return ConstantIteratorRA(val);
+    }
+
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (val == rhs.val);
+    }
+
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (val != rhs.val);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+};
+
+
+
+/**
+ * \brief A simple random-access transform iterator for applying a transformation operator.
+ *
+ * \par Overview
+ * TransformIteratorRA is a random-access iterator that wraps both a native
+ * device pointer of type <tt>InputType*</tt> and a unary conversion functor of
+ * type \p ConversionOp. \p OutputType references are made by pulling \p InputType
+ * values through the \p ConversionOp instance.
+ *
+ * \tparam InputType            The value type of the pointer being wrapped
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p OutputType.  Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
+ * \tparam OutputType           The value type of this iterator
+ */
+template <typename OutputType, typename ConversionOp, typename InputType>
+class TransformIteratorRA
+{
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    typedef TransformIteratorRA                 self_type;
+    typedef OutputType                          value_type;
+    typedef OutputType                          reference;
+    typedef OutputType*                         pointer;
+    typedef std::random_access_iterator_tag     iterator_category;
+    typedef int                                 difference_type;
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+
+    ConversionOp    conversion_op;
+    InputType*      ptr;
+
+public:
+
+    /**
+     * \brief Constructor
+     * @param ptr Native pointer to wrap
+     * @param conversion_op Binary transformation functor
+     */
+    __host__ __device__ __forceinline__ TransformIteratorRA(InputType* ptr, ConversionOp conversion_op) :
+        conversion_op(conversion_op),
+        ptr(ptr) {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        self_type i = *this;
+        ptr++;
+        return i;
+    }
+
+    __host__ __device__ __forceinline__ self_type operator++(int junk)
+    {
+        ptr++;
+        return *this;
+    }
+
+    __host__ __device__ __forceinline__ reference operator*()
+    {
+        return conversion_op(*ptr);
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+    {
+        TransformIteratorRA retval(ptr + n, conversion_op);
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+    {
+        TransformIteratorRA retval(ptr - n, conversion_op);
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ reference operator[](SizeT n)
+    {
+        return conversion_op(ptr[n]);
+    }
+
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &conversion_op(*ptr);
+    }
+
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+};
+
+
+
+/**
+ * \brief A simple random-access iterator for loading primitive values through texture cache.
+ *
+ * \par Overview
+ * TexIteratorRA is a random-access iterator that wraps a native
+ * device pointer of type <tt>T*</tt>. References made through TexIteratorRA
+ * causes values to be pulled through texture cache.
+ *
+ * \par Usage Considerations
+ * - Can only be used with primitive types (e.g., \p char, \p int, \p float), with the exception of \p double
+ * - Only one TexIteratorRA or TexIteratorRA of a certain \p InputType can be bound at any given time (per host thread)
+ *
+ * \tparam InputType            The value type of the pointer being wrapped
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p OutputType.  Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
+ * \tparam OutputType           The value type of this iterator
+ */
+template <typename T>
+class TexIteratorRA
+{
+public:
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    typedef TexIteratorRA                       self_type;
+    typedef T                                   value_type;
+    typedef T                                   reference;
+    typedef T*                                  pointer;
+    typedef std::random_access_iterator_tag     iterator_category;
+    typedef int                                 difference_type;
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Tag identifying iterator type as being texture-bindable
+    typedef void TexBindingTag;
+
+private:
+
+    T*                  ptr;
+    size_t              tex_align_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /**
+     * \brief Constructor
+     */
+    __host__ __device__ __forceinline__ TexIteratorRA()
+    :
+        ptr(NULL),
+        tex_align_offset(0),
+        tex_obj(0)
+    {}
+
+    /// \brief Bind iterator to texture reference
+    cudaError_t BindTexture(
+        T               *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes,                  ///< Number of items
+        size_t          tex_align_offset = 0)   ///< Offset (in items) from ptr denoting the position of the iterator
+    {
+        this->ptr = ptr;
+        this->tex_align_offset = tex_align_offset;
+
+        int ptx_version;
+        cudaError_t error = cudaSuccess;
+        if (CubDebug(error = PtxVersion(ptx_version))) return error;
+        if (ptx_version >= 300)
+        {
+            // Use texture object
+            cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<T>();
+            cudaResourceDesc        res_desc;
+            cudaTextureDesc         tex_desc;
+            memset(&res_desc, 0, sizeof(cudaResourceDesc));
+            memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+            res_desc.resType                = cudaResourceTypeLinear;
+            res_desc.res.linear.devPtr      = ptr;
+            res_desc.res.linear.desc        = channel_desc;
+            res_desc.res.linear.sizeInBytes = bytes;
+            tex_desc.readMode               = cudaReadModeElementType;
+            return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+        }
+        else
+        {
+            // Use texture reference
+            return TexIteratorRef<T>::BindTexture(ptr);
+        }
+    }
+
+    /// \brief Unbind iterator to texture reference
+    cudaError_t UnbindTexture()
+    {
+        int ptx_version;
+        cudaError_t error = cudaSuccess;
+        if (CubDebug(error = PtxVersion(ptx_version))) return error;
+        if (ptx_version < 300)
+        {
+            // Use texture reference
+            return TexIteratorRef<T>::UnbindTexture();
+        }
+        else
+        {
+            // Use texture object
+            return cudaDestroyTextureObject(tex_obj);
+        }
+    }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        self_type i = *this;
+        ptr++;
+        tex_align_offset++;
+        return i;
+    }
+
+    __host__ __device__ __forceinline__ self_type operator++(int junk)
+    {
+        ptr++;
+        tex_align_offset++;
+        return *this;
+    }
+
+    __host__ __device__ __forceinline__ reference operator*()
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return *ptr;
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset);
+#else
+        // Use the texture object
+        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+#endif
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+    {
+        TexIteratorRA retval;
+        retval.ptr = ptr + n;
+        retval.tex_align_offset = tex_align_offset + n;
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+    {
+        TexIteratorRA retval;
+        retval.ptr = ptr - n;
+        retval.tex_align_offset = tex_align_offset - n;
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ reference operator[](SizeT n)
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return ptr[n];
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset + n);
+#else
+        // Use the texture object
+        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset + n));
+#endif
+    }
+
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return &(*ptr);
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return &(tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset));
+#else
+        // Use the texture object
+        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+#endif
+    }
+
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+};
+
+
+/**
+ * \brief A simple random-access transform iterator for loading primitive values through texture cache and and subsequently applying a transformation operator.
+ *
+ * \par Overview
+ * TexTransformIteratorRA is a random-access iterator that wraps both a native
+ * device pointer of type <tt>InputType*</tt> and a unary conversion functor of
+ * type \p ConversionOp. \p OutputType references are made by pulling \p InputType
+ * values through the texture cache and then transformed them using the
+ * \p ConversionOp instance.
+ *
+ * \par Usage Considerations
+ * - Can only be used with primitive types (e.g., \p char, \p int, \p float), with the exception of \p double
+ * - Only one TexIteratorRA or TexTransformIteratorRA of a certain \p InputType can be bound at any given time (per host thread)
+ *
+ * \tparam InputType            The value type of the pointer being wrapped
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p OutputType.  Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
+ * \tparam OutputType           The value type of this iterator
+ */
+template <typename OutputType, typename ConversionOp, typename InputType>
+class TexTransformIteratorRA
+{
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    typedef TexTransformIteratorRA              self_type;
+    typedef OutputType                          value_type;
+    typedef OutputType                          reference;
+    typedef OutputType*                         pointer;
+    typedef std::random_access_iterator_tag     iterator_category;
+    typedef int                                 difference_type;
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Tag identifying iterator type as being texture-bindable
+    typedef void TexBindingTag;
+
+private:
+
+    ConversionOp        conversion_op;
+    InputType*          ptr;
+    size_t              tex_align_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /**
+     * \brief Constructor
+     */
+    TexTransformIteratorRA(
+        ConversionOp    conversion_op)          ///< Binary transformation functor
+    :
+        conversion_op(conversion_op),
+        ptr(NULL),
+        tex_align_offset(0),
+        tex_obj(0)
+    {}
+
+    /// \brief Bind iterator to texture reference
+    cudaError_t BindTexture(
+        InputType*      ptr,                    ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes,                  ///< Number of items
+        size_t          tex_align_offset = 0)   ///< Offset (in items) from ptr denoting the position of the iterator
+    {
+        this->ptr = ptr;
+        this->tex_align_offset = tex_align_offset;
+
+        int ptx_version;
+        cudaError_t error = cudaSuccess;
+        if (CubDebug(error = PtxVersion(ptx_version))) return error;
+        if (ptx_version >= 300)
+        {
+            // Use texture object
+            cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<InputType>();
+            cudaResourceDesc        res_desc;
+            cudaTextureDesc         tex_desc;
+            memset(&res_desc, 0, sizeof(cudaResourceDesc));
+            memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+            res_desc.resType                = cudaResourceTypeLinear;
+            res_desc.res.linear.devPtr      = ptr;
+            res_desc.res.linear.desc        = channel_desc;
+            res_desc.res.linear.sizeInBytes = bytes;
+            tex_desc.readMode               = cudaReadModeElementType;
+            return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+        }
+        else
+        {
+            // Use texture reference
+            return TexIteratorRef<InputType>::BindTexture(ptr);
+        }
+    }
+
+    /// \brief Unbind iterator to texture reference
+    cudaError_t UnbindTexture()
+    {
+        int ptx_version;
+        cudaError_t error = cudaSuccess;
+        if (CubDebug(error = PtxVersion(ptx_version))) return error;
+        if (ptx_version >= 300)
+        {
+            // Use texture object
+            return cudaDestroyTextureObject(tex_obj);
+        }
+        else
+        {
+            // Use texture reference
+            return TexIteratorRef<InputType>::UnbindTexture();
+        }
+    }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        self_type i = *this;
+        ptr++;
+        tex_align_offset++;
+        return i;
+    }
+
+    __host__ __device__ __forceinline__ self_type operator++(int junk)
+    {
+        ptr++;
+        tex_align_offset++;
+        return *this;
+    }
+
+    __host__ __device__ __forceinline__ reference operator*()
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return conversion_op(*ptr);
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset));
+#else
+        // Use the texture object
+        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+#endif
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+    {
+        TexTransformIteratorRA retval(conversion_op);
+        retval.ptr = ptr + n;
+        retval.tex_align_offset = tex_align_offset + n;
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+    {
+        TexTransformIteratorRA retval(conversion_op);
+        retval.ptr = ptr - n;
+        retval.tex_align_offset = tex_align_offset - n;
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ reference operator[](SizeT n)
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return conversion_op(ptr[n]);
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset + n));
+#else
+        // Use the texture object
+        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset + n));
+#endif
+    }
+
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return &conversion_op(*ptr);
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return &conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset));
+#else
+        // Use the texture object
+        return &conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+#endif
+    }
+
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+};
+
+
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/util_macro.cuh
+++ b/lib/kokkos/TPL/cub/util_macro.cuh
@ -0,0 +1,107 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Common C/C++ macro utilities
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * Align struct
+ */
+#if defined(_WIN32) || defined(_WIN64)
+    #define CUB_ALIGN(bytes) __declspec(align(32))
+#else
+    #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
+#endif
+
+/**
+ * Select maximum(a, b)
+ */
+#define CUB_MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+/**
+ * Select minimum(a, b)
+ */
+#define CUB_MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+/**
+ * Quotient of x/y rounded down to nearest integer
+ */
+#define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
+
+/**
+ * Quotient of x/y rounded up to nearest integer
+ */
+#define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
+
+/**
+ * x rounded up to the nearest multiple of y
+ */
+#define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
+
+/**
+ * x rounded down to the nearest multiple of y
+ */
+#define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
+
+/**
+ * Return character string for given type
+ */
+#define CUB_TYPE_STRING(type) ""#type
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+    #define CUB_CAT_(a, b) a ## b
+    #define CUB_CAT(a, b) CUB_CAT_(a, b)
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * Static assert
+ */
+#define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/util_namespace.cuh
+++ b/lib/kokkos/TPL/cub/util_namespace.cuh
@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Place-holder for prefixing the cub namespace
+ */
+
+#pragma once
+
+// For example:
+//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
+//#define CUB_NS_POSTFIX } }
+
+#define CUB_NS_PREFIX
+#define CUB_NS_POSTFIX
--- a/lib/kokkos/TPL/cub/util_ptx.cuh
+++ b/lib/kokkos/TPL/cub/util_ptx.cuh
@ -0,0 +1,380 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * PTX intrinsics
+ */
+
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * PTX helper macros
+ ******************************************************************************/
+
+/**
+ * Register modifier for pointer-types (for inlining PTX assembly)
+ */
+#if defined(_WIN64) || defined(__LP64__)
+    #define __CUB_LP64__ 1
+    // 64-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "l"
+    #define _CUB_ASM_PTR_SIZE_ "u64"
+#else
+    #define __CUB_LP64__ 0
+    // 32-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "r"
+    #define _CUB_ASM_PTR_SIZE_ "u32"
+#endif
+
+
+/******************************************************************************
+ * Inlined PTX intrinsics
+ ******************************************************************************/
+
+/**
+ * Shift-right then add.  Returns (x >> shift) + addend.
+ */
+__device__ __forceinline__ unsigned int SHR_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+#if __CUDA_ARCH__ >= 200
+    asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+#else
+    ret = (x >> shift) + addend;
+#endif
+    return ret;
+}
+
+
+/**
+ * Shift-left then add.  Returns (x << shift) + addend.
+ */
+__device__ __forceinline__ unsigned int SHL_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+#if __CUDA_ARCH__ >= 200
+    asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+#else
+    ret = (x << shift) + addend;
+#endif
+    return ret;
+}
+
+
+/**
+ * Bitfield-extract.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    unsigned int bits;
+#if __CUDA_ARCH__ >= 200
+    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
+#else
+    const unsigned int MASK = (1 << num_bits) - 1;
+    bits = (source >> bit_start) & MASK;
+#endif
+    return bits;
+}
+
+
+/**
+ * Bitfield-extract for 64-bit types.
+ */
+__device__ __forceinline__ unsigned int BFE(
+    unsigned long long source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    const unsigned long long MASK = (1ull << num_bits) - 1;
+    return (source >> bit_start) & MASK;
+}
+
+
+/**
+ * Bitfield insert.  Inserts the first num_bits of y into x starting at bit_start
+ */
+__device__ __forceinline__ void BFI(
+    unsigned int &ret,
+    unsigned int x,
+    unsigned int y,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+#if __CUDA_ARCH__ >= 200
+    asm("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
+#else
+    // TODO
+#endif
+}
+
+
+/**
+ * Three-operand add
+ */
+__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
+{
+#if __CUDA_ARCH__ >= 200
+    asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
+#else
+    x = x + y + z;
+#endif
+    return x;
+}
+
+
+/**
+ * Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and
+ * reassemble them into a 32-bit destination register
+ */
+__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
+{
+    int ret;
+    asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+    return ret;
+}
+
+
+/**
+ * Sync-threads barrier.
+ */
+__device__ __forceinline__ void BAR(int count)
+{
+    asm volatile("bar.sync 1, %0;" : : "r"(count));
+}
+
+
+/**
+ * Floating point multiply. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FMUL_RZ(float a, float b)
+{
+    float d;
+    asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
+    return d;
+}
+
+
+/**
+ * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
+{
+    float d;
+    asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
+    return d;
+}
+
+
+/**
+ * Terminates the calling thread
+ */
+__device__ __forceinline__ void ThreadExit() {
+    asm("exit;");
+}    
+
+
+/**
+ * Returns the warp lane ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneId()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %laneid;" : "=r"(ret) );
+    return ret;
+}
+
+
+/**
+ * Returns the warp ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int WarpId()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %warpid;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * Returns the warp lane mask of all lanes less than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLt()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * Returns the warp lane mask of all lanes less than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLe()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %lanemask_le;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * Returns the warp lane mask of all lanes greater than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGt()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %lanemask_gt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * Returns the warp lane mask of all lanes greater than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGe()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %lanemask_ge;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * Portable implementation of __all
+ */
+__device__ __forceinline__ int WarpAll(int cond)
+{
+#if CUB_PTX_ARCH < 120
+
+    __shared__ volatile int warp_signals[PtxArchProps::MAX_SM_THREADS / PtxArchProps::WARP_THREADS];
+
+    if (LaneId() == 0)
+        warp_signals[WarpId()] = 1;
+
+    if (cond == 0)
+        warp_signals[WarpId()] = 0;
+
+    return warp_signals[WarpId()];
+
+#else
+
+    return __all(cond);
+
+#endif
+}
+
+
+/**
+ * Portable implementation of __any
+ */
+__device__ __forceinline__ int WarpAny(int cond)
+{
+#if CUB_PTX_ARCH < 120
+
+    __shared__ volatile int warp_signals[PtxArchProps::MAX_SM_THREADS / PtxArchProps::WARP_THREADS];
+
+    if (LaneId() == 0)
+        warp_signals[WarpId()] = 0;
+
+    if (cond)
+        warp_signals[WarpId()] = 1;
+
+    return warp_signals[WarpId()];
+
+#else
+
+    return __any(cond);
+
+#endif
+}
+
+
+/// Generic shuffle-up
+template <typename T>
+__device__ __forceinline__ T ShuffleUp(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset)         ///< [in] The up-offset of the peer to read from
+{
+    enum
+    {
+        SHFL_C = 0,
+    };
+
+    typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    #pragma unroll
+    for (int WORD = 0; WORD < WORDS; ++WORD)
+    {
+        unsigned int shuffle_word = input_alias[WORD];
+        asm(
+            "  shfl.up.b32 %0, %1, %2, %3;"
+            : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C));
+        output_alias[WORD] = (ShuffleWord) shuffle_word;
+    }
+
+    return output;
+}
+
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/util_type.cuh
+++ b/lib/kokkos/TPL/cub/util_type.cuh
@ -0,0 +1,685 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Common type manipulation (metaprogramming) utilities
+ */
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+
+/******************************************************************************
+ * Type equality
+ ******************************************************************************/
+
+/**
+ * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
+ */
+template <bool IF, typename ThenType, typename ElseType>
+struct If
+{
+    /// Conditional type result
+    typedef ThenType Type;      // true
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename ThenType, typename ElseType>
+struct If<false, ThenType, ElseType>
+{
+    typedef ElseType Type;      // false
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+
+/**
+ * \brief Type equality test
+ */
+template <typename A, typename B>
+struct Equals
+{
+    enum {
+        VALUE = 0,
+        NEGATE = 1
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename A>
+struct Equals <A, A>
+{
+    enum {
+        VALUE = 1,
+        NEGATE = 0
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Marker types
+ ******************************************************************************/
+
+/**
+ * \brief A simple "NULL" marker type
+ */
+struct NullType
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+    template <typename T>
+    __host__ __device__ __forceinline__ NullType& operator =(const T& b) { return *this; }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+
+/**
+ * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
+ */
+template <int A>
+struct Int2Type
+{
+   enum {VALUE = A};
+};
+
+
+/******************************************************************************
+ * Size and alignment
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename T>
+struct WordAlignment
+{
+    struct Pad
+    {
+        T       val;
+        char    byte;
+    };
+
+    enum
+    {
+        /// The alignment of T in bytes
+        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
+    };
+
+    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<(ALIGN_BYTES % 4 == 0),
+        int,
+        typename If<(ALIGN_BYTES % 2 == 0),
+            short,
+            char>::Type>::Type                  ShuffleWord;
+
+    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<(ALIGN_BYTES % 8 == 0),
+        long long,
+        ShuffleWord>::Type                      VolatileWord;
+
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<(ALIGN_BYTES % 16 == 0),
+        longlong2,
+        typename If<(ALIGN_BYTES % 8 == 0),
+            long long,                                 // needed to get heterogenous PODs to work on all platforms
+            ShuffleWord>::Type>::Type           DeviceWord;
+
+    enum
+    {
+        DEVICE_MULTIPLE = sizeof(DeviceWord) / sizeof(T)
+    };
+
+    struct UninitializedBytes
+    {
+        char buf[sizeof(T)];
+    };
+
+    struct UninitializedShuffleWords
+    {
+        ShuffleWord buf[sizeof(T) / sizeof(ShuffleWord)];
+    };
+
+    struct UninitializedVolatileWords
+    {
+        VolatileWord buf[sizeof(T) / sizeof(VolatileWord)];
+    };
+
+    struct UninitializedDeviceWords
+    {
+        DeviceWord buf[sizeof(T) / sizeof(DeviceWord)];
+    };
+
+
+};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Wrapper types
+ ******************************************************************************/
+
+/**
+ * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
+ */
+template <typename T>
+struct Uninitialized
+{
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename WordAlignment<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        WORDS = sizeof(T) / sizeof(DeviceWord)
+    };
+
+    /// Backing storage
+    DeviceWord storage[WORDS];
+
+    /// Alias
+    __host__ __device__ __forceinline__ T& Alias()
+    {
+        return reinterpret_cast<T&>(*this);
+    }
+};
+
+
+/**
+ * \brief A wrapper for passing simple static arrays as kernel parameters
+ */
+template <typename T, int COUNT>
+struct ArrayWrapper
+{
+    /// Static array of type \p T
+    T array[COUNT];
+};
+
+
+/**
+ * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage
+ * buffers (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass).  This structure wraps a set of device
+ * buffers and a "selector" member to track which is "current".
+ */
+template <typename T>
+struct DoubleBuffer
+{
+    /// Pair of device buffer pointers
+    T *d_buffers[2];
+
+    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
+    int selector;
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer()
+    {
+        selector = 0;
+        d_buffers[0] = NULL;
+        d_buffers[1] = NULL;
+    }
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer(
+        T *d_current,         ///< The currently valid buffer
+        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
+    {
+        selector = 0;
+        d_buffers[0] = d_current;
+        d_buffers[1] = d_alternate;
+    }
+
+    /// \brief Return pointer to the currently valid buffer
+    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
+};
+
+
+
+/******************************************************************************
+ * Static math
+ ******************************************************************************/
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+    /// Static logarithm value
+    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
+        COUNT :
+        COUNT - 1 };
+};
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Statically determine if N is a power-of-two
+ */
+template <int N>
+struct PowerOfTwo
+{
+    enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+
+
+/******************************************************************************
+ * Pointer vs. iterator detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Pointer vs. iterator
+ */
+template <typename Tp>
+struct IsPointer
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsPointer<Tp*>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Qualifier detection
+ ******************************************************************************/
+
+/**
+ * \brief Volatile modifier test
+ */
+template <typename Tp>
+struct IsVolatile
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsVolatile<Tp volatile>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Qualifier removal
+ ******************************************************************************/
+
+/**
+ * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
+ *
+ * For example:
+ *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
+ */
+template <typename Tp, typename Up = Tp>
+struct RemoveQualifiers
+{
+    /// Type without \p const and \p volatile qualifiers
+    typedef Up Type;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, volatile Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const volatile Up>
+{
+    typedef Up Type;
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ */
+#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
+    template <typename T>                                               \
+    struct detector_name                                                \
+    {                                                                   \
+        template <typename C>                                           \
+        static char& test(typename C::nested_type_name*);               \
+        template <typename>                                             \
+        static int& test(...);                                          \
+        enum                                                            \
+        {                                                               \
+            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
+        };                                                              \
+    };
+
+
+
+/******************************************************************************
+ * Simple enable-if (similar to Boost)
+ ******************************************************************************/
+
+/**
+ * \brief Simple enable-if (similar to Boost)
+ */
+template <bool Condition, class T = void>
+struct EnableIf
+{
+    /// Enable-if type for SFINAE dummy variables
+    typedef T Type;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <class T>
+struct EnableIf<false, T> {};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
+ */
+template <typename T, typename BinaryOp>
+struct BinaryOpHasIdxParam
+{
+private:
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
+
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
+
+    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static int Test(...);
+
+public:
+
+    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
+    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
+};
+
+
+
+/******************************************************************************
+ * Simple type traits utilities.
+ *
+ * For example:
+ *     Traits<int>::CATEGORY             // SIGNED_INTEGER
+ *     Traits<NullType>::NULL_TYPE       // true
+ *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
+ *     Traits<uint4>::PRIMITIVE;         // false
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Basic type traits categories
+ */
+enum Category
+{
+    NOT_A_NUMBER,
+    SIGNED_INTEGER,
+    UNSIGNED_INTEGER,
+    FLOATING_POINT
+};
+
+
+/**
+ * \brief Basic type traits
+ */
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits>
+struct BaseTraits
+{
+    /// Category
+    static const Category CATEGORY      = _CATEGORY;
+    enum
+    {
+        PRIMITIVE       = _PRIMITIVE,
+        NULL_TYPE       = _NULL_TYPE,
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Basic type traits (unsigned primitive specialization)
+ */
+template <typename _UnsignedBits>
+struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = UNSIGNED_INTEGER;
+    static const UnsignedBits   MIN_KEY     = UnsignedBits(0);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key;
+    }
+};
+
+
+/**
+ * Basic type traits (signed primitive specialization)
+ */
+template <typename _UnsignedBits>
+struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = SIGNED_INTEGER;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   MIN_KEY     = HIGH_BIT;
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+};
+
+
+/**
+ * Basic type traits (fp primitive specialization)
+ */
+template <typename _UnsignedBits>
+struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = FLOATING_POINT;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   MIN_KEY     = UnsignedBits(-1);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+        return key ^ mask;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+        return key ^ mask;
+    };
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Numeric type traits
+ */
+template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T> {};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType> {};
+
+template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char> {};
+template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char> {};
+template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short> {};
+template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int> {};
+template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long> {};
+template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long> {};
+
+template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char> {};
+template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short> {};
+template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int> {};
+template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long> {};
+template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long> {};
+
+template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int> {};
+template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long> {};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Type traits
+ */
+template <typename T>
+struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
+
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/util_vector.cuh
+++ b/lib/kokkos/TPL/cub/util_vector.cuh
@ -0,0 +1,166 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Vector type inference utilities
+ */
+
+#pragma once
+
+#include <iostream>
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Vector type inference utilities.  For example:
+ *
+ * typename VectorHelper<unsigned int, 2>::Type    // Aliases uint2
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the VectorHelper structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
+ */
+template <typename T, int vec_elements> struct VectorHelper;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+enum
+{
+    /// The maximum number of elements in CUDA vector types
+    MAX_VEC_ELEMENTS = 4,
+};
+
+
+/**
+ * Generic vector-1 type
+ */
+template <typename T>
+struct VectorHelper<T, 1>
+{
+    enum { BUILT_IN = false };
+
+    T x;
+
+    typedef VectorHelper<T, 1> Type;
+};
+
+/**
+ * Generic vector-2 type
+ */
+template <typename T>
+struct VectorHelper<T, 2>
+{
+    enum { BUILT_IN = false };
+
+    T x;
+    T y;
+
+    typedef VectorHelper<T, 2> Type;
+};
+
+/**
+ * Generic vector-3 type
+ */
+template <typename T>
+struct VectorHelper<T, 3>
+{
+    enum { BUILT_IN = false };
+
+    T x;
+    T y;
+    T z;
+
+    typedef VectorHelper<T, 3> Type;
+};
+
+/**
+ * Generic vector-4 type
+ */
+template <typename T>
+struct VectorHelper<T, 4>
+{
+    enum { BUILT_IN = false };
+
+    T x;
+    T y;
+    T z;
+    T w;
+
+    typedef VectorHelper<T, 4> Type;
+};
+
+/**
+ * Macro for expanding partially-specialized built-in vector types
+ */
+#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                            \
+  template<> struct VectorHelper<base_type, 1> { typedef short_type##1 Type; enum { BUILT_IN = true }; };         \
+  template<> struct VectorHelper<base_type, 2> { typedef short_type##2 Type; enum { BUILT_IN = true }; };         \
+  template<> struct VectorHelper<base_type, 3> { typedef short_type##3 Type; enum { BUILT_IN = true }; };         \
+  template<> struct VectorHelper<base_type, 4> { typedef short_type##4 Type; enum { BUILT_IN = true }; };
+
+// Expand CUDA vector types for built-in primitives
+CUB_DEFINE_VECTOR_TYPE(char,               char)
+CUB_DEFINE_VECTOR_TYPE(signed char,        char)
+CUB_DEFINE_VECTOR_TYPE(short,              short)
+CUB_DEFINE_VECTOR_TYPE(int,                int)
+CUB_DEFINE_VECTOR_TYPE(long,               long)
+CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
+CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+CUB_DEFINE_VECTOR_TYPE(float,              float)
+CUB_DEFINE_VECTOR_TYPE(double,             double)
+CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
+
+// Undefine macros
+#undef CUB_DEFINE_VECTOR_TYPE
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/lib/kokkos/TPL/cub/warp/specializations/warp_reduce_shfl.cuh
@ -0,0 +1,358 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction across CUDA warps.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_type.cuh"
+#include "../../util_macro.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction across CUDA warps.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
+    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
+struct WarpReduceShfl
+{
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        /// The number of warp reduction steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        // The 5-bit SHFL mask for logically splitting warps into sub-segments
+        SHFL_MASK = (-1 << STEPS) & 31,
+
+        // The 5-bit SFHL clamp
+        SHFL_CLAMP = LOGICAL_WARP_THREADS - 1,
+
+        // The packed C argument (mask starts 8 bits up)
+        SHFL_C = (SHFL_MASK << 8) | SHFL_CLAMP,
+    };
+
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    int     warp_id;
+    int     lane_id;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceShfl(
+        TempStorage &temp_storage,
+        int warp_id,
+        int lane_id)
+    :
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /******************************************************************************
+     * Operation
+     ******************************************************************************/
+
+    /// Summation (single-SHFL)
+    template <
+        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
+    __device__ __forceinline__ T Sum(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        Int2Type<true>      single_shfl)            ///< [in] Marker type indicating whether only one SHFL instruction is required
+    {
+        unsigned int output = reinterpret_cast<unsigned int &>(input);
+
+        // Iterate reduction steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            if (FULL_WARPS)
+            {
+                // Use predicate set from SHFL to guard against invalid peers
+                asm(
+                    "{"
+                    "  .reg .u32 r0;"
+                    "  .reg .pred p;"
+                    "  shfl.down.b32 r0|p, %1, %2, %3;"
+                    "  @p add.u32 r0, r0, %4;"
+                    "  mov.u32 %0, r0;"
+                    "}"
+                    : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output));
+            }
+            else
+            {
+                // Set range predicate to guard against invalid peers
+                asm(
+                    "{"
+                    "  .reg .u32 r0;"
+                    "  .reg .pred p;"
+                    "  shfl.down.b32 r0, %1, %2, %3;"
+                    "  setp.lt.u32 p, %5, %6;"
+                    "  mov.u32 %0, %1;"
+                    "  @p add.u32 %0, %1, r0;"
+                    "}"
+                    : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp));
+            }
+        }
+
+        return output;
+    }
+
+
+    /// Summation (multi-SHFL)
+    template <
+        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 folded_items_per_warp,        ///< [in] Total number of valid items folded into each logical warp
+        Int2Type<false>     single_shfl)        ///< [in] Marker type indicating whether only one SHFL instruction is required
+    {
+        // Delegate to generic reduce
+        return Reduce<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, cub::Sum());
+    }
+
+
+    /// Summation (float)
+    template <
+        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
+    __device__ __forceinline__ float Sum(
+        float               input,              ///< [in] Calling thread's input
+        int                 folded_items_per_warp)        ///< [in] Total number of valid items folded into each logical warp
+    {
+        T output = input;
+
+        // Iterate reduction steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            if (FULL_WARPS)
+            {
+                // Use predicate set from SHFL to guard against invalid peers
+                asm(
+                    "{"
+                    "  .reg .f32 r0;"
+                    "  .reg .pred p;"
+                    "  shfl.down.b32 r0|p, %1, %2, %3;"
+                    "  @p add.f32 r0, r0, %4;"
+                    "  mov.f32 %0, r0;"
+                    "}"
+                    : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output));
+            }
+            else
+            {
+                // Set range predicate to guard against invalid peers
+                asm(
+                    "{"
+                    "  .reg .f32 r0;"
+                    "  .reg .pred p;"
+                    "  shfl.down.b32 r0, %1, %2, %3;"
+                    "  setp.lt.u32 p, %5, %6;"
+                    "  mov.f32 %0, %1;"
+                    "  @p add.f32 %0, %0, r0;"
+                    "}"
+                    : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp));
+            }
+        }
+
+        return output;
+    }
+
+    /// Summation (generic)
+    template <
+        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename            _T>
+    __device__ __forceinline__ _T Sum(
+        _T                  input,                  ///< [in] Calling thread's input
+        int                 folded_items_per_warp)  ///< [in] Total number of valid items folded into each logical warp
+    {
+        // Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions)
+        Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl;
+
+        return Sum<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, single_shfl);
+    }
+
+
+    /// Reduction
+    template <
+        bool            FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int             FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename        ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                  ///< [in] Calling thread's input
+        int             folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    {
+        typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
+
+        const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+        T               output          = input;
+        T               temp;
+        ShuffleWord     *temp_alias     = reinterpret_cast<ShuffleWord *>(&temp);
+        ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            // Grab addend from peer
+            const int OFFSET = 1 << STEP;
+
+            #pragma unroll
+            for (int WORD = 0; WORD < WORDS; ++WORD)
+            {
+                unsigned int shuffle_word = output_alias[WORD];
+                asm(
+                    "  shfl.down.b32 %0, %1, %2, %3;"
+                    : "=r"(shuffle_word) : "r"(shuffle_word), "r"(OFFSET), "r"(SHFL_C));
+                temp_alias[WORD] = (ShuffleWord) shuffle_word;
+            }
+
+            // Perform reduction op if from a valid peer
+            if (FULL_WARPS)
+            {
+                if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+                    output = reduction_op(output, temp);
+            }
+            else
+            {
+                if (((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE) < folded_items_per_warp)
+                    output = reduction_op(output, temp);
+            }
+        }
+
+        return output;
+    }
+
+
+    /// Segmented reduction
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        Flag,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        Flag            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
+    {
+        typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
+
+        T output = input;
+
+        const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+        T               temp;
+        ShuffleWord     *temp_alias     = reinterpret_cast<ShuffleWord *>(&temp);
+        ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+
+        // Get the start flags for each thread in the warp.
+        int warp_flags = __ballot(flag);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if ((LOGICAL_WARPS > 1) && (LOGICAL_WARP_THREADS < 32))
+            warp_flags >>= (warp_id * LOGICAL_WARP_THREADS);
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Grab addend from peer
+            #pragma unroll
+            for (int WORD = 0; WORD < WORDS; ++WORD)
+            {
+                unsigned int shuffle_word = output_alias[WORD];
+
+                asm(
+                    "  shfl.down.b32 %0, %1, %2, %3;"
+                    : "=r"(shuffle_word) : "r"(shuffle_word), "r"(OFFSET), "r"(SHFL_C));
+                temp_alias[WORD] = (ShuffleWord) shuffle_word;
+
+            }
+
+            // Perform reduction op if valid
+            if (OFFSET < next_flag - lane_id)
+                output = reduction_op(output, temp);
+        }
+
+        return output;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/lib/kokkos/TPL/cub/warp/specializations/warp_reduce_smem.cuh
@ -0,0 +1,291 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction across CUDA warps.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpReduceSmem provides smem-based variants of parallel reduction across CUDA warps.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
+    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
+struct WarpReduceSmem
+{
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size is a power-of-two
+        POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+    /// Shared memory flag type
+    typedef unsigned char SmemFlag;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef T _TempStorage[LOGICAL_WARPS][WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage     &temp_storage;
+    int             warp_id;
+    int             lane_id;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceSmem(
+        TempStorage     &temp_storage,
+        int             warp_id,
+        int             lane_id)
+    :
+        temp_storage(temp_storage.Alias()),
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /******************************************************************************
+     * Operation
+     ******************************************************************************/
+
+    /**
+     * Reduction
+     */
+    template <
+        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp         reduction_op)           ///< [in] Reduction operator
+    {
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
+
+            // Update input if peer_addend is in range
+            if ((FULL_WARPS && POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp))
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+        }
+
+        return input;
+    }
+
+
+    /**
+     * Segmented reduction
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        Flag,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        Flag            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Reduction operator
+    {
+    #if CUB_PTX_ARCH >= 200
+
+        // Ballot-based segmented reduce
+
+        // Get the start flags for each thread in the warp.
+        int warp_flags = __ballot(flag);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if ((LOGICAL_WARPS > 1) && (LOGICAL_WARP_THREADS < 32))
+            warp_flags >>= (warp_id * LOGICAL_WARP_THREADS);
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
+
+            // Update input if peer_addend is in range
+            if (OFFSET < next_flag - lane_id)
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+        }
+
+        return input;
+
+    #else
+
+        // Smem-based segmented reduce
+
+        enum
+        {
+            UNSET   = 0x0,  // Is initially unset
+            SET     = 0x1,  // Is initially set
+            SEEN    = 0x2,  // Has seen another head flag from a successor peer
+        };
+
+        // Alias flags onto shared data storage
+        volatile SmemFlag *flag_storage = reinterpret_cast<SmemFlag*>(temp_storage[warp_id]);
+
+        SmemFlag flag_status = (flag) ? SET : UNSET;
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
+
+            // Get peer from buffer
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
+
+            // Share flag through buffer
+            flag_storage[lane_id] = flag_status;
+
+            // Get peer flag from buffer
+            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
+
+            // Update input if peer was in range
+            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+            {
+                if (HEAD_SEGMENTED)
+                {
+                    // Head-segmented
+                    if ((flag_status & SEEN) == 0)
+                    {
+                        // Has not seen a more distant head flag
+                        if (peer_flag_status & SET)
+                        {
+                            // Has now seen a head flag
+                            flag_status |= SEEN;
+                        }
+                        else
+                        {
+                            // Peer is not a head flag: grab its count
+                            input = reduction_op(input, peer_addend);
+                        }
+
+                        // Update seen status to include that of peer
+                        flag_status |= (peer_flag_status & SEEN);
+                    }
+                }
+                else
+                {
+                    // Tail-segmented.  Simply propagate flag status
+                    if (!flag_status)
+                    {
+                        input = reduction_op(input, peer_addend);
+                        flag_status |= peer_flag_status;
+                    }
+
+                }
+            }
+        }
+
+        return input;
+
+    #endif
+    }
+
+
+    /**
+     * Summation
+     */
+    template <
+        bool            FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int             FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
+    __device__ __forceinline__ T Sum(
+        T               input,                  ///< [in] Calling thread's input
+        int             folded_items_per_warp)  ///< [in] Total number of valid items folded into each logical warp
+    {
+        return Reduce<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, cub::Sum());
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/lib/kokkos/TPL/cub/warp/specializations/warp_scan_shfl.cuh
@ -0,0 +1,371 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan across CUDA warps.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_type.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan across CUDA warps.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
+    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
+struct WarpScanShfl
+{
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        // The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = ((-1 << STEPS) & 31) << 8,
+    };
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    int             warp_id;
+    int             lane_id;
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanShfl(
+        TempStorage &temp_storage,
+        int warp_id,
+        int lane_id)
+    :
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /******************************************************************************
+     * Operation
+     ******************************************************************************/
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
+
+        const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+        T               output;
+        ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+        ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+        #pragma unroll
+        for (int WORD = 0; WORD < WORDS; ++WORD)
+        {
+            unsigned int shuffle_word = input_alias[WORD];
+            asm("shfl.idx.b32 %0, %1, %2, %3;"
+                : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_lane), "r"(LOGICAL_WARP_THREADS - 1));
+            output_alias[WORD] = (ShuffleWord) shuffle_word;
+        }
+
+        return output;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive prefix sum with aggregate (single-SHFL)
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate,    ///< [out] Warp-wide aggregate reduction of input items.
+        Int2Type<true>  single_shfl)
+    {
+        unsigned int temp = reinterpret_cast<unsigned int &>(input);
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            // Use predicate set from SHFL to guard against invalid peers
+            asm(
+                "{"
+                "  .reg .u32 r0;"
+                "  .reg .pred p;"
+                "  shfl.up.b32 r0|p, %1, %2, %3;"
+                "  @p add.u32 r0, r0, %4;"
+                "  mov.u32 %0, r0;"
+                "}"
+                : "=r"(temp) : "r"(temp), "r"(1 << STEP), "r"(SHFL_C), "r"(temp));
+        }
+
+        output = temp;
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
+    }
+
+
+    /// Inclusive prefix sum with aggregate (multi-SHFL)
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate,    ///< [out] Warp-wide aggregate reduction of input items.
+        Int2Type<false> single_shfl)        ///< [in] Marker type indicating whether only one SHFL instruction is required
+    {
+        // Delegate to generic scan
+        InclusiveScan(input, output, Sum(), warp_aggregate);
+    }
+
+
+    /// Inclusive prefix sum with aggregate (specialized for float)
+    __device__ __forceinline__ void InclusiveSum(
+        float           input,              ///< [in] Calling thread's input item.
+        float           &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        float           &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        output = input;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            // Use predicate set from SHFL to guard against invalid peers
+            asm(
+                "{"
+                "  .reg .f32 r0;"
+                "  .reg .pred p;"
+                "  shfl.up.b32 r0|p, %1, %2, %3;"
+                "  @p add.f32 r0, r0, %4;"
+                "  mov.f32 %0, r0;"
+                "}"
+                : "=f"(output) : "f"(output), "r"(1 << STEP), "r"(SHFL_C), "f"(output));
+        }
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
+    }
+
+
+    /// Inclusive prefix sum with aggregate (specialized for unsigned long long)
+    __device__ __forceinline__ void InclusiveSum(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        unsigned long long  &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        unsigned long long  &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        output = input;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            // Use predicate set from SHFL to guard against invalid peers
+            asm(
+                "{"
+                "  .reg .u32 r0;"
+                "  .reg .u32 r1;"
+                "  .reg .u32 lo;"
+                "  .reg .u32 hi;"
+                "  .reg .pred p;"
+                "  mov.b64 {lo, hi}, %1;"
+                "  shfl.up.b32 r0|p, lo, %2, %3;"
+                "  shfl.up.b32 r1|p, hi, %2, %3;"
+                "  @p add.cc.u32 r0, r0, lo;"
+                "  @p addc.u32 r1, r1, hi;"
+                "  mov.b64 %0, {r0, r1};"
+                "}"
+                : "=l"(output) : "l"(output), "r"(1 << STEP), "r"(SHFL_C));
+        }
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
+    }
+
+
+    /// Inclusive prefix sum with aggregate (generic)
+    template <typename _T>
+    __device__ __forceinline__ void InclusiveSum(
+        _T               input,             ///< [in] Calling thread's input item.
+        _T               &output,           ///< [out] Calling thread's output item.  May be aliased with \p input.
+        _T               &warp_aggregate)   ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions)
+        Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl;
+
+        InclusiveSum(input, output, warp_aggregate, single_shfl);
+    }
+
+
+    /// Inclusive prefix sum
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        T warp_aggregate;
+        InclusiveSum(input, output, warp_aggregate);
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        output = input;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            // Grab addend from peer
+            const int OFFSET = 1 << STEP;
+            T temp = ShuffleUp(output, OFFSET);
+
+            // Perform scan op if from a valid peer
+            if (lane_id >= OFFSET)
+                output = scan_op(temp, output);
+        }
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
+    }
+
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T warp_aggregate;
+        InclusiveScan(input, output, scan_op, warp_aggregate);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Exclusive operations
+    //---------------------------------------------------------------------
+
+    /// Exclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Compute inclusive scan
+        T inclusive;
+        InclusiveScan(input, inclusive, scan_op, warp_aggregate);
+
+        // Grab result from predecessor
+        T exclusive = ShuffleUp(inclusive, 1);
+
+        output = (lane_id == 0) ?
+            identity :
+            exclusive;
+    }
+
+
+    /// Exclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T warp_aggregate;
+        ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
+    }
+
+
+    /// Exclusive scan with aggregate, without identity
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Compute inclusive scan
+        T inclusive;
+        InclusiveScan(input, inclusive, scan_op, warp_aggregate);
+
+        // Grab result from predecessor
+        output = ShuffleUp(inclusive, 1);
+    }
+
+
+    /// Exclusive scan without identity
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T warp_aggregate;
+        ExclusiveScan(input, output, scan_op, warp_aggregate);
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/warp/specializations/warp_scan_smem.cuh
+++ b/lib/kokkos/TPL/cub/warp/specializations/warp_scan_smem.cuh
@ -0,0 +1,327 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan across CUDA warps.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanSmem provides smem-based variants of parallel prefix scan across CUDA warps.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
+    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
+struct WarpScanSmem
+{
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef T _TempStorage[LOGICAL_WARPS][WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage     &temp_storage;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanSmem(
+        TempStorage     &temp_storage,
+        int             warp_id,
+        int             lane_id)
+    :
+        temp_storage(temp_storage.Alias()),
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /******************************************************************************
+     * Operation
+     ******************************************************************************/
+
+    /// Initialize identity padding (specialized for operations that have identity)
+    __device__ __forceinline__ void InitIdentity(Int2Type<true> has_identity)
+    {
+        T identity = T();
+        ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], identity);
+    }
+
+
+    /// Initialize identity padding (specialized for operations without identity)
+    __device__ __forceinline__ void InitIdentity(Int2Type<false> has_identity)
+    {}
+
+
+    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+    template <
+        bool HAS_IDENTITY,
+        typename ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T               &partial,
+        ScanOp          scan_op,
+        Int2Type<STEPS>  step)
+    {}
+
+
+    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        int         STEP,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T               &partial,
+        ScanOp          scan_op,
+        Int2Type<STEP>  step)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share partial into buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id], partial);
+
+        // Update partial if addend is in range
+        if (HAS_IDENTITY || (lane_id >= OFFSET))
+        {
+            T addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - OFFSET]);
+            partial = scan_op(addend, partial);
+        }
+
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
+    }
+
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        if (lane_id == src_lane)
+        {
+            ThreadStore<STORE_VOLATILE>(temp_storage[warp_id], input);
+        }
+
+        return ThreadLoad<LOAD_VOLATILE>(temp_storage[warp_id]);
+    }
+
+
+    /// Basic inclusive scan
+    template <
+        bool        HAS_IDENTITY,
+        bool        SHARE_FINAL,
+        typename    ScanOp>
+    __device__ __forceinline__ T BasicScan(
+        T               partial,            ///< Calling thread's input partial reduction
+        ScanOp          scan_op)            ///< Binary associative scan functor
+    {
+        // Iterate scan steps
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<0>());
+
+        if (SHARE_FINAL)
+        {
+            // Share partial into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id], partial);
+        }
+
+        return partial;
+    }
+
+
+    /// Inclusive prefix sum
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        const bool HAS_IDENTITY = Traits<T>::PRIMITIVE;
+
+        // Initialize identity region
+        InitIdentity(Int2Type<HAS_IDENTITY>());
+
+        // Compute inclusive warp scan (has identity, don't share final)
+        output = BasicScan<HAS_IDENTITY, false>(input, Sum());
+    }
+
+
+    /// Inclusive prefix sum with aggregate
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        const bool HAS_IDENTITY = Traits<T>::PRIMITIVE;
+
+        // Initialize identity region
+        InitIdentity(Int2Type<HAS_IDENTITY>());
+
+        // Compute inclusive warp scan (has identity, share final)
+        output = BasicScan<HAS_IDENTITY, true>(input, Sum());
+
+        // Retrieve aggregate in <em>warp-lane</em><sub>0</sub>
+        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
+    }
+
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        // Compute inclusive warp scan (no identity, don't share final)
+        output = BasicScan<false, false>(input, scan_op);
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Compute inclusive warp scan (no identity, share final)
+        output = BasicScan<false, true>(input, scan_op);
+
+        // Retrieve aggregate
+        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
+    }
+
+    /// Exclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        // Initialize identity region
+        ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], identity);
+
+        // Compute inclusive warp scan (identity, share final)
+        T inclusive = BasicScan<true, true>(input, scan_op);
+
+        // Retrieve exclusive scan
+        output = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+
+    /// Exclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Exclusive warp scan (which does share final)
+        ExclusiveScan(input, output, identity, scan_op);
+
+        // Retrieve aggregate
+        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
+    }
+
+
+    /// Exclusive scan without identity
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        // Compute inclusive warp scan (no identity, share final)
+        T inclusive = BasicScan<false, true>(input, scan_op);
+
+        // Retrieve exclusive scan
+        output = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+
+    /// Exclusive scan with aggregate, without identity
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Exclusive warp scan (which does share final)
+        ExclusiveScan(input, output, scan_op);
+
+        // Retrieve aggregate
+        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/lib/kokkos/TPL/cub/warp/warp_reduce.cuh
+++ b/lib/kokkos/TPL/cub/warp/warp_reduce.cuh
@ -0,0 +1,677 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads.
+ */
+
+#pragma once
+
+#include "specializations/warp_reduce_shfl.cuh"
+#include "specializations/warp_reduce_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads. ![](warp_reduce_logo.png)
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a list of input elements.
+ *
+ * \tparam T                        The reduction input/output element type
+ * \tparam LOGICAL_WARPS            <b>[optional]</b> The number of entrant "logical" warps performing concurrent warp reductions.  Default is 1.
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpReduce}
+ * \par
+ * The code snippet below illustrates four concurrent warp sum reductions within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for 4 warps on type int
+ *     typedef cub::WarpReduce<int, 4> WarpReduce;
+ *
+ *     // Allocate shared memory for WarpReduce
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+ *     int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
+ * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+ * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+ *
+ * \par
+ * The code snippet below illustrates a single warp sum reduction within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for one warp on type int
+ *     typedef cub::WarpReduce<int, 1> WarpReduce;
+ *
+ *     // Allocate shared memory for WarpReduce
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a reduction
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Return the warp-wide sum to lane0
+ *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>0, 1, 2, 3, ..., 31</tt>.
+ * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
+ *
+ * \par Usage and Performance Considerations
+ * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ * - Warp reductions are concurrent if more than one logical warp is participating
+ * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic reduction)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARPS           = 1,
+    int         LOGICAL_WARP_THREADS    = PtxArchProps::WARP_THREADS>
+class WarpReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+    };
+
+public:
+
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and ((only one logical warp) or (LOGICAL_WARP_THREADS is a power-of-two))
+    typedef typename If<(CUB_PTX_ARCH >= 300) && ((LOGICAL_WARPS == 1) || POW_OF_TWO),
+        WarpReduceShfl<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS>,
+        WarpReduceSmem<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS> >::Type InternalWarpReduce;
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+private:
+
+    /// Shared memory storage layout type for WarpReduce
+    typedef typename InternalWarpReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Warp ID
+    int warp_id;
+
+    /// Lane ID
+    int lane_id;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{WarpReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     *
+     */
+    __device__ __forceinline__ WarpReduce()
+    :
+        temp_storage(PrivateStorage()),
+        warp_id((LOGICAL_WARPS == 1) ?
+            0 :
+            threadIdx.x / LOGICAL_WARP_THREADS),
+        lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
+            LaneId() :
+            threadIdx.x % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        warp_id((LOGICAL_WARPS == 1) ?
+            0 :
+            threadIdx.x / LOGICAL_WARP_THREADS),
+        lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
+            LaneId() :
+            threadIdx.x % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Threads are identified using the given warp and lane identifiers.
+     */
+    __device__ __forceinline__ WarpReduce(
+        int warp_id,                           ///< [in] A suitable warp membership identifier
+        int lane_id)                           ///< [in] A lane identifier within the warp
+    :
+        temp_storage(PrivateStorage()),
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Threads are identified using the given warp and lane identifiers.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int warp_id,                           ///< [in] A suitable warp membership identifier
+        int lane_id)                           ///< [in] A lane identifier within the warp
+    :
+        temp_storage(temp_storage.Alias()),
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a warp-wide sum in each active warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp sum reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for 4 warps on type int
+     *     typedef cub::WarpReduce<int, 4> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input)              ///< [in] Calling thread's input
+    {
+        return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, LOGICAL_WARP_THREADS);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide sum in each active warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads in each logical warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sum reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(
+     *         thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
+     * undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        if (valid_items >= LOGICAL_WARP_THREADS)
+        {
+            return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, valid_items);
+        }
+        else
+        {
+            return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<false, 1>(input, valid_items);
+        }
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in each active warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a head-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+     *         thread_data, head_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     */
+    template <
+        typename            Flag>
+    __device__ __forceinline__ T HeadSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        Flag                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return HeadSegmentedReduce(input, head_flag, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in each active warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a tail-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+     *         thread_data, tail_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            Flag>
+    __device__ __forceinline__ T TailSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        Flag                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return TailSegmentedReduce(input, tail_flag, cub::Sum());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a warp-wide reduction in each active warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp max reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for 4 warps on type int
+     *     typedef cub::WarpReduce<int, 4> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
+     * \p 95, and \p 127, respectively  (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide reduction in each active warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads in each logical warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a max reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max(), valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
+     * undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        if (valid_items >= LOGICAL_WARP_THREADS)
+        {
+            return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, valid_items, reduction_op);
+        }
+        else
+        {
+            return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<false, 1>(input, valid_items, reduction_op);
+        }
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in each active warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a head-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+     *         thread_data, head_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            Flag>
+    __device__ __forceinline__ T HeadSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        Flag                head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<true>(input, head_flag, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in each active warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a tail-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+     *         thread_data, tail_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            Flag>
+    __device__ __forceinline__ T TailSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        Flag                tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+    }
+
+
+
+    //@}  end member group
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
--- a/Show More
+++ b/Show More