From 63072faa6c581c5e8a6b590e57ee7c490c88b718 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Tue, 18 Feb 2025 15:20:16 -0800
Subject: [PATCH 01/12] add missing overloadings

---
 cub/cub/warp/warp_reduce.cuh | 284 ++++++++++++++++++++++-------------
 1 file changed, 182 insertions(+), 102 deletions(-)

diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh
index 7992432a1c3..183d3454ba3 100644
--- a/cub/cub/warp/warp_reduce.cuh
+++ b/cub/cub/warp/warp_reduce.cuh
@@ -1,30 +1,25 @@
-/******************************************************************************
+/***********************************************************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2025, NVIDIA CORPORATION.  All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
+ * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+ * following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of conditions and the
+ *       following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
+ *       following disclaimer in the documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote
+ *       products derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- ******************************************************************************/
+ **********************************************************************************************************************/
 
 //! @file
 //! @rst
@@ -44,29 +39,32 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/type_traits.cuh>
 #include <cub/thread/thread_operators.cuh>
+#include <cub/thread/thread_reduce.cuh>
 #include <cub/util_type.cuh>
 #include <cub/warp/specializations/warp_reduce_shfl.cuh>
 #include <cub/warp/specializations/warp_reduce_smem.cuh>
 
+#include <cuda/functional>
+#include <cuda/std/bit>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
 
 //! @rst
-//! The ``WarpReduce`` class provides :ref:`collective <collective-primitives>` methods for
-//! computing a parallel reduction of items partitioned across a CUDA thread warp.
+//! The ``WarpReduce`` class provides :ref:`collective <collective-primitives>` methods for computing a parallel
+//! reduction of items partitioned across a CUDA thread warp.
 //!
 //! .. image:: ../../img/warp_reduce_logo.png
 //!     :align: center
 //!
 //! Overview
-//! ++++++++++++++++++++++++++
+//! ++++++++
 //!
-//! - A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`__ (or *fold*)
-//!   uses a binary combining operator to compute a single aggregate from a list of input elements.
-//! - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8
-//!   threads)
+//! - A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`__ (or *fold*) uses a binary combining
+//!   operator to compute a single aggregate from a list of input elements.
+//! - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
 //! - The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS``
 //!
 //! Performance Considerations
@@ -81,12 +79,12 @@ CUB_NAMESPACE_BEGIN
 //!   - The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS``
 //!
 //! Simple Examples
-//! ++++++++++++++++++++++++++
+//! +++++++++++++++
 //!
 //! @warpcollective{WarpReduce}
 //!
-//! The code snippet below illustrates four concurrent warp sum reductions within a block of
-//! 128 threads (one per each of the 32-thread warps).
+//! The code snippet below illustrates four concurrent warp sum reductions within a block of 128 threads (one per each
+//! of the 32-thread warps).
 //!
 //! .. code-block:: c++
 //!
@@ -96,24 +94,19 @@ CUB_NAMESPACE_BEGIN
 //!    {
 //!        // Specialize WarpReduce for type int
 //!        using WarpReduce = cub::WarpReduce<int>;
-//!
 //!        // Allocate WarpReduce shared memory for 4 warps
 //!        __shared__ typename WarpReduce::TempStorage temp_storage[4];
-//!
 //!        // Obtain one input item per thread
 //!        int thread_data = ...
-//!
 //!        // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
-//!        int warp_id = threadIdx.x / 32;
+//!        int warp_id   = threadIdx.x / 32;
 //!        int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
 //!
-//! Suppose the set of input ``thread_data`` across the block of threads is
-//! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``aggregate`` in threads 0, 32, 64, and 96
-//! will be ``496``, ``1520``, ``2544``, and ``3568``, respectively
-//! (and is undefined in other threads).
+//! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``.
+//! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will be
+//! ``496``, ``1520``, ``2544``, and ``3568``, respectively (and is undefined in other threads).
 //!
-//! The code snippet below illustrates a single warp sum reduction within a block of
-//! 128 threads.
+//! The code snippet below illustrates a single warp sum reduction within a block of 128 threads.
 //!
 //! .. code-block:: c++
 //!
@@ -123,23 +116,19 @@ CUB_NAMESPACE_BEGIN
 //!    {
 //!        // Specialize WarpReduce for type int
 //!        using WarpReduce = cub::WarpReduce<int>;
-//!
 //!        // Allocate WarpReduce shared memory for one warp
 //!        __shared__ typename WarpReduce::TempStorage temp_storage;
 //!        ...
-//!
 //!        // Only the first warp performs a reduction
 //!        if (threadIdx.x < 32)
 //!        {
 //!            // Obtain one input item per thread
 //!            int thread_data = ...
-//!
 //!            // Return the warp-wide sum to lane0
 //!            int aggregate = WarpReduce(temp_storage).Sum(thread_data);
 //!
-//! Suppose the set of input ``thread_data`` across the warp of threads is
-//! ``{0, 1, 2, 3, ..., 31}``. The corresponding output ``aggregate`` in thread0 will be ``496``
-//! (and is undefined in other threads).
+//! Suppose the set of input ``thread_data`` across the warp of threads is ``{0, 1, 2, 3, ..., 31}``.
+//! The corresponding output ``aggregate`` in thread0 will be ``496`` (and is undefined in other threads).
 //! @endrst
 //!
 //! @tparam T
@@ -153,27 +142,21 @@ CUB_NAMESPACE_BEGIN
 template <typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS>
 class WarpReduce
 {
-private:
-  /******************************************************************************
-   * Constants and type definitions
-   ******************************************************************************/
-
-  enum
-  {
-    /// Whether the logical warp size and the PTX warp size coincide
-    IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
+  static_assert(LOGICAL_WARP_THREADS >= 1 && LOGICAL_WARP_THREADS <= CUB_WARP_THREADS(0),
+                "LOGICAL_WARP_THREADS must be in the range [1, 32]");
 
-    /// Whether the logical warp size is a power-of-two
-    IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-  };
+  static constexpr bool is_full_warp    = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0));
+  static constexpr bool is_power_of_two = ::cuda::std::has_single_bit(uint32_t{LOGICAL_WARP_THREADS});
 
 public:
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
   /// Internal specialization.
   /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
-  using InternalWarpReduce = ::cuda::std::
-    _If<IS_POW_OF_TWO, detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>, detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
+  using InternalWarpReduce =
+    ::cuda::std::_If<is_power_of_two,
+                     detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>,
+                     detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
 
 #endif // _CCCL_DOXYGEN_INVOKED
 
@@ -181,17 +164,9 @@ private:
   /// Shared memory storage layout type for WarpReduce
   using _TempStorage = typename InternalWarpReduce::TempStorage;
 
-  /******************************************************************************
-   * Thread fields
-   ******************************************************************************/
-
   /// Shared storage reference
   _TempStorage& temp_storage;
 
-  /******************************************************************************
-   * Utility methods
-   ******************************************************************************/
-
 public:
   /// \smemstorage{WarpReduce}
   struct TempStorage : Uninitialized<_TempStorage>
@@ -207,7 +182,7 @@ public:
   //!
   //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& temp_storage)
-      : temp_storage(temp_storage.Alias())
+      : temp_storage{temp_storage.Alias()}
   {}
 
   //! @}  end member group
@@ -223,8 +198,8 @@ public:
   //! Snippet
   //! +++++++
   //!
-  //! The code snippet below illustrates four concurrent warp sum reductions within a block of
-  //! 128 threads (one per each of the 32-thread warps).
+  //! The code snippet below illustrates four concurrent warp sum reductions within a block of 128 threads
+  //! (one per each of the 32-thread warps).
   //!
   //! .. code-block:: c++
   //!
@@ -234,27 +209,62 @@ public:
   //!    {
   //!        // Specialize WarpReduce for type int
   //!        using WarpReduce = cub::WarpReduce<int>;
-  //!
   //!        // Allocate WarpReduce shared memory for 4 warps
   //!        __shared__ typename WarpReduce::TempStorage temp_storage[4];
-  //!
   //!        // Obtain one input item per thread
   //!        int thread_data = ...
-  //!
   //!        // Return the warp-wide sums to each lane0
   //!        int warp_id = threadIdx.x / 32;
   //!        int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
   //!
-  //! Suppose the set of input ``thread_data`` across the block of threads is
-  //! ``{0, 1, 2, 3, ..., 127}``.
-  //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``,
-  //! ``2544``, and ``3568``, respectively (and is undefined in other threads).
+  //! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``.
+  //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``, ``2544``, and
+  //! ``3568``, respectively (and is undefined in other threads).
   //! @endrst
   //!
-  //! @param[in] input Calling thread's input
-  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input)
+  {
+    return InternalWarpReduce{temp_storage}.template Reduce<true>(input, LOGICAL_WARP_THREADS, ::cuda::std::plus<>{});
+  }
+
+  template <typename InputType>
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  Sum(const InputType& input)
+  {
+    auto thread_reduction = cub::ThreadReduce(input, ::cuda::std::plus<>{});
+    return InternalWarpReduce{temp_storage}.template Reduce<true>(
+      thread_reduction, LOGICAL_WARP_THREADS, ::cuda::std::plus<>{});
+  }
+
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input)
+  {
+    return InternalWarpReduce{temp_storage}.template Reduce<true>(input, LOGICAL_WARP_THREADS, ::cuda::maximum<>{});
+  }
+
+  template <typename InputType>
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  Max(const InputType& input)
+  {
+    auto thread_reduction = cub::ThreadReduce(input, ::cuda::maximum<>{});
+    return InternalWarpReduce{temp_storage}.template Reduce<true>(
+      thread_reduction, LOGICAL_WARP_THREADS, ::cuda::maximum<>{});
+  }
+
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Min(T input)
+  {
+    return InternalWarpReduce{temp_storage}.template Reduce<true>(input, LOGICAL_WARP_THREADS, ::cuda::minimum<>{});
+  }
+
+  template <typename InputType>
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  Min(const InputType& input)
   {
-    return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, ::cuda::std::plus<>{});
+    auto thread_reduction = cub::ThreadReduce(input, ::cuda::minimum<>{});
+    return InternalWarpReduce{temp_storage}.template Reduce<true>(
+      thread_reduction, LOGICAL_WARP_THREADS, ::cuda::minimum<>{});
   }
 
   //! @rst
@@ -304,7 +314,7 @@ public:
   //! @param[in] valid_items
   //!   Total number of valid items in the calling thread's logical warp
   //!   (may be less than ``LOGICAL_WARP_THREADS``)
-  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int valid_items)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int valid_items)
   {
     // Determine if we don't need bounds checking
     return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, ::cuda::std::plus<>{});
@@ -359,7 +369,7 @@ public:
   //! @param[in] head_flag
   //!   Head flag denoting whether or not `input` is the start of a new segment
   template <typename FlagT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT head_flag)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT head_flag)
   {
     return HeadSegmentedReduce(input, head_flag, ::cuda::std::plus<>{});
   }
@@ -413,7 +423,7 @@ public:
   //! @param[in] tail_flag
   //!   Head flag denoting whether or not `input` is the start of a new segment
   template <typename FlagT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT tail_flag)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT tail_flag)
   {
     return TailSegmentedReduce(input, tail_flag, ::cuda::std::plus<>{});
   }
@@ -472,11 +482,19 @@ public:
   //! @param[in] reduction_op
   //!   Binary reduction operator
   template <typename ReductionOp>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op)
   {
     return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);
   }
 
+  template <typename InputType, typename ReductionOp>
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  Reduce(const InputType& input, ReductionOp reduction_op)
+  {
+    auto thread_reduction = cub::ThreadReduce(input, reduction_op);
+    return WarpReduce<T, LOGICAL_WARP_THREADS>::Reduce(thread_reduction, LOGICAL_WARP_THREADS, reduction_op);
+  }
   //! @rst
   //! Computes a partially-full warp-wide reduction in the calling warp using the specified binary
   //! reduction functor. The output is valid in warp *lane*\ :sub:`0`.
@@ -534,7 +552,7 @@ public:
   //!   Total number of valid items in the calling thread's logical warp
   //!   (may be less than ``LOGICAL_WARP_THREADS``)
   template <typename ReductionOp>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int valid_items)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int valid_items)
   {
     return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, reduction_op);
   }
@@ -593,7 +611,8 @@ public:
   //! @param[in] reduction_op
   //!   Reduction operator
   template <typename ReductionOp, typename FlagT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedReduce(T input, FlagT head_flag, ReductionOp reduction_op)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T
+  HeadSegmentedReduce(T input, FlagT head_flag, ReductionOp reduction_op)
   {
     return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
   }
@@ -652,7 +671,8 @@ public:
   //! @param[in] reduction_op
   //!   Reduction operator
   template <typename ReductionOp, typename FlagT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T
+  TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op)
   {
     return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
   }
@@ -673,16 +693,18 @@ public:
     struct TempStorage : Uninitialized<_TempStorage>
     {};
 
-    _CCCL_DEVICE _CCCL_FORCEINLINE InternalWarpReduce(TempStorage& /*temp_storage */) {}
+    _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE InternalWarpReduce(TempStorage& /*temp_storage */) {}
 
     template <bool ALL_LANES_VALID, typename ReductionOp>
-    _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int /* valid_items */, ReductionOp /* reduction_op */)
+    _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T
+    Reduce(T input, int /* valid_items */, ReductionOp /* reduction_op */)
     {
       return input;
     }
 
     template <bool HEAD_SEGMENTED, typename FlagT, typename ReductionOp>
-    _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT /* flag */, ReductionOp /* reduction_op */)
+    _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T
+    SegmentedReduce(T input, FlagT /* flag */, ReductionOp /* reduction_op */)
     {
       return input;
     }
@@ -690,54 +712,112 @@ public:
 
   using TempStorage = typename InternalWarpReduce::TempStorage;
 
-  _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& /*temp_storage */) {}
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& /*temp_storage */) {}
 
-  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input)
   {
     return input;
   }
 
-  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int /* valid_items */)
+  template <typename InputType>
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  Sum(const InputType& input)
+  {
+    return cub::ThreadReduce(input, ::cuda::std::plus<>{});
+  }
+
+  template <typename InputType>
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int /* valid_items */)
+  {
+    return input;
+  }
+
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input)
+  {
+    return input;
+  }
+
+  template <typename InputType>
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  Max(const InputType& input)
+  {
+    return cub::ThreadReduce(input, ::cuda::maximum<>{});
+  }
+
+  template <typename InputType>
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input, int /* valid_items */)
+  {
+    return input;
+  }
+
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Min(T input)
+  {
+    return input;
+  }
+
+  template <typename InputType>
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  Min(const InputType& input)
+  {
+    return cub::ThreadReduce(input, ::cuda::minimum<>{});
+  }
+
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Min(T input, int /* valid_items */)
   {
     return input;
   }
 
   template <typename FlagT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT /* head_flag */)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT /* head_flag */)
   {
     return input;
   }
 
   template <typename FlagT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT /* tail_flag */)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT /* tail_flag */)
   {
     return input;
   }
 
   template <typename ReductionOp>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */)
   {
     return input;
   }
 
+  template <typename InputType, typename ReductionOp>
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  Reduce(const InputType& input, ReductionOp reduction_op)
+  {
+    return cub::ThreadReduce(input, reduction_op);
+  }
+
   template <typename ReductionOp>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */, int /* valid_items */)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T
+  Reduce(T input, ReductionOp /* reduction_op */, int /* valid_items */)
   {
     return input;
   }
 
   template <typename ReductionOp, typename FlagT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedReduce(T input, FlagT /* head_flag */, ReductionOp /* reduction_op */)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T
+  HeadSegmentedReduce(T input, FlagT /* head_flag */, ReductionOp /* reduction_op */)
   {
     return input;
   }
 
   template <typename ReductionOp, typename FlagT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedReduce(T input, FlagT /* tail_flag */, ReductionOp /* reduction_op */)
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T
+  TailSegmentedReduce(T input, FlagT /* tail_flag */, ReductionOp /* reduction_op */)
   {
     return input;
   }
 };
+
 #endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END

From 51160da33ce863cbe674c29646569d8c4410d8f6 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 19 Feb 2025 11:38:35 -0800
Subject: [PATCH 02/12] remove useless volatile load/store

---
 .../warp/specializations/warp_reduce_smem.cuh | 20 +++++++------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index 02b8295460a..1a373204b63 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -44,9 +44,8 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/thread/thread_load.cuh>
 #include <cub/thread/thread_operators.cuh>
-#include <cub/thread/thread_store.cuh>
+#include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
 #include <cuda/ptx>
@@ -152,21 +151,16 @@ struct WarpReduceSmem
   ReduceStep(T input, int valid_items, ReductionOp reduction_op, constant_t<STEP> /*step*/)
   {
     constexpr int OFFSET = 1 << STEP;
-
     // Share input through buffer
-    ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
+    temp_storage.reduce[lane_id] = input;
     __syncwarp(member_mask);
-
     // Update input if peer_addend is in range
     if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
     {
-      T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+      T peer_addend = temp_storage.reduce[lane_id + OFFSET];
       input         = reduction_op(input, peer_addend);
     }
-
     __syncwarp(member_mask);
-
     return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, constant_v<STEP + 1>);
   }
 
@@ -250,14 +244,14 @@ struct WarpReduceSmem
       const int OFFSET = 1 << STEP;
 
       // Share input into buffer
-      ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+      temp_storage.reduce[lane_id] = input;
 
       __syncwarp(member_mask);
 
       // Update input if peer_addend is in range
       if (OFFSET + lane_id < next_flag)
       {
-        T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+        T peer_addend = temp_storage.reduce[lane_id + OFFSET];
         input         = reduction_op(input, peer_addend);
       }
 
@@ -306,12 +300,12 @@ struct WarpReduceSmem
       const int OFFSET = 1 << STEP;
 
       // Share input through buffer
-      ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+      temp_storage.reduce[lane_id] = input;
 
       __syncwarp(member_mask);
 
       // Get peer from buffer
-      T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+      T peer_addend = temp_storage.reduce[lane_id + OFFSET];
 
       __syncwarp(member_mask);
 

From 43ff91eb8fd6b7e15c0615fd49cf272f2bd20e6d Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 20 Feb 2025 09:28:39 -0800
Subject: [PATCH 03/12] improve warp reduce test coverage

---
 cub/test/utils/check_results.cuh              |  45 ++
 cub/test/utils/operator.cuh                   |  83 +++
 cub/test/warp/catch2_test_warp_reduce.cu      | 591 ++++++++++++++++++
 .../catch2_test_warp_segmented_reduce.cu}     |   0
 4 files changed, 719 insertions(+)
 create mode 100644 cub/test/utils/check_results.cuh
 create mode 100644 cub/test/utils/operator.cuh
 create mode 100644 cub/test/warp/catch2_test_warp_reduce.cu
 rename cub/test/{catch2_test_warp_reduce.cu => warp/catch2_test_warp_segmented_reduce.cu} (100%)

diff --git a/cub/test/utils/check_results.cuh b/cub/test/utils/check_results.cuh
new file mode 100644
index 00000000000..f73ec9b26a1
--- /dev/null
+++ b/cub/test/utils/check_results.cuh
@@ -0,0 +1,45 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+/**
+ * @brief Compares the results returned from system under test against the expected results.
+ */
+template <typename T, ::cuda::std::enable_if_t<::cuda::std::is_floating_point_v<T>, int> = 0>
+void verify_results(const c2h::host_vector<T>& expected_data, const c2h::device_vector<T>& test_results)
+{
+  REQUIRE_APPROX_EQ(expected_data, test_results);
+}
+
+/**
+ * @brief Compares the results returned from system under test against the expected results.
+ */
+template <typename T, ::cuda::std::enable_if_t<!::cuda::std::is_floating_point_v<T>, int> = 0>
+void verify_results(const c2h::host_vector<T>& expected_data, const c2h::device_vector<T>& test_results)
+{
+  REQUIRE(expected_data == test_results);
+}
diff --git a/cub/test/utils/operator.cuh b/cub/test/utils/operator.cuh
new file mode 100644
index 00000000000..182d434a92f
--- /dev/null
+++ b/cub/test/utils/operator.cuh
@@ -0,0 +1,83 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <cuda/functional>
+#include <cuda/std/functional>
+#include <cuda/std/limits>
+
+/***********************************************************************************************************************
+ * CUB operator to identity
+ **********************************************************************************************************************/
+
+template <typename T, typename Operator>
+constexpr T operator_identity_v;
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::std::plus<>> = T{0};
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::std::plus<T>> = T{0};
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::std::multiplies<>> = T{1};
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::std::multiplies<T>> = T{1};
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::std::bit_and<>> = T{~T{0}};
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::std::bit_and<T>> = T{~T{0}};
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::std::bit_or<>> = T{0};
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::std::bit_or<T>> = T{0};
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::std::bit_xor<>> = T{0};
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::std::bit_xor<T>> = T{0};
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::maximum<>> = ::std::numeric_limits<T>::min();
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::maximum<T>> = ::std::numeric_limits<T>::min();
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::minimum<>> = ::std::numeric_limits<T>::max();
+
+template <typename T>
+constexpr T operator_identity_v<T, cuda::minimum<T>> = ::std::numeric_limits<T>::max();
+
+struct custom_plus : cuda::std::plus<>
+{};
diff --git a/cub/test/warp/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_reduce.cu
new file mode 100644
index 00000000000..78bf960e745
--- /dev/null
+++ b/cub/test/warp/catch2_test_warp_reduce.cu
@@ -0,0 +1,591 @@
+/***********************************************************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use input source and binary forms, with or without modification, are permitted provided that the
+ * following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of conditions and the
+ *       following disclaimer.
+ *     * Redistributions input binary form must reproduce the above copyright notice, this list of conditions and the
+ *       following disclaimer input the documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote
+ *       products derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **********************************************************************************************************************/
+
+#include <cub/util_macro.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cuda/functional>
+#include <cuda/ptx>
+#include <cuda/std/bit>
+#include <cuda/std/functional>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <numeric>
+
+#include <c2h/catch2_test_helper.h>
+#include <c2h/custom_type.h>
+#include <test_util.h>
+#include <utils/check_results.cuh>
+#include <utils/operator.cuh>
+
+constexpr int warp_size = 32;
+
+template <unsigned LogicalWarpThreads, int TotalWarps, typename T, typename ReductionOp>
+__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op)
+{
+  using warp_reduce_t = cub::WarpReduce<T, LogicalWarpThreads>;
+  using storage_t     = typename warp_reduce_t::TempStorage;
+  __shared__ storage_t storage[TotalWarps];
+  constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads);
+  auto lane                      = cuda::ptx::get_sreg_laneid();
+  auto logical_warp              = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size;
+  auto logical_lane              = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane;
+  if (!is_power_of_two && lane >= LogicalWarpThreads)
+  {
+    return;
+  }
+  auto thread_data = input[threadIdx.x];
+  warp_reduce_t warp_reduce{storage[logical_warp]};
+  auto result = reduction_op(warp_reduce, thread_data);
+  if (logical_lane == 0)
+  {
+    output[logical_warp] = result;
+  }
+}
+
+template <unsigned LogicalWarpThreads, int TotalWarps, typename T, typename ReductionOp>
+__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op, int num_items)
+{
+  using warp_reduce_t = cub::WarpReduce<T, LogicalWarpThreads>;
+  using storage_t     = typename warp_reduce_t::TempStorage;
+  __shared__ storage_t storage[TotalWarps];
+  constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads);
+  auto lane                      = cuda::ptx::get_sreg_laneid();
+  auto logical_warp              = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size;
+  auto logical_lane              = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane;
+  if (!is_power_of_two && lane >= num_items)
+  {
+    return;
+  }
+  auto thread_data = input[threadIdx.x];
+  warp_reduce_t warp_reduce{storage[logical_warp]};
+  auto result = reduction_op(warp_reduce, thread_data, num_items);
+  if (logical_lane == 0)
+  {
+    output[logical_warp] = result;
+  }
+}
+inline constexpr int items_per_thread = 4;
+
+template <unsigned LogicalWarpThreads, int TotalWarps, typename T, typename ReductionOp>
+__global__ void warp_reduce_multiple_items_kernel(T* input, T* output, ReductionOp reduction_op)
+{
+  using warp_reduce_t = cub::WarpReduce<T, LogicalWarpThreads>;
+  using storage_t     = typename warp_reduce_t::TempStorage;
+  __shared__ storage_t storage[TotalWarps];
+  constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads);
+  auto lane                      = cuda::ptx::get_sreg_laneid();
+  auto logical_warp              = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size;
+  auto logical_lane              = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane;
+  if (!is_power_of_two && lane >= LogicalWarpThreads)
+  {
+    return;
+  }
+  T thread_data[items_per_thread];
+  for (int i = 0; i < items_per_thread; ++i)
+  {
+    thread_data[i] = input[threadIdx.x * items_per_thread + i];
+  }
+  warp_reduce_t warp_reduce{storage[logical_warp]};
+  static_assert(cub::detail::is_fixed_size_random_access_range_t<decltype(thread_data)>{});
+  auto result = reduction_op(warp_reduce, thread_data);
+  if (logical_lane == 0)
+  {
+    output[logical_warp] = result;
+  }
+}
+
+template <typename Op, typename T>
+struct warp_reduce_t
+{
+  template <int LogicalWarpThreads>
+  __device__ auto operator()(cub::WarpReduce<T, LogicalWarpThreads> warp_reduce, T& data) const
+  {
+    return warp_reduce.Reduce(data, Op{});
+  }
+
+  template <int LogicalWarpThreads>
+  __device__ auto operator()(cub::WarpReduce<T, LogicalWarpThreads> warp_reduce, T& data, int num_items) const
+  {
+    return warp_reduce.Reduce(data, Op{}, num_items);
+  }
+};
+
+template <typename T>
+struct warp_reduce_t<cuda::std::plus<>, T>
+{
+  template <int LogicalWarpThreads, typename... TArgs>
+  __device__ auto operator()(cub::WarpReduce<T, LogicalWarpThreads> warp_reduce, TArgs&&... args) const
+  {
+    return warp_reduce.Sum(args...);
+  }
+};
+
+template <typename T>
+struct warp_reduce_t<cuda::maximum<>, T>
+{
+  template <int LogicalWarpThreads, typename... TArgs>
+  __device__ auto operator()(cub::WarpReduce<T, LogicalWarpThreads> warp_reduce, TArgs&&... args) const
+  {
+    return warp_reduce.Max(args...);
+  }
+};
+
+template <typename T>
+struct warp_reduce_t<cuda::minimum<>, T>
+{
+  template <int LogicalWarpThreads, typename... TArgs>
+  __device__ auto operator()(cub::WarpReduce<T, LogicalWarpThreads> warp_reduce, TArgs&&... args) const
+  {
+    return warp_reduce.Min(args...);
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::Sum
+ */
+// template <typename T>
+// struct warp_sum_t
+//{
+//   template <int LogicalWarpThreads>
+//   __device__ auto operator()(cub::WarpReduce<T, LogicalWarpThreads> warp_reduce, const T& thread_data) const
+//   {
+//     return warp_reduce.Sum(thread_data);
+//   }
+// };
+//
+///**
+// * @brief Delegate wrapper for partial WarpReduce::Sum
+// */
+// template <typename T>
+// struct warp_sum_partial_t
+//{
+//  int num_valid;
+//  template <int LogicalWarpThreads>
+//  __device__ __forceinline__ T
+//  operator()(int linear_tid, cub::WarpReduce<T, LogicalWarpThreads>& warp_reduce, T& thread_data) const
+//  {
+//    auto result = warp_reduce.Sum(thread_data, num_valid);
+//    return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data;
+//  }
+//};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::Reduce
+ */
+// template <typename T, typename ReductionOpT>
+// struct warp_reduce_t
+//{
+//   ReductionOpT reduction_op;
+//   template <int LogicalWarpThreads>
+//   __device__ __forceinline__ T
+//   operator()(int linear_tid, cub::WarpReduce<T, LogicalWarpThreads>& warp_reduce, T& thread_data) const
+//   {
+//     auto result = warp_reduce.Reduce(thread_data, reduction_op);
+//     return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data;
+//   }
+// };
+//
+///**
+// * @brief Delegate wrapper for partial WarpReduce::Reduce
+// */
+// template <typename T, typename ReductionOpT>
+// struct warp_reduce_partial_t
+//{
+//  int num_valid;
+//  ReductionOpT reduction_op;
+//  template <int LogicalWarpThreads>
+//  __device__ T operator()(int linear_tid, cub::WarpReduce<T, LogicalWarpThreads>& warp_reduce, T& thread_data) const
+//  {
+//    auto result = warp_reduce.Reduce(thread_data, reduction_op, num_valid);
+//    return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data;
+//  }
+//};
+
+template <int LogicalWarpThreads, int TotalWarps, typename T, typename... TArgs>
+void warp_reduce_launch(c2h::device_vector<T>& input, c2h::device_vector<T>& output, TArgs... args)
+{
+  warp_reduce_kernel<LogicalWarpThreads, TotalWarps><<<1, TotalWarps * warp_size>>>(
+    thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), args...);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+template <int LogicalWarpThreads, int TotalWarps, typename T, typename... TArgs>
+void warp_reduce_multiple_items_launch(c2h::device_vector<T>& input, c2h::device_vector<T>& output, TArgs... args)
+{
+  warp_reduce_multiple_items_kernel<LogicalWarpThreads, TotalWarps><<<1, TotalWarps * warp_size>>>(
+    thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), args...);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+/***********************************************************************************************************************
+ * Types
+ **********************************************************************************************************************/
+#if 0
+// List of types to test
+using custom_t =
+  c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t, c2h::lexicographical_less_comparable_t>;
+
+using full_type_list = c2h::type_list<uint8_t, uint16_t, int32_t, int64_t, custom_t, ulonglong4, uchar3, short2>;
+
+using builtin_type_list = c2h::type_list<uint8_t, uint16_t, int32_t, int64_t>;
+
+using predefined_op_list = c2h::type_list<::cuda::std::plus<>, ::cuda::maximum<>, ::cuda::minimum<>>;
+
+using logical_warp_threads = c2h::enum_type_list<unsigned, 32, 16, 9, 7, 1>;
+
+#else
+using builtin_type_list = c2h::type_list<int32_t>;
+
+using predefined_op_list = c2h::type_list<::cuda::std::plus<>>;
+
+using logical_warp_threads = c2h::enum_type_list<unsigned, 32>;
+#endif
+
+/***********************************************************************************************************************
+ * Reference
+ **********************************************************************************************************************/
+
+template <typename predefined_op, typename T>
+void compute_host_reference(
+  const c2h::host_vector<T>& h_in,
+  c2h::host_vector<T>& h_out,
+  int total_warps,
+  int items_per_warp,
+  int logical_warps,
+  int logical_warp_stride,
+  int items_per_logical_warp = 0)
+{
+  constexpr auto identity = operator_identity_v<T, predefined_op>;
+  items_per_logical_warp  = items_per_logical_warp == 0 ? logical_warp_stride : items_per_logical_warp;
+  for (int i = 0; i < total_warps; ++i)
+  {
+    for (int j = 0; j < logical_warps; ++j)
+    {
+      auto start                   = h_in.begin() + i * items_per_warp + j * logical_warp_stride;
+      auto end                     = start + items_per_logical_warp;
+      h_out[i * logical_warps + j] = std::accumulate(start, end, identity, predefined_op{});
+    }
+  }
+}
+
+/***********************************************************************************************************************
+ * Test cases
+ **********************************************************************************************************************/
+/*
+C2H_TEST("WarpReduce::Sum", "[reduce][warp][predefined][full]", full_type_list, logical_warp_threads)
+{
+  using type                          = c2h::get<0, TestType>;
+  constexpr auto logical_warp_threads = c2h::get<1, TestType>::value;
+  constexpr auto total_warps          = 4u;
+  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
+  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
+  constexpr auto input_size           = total_warps * warp_size;
+  constexpr auto output_size          = total_warps * logical_warps;
+  CAPTURE(c2h::type_name<type>(), logical_warp_threads);
+  c2h::device_vector<type> d_in(input_size);
+  c2h::device_vector<type> d_out(output_size);
+  c2h::gen(C2H_SEED(10), d_in);
+  // Run test
+  warp_reduce_launch<logical_warp_threads, total_warps>(d_in, d_out, warp_reduce_t<cuda::std::plus<>, type>{});
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_out(output_size);
+  compute_host_reference<cuda::std::plus<>>(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads);
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("WarpReduce::Sum/Max/Min",
+         "[reduce][warp][predefined][full]",
+         builtin_type_list,
+         predefined_op_list,
+         logical_warp_threads)
+{
+  using type                          = c2h::get<0, TestType>;
+  using predefined_op                 = c2h::get<1, TestType>;
+  constexpr auto logical_warp_threads = c2h::get<2, TestType>::value;
+  constexpr auto total_warps          = 4u;
+  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
+  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
+  constexpr auto input_size           = total_warps * warp_size;
+  constexpr auto output_size          = total_warps * logical_warps;
+  CAPTURE(c2h::type_name<type>(), c2h::type_name<predefined_op>(), logical_warp_threads);
+  c2h::device_vector<type> d_in(input_size);
+  c2h::device_vector<type> d_out(output_size);
+  c2h::gen(C2H_SEED(10), d_in);
+  // Run test
+  warp_reduce_launch<logical_warp_threads, total_warps>(d_in, d_out, warp_reduce_t<predefined_op, type>{});
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_out(output_size);
+  compute_host_reference<predefined_op>(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads);
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][full]", full_type_list, logical_warp_threads)
+{
+  using type                          = c2h::get<0, TestType>;
+  constexpr auto logical_warp_threads = c2h::get<1, TestType>::value;
+  constexpr auto total_warps          = 4u;
+  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
+  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
+  constexpr auto input_size           = total_warps * warp_size;
+  constexpr auto output_size          = total_warps * logical_warps;
+  CAPTURE(c2h::type_name<type>(), logical_warp_threads);
+  c2h::device_vector<type> d_in(input_size);
+  c2h::device_vector<type> d_out(output_size);
+  c2h::gen(C2H_SEED(1), d_in);
+  // Run test
+  warp_reduce_launch<logical_warp_threads, total_warps>(d_in, d_out, warp_reduce_t<custom_plus, type>{});
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_out(output_size);
+  compute_host_reference<cuda::std::plus<>>(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads);
+  verify_results(h_out, d_out);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// partial
+
+C2H_TEST("WarpReduce::Sum/Max/Min Partial",
+         "[reduce][warp][predefined][partial]",
+         builtin_type_list,
+         predefined_op_list,
+         logical_warp_threads)
+{
+  using type                          = c2h::get<0, TestType>;
+  using predefined_op                 = c2h::get<1, TestType>;
+  constexpr auto logical_warp_threads = c2h::get<2, TestType>::value;
+  constexpr auto total_warps          = 4u;
+  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
+  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
+  constexpr auto input_size           = total_warps * warp_size;
+  constexpr auto output_size          = total_warps * logical_warps;
+  const int valid_items               = GENERATE_COPY(take(2, random(1u, logical_warp_threads)));
+  CAPTURE(c2h::type_name<type>(), c2h::type_name<predefined_op>(), logical_warp_threads, valid_items);
+  c2h::device_vector<type> d_in(input_size);
+  c2h::device_vector<type> d_out(output_size);
+  c2h::gen(C2H_SEED(10), d_in);
+  // Run test
+  warp_reduce_launch<logical_warp_threads, total_warps>(d_in, d_out, warp_reduce_t<predefined_op, type>{}, valid_items);
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_out(output_size);
+  compute_host_reference<predefined_op>(
+    h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads, valid_items);
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][partial]", full_type_list, logical_warp_threads)
+{
+  using type                          = c2h::get<0, TestType>;
+  constexpr auto logical_warp_threads = c2h::get<1, TestType>::value;
+  constexpr auto total_warps          = 4u;
+  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
+  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
+  constexpr auto input_size           = total_warps * warp_size;
+  constexpr auto output_size          = total_warps * logical_warps;
+  const int valid_items               = GENERATE_COPY(take(2, random(1u, logical_warp_threads)));
+  CAPTURE(c2h::type_name<type>(), logical_warp_threads);
+  c2h::device_vector<type> d_in(input_size);
+  c2h::device_vector<type> d_out(output_size);
+  c2h::gen(C2H_SEED(1), d_in);
+  // Run test
+  warp_reduce_launch<logical_warp_threads, total_warps>(d_in, d_out, warp_reduce_t<custom_plus, type>{}, valid_items);
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_out(output_size);
+  compute_host_reference<cuda::std::plus<>>(
+    h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads, valid_items);
+  verify_results(h_out, d_out);
+}
+*/
+//----------------------------------------------------------------------------------------------------------------------
+// multiple items per thread
+
+C2H_TEST("WarpReduce::Sum/Max/Min Multiple Items Per Thread",
+         "[reduce][warp][predefined][full]",
+         builtin_type_list,
+         predefined_op_list,
+         logical_warp_threads)
+{
+  using type                          = c2h::get<0, TestType>;
+  using predefined_op                 = c2h::get<1, TestType>;
+  constexpr auto logical_warp_threads = c2h::get<2, TestType>::value;
+  constexpr auto total_warps          = 4u;
+  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
+  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
+  constexpr auto input_size           = total_warps * warp_size * items_per_thread;
+  constexpr auto output_size          = total_warps * logical_warps;
+  CAPTURE(c2h::type_name<type>(), c2h::type_name<predefined_op>(), logical_warp_threads);
+  c2h::device_vector<type> d_in(input_size, 1);
+  c2h::device_vector<type> d_out(output_size);
+  // c2h::gen(C2H_SEED(1), d_in);
+  //  Run test
+  warp_reduce_multiple_items_launch<logical_warp_threads, total_warps>(
+    d_in, d_out, warp_reduce_t<predefined_op, type>{});
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_out(output_size);
+  compute_host_reference<predefined_op>(
+    h_in,
+    h_out,
+    total_warps,
+    warp_size * items_per_thread,
+    logical_warps,
+    logical_warp_threads,
+    logical_warp_threads * items_per_thread);
+  verify_results(h_out, d_out);
+}
+
+#if 0
+C2H_TEST("Warp sum works", "[reduce][warp][predefined]", builtin_type_list, logical_warp_threads, predefined_op_list)
+{
+  using params                        = params_t<TestType>;
+  constexpr auto logical_warp_threads = params::logical_warp_threads;
+  constexpr auto total_warps          = params::total_warps;
+  using type                          = typename params::type;
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::input_size);
+  c2h::device_vector<type> d_out(total_warps);
+  constexpr auto valid_items = logical_warp_threads;
+  c2h::gen(C2H_SEED(10), d_in);
+  // Run test
+  warp_reduce<logical_warp_threads, total_warps>(d_in, d_out, warp_sum_t<type>{});
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_out(total_warps);
+  for (int i = 0; i < total_warps; ++i)
+  {
+    auto start = h_in.begin() + i * logical_warp_threads;
+    auto end   = h_in.begin() + (i + 1) * logical_warp_threads;
+    h_out[i]   = std::accumulate(start, end, type{});
+  }
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("Warp reduce works", "[reduce][warp]", builtin_type_list, logical_warp_threads)
+{
+  using params   = params_t<TestType>;
+  using type     = typename params::type;
+  using red_op_t = ::cuda::minimum<>;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::input_size);
+  c2h::device_vector<type> d_out(params::input_size);
+  constexpr auto valid_items = params::logical_warp_threads;
+  c2h::gen(C2H_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_reduce_t<type, red_op_t>{red_op_t{}});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in  = d_in;
+  c2h::host_vector<type> h_out = h_in;
+  auto h_flags                 = thrust::make_constant_iterator(false);
+  compute_host_reference(
+    reduce_mode::all,
+    h_in,
+    h_flags,
+    params::total_warps,
+    params::logical_warp_threads,
+    valid_items,
+    red_op_t{},
+    h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("Warp sum on partial warp works", "[reduce][warp]", full_type_list, logical_warp_threads)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::input_size);
+  c2h::device_vector<type> d_out(params::input_size);
+  const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads)));
+  c2h::gen(C2H_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_sum_partial_t<type>{valid_items});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in  = d_in;
+  c2h::host_vector<type> h_out = h_in;
+  auto h_flags                 = thrust::make_constant_iterator(false);
+  compute_host_reference(
+    reduce_mode::all,
+    h_in,
+    h_flags,
+    params::total_warps,
+    params::logical_warp_threads,
+    valid_items,
+    ::cuda::std::plus<type>{},
+    h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("Warp reduce on partial warp works", "[reduce][warp]", builtin_type_list, logical_warp_threads)
+{
+  using params   = params_t<TestType>;
+  using type     = typename params::type;
+  using red_op_t = ::cuda::minimum<>;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::input_size);
+  c2h::device_vector<type> d_out(params::input_size);
+  const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads)));
+  c2h::gen(C2H_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(
+    d_in, d_out, warp_reduce_partial_t<type, red_op_t>{valid_items, red_op_t{}});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in  = d_in;
+  c2h::host_vector<type> h_out = h_in;
+  auto h_flags                 = thrust::make_constant_iterator(false);
+  compute_host_reference(
+    reduce_mode::all,
+    h_in,
+    h_flags,
+    params::total_warps,
+    params::logical_warp_threads,
+    valid_items,
+    red_op_t{},
+    h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+#endif
diff --git a/cub/test/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_segmented_reduce.cu
similarity index 100%
rename from cub/test/catch2_test_warp_reduce.cu
rename to cub/test/warp/catch2_test_warp_segmented_reduce.cu

From 4fd702d48f81d1ce83814eb4c711ffdecd828101 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 20 Feb 2025 09:29:01 -0800
Subject: [PATCH 04/12] remove volatile

---
 cub/cub/warp/specializations/warp_reduce_smem.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index 1a373204b63..8e7ae3bd9b1 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -291,7 +291,7 @@ struct WarpReduceSmem
     };
 
     // Alias flags onto shared data storage
-    volatile SmemFlag* flag_storage = temp_storage.flags;
+    SmemFlag* flag_storage = temp_storage.flags;
 
     SmemFlag flag_status = (flag) ? SET : UNSET;
 

From 3b5e5d490384f1f7d0db9d2799e49a22fc797977 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 20 Feb 2025 09:29:13 -0800
Subject: [PATCH 05/12] add missing overloadings

---
 cub/cub/warp/warp_reduce.cuh | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh
index 183d3454ba3..678c0892480 100644
--- a/cub/cub/warp/warp_reduce.cuh
+++ b/cub/cub/warp/warp_reduce.cuh
@@ -320,6 +320,18 @@ public:
     return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, ::cuda::std::plus<>{});
   }
 
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input, int valid_items)
+  {
+    // Determine if we don't need bounds checking
+    return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, ::cuda::maximum<>{});
+  }
+
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Min(T input, int valid_items)
+  {
+    // Determine if we don't need bounds checking
+    return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, ::cuda::minimum<>{});
+  }
+
   //! @rst
   //! Computes a segmented sum in the calling warp where segments are defined by head-flags.
   //! The sum of each segment is returned to the first lane in that segment
@@ -727,7 +739,6 @@ public:
     return cub::ThreadReduce(input, ::cuda::std::plus<>{});
   }
 
-  template <typename InputType>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int /* valid_items */)
   {
     return input;
@@ -746,7 +757,6 @@ public:
     return cub::ThreadReduce(input, ::cuda::maximum<>{});
   }
 
-  template <typename InputType>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input, int /* valid_items */)
   {
     return input;

From 0da4c572bfa8f4a26e5593cf8636d6493df56819 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 20 Feb 2025 11:06:16 -0800
Subject: [PATCH 06/12] fix type traits

---
 cub/cub/warp/warp_reduce.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh
index 678c0892480..37966696807 100644
--- a/cub/cub/warp/warp_reduce.cuh
+++ b/cub/cub/warp/warp_reduce.cuh
@@ -229,7 +229,7 @@ public:
 
   template <typename InputType>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
-  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range<InputType>::value, T> //
   Sum(const InputType& input)
   {
     auto thread_reduction = cub::ThreadReduce(input, ::cuda::std::plus<>{});

From bab0a3b4d90237c960b05e107fbce83d0856b02b Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 20 Feb 2025 11:24:17 -0800
Subject: [PATCH 07/12] fix overloadings

---
 cub/cub/warp/warp_reduce.cuh             |  14 +--
 cub/test/warp/catch2_test_warp_reduce.cu | 137 +----------------------
 2 files changed, 12 insertions(+), 139 deletions(-)

diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh
index 37966696807..3776496959b 100644
--- a/cub/cub/warp/warp_reduce.cuh
+++ b/cub/cub/warp/warp_reduce.cuh
@@ -244,7 +244,7 @@ public:
 
   template <typename InputType>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
-  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range<InputType>::value, T> //
   Max(const InputType& input)
   {
     auto thread_reduction = cub::ThreadReduce(input, ::cuda::maximum<>{});
@@ -259,7 +259,7 @@ public:
 
   template <typename InputType>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
-  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range<InputType>::value, T> //
   Min(const InputType& input)
   {
     auto thread_reduction = cub::ThreadReduce(input, ::cuda::minimum<>{});
@@ -501,7 +501,7 @@ public:
 
   template <typename InputType, typename ReductionOp>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
-  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range<InputType>::value, T> //
   Reduce(const InputType& input, ReductionOp reduction_op)
   {
     auto thread_reduction = cub::ThreadReduce(input, reduction_op);
@@ -733,7 +733,7 @@ public:
 
   template <typename InputType>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
-  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range<InputType>::value, T> //
   Sum(const InputType& input)
   {
     return cub::ThreadReduce(input, ::cuda::std::plus<>{});
@@ -751,7 +751,7 @@ public:
 
   template <typename InputType>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
-  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range<InputType>::value, T> //
   Max(const InputType& input)
   {
     return cub::ThreadReduce(input, ::cuda::maximum<>{});
@@ -769,7 +769,7 @@ public:
 
   template <typename InputType>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
-  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range<InputType>::value, T> //
   Min(const InputType& input)
   {
     return cub::ThreadReduce(input, ::cuda::minimum<>{});
@@ -800,7 +800,7 @@ public:
 
   template <typename InputType, typename ReductionOp>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE //
-  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range_t<InputType>{}, T> //
+  ::cuda::std::enable_if_t<detail::is_fixed_size_random_access_range<InputType>::value, T> //
   Reduce(const InputType& input, ReductionOp reduction_op)
   {
     return cub::ThreadReduce(input, reduction_op);
diff --git a/cub/test/warp/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_reduce.cu
index 78bf960e745..ea103d7c315 100644
--- a/cub/test/warp/catch2_test_warp_reduce.cu
+++ b/cub/test/warp/catch2_test_warp_reduce.cu
@@ -109,7 +109,6 @@ __global__ void warp_reduce_multiple_items_kernel(T* input, T* output, Reduction
     thread_data[i] = input[threadIdx.x * items_per_thread + i];
   }
   warp_reduce_t warp_reduce{storage[logical_warp]};
-  static_assert(cub::detail::is_fixed_size_random_access_range_t<decltype(thread_data)>{});
   auto result = reduction_op(warp_reduce, thread_data);
   if (logical_lane == 0)
   {
@@ -247,7 +246,7 @@ void warp_reduce_multiple_items_launch(c2h::device_vector<T>& input, c2h::device
 /***********************************************************************************************************************
  * Types
  **********************************************************************************************************************/
-#if 0
+#if 1
 // List of types to test
 using custom_t =
   c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t, c2h::lexicographical_less_comparable_t>;
@@ -299,7 +298,7 @@ void compute_host_reference(
  * Test cases
  **********************************************************************************************************************/
 /*
-C2H_TEST("WarpReduce::Sum", "[reduce][warp][predefined][full]", full_type_list, logical_warp_threads)
+C2H_TEST("WarpReduce::Sum", "[reduce][warp][predefined_op][full]", full_type_list, logical_warp_threads)
 {
   using type                          = c2h::get<0, TestType>;
   constexpr auto logical_warp_threads = c2h::get<1, TestType>::value;
@@ -322,7 +321,7 @@ C2H_TEST("WarpReduce::Sum", "[reduce][warp][predefined][full]", full_type_list,
 }
 
 C2H_TEST("WarpReduce::Sum/Max/Min",
-         "[reduce][warp][predefined][full]",
+         "[reduce][warp][predefined_op][full]",
          builtin_type_list,
          predefined_op_list,
          logical_warp_threads)
@@ -374,7 +373,7 @@ C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][full]", full_type_list, log
 // partial
 
 C2H_TEST("WarpReduce::Sum/Max/Min Partial",
-         "[reduce][warp][predefined][partial]",
+         "[reduce][warp][predefined_op][partial]",
          builtin_type_list,
          predefined_op_list,
          logical_warp_threads)
@@ -430,7 +429,7 @@ C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][partial]", full_type_list,
 // multiple items per thread
 
 C2H_TEST("WarpReduce::Sum/Max/Min Multiple Items Per Thread",
-         "[reduce][warp][predefined][full]",
+         "[reduce][warp][predefined_op][full]",
          builtin_type_list,
          predefined_op_list,
          logical_warp_threads)
@@ -463,129 +462,3 @@ C2H_TEST("WarpReduce::Sum/Max/Min Multiple Items Per Thread",
     logical_warp_threads * items_per_thread);
   verify_results(h_out, d_out);
 }
-
-#if 0
-C2H_TEST("Warp sum works", "[reduce][warp][predefined]", builtin_type_list, logical_warp_threads, predefined_op_list)
-{
-  using params                        = params_t<TestType>;
-  constexpr auto logical_warp_threads = params::logical_warp_threads;
-  constexpr auto total_warps          = params::total_warps;
-  using type                          = typename params::type;
-  // Prepare test data
-  c2h::device_vector<type> d_in(params::input_size);
-  c2h::device_vector<type> d_out(total_warps);
-  constexpr auto valid_items = logical_warp_threads;
-  c2h::gen(C2H_SEED(10), d_in);
-  // Run test
-  warp_reduce<logical_warp_threads, total_warps>(d_in, d_out, warp_sum_t<type>{});
-
-  c2h::host_vector<type> h_in = d_in;
-  c2h::host_vector<type> h_out(total_warps);
-  for (int i = 0; i < total_warps; ++i)
-  {
-    auto start = h_in.begin() + i * logical_warp_threads;
-    auto end   = h_in.begin() + (i + 1) * logical_warp_threads;
-    h_out[i]   = std::accumulate(start, end, type{});
-  }
-  verify_results(h_out, d_out);
-}
-
-C2H_TEST("Warp reduce works", "[reduce][warp]", builtin_type_list, logical_warp_threads)
-{
-  using params   = params_t<TestType>;
-  using type     = typename params::type;
-  using red_op_t = ::cuda::minimum<>;
-
-  // Prepare test data
-  c2h::device_vector<type> d_in(params::input_size);
-  c2h::device_vector<type> d_out(params::input_size);
-  constexpr auto valid_items = params::logical_warp_threads;
-  c2h::gen(C2H_SEED(10), d_in);
-
-  // Run test
-  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_reduce_t<type, red_op_t>{red_op_t{}});
-
-  // Prepare verification data
-  c2h::host_vector<type> h_in  = d_in;
-  c2h::host_vector<type> h_out = h_in;
-  auto h_flags                 = thrust::make_constant_iterator(false);
-  compute_host_reference(
-    reduce_mode::all,
-    h_in,
-    h_flags,
-    params::total_warps,
-    params::logical_warp_threads,
-    valid_items,
-    red_op_t{},
-    h_out.begin());
-
-  // Verify results
-  verify_results(h_out, d_out);
-}
-
-C2H_TEST("Warp sum on partial warp works", "[reduce][warp]", full_type_list, logical_warp_threads)
-{
-  using params = params_t<TestType>;
-  using type   = typename params::type;
-
-  // Prepare test data
-  c2h::device_vector<type> d_in(params::input_size);
-  c2h::device_vector<type> d_out(params::input_size);
-  const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads)));
-  c2h::gen(C2H_SEED(10), d_in);
-
-  // Run test
-  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_sum_partial_t<type>{valid_items});
-
-  // Prepare verification data
-  c2h::host_vector<type> h_in  = d_in;
-  c2h::host_vector<type> h_out = h_in;
-  auto h_flags                 = thrust::make_constant_iterator(false);
-  compute_host_reference(
-    reduce_mode::all,
-    h_in,
-    h_flags,
-    params::total_warps,
-    params::logical_warp_threads,
-    valid_items,
-    ::cuda::std::plus<type>{},
-    h_out.begin());
-
-  // Verify results
-  verify_results(h_out, d_out);
-}
-
-C2H_TEST("Warp reduce on partial warp works", "[reduce][warp]", builtin_type_list, logical_warp_threads)
-{
-  using params   = params_t<TestType>;
-  using type     = typename params::type;
-  using red_op_t = ::cuda::minimum<>;
-
-  // Prepare test data
-  c2h::device_vector<type> d_in(params::input_size);
-  c2h::device_vector<type> d_out(params::input_size);
-  const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads)));
-  c2h::gen(C2H_SEED(10), d_in);
-
-  // Run test
-  warp_reduce<params::logical_warp_threads, params::total_warps>(
-    d_in, d_out, warp_reduce_partial_t<type, red_op_t>{valid_items, red_op_t{}});
-
-  // Prepare verification data
-  c2h::host_vector<type> h_in  = d_in;
-  c2h::host_vector<type> h_out = h_in;
-  auto h_flags                 = thrust::make_constant_iterator(false);
-  compute_host_reference(
-    reduce_mode::all,
-    h_in,
-    h_flags,
-    params::total_warps,
-    params::logical_warp_threads,
-    valid_items,
-    red_op_t{},
-    h_out.begin());
-
-  // Verify results
-  verify_results(h_out, d_out);
-}
-#endif

From 3ed706605fd8ccf53fb835d925979e9ca2503117 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 20 Feb 2025 12:49:48 -0800
Subject: [PATCH 08/12] simplify unit test

---
 cub/test/warp/catch2_test_warp_reduce.cu | 260 +++++++----------------
 1 file changed, 76 insertions(+), 184 deletions(-)

diff --git a/cub/test/warp/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_reduce.cu
index ea103d7c315..c8c5fd3c1bd 100644
--- a/cub/test/warp/catch2_test_warp_reduce.cu
+++ b/cub/test/warp/catch2_test_warp_reduce.cu
@@ -40,61 +40,49 @@
 #include <utils/check_results.cuh>
 #include <utils/operator.cuh>
 
-constexpr int warp_size = 32;
+/***********************************************************************************************************************
+ * Constants
+ **********************************************************************************************************************/
 
-template <unsigned LogicalWarpThreads, int TotalWarps, typename T, typename ReductionOp>
-__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op)
-{
-  using warp_reduce_t = cub::WarpReduce<T, LogicalWarpThreads>;
-  using storage_t     = typename warp_reduce_t::TempStorage;
-  __shared__ storage_t storage[TotalWarps];
-  constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads);
-  auto lane                      = cuda::ptx::get_sreg_laneid();
-  auto logical_warp              = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size;
-  auto logical_lane              = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane;
-  if (!is_power_of_two && lane >= LogicalWarpThreads)
-  {
-    return;
-  }
-  auto thread_data = input[threadIdx.x];
-  warp_reduce_t warp_reduce{storage[logical_warp]};
-  auto result = reduction_op(warp_reduce, thread_data);
-  if (logical_lane == 0)
-  {
-    output[logical_warp] = result;
-  }
-}
+inline constexpr int warp_size        = 32;
+inline constexpr auto total_warps     = 4u;
+inline constexpr int items_per_thread = 4;
+
+/***********************************************************************************************************************
+ * Kernel
+ **********************************************************************************************************************/
 
-template <unsigned LogicalWarpThreads, int TotalWarps, typename T, typename ReductionOp>
-__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op, int num_items)
+template <unsigned LogicalWarpThreads, bool EnableNumItems, typename T, typename ReductionOp>
+__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op, int num_items = 0)
 {
   using warp_reduce_t = cub::WarpReduce<T, LogicalWarpThreads>;
   using storage_t     = typename warp_reduce_t::TempStorage;
-  __shared__ storage_t storage[TotalWarps];
+  __shared__ storage_t storage[total_warps];
   constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads);
   auto lane                      = cuda::ptx::get_sreg_laneid();
   auto logical_warp              = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size;
   auto logical_lane              = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane;
-  if (!is_power_of_two && lane >= num_items)
+  auto limit                     = EnableNumItems ? num_items : LogicalWarpThreads;
+  if (!is_power_of_two && lane >= limit)
   {
     return;
   }
   auto thread_data = input[threadIdx.x];
   warp_reduce_t warp_reduce{storage[logical_warp]};
-  auto result = reduction_op(warp_reduce, thread_data, num_items);
+  auto result = EnableNumItems ? reduction_op(warp_reduce, thread_data, num_items) //
+                               : reduction_op(warp_reduce, thread_data);
   if (logical_lane == 0)
   {
     output[logical_warp] = result;
   }
 }
-inline constexpr int items_per_thread = 4;
 
-template <unsigned LogicalWarpThreads, int TotalWarps, typename T, typename ReductionOp>
+template <unsigned LogicalWarpThreads, typename T, typename ReductionOp>
 __global__ void warp_reduce_multiple_items_kernel(T* input, T* output, ReductionOp reduction_op)
 {
   using warp_reduce_t = cub::WarpReduce<T, LogicalWarpThreads>;
   using storage_t     = typename warp_reduce_t::TempStorage;
-  __shared__ storage_t storage[TotalWarps];
+  __shared__ storage_t storage[total_warps];
   constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads);
   auto lane                      = cuda::ptx::get_sreg_laneid();
   auto logical_warp              = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size;
@@ -162,81 +150,20 @@ struct warp_reduce_t<cuda::minimum<>, T>
   }
 };
 
-/**
- * @brief Delegate wrapper for WarpReduce::Sum
- */
-// template <typename T>
-// struct warp_sum_t
-//{
-//   template <int LogicalWarpThreads>
-//   __device__ auto operator()(cub::WarpReduce<T, LogicalWarpThreads> warp_reduce, const T& thread_data) const
-//   {
-//     return warp_reduce.Sum(thread_data);
-//   }
-// };
-//
-///**
-// * @brief Delegate wrapper for partial WarpReduce::Sum
-// */
-// template <typename T>
-// struct warp_sum_partial_t
-//{
-//  int num_valid;
-//  template <int LogicalWarpThreads>
-//  __device__ __forceinline__ T
-//  operator()(int linear_tid, cub::WarpReduce<T, LogicalWarpThreads>& warp_reduce, T& thread_data) const
-//  {
-//    auto result = warp_reduce.Sum(thread_data, num_valid);
-//    return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data;
-//  }
-//};
-
-/**
- * @brief Delegate wrapper for WarpReduce::Reduce
- */
-// template <typename T, typename ReductionOpT>
-// struct warp_reduce_t
-//{
-//   ReductionOpT reduction_op;
-//   template <int LogicalWarpThreads>
-//   __device__ __forceinline__ T
-//   operator()(int linear_tid, cub::WarpReduce<T, LogicalWarpThreads>& warp_reduce, T& thread_data) const
-//   {
-//     auto result = warp_reduce.Reduce(thread_data, reduction_op);
-//     return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data;
-//   }
-// };
-//
-///**
-// * @brief Delegate wrapper for partial WarpReduce::Reduce
-// */
-// template <typename T, typename ReductionOpT>
-// struct warp_reduce_partial_t
-//{
-//  int num_valid;
-//  ReductionOpT reduction_op;
-//  template <int LogicalWarpThreads>
-//  __device__ T operator()(int linear_tid, cub::WarpReduce<T, LogicalWarpThreads>& warp_reduce, T& thread_data) const
-//  {
-//    auto result = warp_reduce.Reduce(thread_data, reduction_op, num_valid);
-//    return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data;
-//  }
-//};
-
-template <int LogicalWarpThreads, int TotalWarps, typename T, typename... TArgs>
+template <int LogicalWarpThreads, bool EnableNumItems = false, typename T, typename... TArgs>
 void warp_reduce_launch(c2h::device_vector<T>& input, c2h::device_vector<T>& output, TArgs... args)
 {
-  warp_reduce_kernel<LogicalWarpThreads, TotalWarps><<<1, TotalWarps * warp_size>>>(
+  warp_reduce_kernel<LogicalWarpThreads, EnableNumItems><<<1, total_warps * warp_size>>>(
     thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), args...);
 
   REQUIRE(cudaSuccess == cudaPeekAtLastError());
   REQUIRE(cudaSuccess == cudaDeviceSynchronize());
 }
 
-template <int LogicalWarpThreads, int TotalWarps, typename T, typename... TArgs>
+template <int LogicalWarpThreads, typename T, typename... TArgs>
 void warp_reduce_multiple_items_launch(c2h::device_vector<T>& input, c2h::device_vector<T>& output, TArgs... args)
 {
-  warp_reduce_multiple_items_kernel<LogicalWarpThreads, TotalWarps><<<1, TotalWarps * warp_size>>>(
+  warp_reduce_multiple_items_kernel<LogicalWarpThreads><<<1, total_warps * warp_size>>>(
     thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), args...);
 
   REQUIRE(cudaSuccess == cudaPeekAtLastError());
@@ -246,8 +173,7 @@ void warp_reduce_multiple_items_launch(c2h::device_vector<T>& input, c2h::device
 /***********************************************************************************************************************
  * Types
  **********************************************************************************************************************/
-#if 1
-// List of types to test
+
 using custom_t =
   c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t, c2h::lexicographical_less_comparable_t>;
 
@@ -259,14 +185,6 @@ using predefined_op_list = c2h::type_list<::cuda::std::plus<>, ::cuda::maximum<>
 
 using logical_warp_threads = c2h::enum_type_list<unsigned, 32, 16, 9, 7, 1>;
 
-#else
-using builtin_type_list = c2h::type_list<int32_t>;
-
-using predefined_op_list = c2h::type_list<::cuda::std::plus<>>;
-
-using logical_warp_threads = c2h::enum_type_list<unsigned, 32>;
-#endif
-
 /***********************************************************************************************************************
  * Reference
  **********************************************************************************************************************/
@@ -275,7 +193,6 @@ template <typename predefined_op, typename T>
 void compute_host_reference(
   const c2h::host_vector<T>& h_in,
   c2h::host_vector<T>& h_out,
-  int total_warps,
   int items_per_warp,
   int logical_warps,
   int logical_warp_stride,
@@ -283,7 +200,7 @@ void compute_host_reference(
 {
   constexpr auto identity = operator_identity_v<T, predefined_op>;
   items_per_logical_warp  = items_per_logical_warp == 0 ? logical_warp_stride : items_per_logical_warp;
-  for (int i = 0; i < total_warps; ++i)
+  for (unsigned i = 0; i < total_warps; ++i)
   {
     for (int j = 0; j < logical_warps; ++j)
     {
@@ -294,78 +211,72 @@ void compute_host_reference(
   }
 }
 
+std::array<unsigned, 3> get_test_config(unsigned logical_warp_threads, unsigned items_per_thread1 = 1)
+{
+  bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads);
+  auto logical_warps   = is_power_of_two ? warp_size / logical_warp_threads : 1;
+  auto input_size      = total_warps * warp_size * items_per_thread1;
+  auto output_size     = total_warps * logical_warps;
+  return {input_size, output_size, logical_warps};
+}
+
 /***********************************************************************************************************************
  * Test cases
  **********************************************************************************************************************/
-/*
-C2H_TEST("WarpReduce::Sum", "[reduce][warp][predefined_op][full]", full_type_list, logical_warp_threads)
+
+C2H_TEST("WarpReduce::Sum, full_type_list", "[reduce][warp][predefined_op][full]", full_type_list, logical_warp_threads)
 {
-  using type                          = c2h::get<0, TestType>;
-  constexpr auto logical_warp_threads = c2h::get<1, TestType>::value;
-  constexpr auto total_warps          = 4u;
-  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
-  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
-  constexpr auto input_size           = total_warps * warp_size;
-  constexpr auto output_size          = total_warps * logical_warps;
+  using type                                    = c2h::get<0, TestType>;
+  constexpr auto logical_warp_threads           = c2h::get<1, TestType>::value;
+  auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads);
   CAPTURE(c2h::type_name<type>(), logical_warp_threads);
   c2h::device_vector<type> d_in(input_size);
   c2h::device_vector<type> d_out(output_size);
   c2h::gen(C2H_SEED(10), d_in);
-  // Run test
-  warp_reduce_launch<logical_warp_threads, total_warps>(d_in, d_out, warp_reduce_t<cuda::std::plus<>, type>{});
+  warp_reduce_launch<logical_warp_threads>(d_in, d_out, warp_reduce_t<cuda::std::plus<>, type>{});
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<cuda::std::plus<>>(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads);
+  compute_host_reference<cuda::std::plus<>>(h_in, h_out, warp_size, logical_warps, logical_warp_threads);
   verify_results(h_out, d_out);
 }
 
-C2H_TEST("WarpReduce::Sum/Max/Min",
+C2H_TEST("WarpReduce::Sum/Max/Min, builtin types",
          "[reduce][warp][predefined_op][full]",
          builtin_type_list,
          predefined_op_list,
          logical_warp_threads)
 {
-  using type                          = c2h::get<0, TestType>;
-  using predefined_op                 = c2h::get<1, TestType>;
-  constexpr auto logical_warp_threads = c2h::get<2, TestType>::value;
-  constexpr auto total_warps          = 4u;
-  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
-  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
-  constexpr auto input_size           = total_warps * warp_size;
-  constexpr auto output_size          = total_warps * logical_warps;
+  using type                                    = c2h::get<0, TestType>;
+  using predefined_op                           = c2h::get<1, TestType>;
+  constexpr auto logical_warp_threads           = c2h::get<2, TestType>::value;
+  auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads);
   CAPTURE(c2h::type_name<type>(), c2h::type_name<predefined_op>(), logical_warp_threads);
   c2h::device_vector<type> d_in(input_size);
   c2h::device_vector<type> d_out(output_size);
   c2h::gen(C2H_SEED(10), d_in);
-  // Run test
-  warp_reduce_launch<logical_warp_threads, total_warps>(d_in, d_out, warp_reduce_t<predefined_op, type>{});
+  warp_reduce_launch<logical_warp_threads>(d_in, d_out, warp_reduce_t<predefined_op, type>{});
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<predefined_op>(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads);
+  compute_host_reference<predefined_op>(h_in, h_out, warp_size, logical_warps, logical_warp_threads);
   verify_results(h_out, d_out);
 }
 
 C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][full]", full_type_list, logical_warp_threads)
 {
-  using type                          = c2h::get<0, TestType>;
-  constexpr auto logical_warp_threads = c2h::get<1, TestType>::value;
-  constexpr auto total_warps          = 4u;
-  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
-  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
-  constexpr auto input_size           = total_warps * warp_size;
-  constexpr auto output_size          = total_warps * logical_warps;
+  using type                                    = c2h::get<0, TestType>;
+  constexpr auto logical_warp_threads           = c2h::get<1, TestType>::value;
+  auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads);
   CAPTURE(c2h::type_name<type>(), logical_warp_threads);
   c2h::device_vector<type> d_in(input_size);
   c2h::device_vector<type> d_out(output_size);
   c2h::gen(C2H_SEED(1), d_in);
-  // Run test
-  warp_reduce_launch<logical_warp_threads, total_warps>(d_in, d_out, warp_reduce_t<custom_plus, type>{});
+  warp_reduce_launch<logical_warp_threads>(d_in, d_out, warp_reduce_t<custom_plus, type>{});
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<cuda::std::plus<>>(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads);
+  compute_host_reference<cuda::std::plus<>>(h_in, h_out, warp_size, logical_warps, logical_warp_threads);
   verify_results(h_out, d_out);
 }
 
@@ -378,53 +289,41 @@ C2H_TEST("WarpReduce::Sum/Max/Min Partial",
          predefined_op_list,
          logical_warp_threads)
 {
-  using type                          = c2h::get<0, TestType>;
-  using predefined_op                 = c2h::get<1, TestType>;
-  constexpr auto logical_warp_threads = c2h::get<2, TestType>::value;
-  constexpr auto total_warps          = 4u;
-  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
-  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
-  constexpr auto input_size           = total_warps * warp_size;
-  constexpr auto output_size          = total_warps * logical_warps;
-  const int valid_items               = GENERATE_COPY(take(2, random(1u, logical_warp_threads)));
+  using type                                    = c2h::get<0, TestType>;
+  using predefined_op                           = c2h::get<1, TestType>;
+  constexpr auto logical_warp_threads           = c2h::get<2, TestType>::value;
+  auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads);
+  const int valid_items                         = GENERATE_COPY(take(2, random(1u, logical_warp_threads)));
   CAPTURE(c2h::type_name<type>(), c2h::type_name<predefined_op>(), logical_warp_threads, valid_items);
   c2h::device_vector<type> d_in(input_size);
   c2h::device_vector<type> d_out(output_size);
   c2h::gen(C2H_SEED(10), d_in);
-  // Run test
-  warp_reduce_launch<logical_warp_threads, total_warps>(d_in, d_out, warp_reduce_t<predefined_op, type>{}, valid_items);
+  warp_reduce_launch<logical_warp_threads, true>(d_in, d_out, warp_reduce_t<predefined_op, type>{}, valid_items);
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<predefined_op>(
-    h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads, valid_items);
+  compute_host_reference<predefined_op>(h_in, h_out, warp_size, logical_warps, logical_warp_threads, valid_items);
   verify_results(h_out, d_out);
 }
 
 C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][partial]", full_type_list, logical_warp_threads)
 {
-  using type                          = c2h::get<0, TestType>;
-  constexpr auto logical_warp_threads = c2h::get<1, TestType>::value;
-  constexpr auto total_warps          = 4u;
-  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
-  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
-  constexpr auto input_size           = total_warps * warp_size;
-  constexpr auto output_size          = total_warps * logical_warps;
-  const int valid_items               = GENERATE_COPY(take(2, random(1u, logical_warp_threads)));
+  using type                                    = c2h::get<0, TestType>;
+  constexpr auto logical_warp_threads           = c2h::get<1, TestType>::value;
+  auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads);
+  const int valid_items                         = GENERATE_COPY(take(2, random(1u, logical_warp_threads)));
   CAPTURE(c2h::type_name<type>(), logical_warp_threads);
   c2h::device_vector<type> d_in(input_size);
   c2h::device_vector<type> d_out(output_size);
-  c2h::gen(C2H_SEED(1), d_in);
-  // Run test
-  warp_reduce_launch<logical_warp_threads, total_warps>(d_in, d_out, warp_reduce_t<custom_plus, type>{}, valid_items);
+  c2h::gen(C2H_SEED(10), d_in);
+  warp_reduce_launch<logical_warp_threads, true>(d_in, d_out, warp_reduce_t<custom_plus, type>{}, valid_items);
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<cuda::std::plus<>>(
-    h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads, valid_items);
+  compute_host_reference<cuda::std::plus<>>(h_in, h_out, warp_size, logical_warps, logical_warp_threads, valid_items);
   verify_results(h_out, d_out);
 }
-*/
+
 //----------------------------------------------------------------------------------------------------------------------
 // multiple items per thread
 
@@ -434,31 +333,24 @@ C2H_TEST("WarpReduce::Sum/Max/Min Multiple Items Per Thread",
          predefined_op_list,
          logical_warp_threads)
 {
-  using type                          = c2h::get<0, TestType>;
-  using predefined_op                 = c2h::get<1, TestType>;
-  constexpr auto logical_warp_threads = c2h::get<2, TestType>::value;
-  constexpr auto total_warps          = 4u;
-  constexpr bool is_power_of_two      = cuda::std::has_single_bit(logical_warp_threads);
-  constexpr auto logical_warps        = is_power_of_two ? warp_size / logical_warp_threads : 1;
-  constexpr auto input_size           = total_warps * warp_size * items_per_thread;
-  constexpr auto output_size          = total_warps * logical_warps;
+  using type                                    = c2h::get<0, TestType>;
+  using predefined_op                           = c2h::get<1, TestType>;
+  constexpr auto logical_warp_threads           = c2h::get<2, TestType>::value;
+  auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads, items_per_thread);
   CAPTURE(c2h::type_name<type>(), c2h::type_name<predefined_op>(), logical_warp_threads);
-  c2h::device_vector<type> d_in(input_size, 1);
+  c2h::device_vector<type> d_in(input_size);
   c2h::device_vector<type> d_out(output_size);
-  // c2h::gen(C2H_SEED(1), d_in);
-  //  Run test
-  warp_reduce_multiple_items_launch<logical_warp_threads, total_warps>(
-    d_in, d_out, warp_reduce_t<predefined_op, type>{});
+  c2h::gen(C2H_SEED(10), d_in);
+  warp_reduce_multiple_items_launch<logical_warp_threads>(d_in, d_out, warp_reduce_t<predefined_op, type>{});
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
   compute_host_reference<predefined_op>(
     h_in,
     h_out,
-    total_warps,
     warp_size * items_per_thread,
     logical_warps,
-    logical_warp_threads,
+    logical_warp_threads * items_per_thread,
     logical_warp_threads * items_per_thread);
   verify_results(h_out, d_out);
 }

From 27e45d916152a094239d6873ac214fb3800c9e0c Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 20 Feb 2025 14:36:26 -0800
Subject: [PATCH 09/12] simplify test cases

---
 cub/test/warp/catch2_test_warp_reduce.cu | 30 ++++++++++--------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/cub/test/warp/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_reduce.cu
index c8c5fd3c1bd..cb3496a5003 100644
--- a/cub/test/warp/catch2_test_warp_reduce.cu
+++ b/cub/test/warp/catch2_test_warp_reduce.cu
@@ -193,18 +193,20 @@ template <typename predefined_op, typename T>
 void compute_host_reference(
   const c2h::host_vector<T>& h_in,
   c2h::host_vector<T>& h_out,
-  int items_per_warp,
   int logical_warps,
-  int logical_warp_stride,
-  int items_per_logical_warp = 0)
+  int logical_warp_threads,
+  int items_per_logical_warp = 0,
+  int items_per_thread1      = 1)
 {
   constexpr auto identity = operator_identity_v<T, predefined_op>;
-  items_per_logical_warp  = items_per_logical_warp == 0 ? logical_warp_stride : items_per_logical_warp;
+  int items_per_warp      = warp_size * items_per_thread1;
+  items_per_logical_warp =
+    items_per_logical_warp == 0 ? logical_warp_threads : items_per_logical_warp * items_per_thread1;
   for (unsigned i = 0; i < total_warps; ++i)
   {
     for (int j = 0; j < logical_warps; ++j)
     {
-      auto start                   = h_in.begin() + i * items_per_warp + j * logical_warp_stride;
+      auto start                   = h_in.begin() + i * items_per_warp + j * logical_warp_threads * items_per_thread1;
       auto end                     = start + items_per_logical_warp;
       h_out[i * logical_warps + j] = std::accumulate(start, end, identity, predefined_op{});
     }
@@ -237,7 +239,7 @@ C2H_TEST("WarpReduce::Sum, full_type_list", "[reduce][warp][predefined_op][full]
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<cuda::std::plus<>>(h_in, h_out, warp_size, logical_warps, logical_warp_threads);
+  compute_host_reference<cuda::std::plus<>>(h_in, h_out, logical_warps, logical_warp_threads);
   verify_results(h_out, d_out);
 }
 
@@ -259,7 +261,7 @@ C2H_TEST("WarpReduce::Sum/Max/Min, builtin types",
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<predefined_op>(h_in, h_out, warp_size, logical_warps, logical_warp_threads);
+  compute_host_reference<predefined_op>(h_in, h_out, logical_warps, logical_warp_threads);
   verify_results(h_out, d_out);
 }
 
@@ -276,7 +278,7 @@ C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][full]", full_type_list, log
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<cuda::std::plus<>>(h_in, h_out, warp_size, logical_warps, logical_warp_threads);
+  compute_host_reference<cuda::std::plus<>>(h_in, h_out, logical_warps, logical_warp_threads);
   verify_results(h_out, d_out);
 }
 
@@ -302,7 +304,7 @@ C2H_TEST("WarpReduce::Sum/Max/Min Partial",
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<predefined_op>(h_in, h_out, warp_size, logical_warps, logical_warp_threads, valid_items);
+  compute_host_reference<predefined_op>(h_in, h_out, logical_warps, logical_warp_threads, valid_items);
   verify_results(h_out, d_out);
 }
 
@@ -320,7 +322,7 @@ C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][partial]", full_type_list,
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<cuda::std::plus<>>(h_in, h_out, warp_size, logical_warps, logical_warp_threads, valid_items);
+  compute_host_reference<cuda::std::plus<>>(h_in, h_out, logical_warps, logical_warp_threads, valid_items);
   verify_results(h_out, d_out);
 }
 
@@ -345,12 +347,6 @@ C2H_TEST("WarpReduce::Sum/Max/Min Multiple Items Per Thread",
 
   c2h::host_vector<type> h_in = d_in;
   c2h::host_vector<type> h_out(output_size);
-  compute_host_reference<predefined_op>(
-    h_in,
-    h_out,
-    warp_size * items_per_thread,
-    logical_warps,
-    logical_warp_threads * items_per_thread,
-    logical_warp_threads * items_per_thread);
+  compute_host_reference<predefined_op>(h_in, h_out, logical_warps, logical_warp_threads, 0, items_per_thread);
   verify_results(h_out, d_out);
 }

From 9da77699b5c1d1f209cb8e36980dc8162a8fda99 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 20 Feb 2025 17:02:27 -0800
Subject: [PATCH 10/12] unify common functionalities

---
 cub/test/warp/catch2_test_warp_reduce.cu | 54 ++++++++++++------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/cub/test/warp/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_reduce.cu
index cb3496a5003..ba049c9097f 100644
--- a/cub/test/warp/catch2_test_warp_reduce.cu
+++ b/cub/test/warp/catch2_test_warp_reduce.cu
@@ -52,8 +52,12 @@ inline constexpr int items_per_thread = 4;
  * Kernel
  **********************************************************************************************************************/
 
-template <unsigned LogicalWarpThreads, bool EnableNumItems, typename T, typename ReductionOp>
-__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op, int num_items = 0)
+template <unsigned LogicalWarpThreads,
+          bool EnableNumItems = false, //
+          typename ThreadDataType,
+          typename T,
+          typename ReductionOp>
+__device__ void warp_reduce_function(ThreadDataType& thread_data, T* output, ReductionOp reduction_op, int num_items = 0)
 {
   using warp_reduce_t = cub::WarpReduce<T, LogicalWarpThreads>;
   using storage_t     = typename warp_reduce_t::TempStorage;
@@ -67,41 +71,39 @@ __global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op
   {
     return;
   }
-  auto thread_data = input[threadIdx.x];
   warp_reduce_t warp_reduce{storage[logical_warp]};
-  auto result = EnableNumItems ? reduction_op(warp_reduce, thread_data, num_items) //
-                               : reduction_op(warp_reduce, thread_data);
+  using result_t = decltype(reduction_op(warp_reduce, thread_data));
+  result_t result;
+  if constexpr (EnableNumItems)
+  {
+    result = reduction_op(warp_reduce, thread_data, num_items);
+  }
+  else
+  {
+    result = reduction_op(warp_reduce, thread_data);
+  }
   if (logical_lane == 0)
   {
     output[logical_warp] = result;
   }
 }
 
+template <unsigned LogicalWarpThreads, bool EnableNumItems, typename T, typename ReductionOp>
+__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op, int num_items = 0)
+{
+  auto thread_data = input[threadIdx.x];
+  warp_reduce_function<LogicalWarpThreads, EnableNumItems>(thread_data, output, reduction_op, num_items);
+}
+
 template <unsigned LogicalWarpThreads, typename T, typename ReductionOp>
 __global__ void warp_reduce_multiple_items_kernel(T* input, T* output, ReductionOp reduction_op)
 {
-  using warp_reduce_t = cub::WarpReduce<T, LogicalWarpThreads>;
-  using storage_t     = typename warp_reduce_t::TempStorage;
-  __shared__ storage_t storage[total_warps];
-  constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads);
-  auto lane                      = cuda::ptx::get_sreg_laneid();
-  auto logical_warp              = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size;
-  auto logical_lane              = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane;
-  if (!is_power_of_two && lane >= LogicalWarpThreads)
-  {
-    return;
-  }
   T thread_data[items_per_thread];
   for (int i = 0; i < items_per_thread; ++i)
   {
     thread_data[i] = input[threadIdx.x * items_per_thread + i];
   }
-  warp_reduce_t warp_reduce{storage[logical_warp]};
-  auto result = reduction_op(warp_reduce, thread_data);
-  if (logical_lane == 0)
-  {
-    output[logical_warp] = result;
-  }
+  warp_reduce_function<LogicalWarpThreads>(thread_data, output, reduction_op);
 }
 
 template <typename Op, typename T>
@@ -199,15 +201,13 @@ void compute_host_reference(
   int items_per_thread1      = 1)
 {
   constexpr auto identity = operator_identity_v<T, predefined_op>;
-  int items_per_warp      = warp_size * items_per_thread1;
-  items_per_logical_warp =
-    items_per_logical_warp == 0 ? logical_warp_threads : items_per_logical_warp * items_per_thread1;
+  items_per_logical_warp  = items_per_logical_warp == 0 ? logical_warp_threads : items_per_logical_warp;
   for (unsigned i = 0; i < total_warps; ++i)
   {
     for (int j = 0; j < logical_warps; ++j)
     {
-      auto start                   = h_in.begin() + i * items_per_warp + j * logical_warp_threads * items_per_thread1;
-      auto end                     = start + items_per_logical_warp;
+      auto start                   = h_in.begin() + (i * warp_size + j * logical_warp_threads) * items_per_thread1;
+      auto end                     = start + items_per_logical_warp * items_per_thread1;
       h_out[i * logical_warps + j] = std::accumulate(start, end, identity, predefined_op{});
     }
   }

From 74622473a3dfc8bdf4c5559c3820fb057f0667f7 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 20 Feb 2025 17:02:41 -0800
Subject: [PATCH 11/12] remove warp_reduce tests from warp_segmented_reduce

---
 .../warp/catch2_test_warp_segmented_reduce.cu | 193 ------------------
 1 file changed, 193 deletions(-)

diff --git a/cub/test/warp/catch2_test_warp_segmented_reduce.cu b/cub/test/warp/catch2_test_warp_segmented_reduce.cu
index 58f46ac7633..e036f893761 100644
--- a/cub/test/warp/catch2_test_warp_segmented_reduce.cu
+++ b/cub/test/warp/catch2_test_warp_segmented_reduce.cu
@@ -60,68 +60,6 @@ __global__ void warp_reduce_kernel(T* in, T* out, ActionT action)
   out[tid] = result;
 }
 
-/**
- * @brief Delegate wrapper for WarpReduce::Sum
- */
-template <typename T>
-struct warp_sum_t
-{
-  template <int LOGICAL_WARP_THREADS>
-  __device__ T operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
-  {
-    auto result = warp_reduce.Sum(thread_data);
-    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
-  }
-};
-
-/**
- * @brief Delegate wrapper for partial WarpReduce::Sum
- */
-template <typename T>
-struct warp_sum_partial_t
-{
-  int num_valid;
-  template <int LOGICAL_WARP_THREADS>
-  __device__ __forceinline__ T
-  operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
-  {
-    auto result = warp_reduce.Sum(thread_data, num_valid);
-    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
-  }
-};
-
-/**
- * @brief Delegate wrapper for WarpReduce::Reduce
- */
-template <typename T, typename ReductionOpT>
-struct warp_reduce_t
-{
-  ReductionOpT reduction_op;
-  template <int LOGICAL_WARP_THREADS>
-  __device__ __forceinline__ T
-  operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
-  {
-    auto result = warp_reduce.Reduce(thread_data, reduction_op);
-    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
-  }
-};
-
-/**
- * @brief Delegate wrapper for partial WarpReduce::Reduce
- */
-template <typename T, typename ReductionOpT>
-struct warp_reduce_partial_t
-{
-  int num_valid;
-  ReductionOpT reduction_op;
-  template <int LOGICAL_WARP_THREADS>
-  __device__ T operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
-  {
-    auto result = warp_reduce.Reduce(thread_data, reduction_op, num_valid);
-    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
-  }
-};
-
 /**
  * @brief Delegate wrapper for WarpReduce::TailSegmentedSum
  */
@@ -337,137 +275,6 @@ struct params_t
   static constexpr int tile_size            = total_warps * logical_warp_threads;
 };
 
-C2H_TEST("Warp sum works", "[reduce][warp]", full_type_list, logical_warp_threads)
-{
-  using params = params_t<TestType>;
-  using type   = typename params::type;
-
-  // Prepare test data
-  c2h::device_vector<type> d_in(params::tile_size);
-  c2h::device_vector<type> d_out(params::tile_size);
-  constexpr auto valid_items = params::logical_warp_threads;
-  c2h::gen(C2H_SEED(10), d_in);
-
-  // Run test
-  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_sum_t<type>{});
-
-  // Prepare verification data
-  c2h::host_vector<type> h_in  = d_in;
-  c2h::host_vector<type> h_out = h_in;
-  auto h_flags                 = thrust::make_constant_iterator(false);
-  compute_host_reference(
-    reduce_mode::all,
-    h_in,
-    h_flags,
-    params::total_warps,
-    params::logical_warp_threads,
-    valid_items,
-    ::cuda::std::plus<type>{},
-    h_out.begin());
-
-  // Verify results
-  verify_results(h_out, d_out);
-}
-
-C2H_TEST("Warp reduce works", "[reduce][warp]", builtin_type_list, logical_warp_threads)
-{
-  using params   = params_t<TestType>;
-  using type     = typename params::type;
-  using red_op_t = ::cuda::minimum<>;
-
-  // Prepare test data
-  c2h::device_vector<type> d_in(params::tile_size);
-  c2h::device_vector<type> d_out(params::tile_size);
-  constexpr auto valid_items = params::logical_warp_threads;
-  c2h::gen(C2H_SEED(10), d_in);
-
-  // Run test
-  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_reduce_t<type, red_op_t>{red_op_t{}});
-
-  // Prepare verification data
-  c2h::host_vector<type> h_in  = d_in;
-  c2h::host_vector<type> h_out = h_in;
-  auto h_flags                 = thrust::make_constant_iterator(false);
-  compute_host_reference(
-    reduce_mode::all,
-    h_in,
-    h_flags,
-    params::total_warps,
-    params::logical_warp_threads,
-    valid_items,
-    red_op_t{},
-    h_out.begin());
-
-  // Verify results
-  verify_results(h_out, d_out);
-}
-
-C2H_TEST("Warp sum on partial warp works", "[reduce][warp]", full_type_list, logical_warp_threads)
-{
-  using params = params_t<TestType>;
-  using type   = typename params::type;
-
-  // Prepare test data
-  c2h::device_vector<type> d_in(params::tile_size);
-  c2h::device_vector<type> d_out(params::tile_size);
-  const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads)));
-  c2h::gen(C2H_SEED(10), d_in);
-
-  // Run test
-  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_sum_partial_t<type>{valid_items});
-
-  // Prepare verification data
-  c2h::host_vector<type> h_in  = d_in;
-  c2h::host_vector<type> h_out = h_in;
-  auto h_flags                 = thrust::make_constant_iterator(false);
-  compute_host_reference(
-    reduce_mode::all,
-    h_in,
-    h_flags,
-    params::total_warps,
-    params::logical_warp_threads,
-    valid_items,
-    ::cuda::std::plus<type>{},
-    h_out.begin());
-
-  // Verify results
-  verify_results(h_out, d_out);
-}
-
-C2H_TEST("Warp reduce on partial warp works", "[reduce][warp]", builtin_type_list, logical_warp_threads)
-{
-  using params   = params_t<TestType>;
-  using type     = typename params::type;
-  using red_op_t = ::cuda::minimum<>;
-
-  // Prepare test data
-  c2h::device_vector<type> d_in(params::tile_size);
-  c2h::device_vector<type> d_out(params::tile_size);
-  const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads)));
-  c2h::gen(C2H_SEED(10), d_in);
-
-  // Run test
-  warp_reduce<params::logical_warp_threads, params::total_warps>(
-    d_in, d_out, warp_reduce_partial_t<type, red_op_t>{valid_items, red_op_t{}});
-
-  // Prepare verification data
-  c2h::host_vector<type> h_in  = d_in;
-  c2h::host_vector<type> h_out = h_in;
-  auto h_flags                 = thrust::make_constant_iterator(false);
-  compute_host_reference(
-    reduce_mode::all,
-    h_in,
-    h_flags,
-    params::total_warps,
-    params::logical_warp_threads,
-    valid_items,
-    red_op_t{},
-    h_out.begin());
-
-  // Verify results
-  verify_results(h_out, d_out);
-}
-
 C2H_TEST("Warp segmented sum works", "[reduce][warp]", full_type_list, logical_warp_threads, segmented_modes)
 {
   using params = params_t<TestType>;

From e5da8b67c0b963f7b5bd1215dcfbb03bef87d4a7 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 21 Feb 2025 10:16:37 -0800
Subject: [PATCH 12/12] use Pascal style

---
 cub/cub/warp/warp_reduce.cuh | 44 +++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh
index 3776496959b..98149072a15 100644
--- a/cub/cub/warp/warp_reduce.cuh
+++ b/cub/cub/warp/warp_reduce.cuh
@@ -65,7 +65,7 @@ CUB_NAMESPACE_BEGIN
 //! - A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`__ (or *fold*) uses a binary combining
 //!   operator to compute a single aggregate from a list of input elements.
 //! - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
-//! - The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS``
+//! - The number of entrant threads must be an multiple of ``LogicalWarpThreads``
 //!
 //! Performance Considerations
 //! ++++++++++++++++++++++++++
@@ -76,7 +76,7 @@ CUB_NAMESPACE_BEGIN
 //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
 //!
 //!   - Summation (**vs.** generic reduction)
-//!   - The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS``
+//!   - The architecture's warp size is a whole multiple of ``LogicalWarpThreads``
 //!
 //! Simple Examples
 //! +++++++++++++++
@@ -134,29 +134,27 @@ CUB_NAMESPACE_BEGIN
 //! @tparam T
 //!   The reduction input/output element type
 //!
-//! @tparam LOGICAL_WARP_THREADS
+//! @tparam LogicalWarpThreads
 //!   <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of
 //!   hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability
 //!   (e.g., 32 threads for SM20).
 //!
-template <typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS>
+template <typename T, int LogicalWarpThreads = CUB_PTX_WARP_THREADS>
 class WarpReduce
 {
-  static_assert(LOGICAL_WARP_THREADS >= 1 && LOGICAL_WARP_THREADS <= CUB_WARP_THREADS(0),
-                "LOGICAL_WARP_THREADS must be in the range [1, 32]");
+  static_assert(LogicalWarpThreads >= 1 && LogicalWarpThreads <= CUB_WARP_THREADS(0),
+                "LogicalWarpThreads must be in the range [1, 32]");
 
-  static constexpr bool is_full_warp    = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0));
-  static constexpr bool is_power_of_two = ::cuda::std::has_single_bit(uint32_t{LOGICAL_WARP_THREADS});
+  static constexpr bool is_full_warp    = (LogicalWarpThreads == CUB_WARP_THREADS(0));
+  static constexpr bool is_power_of_two = ::cuda::std::has_single_bit(uint32_t{LogicalWarpThreads});
 
 public:
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
   /// Internal specialization.
-  /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
-  using InternalWarpReduce =
-    ::cuda::std::_If<is_power_of_two,
-                     detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>,
-                     detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
+  /// Use SHFL-based reduction if LogicalWarpThreads is a power-of-two
+  using InternalWarpReduce = ::cuda::std::
+    _If<is_power_of_two, detail::WarpReduceShfl<T, LogicalWarpThreads>, detail::WarpReduceSmem<T, LogicalWarpThreads>>;
 
 #endif // _CCCL_DOXYGEN_INVOKED
 
@@ -224,7 +222,7 @@ public:
   //!
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input)
   {
-    return InternalWarpReduce{temp_storage}.template Reduce<true>(input, LOGICAL_WARP_THREADS, ::cuda::std::plus<>{});
+    return InternalWarpReduce{temp_storage}.template Reduce<true>(input, LogicalWarpThreads, ::cuda::std::plus<>{});
   }
 
   template <typename InputType>
@@ -234,12 +232,12 @@ public:
   {
     auto thread_reduction = cub::ThreadReduce(input, ::cuda::std::plus<>{});
     return InternalWarpReduce{temp_storage}.template Reduce<true>(
-      thread_reduction, LOGICAL_WARP_THREADS, ::cuda::std::plus<>{});
+      thread_reduction, LogicalWarpThreads, ::cuda::std::plus<>{});
   }
 
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input)
   {
-    return InternalWarpReduce{temp_storage}.template Reduce<true>(input, LOGICAL_WARP_THREADS, ::cuda::maximum<>{});
+    return InternalWarpReduce{temp_storage}.template Reduce<true>(input, LogicalWarpThreads, ::cuda::maximum<>{});
   }
 
   template <typename InputType>
@@ -249,12 +247,12 @@ public:
   {
     auto thread_reduction = cub::ThreadReduce(input, ::cuda::maximum<>{});
     return InternalWarpReduce{temp_storage}.template Reduce<true>(
-      thread_reduction, LOGICAL_WARP_THREADS, ::cuda::maximum<>{});
+      thread_reduction, LogicalWarpThreads, ::cuda::maximum<>{});
   }
 
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Min(T input)
   {
-    return InternalWarpReduce{temp_storage}.template Reduce<true>(input, LOGICAL_WARP_THREADS, ::cuda::minimum<>{});
+    return InternalWarpReduce{temp_storage}.template Reduce<true>(input, LogicalWarpThreads, ::cuda::minimum<>{});
   }
 
   template <typename InputType>
@@ -264,7 +262,7 @@ public:
   {
     auto thread_reduction = cub::ThreadReduce(input, ::cuda::minimum<>{});
     return InternalWarpReduce{temp_storage}.template Reduce<true>(
-      thread_reduction, LOGICAL_WARP_THREADS, ::cuda::minimum<>{});
+      thread_reduction, LogicalWarpThreads, ::cuda::minimum<>{});
   }
 
   //! @rst
@@ -313,7 +311,7 @@ public:
   //!
   //! @param[in] valid_items
   //!   Total number of valid items in the calling thread's logical warp
-  //!   (may be less than ``LOGICAL_WARP_THREADS``)
+  //!   (may be less than ``LogicalWarpThreads``)
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int valid_items)
   {
     // Determine if we don't need bounds checking
@@ -496,7 +494,7 @@ public:
   template <typename ReductionOp>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op)
   {
-    return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);
+    return InternalWarpReduce(temp_storage).template Reduce<true>(input, LogicalWarpThreads, reduction_op);
   }
 
   template <typename InputType, typename ReductionOp>
@@ -505,7 +503,7 @@ public:
   Reduce(const InputType& input, ReductionOp reduction_op)
   {
     auto thread_reduction = cub::ThreadReduce(input, reduction_op);
-    return WarpReduce<T, LOGICAL_WARP_THREADS>::Reduce(thread_reduction, LOGICAL_WARP_THREADS, reduction_op);
+    return WarpReduce<T, LogicalWarpThreads>::Reduce(thread_reduction, LogicalWarpThreads, reduction_op);
   }
   //! @rst
   //! Computes a partially-full warp-wide reduction in the calling warp using the specified binary
@@ -562,7 +560,7 @@ public:
   //!
   //! @param[in] valid_items
   //!   Total number of valid items in the calling thread's logical warp
-  //!   (may be less than ``LOGICAL_WARP_THREADS``)
+  //!   (may be less than ``LogicalWarpThreads``)
   template <typename ReductionOp>
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int valid_items)
   {