From 63072faa6c581c5e8a6b590e57ee7c490c88b718 Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 18 Feb 2025 15:20:16 -0800 Subject: [PATCH 01/12] add missing overloadings --- cub/cub/warp/warp_reduce.cuh | 284 ++++++++++++++++++++++------------- 1 file changed, 182 insertions(+), 102 deletions(-) diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh index 7992432a1c3..183d3454ba3 100644 --- a/cub/cub/warp/warp_reduce.cuh +++ b/cub/cub/warp/warp_reduce.cuh @@ -1,30 +1,25 @@ -/****************************************************************************** +/*********************************************************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. + * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the + * following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of conditions and the + * following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the + * following disclaimer in the documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote + * products derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ******************************************************************************/ + **********************************************************************************************************************/ //! @file //! @rst @@ -44,29 +39,32 @@ # pragma system_header #endif // no system header +#include #include +#include #include #include #include +#include +#include #include CUB_NAMESPACE_BEGIN //! @rst -//! The ``WarpReduce`` class provides :ref:`collective ` methods for -//! computing a parallel reduction of items partitioned across a CUDA thread warp. +//! The ``WarpReduce`` class provides :ref:`collective ` methods for computing a parallel +//! reduction of items partitioned across a CUDA thread warp. //! //! .. image:: ../../img/warp_reduce_logo.png //! :align: center //! //! Overview -//! ++++++++++++++++++++++++++ +//! ++++++++ //! -//! - A `reduction `__ (or *fold*) -//! uses a binary combining operator to compute a single aggregate from a list of input elements. -//! - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 -//! threads) +//! - A `reduction `__ (or *fold*) uses a binary combining +//! operator to compute a single aggregate from a list of input elements. +//! - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) //! - The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS`` //! //! Performance Considerations @@ -81,12 +79,12 @@ CUB_NAMESPACE_BEGIN //! - The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS`` //! //! Simple Examples -//! ++++++++++++++++++++++++++ +//! +++++++++++++++ //! //! @warpcollective{WarpReduce} //! -//! The code snippet below illustrates four concurrent warp sum reductions within a block of -//! 128 threads (one per each of the 32-thread warps). +//! The code snippet below illustrates four concurrent warp sum reductions within a block of 128 threads (one per each +//! of the 32-thread warps). //! //! .. code-block:: c++ //! @@ -96,24 +94,19 @@ CUB_NAMESPACE_BEGIN //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; -//! //! // Allocate WarpReduce shared memory for 4 warps //! __shared__ typename WarpReduce::TempStorage temp_storage[4]; -//! //! // Obtain one input item per thread //! int thread_data = ... -//! //! // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) -//! int warp_id = threadIdx.x / 32; +//! int warp_id = threadIdx.x / 32; //! int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); //! -//! Suppose the set of input ``thread_data`` across the block of threads is -//! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 -//! will be ``496``, ``1520``, ``2544``, and ``3568``, respectively -//! (and is undefined in other threads). +//! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``. +//! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will be +//! ``496``, ``1520``, ``2544``, and ``3568``, respectively (and is undefined in other threads). //! -//! The code snippet below illustrates a single warp sum reduction within a block of -//! 128 threads. +//! The code snippet below illustrates a single warp sum reduction within a block of 128 threads. //! //! .. code-block:: c++ //! @@ -123,23 +116,19 @@ CUB_NAMESPACE_BEGIN //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; -//! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! ... -//! //! // Only the first warp performs a reduction //! if (threadIdx.x < 32) //! { //! // Obtain one input item per thread //! int thread_data = ... -//! //! // Return the warp-wide sum to lane0 //! int aggregate = WarpReduce(temp_storage).Sum(thread_data); //! -//! Suppose the set of input ``thread_data`` across the warp of threads is -//! ``{0, 1, 2, 3, ..., 31}``. The corresponding output ``aggregate`` in thread0 will be ``496`` -//! (and is undefined in other threads). +//! Suppose the set of input ``thread_data`` across the warp of threads is ``{0, 1, 2, 3, ..., 31}``. +//! The corresponding output ``aggregate`` in thread0 will be ``496`` (and is undefined in other threads). //! @endrst //! //! @tparam T @@ -153,27 +142,21 @@ CUB_NAMESPACE_BEGIN template class WarpReduce { -private: - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), + static_assert(LOGICAL_WARP_THREADS >= 1 && LOGICAL_WARP_THREADS <= CUB_WARP_THREADS(0), + "LOGICAL_WARP_THREADS must be in the range [1, 32]"); - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = PowerOfTwo::VALUE, - }; + static constexpr bool is_full_warp = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)); + static constexpr bool is_power_of_two = ::cuda::std::has_single_bit(uint32_t{LOGICAL_WARP_THREADS}); public: #ifndef _CCCL_DOXYGEN_INVOKED // Do not document /// Internal specialization. /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two - using InternalWarpReduce = ::cuda::std:: - _If, detail::WarpReduceSmem>; + using InternalWarpReduce = + ::cuda::std::_If, + detail::WarpReduceSmem>; #endif // _CCCL_DOXYGEN_INVOKED @@ -181,17 +164,9 @@ private: /// Shared memory storage layout type for WarpReduce using _TempStorage = typename InternalWarpReduce::TempStorage; - /****************************************************************************** - * Thread fields - ******************************************************************************/ - /// Shared storage reference _TempStorage& temp_storage; - /****************************************************************************** - * Utility methods - ******************************************************************************/ - public: /// \smemstorage{WarpReduce} struct TempStorage : Uninitialized<_TempStorage> @@ -207,7 +182,7 @@ public: //! //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& temp_storage) - : temp_storage(temp_storage.Alias()) + : temp_storage{temp_storage.Alias()} {} //! @} end member group @@ -223,8 +198,8 @@ public: //! Snippet //! +++++++ //! - //! The code snippet below illustrates four concurrent warp sum reductions within a block of - //! 128 threads (one per each of the 32-thread warps). + //! The code snippet below illustrates four concurrent warp sum reductions within a block of 128 threads + //! (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! @@ -234,27 +209,62 @@ public: //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; - //! //! // Allocate WarpReduce shared memory for 4 warps //! __shared__ typename WarpReduce::TempStorage temp_storage[4]; - //! //! // Obtain one input item per thread //! int thread_data = ... - //! //! // Return the warp-wide sums to each lane0 //! int warp_id = threadIdx.x / 32; //! int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); //! - //! Suppose the set of input ``thread_data`` across the block of threads is - //! ``{0, 1, 2, 3, ..., 127}``. - //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``, - //! ``2544``, and ``3568``, respectively (and is undefined in other threads). + //! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``. + //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``, ``2544``, and + //! ``3568``, respectively (and is undefined in other threads). //! @endrst //! - //! @param[in] input Calling thread's input - _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input) + { + return InternalWarpReduce{temp_storage}.template Reduce(input, LOGICAL_WARP_THREADS, ::cuda::std::plus<>{}); + } + + template + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // + ::cuda::std::enable_if_t{}, T> // + Sum(const InputType& input) + { + auto thread_reduction = cub::ThreadReduce(input, ::cuda::std::plus<>{}); + return InternalWarpReduce{temp_storage}.template Reduce( + thread_reduction, LOGICAL_WARP_THREADS, ::cuda::std::plus<>{}); + } + + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input) + { + return InternalWarpReduce{temp_storage}.template Reduce(input, LOGICAL_WARP_THREADS, ::cuda::maximum<>{}); + } + + template + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // + ::cuda::std::enable_if_t{}, T> // + Max(const InputType& input) + { + auto thread_reduction = cub::ThreadReduce(input, ::cuda::maximum<>{}); + return InternalWarpReduce{temp_storage}.template Reduce( + thread_reduction, LOGICAL_WARP_THREADS, ::cuda::maximum<>{}); + } + + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Min(T input) + { + return InternalWarpReduce{temp_storage}.template Reduce(input, LOGICAL_WARP_THREADS, ::cuda::minimum<>{}); + } + + template + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // + ::cuda::std::enable_if_t{}, T> // + Min(const InputType& input) { - return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, ::cuda::std::plus<>{}); + auto thread_reduction = cub::ThreadReduce(input, ::cuda::minimum<>{}); + return InternalWarpReduce{temp_storage}.template Reduce( + thread_reduction, LOGICAL_WARP_THREADS, ::cuda::minimum<>{}); } //! @rst @@ -304,7 +314,7 @@ public: //! @param[in] valid_items //! Total number of valid items in the calling thread's logical warp //! (may be less than ``LOGICAL_WARP_THREADS``) - _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int valid_items) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int valid_items) { // Determine if we don't need bounds checking return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, ::cuda::std::plus<>{}); @@ -359,7 +369,7 @@ public: //! @param[in] head_flag //! Head flag denoting whether or not `input` is the start of a new segment template - _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT head_flag) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT head_flag) { return HeadSegmentedReduce(input, head_flag, ::cuda::std::plus<>{}); } @@ -413,7 +423,7 @@ public: //! @param[in] tail_flag //! Head flag denoting whether or not `input` is the start of a new segment template - _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT tail_flag) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT tail_flag) { return TailSegmentedReduce(input, tail_flag, ::cuda::std::plus<>{}); } @@ -472,11 +482,19 @@ public: //! @param[in] reduction_op //! Binary reduction operator template - _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op) { return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); } + template + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // + ::cuda::std::enable_if_t{}, T> // + Reduce(const InputType& input, ReductionOp reduction_op) + { + auto thread_reduction = cub::ThreadReduce(input, reduction_op); + return WarpReduce::Reduce(thread_reduction, LOGICAL_WARP_THREADS, reduction_op); + } //! @rst //! Computes a partially-full warp-wide reduction in the calling warp using the specified binary //! reduction functor. The output is valid in warp *lane*\ :sub:`0`. @@ -534,7 +552,7 @@ public: //! Total number of valid items in the calling thread's logical warp //! (may be less than ``LOGICAL_WARP_THREADS``) template - _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int valid_items) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int valid_items) { return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, reduction_op); } @@ -593,7 +611,8 @@ public: //! @param[in] reduction_op //! Reduction operator template - _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedReduce(T input, FlagT head_flag, ReductionOp reduction_op) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T + HeadSegmentedReduce(T input, FlagT head_flag, ReductionOp reduction_op) { return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); } @@ -652,7 +671,8 @@ public: //! @param[in] reduction_op //! Reduction operator template - _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T + TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op) { return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); } @@ -673,16 +693,18 @@ public: struct TempStorage : Uninitialized<_TempStorage> {}; - _CCCL_DEVICE _CCCL_FORCEINLINE InternalWarpReduce(TempStorage& /*temp_storage */) {} + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE InternalWarpReduce(TempStorage& /*temp_storage */) {} template - _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int /* valid_items */, ReductionOp /* reduction_op */) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T + Reduce(T input, int /* valid_items */, ReductionOp /* reduction_op */) { return input; } template - _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT /* flag */, ReductionOp /* reduction_op */) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T + SegmentedReduce(T input, FlagT /* flag */, ReductionOp /* reduction_op */) { return input; } @@ -690,54 +712,112 @@ public: using TempStorage = typename InternalWarpReduce::TempStorage; - _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& /*temp_storage */) {} + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& /*temp_storage */) {} - _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input) { return input; } - _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int /* valid_items */) + template + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // + ::cuda::std::enable_if_t{}, T> // + Sum(const InputType& input) + { + return cub::ThreadReduce(input, ::cuda::std::plus<>{}); + } + + template + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int /* valid_items */) + { + return input; + } + + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input) + { + return input; + } + + template + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // + ::cuda::std::enable_if_t{}, T> // + Max(const InputType& input) + { + return cub::ThreadReduce(input, ::cuda::maximum<>{}); + } + + template + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input, int /* valid_items */) + { + return input; + } + + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Min(T input) + { + return input; + } + + template + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // + ::cuda::std::enable_if_t{}, T> // + Min(const InputType& input) + { + return cub::ThreadReduce(input, ::cuda::minimum<>{}); + } + + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Min(T input, int /* valid_items */) { return input; } template - _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT /* head_flag */) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT /* head_flag */) { return input; } template - _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT /* tail_flag */) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT /* tail_flag */) { return input; } template - _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */) { return input; } + template + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // + ::cuda::std::enable_if_t{}, T> // + Reduce(const InputType& input, ReductionOp reduction_op) + { + return cub::ThreadReduce(input, reduction_op); + } + template - _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */, int /* valid_items */) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T + Reduce(T input, ReductionOp /* reduction_op */, int /* valid_items */) { return input; } template - _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedReduce(T input, FlagT /* head_flag */, ReductionOp /* reduction_op */) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T + HeadSegmentedReduce(T input, FlagT /* head_flag */, ReductionOp /* reduction_op */) { return input; } template - _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedReduce(T input, FlagT /* tail_flag */, ReductionOp /* reduction_op */) + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T + TailSegmentedReduce(T input, FlagT /* tail_flag */, ReductionOp /* reduction_op */) { return input; } }; + #endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END From 51160da33ce863cbe674c29646569d8c4410d8f6 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 19 Feb 2025 11:38:35 -0800 Subject: [PATCH 02/12] remove useless volatile load/store --- .../warp/specializations/warp_reduce_smem.cuh | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh index 02b8295460a..1a373204b63 100644 --- a/cub/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh @@ -44,9 +44,8 @@ # pragma system_header #endif // no system header -#include #include -#include +#include #include #include @@ -152,21 +151,16 @@ struct WarpReduceSmem ReduceStep(T input, int valid_items, ReductionOp reduction_op, constant_t /*step*/) { constexpr int OFFSET = 1 << STEP; - // Share input through buffer - ThreadStore(&temp_storage.reduce[lane_id], input); - + temp_storage.reduce[lane_id] = input; __syncwarp(member_mask); - // Update input if peer_addend is in range if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items)) { - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + T peer_addend = temp_storage.reduce[lane_id + OFFSET]; input = reduction_op(input, peer_addend); } - __syncwarp(member_mask); - return ReduceStep(input, valid_items, reduction_op, constant_v); } @@ -250,14 +244,14 @@ struct WarpReduceSmem const int OFFSET = 1 << STEP; // Share input into buffer - ThreadStore(&temp_storage.reduce[lane_id], input); + temp_storage.reduce[lane_id] = input; __syncwarp(member_mask); // Update input if peer_addend is in range if (OFFSET + lane_id < next_flag) { - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + T peer_addend = temp_storage.reduce[lane_id + OFFSET]; input = reduction_op(input, peer_addend); } @@ -306,12 +300,12 @@ struct WarpReduceSmem const int OFFSET = 1 << STEP; // Share input through buffer - ThreadStore(&temp_storage.reduce[lane_id], input); + temp_storage.reduce[lane_id] = input; __syncwarp(member_mask); // Get peer from buffer - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + T peer_addend = temp_storage.reduce[lane_id + OFFSET]; __syncwarp(member_mask); From 43ff91eb8fd6b7e15c0615fd49cf272f2bd20e6d Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Feb 2025 09:28:39 -0800 Subject: [PATCH 03/12] improve warp reduce test coverage --- cub/test/utils/check_results.cuh | 45 ++ cub/test/utils/operator.cuh | 83 +++ cub/test/warp/catch2_test_warp_reduce.cu | 591 ++++++++++++++++++ .../catch2_test_warp_segmented_reduce.cu} | 0 4 files changed, 719 insertions(+) create mode 100644 cub/test/utils/check_results.cuh create mode 100644 cub/test/utils/operator.cuh create mode 100644 cub/test/warp/catch2_test_warp_reduce.cu rename cub/test/{catch2_test_warp_reduce.cu => warp/catch2_test_warp_segmented_reduce.cu} (100%) diff --git a/cub/test/utils/check_results.cuh b/cub/test/utils/check_results.cuh new file mode 100644 index 00000000000..f73ec9b26a1 --- /dev/null +++ b/cub/test/utils/check_results.cuh @@ -0,0 +1,45 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ +#pragma once + +/** + * @brief Compares the results returned from system under test against the expected results. + */ +template , int> = 0> +void verify_results(const c2h::host_vector& expected_data, const c2h::device_vector& test_results) +{ + REQUIRE_APPROX_EQ(expected_data, test_results); +} + +/** + * @brief Compares the results returned from system under test against the expected results. + */ +template , int> = 0> +void verify_results(const c2h::host_vector& expected_data, const c2h::device_vector& test_results) +{ + REQUIRE(expected_data == test_results); +} diff --git a/cub/test/utils/operator.cuh b/cub/test/utils/operator.cuh new file mode 100644 index 00000000000..182d434a92f --- /dev/null +++ b/cub/test/utils/operator.cuh @@ -0,0 +1,83 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ +#pragma once + +#include +#include +#include + +/*********************************************************************************************************************** + * CUB operator to identity + **********************************************************************************************************************/ + +template +constexpr T operator_identity_v; + +template +constexpr T operator_identity_v> = T{0}; + +template +constexpr T operator_identity_v> = T{0}; + +template +constexpr T operator_identity_v> = T{1}; + +template +constexpr T operator_identity_v> = T{1}; + +template +constexpr T operator_identity_v> = T{~T{0}}; + +template +constexpr T operator_identity_v> = T{~T{0}}; + +template +constexpr T operator_identity_v> = T{0}; + +template +constexpr T operator_identity_v> = T{0}; + +template +constexpr T operator_identity_v> = T{0}; + +template +constexpr T operator_identity_v> = T{0}; + +template +constexpr T operator_identity_v> = ::std::numeric_limits::min(); + +template +constexpr T operator_identity_v> = ::std::numeric_limits::min(); + +template +constexpr T operator_identity_v> = ::std::numeric_limits::max(); + +template +constexpr T operator_identity_v> = ::std::numeric_limits::max(); + +struct custom_plus : cuda::std::plus<> +{}; diff --git a/cub/test/warp/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_reduce.cu new file mode 100644 index 00000000000..78bf960e745 --- /dev/null +++ b/cub/test/warp/catch2_test_warp_reduce.cu @@ -0,0 +1,591 @@ +/*********************************************************************************************************************** + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use input source and binary forms, with or without modification, are permitted provided that the + * following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of conditions and the + * following disclaimer. + * * Redistributions input binary form must reproduce the above copyright notice, this list of conditions and the + * following disclaimer input the documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote + * products derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **********************************************************************************************************************/ + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +constexpr int warp_size = 32; + +template +__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op) +{ + using warp_reduce_t = cub::WarpReduce; + using storage_t = typename warp_reduce_t::TempStorage; + __shared__ storage_t storage[TotalWarps]; + constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads); + auto lane = cuda::ptx::get_sreg_laneid(); + auto logical_warp = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size; + auto logical_lane = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane; + if (!is_power_of_two && lane >= LogicalWarpThreads) + { + return; + } + auto thread_data = input[threadIdx.x]; + warp_reduce_t warp_reduce{storage[logical_warp]}; + auto result = reduction_op(warp_reduce, thread_data); + if (logical_lane == 0) + { + output[logical_warp] = result; + } +} + +template +__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op, int num_items) +{ + using warp_reduce_t = cub::WarpReduce; + using storage_t = typename warp_reduce_t::TempStorage; + __shared__ storage_t storage[TotalWarps]; + constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads); + auto lane = cuda::ptx::get_sreg_laneid(); + auto logical_warp = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size; + auto logical_lane = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane; + if (!is_power_of_two && lane >= num_items) + { + return; + } + auto thread_data = input[threadIdx.x]; + warp_reduce_t warp_reduce{storage[logical_warp]}; + auto result = reduction_op(warp_reduce, thread_data, num_items); + if (logical_lane == 0) + { + output[logical_warp] = result; + } +} +inline constexpr int items_per_thread = 4; + +template +__global__ void warp_reduce_multiple_items_kernel(T* input, T* output, ReductionOp reduction_op) +{ + using warp_reduce_t = cub::WarpReduce; + using storage_t = typename warp_reduce_t::TempStorage; + __shared__ storage_t storage[TotalWarps]; + constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads); + auto lane = cuda::ptx::get_sreg_laneid(); + auto logical_warp = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size; + auto logical_lane = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane; + if (!is_power_of_two && lane >= LogicalWarpThreads) + { + return; + } + T thread_data[items_per_thread]; + for (int i = 0; i < items_per_thread; ++i) + { + thread_data[i] = input[threadIdx.x * items_per_thread + i]; + } + warp_reduce_t warp_reduce{storage[logical_warp]}; + static_assert(cub::detail::is_fixed_size_random_access_range_t{}); + auto result = reduction_op(warp_reduce, thread_data); + if (logical_lane == 0) + { + output[logical_warp] = result; + } +} + +template +struct warp_reduce_t +{ + template + __device__ auto operator()(cub::WarpReduce warp_reduce, T& data) const + { + return warp_reduce.Reduce(data, Op{}); + } + + template + __device__ auto operator()(cub::WarpReduce warp_reduce, T& data, int num_items) const + { + return warp_reduce.Reduce(data, Op{}, num_items); + } +}; + +template +struct warp_reduce_t, T> +{ + template + __device__ auto operator()(cub::WarpReduce warp_reduce, TArgs&&... args) const + { + return warp_reduce.Sum(args...); + } +}; + +template +struct warp_reduce_t, T> +{ + template + __device__ auto operator()(cub::WarpReduce warp_reduce, TArgs&&... args) const + { + return warp_reduce.Max(args...); + } +}; + +template +struct warp_reduce_t, T> +{ + template + __device__ auto operator()(cub::WarpReduce warp_reduce, TArgs&&... args) const + { + return warp_reduce.Min(args...); + } +}; + +/** + * @brief Delegate wrapper for WarpReduce::Sum + */ +// template +// struct warp_sum_t +//{ +// template +// __device__ auto operator()(cub::WarpReduce warp_reduce, const T& thread_data) const +// { +// return warp_reduce.Sum(thread_data); +// } +// }; +// +///** +// * @brief Delegate wrapper for partial WarpReduce::Sum +// */ +// template +// struct warp_sum_partial_t +//{ +// int num_valid; +// template +// __device__ __forceinline__ T +// operator()(int linear_tid, cub::WarpReduce& warp_reduce, T& thread_data) const +// { +// auto result = warp_reduce.Sum(thread_data, num_valid); +// return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data; +// } +//}; + +/** + * @brief Delegate wrapper for WarpReduce::Reduce + */ +// template +// struct warp_reduce_t +//{ +// ReductionOpT reduction_op; +// template +// __device__ __forceinline__ T +// operator()(int linear_tid, cub::WarpReduce& warp_reduce, T& thread_data) const +// { +// auto result = warp_reduce.Reduce(thread_data, reduction_op); +// return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data; +// } +// }; +// +///** +// * @brief Delegate wrapper for partial WarpReduce::Reduce +// */ +// template +// struct warp_reduce_partial_t +//{ +// int num_valid; +// ReductionOpT reduction_op; +// template +// __device__ T operator()(int linear_tid, cub::WarpReduce& warp_reduce, T& thread_data) const +// { +// auto result = warp_reduce.Reduce(thread_data, reduction_op, num_valid); +// return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data; +// } +//}; + +template +void warp_reduce_launch(c2h::device_vector& input, c2h::device_vector& output, TArgs... args) +{ + warp_reduce_kernel<<<1, TotalWarps * warp_size>>>( + thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), args...); + + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); +} + +template +void warp_reduce_multiple_items_launch(c2h::device_vector& input, c2h::device_vector& output, TArgs... args) +{ + warp_reduce_multiple_items_kernel<<<1, TotalWarps * warp_size>>>( + thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), args...); + + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); +} + +/*********************************************************************************************************************** + * Types + **********************************************************************************************************************/ +#if 0 +// List of types to test +using custom_t = + c2h::custom_type_t; + +using full_type_list = c2h::type_list; + +using builtin_type_list = c2h::type_list; + +using predefined_op_list = c2h::type_list<::cuda::std::plus<>, ::cuda::maximum<>, ::cuda::minimum<>>; + +using logical_warp_threads = c2h::enum_type_list; + +#else +using builtin_type_list = c2h::type_list; + +using predefined_op_list = c2h::type_list<::cuda::std::plus<>>; + +using logical_warp_threads = c2h::enum_type_list; +#endif + +/*********************************************************************************************************************** + * Reference + **********************************************************************************************************************/ + +template +void compute_host_reference( + const c2h::host_vector& h_in, + c2h::host_vector& h_out, + int total_warps, + int items_per_warp, + int logical_warps, + int logical_warp_stride, + int items_per_logical_warp = 0) +{ + constexpr auto identity = operator_identity_v; + items_per_logical_warp = items_per_logical_warp == 0 ? logical_warp_stride : items_per_logical_warp; + for (int i = 0; i < total_warps; ++i) + { + for (int j = 0; j < logical_warps; ++j) + { + auto start = h_in.begin() + i * items_per_warp + j * logical_warp_stride; + auto end = start + items_per_logical_warp; + h_out[i * logical_warps + j] = std::accumulate(start, end, identity, predefined_op{}); + } + } +} + +/*********************************************************************************************************************** + * Test cases + **********************************************************************************************************************/ +/* +C2H_TEST("WarpReduce::Sum", "[reduce][warp][predefined][full]", full_type_list, logical_warp_threads) +{ + using type = c2h::get<0, TestType>; + constexpr auto logical_warp_threads = c2h::get<1, TestType>::value; + constexpr auto total_warps = 4u; + constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); + constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; + constexpr auto input_size = total_warps * warp_size; + constexpr auto output_size = total_warps * logical_warps; + CAPTURE(c2h::type_name(), logical_warp_threads); + c2h::device_vector d_in(input_size); + c2h::device_vector d_out(output_size); + c2h::gen(C2H_SEED(10), d_in); + // Run test + warp_reduce_launch(d_in, d_out, warp_reduce_t, type>{}); + + c2h::host_vector h_in = d_in; + c2h::host_vector h_out(output_size); + compute_host_reference>(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads); + verify_results(h_out, d_out); +} + +C2H_TEST("WarpReduce::Sum/Max/Min", + "[reduce][warp][predefined][full]", + builtin_type_list, + predefined_op_list, + logical_warp_threads) +{ + using type = c2h::get<0, TestType>; + using predefined_op = c2h::get<1, TestType>; + constexpr auto logical_warp_threads = c2h::get<2, TestType>::value; + constexpr auto total_warps = 4u; + constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); + constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; + constexpr auto input_size = total_warps * warp_size; + constexpr auto output_size = total_warps * logical_warps; + CAPTURE(c2h::type_name(), c2h::type_name(), logical_warp_threads); + c2h::device_vector d_in(input_size); + c2h::device_vector d_out(output_size); + c2h::gen(C2H_SEED(10), d_in); + // Run test + warp_reduce_launch(d_in, d_out, warp_reduce_t{}); + + c2h::host_vector h_in = d_in; + c2h::host_vector h_out(output_size); + compute_host_reference(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads); + verify_results(h_out, d_out); +} + +C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][full]", full_type_list, logical_warp_threads) +{ + using type = c2h::get<0, TestType>; + constexpr auto logical_warp_threads = c2h::get<1, TestType>::value; + constexpr auto total_warps = 4u; + constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); + constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; + constexpr auto input_size = total_warps * warp_size; + constexpr auto output_size = total_warps * logical_warps; + CAPTURE(c2h::type_name(), logical_warp_threads); + c2h::device_vector d_in(input_size); + c2h::device_vector d_out(output_size); + c2h::gen(C2H_SEED(1), d_in); + // Run test + warp_reduce_launch(d_in, d_out, warp_reduce_t{}); + + c2h::host_vector h_in = d_in; + c2h::host_vector h_out(output_size); + compute_host_reference>(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads); + verify_results(h_out, d_out); +} + +//---------------------------------------------------------------------------------------------------------------------- +// partial + +C2H_TEST("WarpReduce::Sum/Max/Min Partial", + "[reduce][warp][predefined][partial]", + builtin_type_list, + predefined_op_list, + logical_warp_threads) +{ + using type = c2h::get<0, TestType>; + using predefined_op = c2h::get<1, TestType>; + constexpr auto logical_warp_threads = c2h::get<2, TestType>::value; + constexpr auto total_warps = 4u; + constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); + constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; + constexpr auto input_size = total_warps * warp_size; + constexpr auto output_size = total_warps * logical_warps; + const int valid_items = GENERATE_COPY(take(2, random(1u, logical_warp_threads))); + CAPTURE(c2h::type_name(), c2h::type_name(), logical_warp_threads, valid_items); + c2h::device_vector d_in(input_size); + c2h::device_vector d_out(output_size); + c2h::gen(C2H_SEED(10), d_in); + // Run test + warp_reduce_launch(d_in, d_out, warp_reduce_t{}, valid_items); + + c2h::host_vector h_in = d_in; + c2h::host_vector h_out(output_size); + compute_host_reference( + h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads, valid_items); + verify_results(h_out, d_out); +} + +C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][partial]", full_type_list, logical_warp_threads) +{ + using type = c2h::get<0, TestType>; + constexpr auto logical_warp_threads = c2h::get<1, TestType>::value; + constexpr auto total_warps = 4u; + constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); + constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; + constexpr auto input_size = total_warps * warp_size; + constexpr auto output_size = total_warps * logical_warps; + const int valid_items = GENERATE_COPY(take(2, random(1u, logical_warp_threads))); + CAPTURE(c2h::type_name(), logical_warp_threads); + c2h::device_vector d_in(input_size); + c2h::device_vector d_out(output_size); + c2h::gen(C2H_SEED(1), d_in); + // Run test + warp_reduce_launch(d_in, d_out, warp_reduce_t{}, valid_items); + + c2h::host_vector h_in = d_in; + c2h::host_vector h_out(output_size); + compute_host_reference>( + h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads, valid_items); + verify_results(h_out, d_out); +} +*/ +//---------------------------------------------------------------------------------------------------------------------- +// multiple items per thread + +C2H_TEST("WarpReduce::Sum/Max/Min Multiple Items Per Thread", + "[reduce][warp][predefined][full]", + builtin_type_list, + predefined_op_list, + logical_warp_threads) +{ + using type = c2h::get<0, TestType>; + using predefined_op = c2h::get<1, TestType>; + constexpr auto logical_warp_threads = c2h::get<2, TestType>::value; + constexpr auto total_warps = 4u; + constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); + constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; + constexpr auto input_size = total_warps * warp_size * items_per_thread; + constexpr auto output_size = total_warps * logical_warps; + CAPTURE(c2h::type_name(), c2h::type_name(), logical_warp_threads); + c2h::device_vector d_in(input_size, 1); + c2h::device_vector d_out(output_size); + // c2h::gen(C2H_SEED(1), d_in); + // Run test + warp_reduce_multiple_items_launch( + d_in, d_out, warp_reduce_t{}); + + c2h::host_vector h_in = d_in; + c2h::host_vector h_out(output_size); + compute_host_reference( + h_in, + h_out, + total_warps, + warp_size * items_per_thread, + logical_warps, + logical_warp_threads, + logical_warp_threads * items_per_thread); + verify_results(h_out, d_out); +} + +#if 0 +C2H_TEST("Warp sum works", "[reduce][warp][predefined]", builtin_type_list, logical_warp_threads, predefined_op_list) +{ + using params = params_t; + constexpr auto logical_warp_threads = params::logical_warp_threads; + constexpr auto total_warps = params::total_warps; + using type = typename params::type; + // Prepare test data + c2h::device_vector d_in(params::input_size); + c2h::device_vector d_out(total_warps); + constexpr auto valid_items = logical_warp_threads; + c2h::gen(C2H_SEED(10), d_in); + // Run test + warp_reduce(d_in, d_out, warp_sum_t{}); + + c2h::host_vector h_in = d_in; + c2h::host_vector h_out(total_warps); + for (int i = 0; i < total_warps; ++i) + { + auto start = h_in.begin() + i * logical_warp_threads; + auto end = h_in.begin() + (i + 1) * logical_warp_threads; + h_out[i] = std::accumulate(start, end, type{}); + } + verify_results(h_out, d_out); +} + +C2H_TEST("Warp reduce works", "[reduce][warp]", builtin_type_list, logical_warp_threads) +{ + using params = params_t; + using type = typename params::type; + using red_op_t = ::cuda::minimum<>; + + // Prepare test data + c2h::device_vector d_in(params::input_size); + c2h::device_vector d_out(params::input_size); + constexpr auto valid_items = params::logical_warp_threads; + c2h::gen(C2H_SEED(10), d_in); + + // Run test + warp_reduce(d_in, d_out, warp_reduce_t{red_op_t{}}); + + // Prepare verification data + c2h::host_vector h_in = d_in; + c2h::host_vector h_out = h_in; + auto h_flags = thrust::make_constant_iterator(false); + compute_host_reference( + reduce_mode::all, + h_in, + h_flags, + params::total_warps, + params::logical_warp_threads, + valid_items, + red_op_t{}, + h_out.begin()); + + // Verify results + verify_results(h_out, d_out); +} + +C2H_TEST("Warp sum on partial warp works", "[reduce][warp]", full_type_list, logical_warp_threads) +{ + using params = params_t; + using type = typename params::type; + + // Prepare test data + c2h::device_vector d_in(params::input_size); + c2h::device_vector d_out(params::input_size); + const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads))); + c2h::gen(C2H_SEED(10), d_in); + + // Run test + warp_reduce(d_in, d_out, warp_sum_partial_t{valid_items}); + + // Prepare verification data + c2h::host_vector h_in = d_in; + c2h::host_vector h_out = h_in; + auto h_flags = thrust::make_constant_iterator(false); + compute_host_reference( + reduce_mode::all, + h_in, + h_flags, + params::total_warps, + params::logical_warp_threads, + valid_items, + ::cuda::std::plus{}, + h_out.begin()); + + // Verify results + verify_results(h_out, d_out); +} + +C2H_TEST("Warp reduce on partial warp works", "[reduce][warp]", builtin_type_list, logical_warp_threads) +{ + using params = params_t; + using type = typename params::type; + using red_op_t = ::cuda::minimum<>; + + // Prepare test data + c2h::device_vector d_in(params::input_size); + c2h::device_vector d_out(params::input_size); + const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads))); + c2h::gen(C2H_SEED(10), d_in); + + // Run test + warp_reduce( + d_in, d_out, warp_reduce_partial_t{valid_items, red_op_t{}}); + + // Prepare verification data + c2h::host_vector h_in = d_in; + c2h::host_vector h_out = h_in; + auto h_flags = thrust::make_constant_iterator(false); + compute_host_reference( + reduce_mode::all, + h_in, + h_flags, + params::total_warps, + params::logical_warp_threads, + valid_items, + red_op_t{}, + h_out.begin()); + + // Verify results + verify_results(h_out, d_out); +} +#endif diff --git a/cub/test/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_segmented_reduce.cu similarity index 100% rename from cub/test/catch2_test_warp_reduce.cu rename to cub/test/warp/catch2_test_warp_segmented_reduce.cu From 4fd702d48f81d1ce83814eb4c711ffdecd828101 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Feb 2025 09:29:01 -0800 Subject: [PATCH 04/12] remove volatile --- cub/cub/warp/specializations/warp_reduce_smem.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh index 1a373204b63..8e7ae3bd9b1 100644 --- a/cub/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh @@ -291,7 +291,7 @@ struct WarpReduceSmem }; // Alias flags onto shared data storage - volatile SmemFlag* flag_storage = temp_storage.flags; + SmemFlag* flag_storage = temp_storage.flags; SmemFlag flag_status = (flag) ? SET : UNSET; From 3b5e5d490384f1f7d0db9d2799e49a22fc797977 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Feb 2025 09:29:13 -0800 Subject: [PATCH 05/12] add missing overloadings --- cub/cub/warp/warp_reduce.cuh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh index 183d3454ba3..678c0892480 100644 --- a/cub/cub/warp/warp_reduce.cuh +++ b/cub/cub/warp/warp_reduce.cuh @@ -320,6 +320,18 @@ public: return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, ::cuda::std::plus<>{}); } + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input, int valid_items) + { + // Determine if we don't need bounds checking + return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, ::cuda::maximum<>{}); + } + + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Min(T input, int valid_items) + { + // Determine if we don't need bounds checking + return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, ::cuda::minimum<>{}); + } + //! @rst //! Computes a segmented sum in the calling warp where segments are defined by head-flags. //! The sum of each segment is returned to the first lane in that segment @@ -727,7 +739,6 @@ public: return cub::ThreadReduce(input, ::cuda::std::plus<>{}); } - template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int /* valid_items */) { return input; @@ -746,7 +757,6 @@ public: return cub::ThreadReduce(input, ::cuda::maximum<>{}); } - template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input, int /* valid_items */) { return input; From 0da4c572bfa8f4a26e5593cf8636d6493df56819 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Feb 2025 11:06:16 -0800 Subject: [PATCH 06/12] fix type traits --- cub/cub/warp/warp_reduce.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh index 678c0892480..37966696807 100644 --- a/cub/cub/warp/warp_reduce.cuh +++ b/cub/cub/warp/warp_reduce.cuh @@ -229,7 +229,7 @@ public: template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // - ::cuda::std::enable_if_t{}, T> // + ::cuda::std::enable_if_t::value, T> // Sum(const InputType& input) { auto thread_reduction = cub::ThreadReduce(input, ::cuda::std::plus<>{}); From bab0a3b4d90237c960b05e107fbce83d0856b02b Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Feb 2025 11:24:17 -0800 Subject: [PATCH 07/12] fix overloadings --- cub/cub/warp/warp_reduce.cuh | 14 +-- cub/test/warp/catch2_test_warp_reduce.cu | 137 +---------------------- 2 files changed, 12 insertions(+), 139 deletions(-) diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh index 37966696807..3776496959b 100644 --- a/cub/cub/warp/warp_reduce.cuh +++ b/cub/cub/warp/warp_reduce.cuh @@ -244,7 +244,7 @@ public: template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // - ::cuda::std::enable_if_t{}, T> // + ::cuda::std::enable_if_t::value, T> // Max(const InputType& input) { auto thread_reduction = cub::ThreadReduce(input, ::cuda::maximum<>{}); @@ -259,7 +259,7 @@ public: template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // - ::cuda::std::enable_if_t{}, T> // + ::cuda::std::enable_if_t::value, T> // Min(const InputType& input) { auto thread_reduction = cub::ThreadReduce(input, ::cuda::minimum<>{}); @@ -501,7 +501,7 @@ public: template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // - ::cuda::std::enable_if_t{}, T> // + ::cuda::std::enable_if_t::value, T> // Reduce(const InputType& input, ReductionOp reduction_op) { auto thread_reduction = cub::ThreadReduce(input, reduction_op); @@ -733,7 +733,7 @@ public: template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // - ::cuda::std::enable_if_t{}, T> // + ::cuda::std::enable_if_t::value, T> // Sum(const InputType& input) { return cub::ThreadReduce(input, ::cuda::std::plus<>{}); @@ -751,7 +751,7 @@ public: template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // - ::cuda::std::enable_if_t{}, T> // + ::cuda::std::enable_if_t::value, T> // Max(const InputType& input) { return cub::ThreadReduce(input, ::cuda::maximum<>{}); @@ -769,7 +769,7 @@ public: template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // - ::cuda::std::enable_if_t{}, T> // + ::cuda::std::enable_if_t::value, T> // Min(const InputType& input) { return cub::ThreadReduce(input, ::cuda::minimum<>{}); @@ -800,7 +800,7 @@ public: template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // - ::cuda::std::enable_if_t{}, T> // + ::cuda::std::enable_if_t::value, T> // Reduce(const InputType& input, ReductionOp reduction_op) { return cub::ThreadReduce(input, reduction_op); diff --git a/cub/test/warp/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_reduce.cu index 78bf960e745..ea103d7c315 100644 --- a/cub/test/warp/catch2_test_warp_reduce.cu +++ b/cub/test/warp/catch2_test_warp_reduce.cu @@ -109,7 +109,6 @@ __global__ void warp_reduce_multiple_items_kernel(T* input, T* output, Reduction thread_data[i] = input[threadIdx.x * items_per_thread + i]; } warp_reduce_t warp_reduce{storage[logical_warp]}; - static_assert(cub::detail::is_fixed_size_random_access_range_t{}); auto result = reduction_op(warp_reduce, thread_data); if (logical_lane == 0) { @@ -247,7 +246,7 @@ void warp_reduce_multiple_items_launch(c2h::device_vector& input, c2h::device /*********************************************************************************************************************** * Types **********************************************************************************************************************/ -#if 0 +#if 1 // List of types to test using custom_t = c2h::custom_type_t; @@ -299,7 +298,7 @@ void compute_host_reference( * Test cases **********************************************************************************************************************/ /* -C2H_TEST("WarpReduce::Sum", "[reduce][warp][predefined][full]", full_type_list, logical_warp_threads) +C2H_TEST("WarpReduce::Sum", "[reduce][warp][predefined_op][full]", full_type_list, logical_warp_threads) { using type = c2h::get<0, TestType>; constexpr auto logical_warp_threads = c2h::get<1, TestType>::value; @@ -322,7 +321,7 @@ C2H_TEST("WarpReduce::Sum", "[reduce][warp][predefined][full]", full_type_list, } C2H_TEST("WarpReduce::Sum/Max/Min", - "[reduce][warp][predefined][full]", + "[reduce][warp][predefined_op][full]", builtin_type_list, predefined_op_list, logical_warp_threads) @@ -374,7 +373,7 @@ C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][full]", full_type_list, log // partial C2H_TEST("WarpReduce::Sum/Max/Min Partial", - "[reduce][warp][predefined][partial]", + "[reduce][warp][predefined_op][partial]", builtin_type_list, predefined_op_list, logical_warp_threads) @@ -430,7 +429,7 @@ C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][partial]", full_type_list, // multiple items per thread C2H_TEST("WarpReduce::Sum/Max/Min Multiple Items Per Thread", - "[reduce][warp][predefined][full]", + "[reduce][warp][predefined_op][full]", builtin_type_list, predefined_op_list, logical_warp_threads) @@ -463,129 +462,3 @@ C2H_TEST("WarpReduce::Sum/Max/Min Multiple Items Per Thread", logical_warp_threads * items_per_thread); verify_results(h_out, d_out); } - -#if 0 -C2H_TEST("Warp sum works", "[reduce][warp][predefined]", builtin_type_list, logical_warp_threads, predefined_op_list) -{ - using params = params_t; - constexpr auto logical_warp_threads = params::logical_warp_threads; - constexpr auto total_warps = params::total_warps; - using type = typename params::type; - // Prepare test data - c2h::device_vector d_in(params::input_size); - c2h::device_vector d_out(total_warps); - constexpr auto valid_items = logical_warp_threads; - c2h::gen(C2H_SEED(10), d_in); - // Run test - warp_reduce(d_in, d_out, warp_sum_t{}); - - c2h::host_vector h_in = d_in; - c2h::host_vector h_out(total_warps); - for (int i = 0; i < total_warps; ++i) - { - auto start = h_in.begin() + i * logical_warp_threads; - auto end = h_in.begin() + (i + 1) * logical_warp_threads; - h_out[i] = std::accumulate(start, end, type{}); - } - verify_results(h_out, d_out); -} - -C2H_TEST("Warp reduce works", "[reduce][warp]", builtin_type_list, logical_warp_threads) -{ - using params = params_t; - using type = typename params::type; - using red_op_t = ::cuda::minimum<>; - - // Prepare test data - c2h::device_vector d_in(params::input_size); - c2h::device_vector d_out(params::input_size); - constexpr auto valid_items = params::logical_warp_threads; - c2h::gen(C2H_SEED(10), d_in); - - // Run test - warp_reduce(d_in, d_out, warp_reduce_t{red_op_t{}}); - - // Prepare verification data - c2h::host_vector h_in = d_in; - c2h::host_vector h_out = h_in; - auto h_flags = thrust::make_constant_iterator(false); - compute_host_reference( - reduce_mode::all, - h_in, - h_flags, - params::total_warps, - params::logical_warp_threads, - valid_items, - red_op_t{}, - h_out.begin()); - - // Verify results - verify_results(h_out, d_out); -} - -C2H_TEST("Warp sum on partial warp works", "[reduce][warp]", full_type_list, logical_warp_threads) -{ - using params = params_t; - using type = typename params::type; - - // Prepare test data - c2h::device_vector d_in(params::input_size); - c2h::device_vector d_out(params::input_size); - const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads))); - c2h::gen(C2H_SEED(10), d_in); - - // Run test - warp_reduce(d_in, d_out, warp_sum_partial_t{valid_items}); - - // Prepare verification data - c2h::host_vector h_in = d_in; - c2h::host_vector h_out = h_in; - auto h_flags = thrust::make_constant_iterator(false); - compute_host_reference( - reduce_mode::all, - h_in, - h_flags, - params::total_warps, - params::logical_warp_threads, - valid_items, - ::cuda::std::plus{}, - h_out.begin()); - - // Verify results - verify_results(h_out, d_out); -} - -C2H_TEST("Warp reduce on partial warp works", "[reduce][warp]", builtin_type_list, logical_warp_threads) -{ - using params = params_t; - using type = typename params::type; - using red_op_t = ::cuda::minimum<>; - - // Prepare test data - c2h::device_vector d_in(params::input_size); - c2h::device_vector d_out(params::input_size); - const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads))); - c2h::gen(C2H_SEED(10), d_in); - - // Run test - warp_reduce( - d_in, d_out, warp_reduce_partial_t{valid_items, red_op_t{}}); - - // Prepare verification data - c2h::host_vector h_in = d_in; - c2h::host_vector h_out = h_in; - auto h_flags = thrust::make_constant_iterator(false); - compute_host_reference( - reduce_mode::all, - h_in, - h_flags, - params::total_warps, - params::logical_warp_threads, - valid_items, - red_op_t{}, - h_out.begin()); - - // Verify results - verify_results(h_out, d_out); -} -#endif From 3ed706605fd8ccf53fb835d925979e9ca2503117 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Feb 2025 12:49:48 -0800 Subject: [PATCH 08/12] simplify unit test --- cub/test/warp/catch2_test_warp_reduce.cu | 260 +++++++---------------- 1 file changed, 76 insertions(+), 184 deletions(-) diff --git a/cub/test/warp/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_reduce.cu index ea103d7c315..c8c5fd3c1bd 100644 --- a/cub/test/warp/catch2_test_warp_reduce.cu +++ b/cub/test/warp/catch2_test_warp_reduce.cu @@ -40,61 +40,49 @@ #include #include -constexpr int warp_size = 32; +/*********************************************************************************************************************** + * Constants + **********************************************************************************************************************/ -template -__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op) -{ - using warp_reduce_t = cub::WarpReduce; - using storage_t = typename warp_reduce_t::TempStorage; - __shared__ storage_t storage[TotalWarps]; - constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads); - auto lane = cuda::ptx::get_sreg_laneid(); - auto logical_warp = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size; - auto logical_lane = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane; - if (!is_power_of_two && lane >= LogicalWarpThreads) - { - return; - } - auto thread_data = input[threadIdx.x]; - warp_reduce_t warp_reduce{storage[logical_warp]}; - auto result = reduction_op(warp_reduce, thread_data); - if (logical_lane == 0) - { - output[logical_warp] = result; - } -} +inline constexpr int warp_size = 32; +inline constexpr auto total_warps = 4u; +inline constexpr int items_per_thread = 4; + +/*********************************************************************************************************************** + * Kernel + **********************************************************************************************************************/ -template -__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op, int num_items) +template +__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op, int num_items = 0) { using warp_reduce_t = cub::WarpReduce; using storage_t = typename warp_reduce_t::TempStorage; - __shared__ storage_t storage[TotalWarps]; + __shared__ storage_t storage[total_warps]; constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads); auto lane = cuda::ptx::get_sreg_laneid(); auto logical_warp = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size; auto logical_lane = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane; - if (!is_power_of_two && lane >= num_items) + auto limit = EnableNumItems ? num_items : LogicalWarpThreads; + if (!is_power_of_two && lane >= limit) { return; } auto thread_data = input[threadIdx.x]; warp_reduce_t warp_reduce{storage[logical_warp]}; - auto result = reduction_op(warp_reduce, thread_data, num_items); + auto result = EnableNumItems ? reduction_op(warp_reduce, thread_data, num_items) // + : reduction_op(warp_reduce, thread_data); if (logical_lane == 0) { output[logical_warp] = result; } } -inline constexpr int items_per_thread = 4; -template +template __global__ void warp_reduce_multiple_items_kernel(T* input, T* output, ReductionOp reduction_op) { using warp_reduce_t = cub::WarpReduce; using storage_t = typename warp_reduce_t::TempStorage; - __shared__ storage_t storage[TotalWarps]; + __shared__ storage_t storage[total_warps]; constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads); auto lane = cuda::ptx::get_sreg_laneid(); auto logical_warp = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size; @@ -162,81 +150,20 @@ struct warp_reduce_t, T> } }; -/** - * @brief Delegate wrapper for WarpReduce::Sum - */ -// template -// struct warp_sum_t -//{ -// template -// __device__ auto operator()(cub::WarpReduce warp_reduce, const T& thread_data) const -// { -// return warp_reduce.Sum(thread_data); -// } -// }; -// -///** -// * @brief Delegate wrapper for partial WarpReduce::Sum -// */ -// template -// struct warp_sum_partial_t -//{ -// int num_valid; -// template -// __device__ __forceinline__ T -// operator()(int linear_tid, cub::WarpReduce& warp_reduce, T& thread_data) const -// { -// auto result = warp_reduce.Sum(thread_data, num_valid); -// return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data; -// } -//}; - -/** - * @brief Delegate wrapper for WarpReduce::Reduce - */ -// template -// struct warp_reduce_t -//{ -// ReductionOpT reduction_op; -// template -// __device__ __forceinline__ T -// operator()(int linear_tid, cub::WarpReduce& warp_reduce, T& thread_data) const -// { -// auto result = warp_reduce.Reduce(thread_data, reduction_op); -// return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data; -// } -// }; -// -///** -// * @brief Delegate wrapper for partial WarpReduce::Reduce -// */ -// template -// struct warp_reduce_partial_t -//{ -// int num_valid; -// ReductionOpT reduction_op; -// template -// __device__ T operator()(int linear_tid, cub::WarpReduce& warp_reduce, T& thread_data) const -// { -// auto result = warp_reduce.Reduce(thread_data, reduction_op, num_valid); -// return ((linear_tid % LogicalWarpThreads) == 0) ? result : thread_data; -// } -//}; - -template +template void warp_reduce_launch(c2h::device_vector& input, c2h::device_vector& output, TArgs... args) { - warp_reduce_kernel<<<1, TotalWarps * warp_size>>>( + warp_reduce_kernel<<<1, total_warps * warp_size>>>( thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), args...); REQUIRE(cudaSuccess == cudaPeekAtLastError()); REQUIRE(cudaSuccess == cudaDeviceSynchronize()); } -template +template void warp_reduce_multiple_items_launch(c2h::device_vector& input, c2h::device_vector& output, TArgs... args) { - warp_reduce_multiple_items_kernel<<<1, TotalWarps * warp_size>>>( + warp_reduce_multiple_items_kernel<<<1, total_warps * warp_size>>>( thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), args...); REQUIRE(cudaSuccess == cudaPeekAtLastError()); @@ -246,8 +173,7 @@ void warp_reduce_multiple_items_launch(c2h::device_vector& input, c2h::device /*********************************************************************************************************************** * Types **********************************************************************************************************************/ -#if 1 -// List of types to test + using custom_t = c2h::custom_type_t; @@ -259,14 +185,6 @@ using predefined_op_list = c2h::type_list<::cuda::std::plus<>, ::cuda::maximum<> using logical_warp_threads = c2h::enum_type_list; -#else -using builtin_type_list = c2h::type_list; - -using predefined_op_list = c2h::type_list<::cuda::std::plus<>>; - -using logical_warp_threads = c2h::enum_type_list; -#endif - /*********************************************************************************************************************** * Reference **********************************************************************************************************************/ @@ -275,7 +193,6 @@ template void compute_host_reference( const c2h::host_vector& h_in, c2h::host_vector& h_out, - int total_warps, int items_per_warp, int logical_warps, int logical_warp_stride, @@ -283,7 +200,7 @@ void compute_host_reference( { constexpr auto identity = operator_identity_v; items_per_logical_warp = items_per_logical_warp == 0 ? logical_warp_stride : items_per_logical_warp; - for (int i = 0; i < total_warps; ++i) + for (unsigned i = 0; i < total_warps; ++i) { for (int j = 0; j < logical_warps; ++j) { @@ -294,78 +211,72 @@ void compute_host_reference( } } +std::array get_test_config(unsigned logical_warp_threads, unsigned items_per_thread1 = 1) +{ + bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); + auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; + auto input_size = total_warps * warp_size * items_per_thread1; + auto output_size = total_warps * logical_warps; + return {input_size, output_size, logical_warps}; +} + /*********************************************************************************************************************** * Test cases **********************************************************************************************************************/ -/* -C2H_TEST("WarpReduce::Sum", "[reduce][warp][predefined_op][full]", full_type_list, logical_warp_threads) + +C2H_TEST("WarpReduce::Sum, full_type_list", "[reduce][warp][predefined_op][full]", full_type_list, logical_warp_threads) { - using type = c2h::get<0, TestType>; - constexpr auto logical_warp_threads = c2h::get<1, TestType>::value; - constexpr auto total_warps = 4u; - constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); - constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; - constexpr auto input_size = total_warps * warp_size; - constexpr auto output_size = total_warps * logical_warps; + using type = c2h::get<0, TestType>; + constexpr auto logical_warp_threads = c2h::get<1, TestType>::value; + auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads); CAPTURE(c2h::type_name(), logical_warp_threads); c2h::device_vector d_in(input_size); c2h::device_vector d_out(output_size); c2h::gen(C2H_SEED(10), d_in); - // Run test - warp_reduce_launch(d_in, d_out, warp_reduce_t, type>{}); + warp_reduce_launch(d_in, d_out, warp_reduce_t, type>{}); c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference>(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads); + compute_host_reference>(h_in, h_out, warp_size, logical_warps, logical_warp_threads); verify_results(h_out, d_out); } -C2H_TEST("WarpReduce::Sum/Max/Min", +C2H_TEST("WarpReduce::Sum/Max/Min, builtin types", "[reduce][warp][predefined_op][full]", builtin_type_list, predefined_op_list, logical_warp_threads) { - using type = c2h::get<0, TestType>; - using predefined_op = c2h::get<1, TestType>; - constexpr auto logical_warp_threads = c2h::get<2, TestType>::value; - constexpr auto total_warps = 4u; - constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); - constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; - constexpr auto input_size = total_warps * warp_size; - constexpr auto output_size = total_warps * logical_warps; + using type = c2h::get<0, TestType>; + using predefined_op = c2h::get<1, TestType>; + constexpr auto logical_warp_threads = c2h::get<2, TestType>::value; + auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads); CAPTURE(c2h::type_name(), c2h::type_name(), logical_warp_threads); c2h::device_vector d_in(input_size); c2h::device_vector d_out(output_size); c2h::gen(C2H_SEED(10), d_in); - // Run test - warp_reduce_launch(d_in, d_out, warp_reduce_t{}); + warp_reduce_launch(d_in, d_out, warp_reduce_t{}); c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads); + compute_host_reference(h_in, h_out, warp_size, logical_warps, logical_warp_threads); verify_results(h_out, d_out); } C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][full]", full_type_list, logical_warp_threads) { - using type = c2h::get<0, TestType>; - constexpr auto logical_warp_threads = c2h::get<1, TestType>::value; - constexpr auto total_warps = 4u; - constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); - constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; - constexpr auto input_size = total_warps * warp_size; - constexpr auto output_size = total_warps * logical_warps; + using type = c2h::get<0, TestType>; + constexpr auto logical_warp_threads = c2h::get<1, TestType>::value; + auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads); CAPTURE(c2h::type_name(), logical_warp_threads); c2h::device_vector d_in(input_size); c2h::device_vector d_out(output_size); c2h::gen(C2H_SEED(1), d_in); - // Run test - warp_reduce_launch(d_in, d_out, warp_reduce_t{}); + warp_reduce_launch(d_in, d_out, warp_reduce_t{}); c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference>(h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads); + compute_host_reference>(h_in, h_out, warp_size, logical_warps, logical_warp_threads); verify_results(h_out, d_out); } @@ -378,53 +289,41 @@ C2H_TEST("WarpReduce::Sum/Max/Min Partial", predefined_op_list, logical_warp_threads) { - using type = c2h::get<0, TestType>; - using predefined_op = c2h::get<1, TestType>; - constexpr auto logical_warp_threads = c2h::get<2, TestType>::value; - constexpr auto total_warps = 4u; - constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); - constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; - constexpr auto input_size = total_warps * warp_size; - constexpr auto output_size = total_warps * logical_warps; - const int valid_items = GENERATE_COPY(take(2, random(1u, logical_warp_threads))); + using type = c2h::get<0, TestType>; + using predefined_op = c2h::get<1, TestType>; + constexpr auto logical_warp_threads = c2h::get<2, TestType>::value; + auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads); + const int valid_items = GENERATE_COPY(take(2, random(1u, logical_warp_threads))); CAPTURE(c2h::type_name(), c2h::type_name(), logical_warp_threads, valid_items); c2h::device_vector d_in(input_size); c2h::device_vector d_out(output_size); c2h::gen(C2H_SEED(10), d_in); - // Run test - warp_reduce_launch(d_in, d_out, warp_reduce_t{}, valid_items); + warp_reduce_launch(d_in, d_out, warp_reduce_t{}, valid_items); c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference( - h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads, valid_items); + compute_host_reference(h_in, h_out, warp_size, logical_warps, logical_warp_threads, valid_items); verify_results(h_out, d_out); } C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][partial]", full_type_list, logical_warp_threads) { - using type = c2h::get<0, TestType>; - constexpr auto logical_warp_threads = c2h::get<1, TestType>::value; - constexpr auto total_warps = 4u; - constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); - constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; - constexpr auto input_size = total_warps * warp_size; - constexpr auto output_size = total_warps * logical_warps; - const int valid_items = GENERATE_COPY(take(2, random(1u, logical_warp_threads))); + using type = c2h::get<0, TestType>; + constexpr auto logical_warp_threads = c2h::get<1, TestType>::value; + auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads); + const int valid_items = GENERATE_COPY(take(2, random(1u, logical_warp_threads))); CAPTURE(c2h::type_name(), logical_warp_threads); c2h::device_vector d_in(input_size); c2h::device_vector d_out(output_size); - c2h::gen(C2H_SEED(1), d_in); - // Run test - warp_reduce_launch(d_in, d_out, warp_reduce_t{}, valid_items); + c2h::gen(C2H_SEED(10), d_in); + warp_reduce_launch(d_in, d_out, warp_reduce_t{}, valid_items); c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference>( - h_in, h_out, total_warps, warp_size, logical_warps, logical_warp_threads, valid_items); + compute_host_reference>(h_in, h_out, warp_size, logical_warps, logical_warp_threads, valid_items); verify_results(h_out, d_out); } -*/ + //---------------------------------------------------------------------------------------------------------------------- // multiple items per thread @@ -434,31 +333,24 @@ C2H_TEST("WarpReduce::Sum/Max/Min Multiple Items Per Thread", predefined_op_list, logical_warp_threads) { - using type = c2h::get<0, TestType>; - using predefined_op = c2h::get<1, TestType>; - constexpr auto logical_warp_threads = c2h::get<2, TestType>::value; - constexpr auto total_warps = 4u; - constexpr bool is_power_of_two = cuda::std::has_single_bit(logical_warp_threads); - constexpr auto logical_warps = is_power_of_two ? warp_size / logical_warp_threads : 1; - constexpr auto input_size = total_warps * warp_size * items_per_thread; - constexpr auto output_size = total_warps * logical_warps; + using type = c2h::get<0, TestType>; + using predefined_op = c2h::get<1, TestType>; + constexpr auto logical_warp_threads = c2h::get<2, TestType>::value; + auto [input_size, output_size, logical_warps] = get_test_config(logical_warp_threads, items_per_thread); CAPTURE(c2h::type_name(), c2h::type_name(), logical_warp_threads); - c2h::device_vector d_in(input_size, 1); + c2h::device_vector d_in(input_size); c2h::device_vector d_out(output_size); - // c2h::gen(C2H_SEED(1), d_in); - // Run test - warp_reduce_multiple_items_launch( - d_in, d_out, warp_reduce_t{}); + c2h::gen(C2H_SEED(10), d_in); + warp_reduce_multiple_items_launch(d_in, d_out, warp_reduce_t{}); c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); compute_host_reference( h_in, h_out, - total_warps, warp_size * items_per_thread, logical_warps, - logical_warp_threads, + logical_warp_threads * items_per_thread, logical_warp_threads * items_per_thread); verify_results(h_out, d_out); } From 27e45d916152a094239d6873ac214fb3800c9e0c Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Feb 2025 14:36:26 -0800 Subject: [PATCH 09/12] simplify test cases --- cub/test/warp/catch2_test_warp_reduce.cu | 30 ++++++++++-------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/cub/test/warp/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_reduce.cu index c8c5fd3c1bd..cb3496a5003 100644 --- a/cub/test/warp/catch2_test_warp_reduce.cu +++ b/cub/test/warp/catch2_test_warp_reduce.cu @@ -193,18 +193,20 @@ template void compute_host_reference( const c2h::host_vector& h_in, c2h::host_vector& h_out, - int items_per_warp, int logical_warps, - int logical_warp_stride, - int items_per_logical_warp = 0) + int logical_warp_threads, + int items_per_logical_warp = 0, + int items_per_thread1 = 1) { constexpr auto identity = operator_identity_v; - items_per_logical_warp = items_per_logical_warp == 0 ? logical_warp_stride : items_per_logical_warp; + int items_per_warp = warp_size * items_per_thread1; + items_per_logical_warp = + items_per_logical_warp == 0 ? logical_warp_threads : items_per_logical_warp * items_per_thread1; for (unsigned i = 0; i < total_warps; ++i) { for (int j = 0; j < logical_warps; ++j) { - auto start = h_in.begin() + i * items_per_warp + j * logical_warp_stride; + auto start = h_in.begin() + i * items_per_warp + j * logical_warp_threads * items_per_thread1; auto end = start + items_per_logical_warp; h_out[i * logical_warps + j] = std::accumulate(start, end, identity, predefined_op{}); } @@ -237,7 +239,7 @@ C2H_TEST("WarpReduce::Sum, full_type_list", "[reduce][warp][predefined_op][full] c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference>(h_in, h_out, warp_size, logical_warps, logical_warp_threads); + compute_host_reference>(h_in, h_out, logical_warps, logical_warp_threads); verify_results(h_out, d_out); } @@ -259,7 +261,7 @@ C2H_TEST("WarpReduce::Sum/Max/Min, builtin types", c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference(h_in, h_out, warp_size, logical_warps, logical_warp_threads); + compute_host_reference(h_in, h_out, logical_warps, logical_warp_threads); verify_results(h_out, d_out); } @@ -276,7 +278,7 @@ C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][full]", full_type_list, log c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference>(h_in, h_out, warp_size, logical_warps, logical_warp_threads); + compute_host_reference>(h_in, h_out, logical_warps, logical_warp_threads); verify_results(h_out, d_out); } @@ -302,7 +304,7 @@ C2H_TEST("WarpReduce::Sum/Max/Min Partial", c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference(h_in, h_out, warp_size, logical_warps, logical_warp_threads, valid_items); + compute_host_reference(h_in, h_out, logical_warps, logical_warp_threads, valid_items); verify_results(h_out, d_out); } @@ -320,7 +322,7 @@ C2H_TEST("WarpReduce::Sum", "[reduce][warp][generic][partial]", full_type_list, c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference>(h_in, h_out, warp_size, logical_warps, logical_warp_threads, valid_items); + compute_host_reference>(h_in, h_out, logical_warps, logical_warp_threads, valid_items); verify_results(h_out, d_out); } @@ -345,12 +347,6 @@ C2H_TEST("WarpReduce::Sum/Max/Min Multiple Items Per Thread", c2h::host_vector h_in = d_in; c2h::host_vector h_out(output_size); - compute_host_reference( - h_in, - h_out, - warp_size * items_per_thread, - logical_warps, - logical_warp_threads * items_per_thread, - logical_warp_threads * items_per_thread); + compute_host_reference(h_in, h_out, logical_warps, logical_warp_threads, 0, items_per_thread); verify_results(h_out, d_out); } From 9da77699b5c1d1f209cb8e36980dc8162a8fda99 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Feb 2025 17:02:27 -0800 Subject: [PATCH 10/12] unify common functionalities --- cub/test/warp/catch2_test_warp_reduce.cu | 54 ++++++++++++------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/cub/test/warp/catch2_test_warp_reduce.cu b/cub/test/warp/catch2_test_warp_reduce.cu index cb3496a5003..ba049c9097f 100644 --- a/cub/test/warp/catch2_test_warp_reduce.cu +++ b/cub/test/warp/catch2_test_warp_reduce.cu @@ -52,8 +52,12 @@ inline constexpr int items_per_thread = 4; * Kernel **********************************************************************************************************************/ -template -__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op, int num_items = 0) +template +__device__ void warp_reduce_function(ThreadDataType& thread_data, T* output, ReductionOp reduction_op, int num_items = 0) { using warp_reduce_t = cub::WarpReduce; using storage_t = typename warp_reduce_t::TempStorage; @@ -67,41 +71,39 @@ __global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op { return; } - auto thread_data = input[threadIdx.x]; warp_reduce_t warp_reduce{storage[logical_warp]}; - auto result = EnableNumItems ? reduction_op(warp_reduce, thread_data, num_items) // - : reduction_op(warp_reduce, thread_data); + using result_t = decltype(reduction_op(warp_reduce, thread_data)); + result_t result; + if constexpr (EnableNumItems) + { + result = reduction_op(warp_reduce, thread_data, num_items); + } + else + { + result = reduction_op(warp_reduce, thread_data); + } if (logical_lane == 0) { output[logical_warp] = result; } } +template +__global__ void warp_reduce_kernel(T* input, T* output, ReductionOp reduction_op, int num_items = 0) +{ + auto thread_data = input[threadIdx.x]; + warp_reduce_function(thread_data, output, reduction_op, num_items); +} + template __global__ void warp_reduce_multiple_items_kernel(T* input, T* output, ReductionOp reduction_op) { - using warp_reduce_t = cub::WarpReduce; - using storage_t = typename warp_reduce_t::TempStorage; - __shared__ storage_t storage[total_warps]; - constexpr bool is_power_of_two = cuda::std::has_single_bit(LogicalWarpThreads); - auto lane = cuda::ptx::get_sreg_laneid(); - auto logical_warp = is_power_of_two ? threadIdx.x / LogicalWarpThreads : threadIdx.x / warp_size; - auto logical_lane = is_power_of_two ? threadIdx.x % LogicalWarpThreads : lane; - if (!is_power_of_two && lane >= LogicalWarpThreads) - { - return; - } T thread_data[items_per_thread]; for (int i = 0; i < items_per_thread; ++i) { thread_data[i] = input[threadIdx.x * items_per_thread + i]; } - warp_reduce_t warp_reduce{storage[logical_warp]}; - auto result = reduction_op(warp_reduce, thread_data); - if (logical_lane == 0) - { - output[logical_warp] = result; - } + warp_reduce_function(thread_data, output, reduction_op); } template @@ -199,15 +201,13 @@ void compute_host_reference( int items_per_thread1 = 1) { constexpr auto identity = operator_identity_v; - int items_per_warp = warp_size * items_per_thread1; - items_per_logical_warp = - items_per_logical_warp == 0 ? logical_warp_threads : items_per_logical_warp * items_per_thread1; + items_per_logical_warp = items_per_logical_warp == 0 ? logical_warp_threads : items_per_logical_warp; for (unsigned i = 0; i < total_warps; ++i) { for (int j = 0; j < logical_warps; ++j) { - auto start = h_in.begin() + i * items_per_warp + j * logical_warp_threads * items_per_thread1; - auto end = start + items_per_logical_warp; + auto start = h_in.begin() + (i * warp_size + j * logical_warp_threads) * items_per_thread1; + auto end = start + items_per_logical_warp * items_per_thread1; h_out[i * logical_warps + j] = std::accumulate(start, end, identity, predefined_op{}); } } From 74622473a3dfc8bdf4c5559c3820fb057f0667f7 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Feb 2025 17:02:41 -0800 Subject: [PATCH 11/12] remove warp_reduce tests from warp_segmented_reduce --- .../warp/catch2_test_warp_segmented_reduce.cu | 193 ------------------ 1 file changed, 193 deletions(-) diff --git a/cub/test/warp/catch2_test_warp_segmented_reduce.cu b/cub/test/warp/catch2_test_warp_segmented_reduce.cu index 58f46ac7633..e036f893761 100644 --- a/cub/test/warp/catch2_test_warp_segmented_reduce.cu +++ b/cub/test/warp/catch2_test_warp_segmented_reduce.cu @@ -60,68 +60,6 @@ __global__ void warp_reduce_kernel(T* in, T* out, ActionT action) out[tid] = result; } -/** - * @brief Delegate wrapper for WarpReduce::Sum - */ -template -struct warp_sum_t -{ - template - __device__ T operator()(int linear_tid, cub::WarpReduce& warp_reduce, T& thread_data) const - { - auto result = warp_reduce.Sum(thread_data); - return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data; - } -}; - -/** - * @brief Delegate wrapper for partial WarpReduce::Sum - */ -template -struct warp_sum_partial_t -{ - int num_valid; - template - __device__ __forceinline__ T - operator()(int linear_tid, cub::WarpReduce& warp_reduce, T& thread_data) const - { - auto result = warp_reduce.Sum(thread_data, num_valid); - return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data; - } -}; - -/** - * @brief Delegate wrapper for WarpReduce::Reduce - */ -template -struct warp_reduce_t -{ - ReductionOpT reduction_op; - template - __device__ __forceinline__ T - operator()(int linear_tid, cub::WarpReduce& warp_reduce, T& thread_data) const - { - auto result = warp_reduce.Reduce(thread_data, reduction_op); - return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data; - } -}; - -/** - * @brief Delegate wrapper for partial WarpReduce::Reduce - */ -template -struct warp_reduce_partial_t -{ - int num_valid; - ReductionOpT reduction_op; - template - __device__ T operator()(int linear_tid, cub::WarpReduce& warp_reduce, T& thread_data) const - { - auto result = warp_reduce.Reduce(thread_data, reduction_op, num_valid); - return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data; - } -}; - /** * @brief Delegate wrapper for WarpReduce::TailSegmentedSum */ @@ -337,137 +275,6 @@ struct params_t static constexpr int tile_size = total_warps * logical_warp_threads; }; -C2H_TEST("Warp sum works", "[reduce][warp]", full_type_list, logical_warp_threads) -{ - using params = params_t; - using type = typename params::type; - - // Prepare test data - c2h::device_vector d_in(params::tile_size); - c2h::device_vector d_out(params::tile_size); - constexpr auto valid_items = params::logical_warp_threads; - c2h::gen(C2H_SEED(10), d_in); - - // Run test - warp_reduce(d_in, d_out, warp_sum_t{}); - - // Prepare verification data - c2h::host_vector h_in = d_in; - c2h::host_vector h_out = h_in; - auto h_flags = thrust::make_constant_iterator(false); - compute_host_reference( - reduce_mode::all, - h_in, - h_flags, - params::total_warps, - params::logical_warp_threads, - valid_items, - ::cuda::std::plus{}, - h_out.begin()); - - // Verify results - verify_results(h_out, d_out); -} - -C2H_TEST("Warp reduce works", "[reduce][warp]", builtin_type_list, logical_warp_threads) -{ - using params = params_t; - using type = typename params::type; - using red_op_t = ::cuda::minimum<>; - - // Prepare test data - c2h::device_vector d_in(params::tile_size); - c2h::device_vector d_out(params::tile_size); - constexpr auto valid_items = params::logical_warp_threads; - c2h::gen(C2H_SEED(10), d_in); - - // Run test - warp_reduce(d_in, d_out, warp_reduce_t{red_op_t{}}); - - // Prepare verification data - c2h::host_vector h_in = d_in; - c2h::host_vector h_out = h_in; - auto h_flags = thrust::make_constant_iterator(false); - compute_host_reference( - reduce_mode::all, - h_in, - h_flags, - params::total_warps, - params::logical_warp_threads, - valid_items, - red_op_t{}, - h_out.begin()); - - // Verify results - verify_results(h_out, d_out); -} - -C2H_TEST("Warp sum on partial warp works", "[reduce][warp]", full_type_list, logical_warp_threads) -{ - using params = params_t; - using type = typename params::type; - - // Prepare test data - c2h::device_vector d_in(params::tile_size); - c2h::device_vector d_out(params::tile_size); - const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads))); - c2h::gen(C2H_SEED(10), d_in); - - // Run test - warp_reduce(d_in, d_out, warp_sum_partial_t{valid_items}); - - // Prepare verification data - c2h::host_vector h_in = d_in; - c2h::host_vector h_out = h_in; - auto h_flags = thrust::make_constant_iterator(false); - compute_host_reference( - reduce_mode::all, - h_in, - h_flags, - params::total_warps, - params::logical_warp_threads, - valid_items, - ::cuda::std::plus{}, - h_out.begin()); - - // Verify results - verify_results(h_out, d_out); -} - -C2H_TEST("Warp reduce on partial warp works", "[reduce][warp]", builtin_type_list, logical_warp_threads) -{ - using params = params_t; - using type = typename params::type; - using red_op_t = ::cuda::minimum<>; - - // Prepare test data - c2h::device_vector d_in(params::tile_size); - c2h::device_vector d_out(params::tile_size); - const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads))); - c2h::gen(C2H_SEED(10), d_in); - - // Run test - warp_reduce( - d_in, d_out, warp_reduce_partial_t{valid_items, red_op_t{}}); - - // Prepare verification data - c2h::host_vector h_in = d_in; - c2h::host_vector h_out = h_in; - auto h_flags = thrust::make_constant_iterator(false); - compute_host_reference( - reduce_mode::all, - h_in, - h_flags, - params::total_warps, - params::logical_warp_threads, - valid_items, - red_op_t{}, - h_out.begin()); - - // Verify results - verify_results(h_out, d_out); -} - C2H_TEST("Warp segmented sum works", "[reduce][warp]", full_type_list, logical_warp_threads, segmented_modes) { using params = params_t; From e5da8b67c0b963f7b5bd1215dcfbb03bef87d4a7 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 21 Feb 2025 10:16:37 -0800 Subject: [PATCH 12/12] use Pascal style --- cub/cub/warp/warp_reduce.cuh | 44 +++++++++++++++++------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh index 3776496959b..98149072a15 100644 --- a/cub/cub/warp/warp_reduce.cuh +++ b/cub/cub/warp/warp_reduce.cuh @@ -65,7 +65,7 @@ CUB_NAMESPACE_BEGIN //! - A `reduction `__ (or *fold*) uses a binary combining //! operator to compute a single aggregate from a list of input elements. //! - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) -//! - The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS`` +//! - The number of entrant threads must be an multiple of ``LogicalWarpThreads`` //! //! Performance Considerations //! ++++++++++++++++++++++++++ @@ -76,7 +76,7 @@ CUB_NAMESPACE_BEGIN //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for: //! //! - Summation (**vs.** generic reduction) -//! - The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS`` +//! - The architecture's warp size is a whole multiple of ``LogicalWarpThreads`` //! //! Simple Examples //! +++++++++++++++ @@ -134,29 +134,27 @@ CUB_NAMESPACE_BEGIN //! @tparam T //! The reduction input/output element type //! -//! @tparam LOGICAL_WARP_THREADS +//! @tparam LogicalWarpThreads //! [optional] The number of threads per "logical" warp (may be less than the number of //! hardware warp threads). Default is the warp size of the targeted CUDA compute-capability //! (e.g., 32 threads for SM20). //! -template +template class WarpReduce { - static_assert(LOGICAL_WARP_THREADS >= 1 && LOGICAL_WARP_THREADS <= CUB_WARP_THREADS(0), - "LOGICAL_WARP_THREADS must be in the range [1, 32]"); + static_assert(LogicalWarpThreads >= 1 && LogicalWarpThreads <= CUB_WARP_THREADS(0), + "LogicalWarpThreads must be in the range [1, 32]"); - static constexpr bool is_full_warp = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)); - static constexpr bool is_power_of_two = ::cuda::std::has_single_bit(uint32_t{LOGICAL_WARP_THREADS}); + static constexpr bool is_full_warp = (LogicalWarpThreads == CUB_WARP_THREADS(0)); + static constexpr bool is_power_of_two = ::cuda::std::has_single_bit(uint32_t{LogicalWarpThreads}); public: #ifndef _CCCL_DOXYGEN_INVOKED // Do not document /// Internal specialization. - /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two - using InternalWarpReduce = - ::cuda::std::_If, - detail::WarpReduceSmem>; + /// Use SHFL-based reduction if LogicalWarpThreads is a power-of-two + using InternalWarpReduce = ::cuda::std:: + _If, detail::WarpReduceSmem>; #endif // _CCCL_DOXYGEN_INVOKED @@ -224,7 +222,7 @@ public: //! _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input) { - return InternalWarpReduce{temp_storage}.template Reduce(input, LOGICAL_WARP_THREADS, ::cuda::std::plus<>{}); + return InternalWarpReduce{temp_storage}.template Reduce(input, LogicalWarpThreads, ::cuda::std::plus<>{}); } template @@ -234,12 +232,12 @@ public: { auto thread_reduction = cub::ThreadReduce(input, ::cuda::std::plus<>{}); return InternalWarpReduce{temp_storage}.template Reduce( - thread_reduction, LOGICAL_WARP_THREADS, ::cuda::std::plus<>{}); + thread_reduction, LogicalWarpThreads, ::cuda::std::plus<>{}); } _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Max(T input) { - return InternalWarpReduce{temp_storage}.template Reduce(input, LOGICAL_WARP_THREADS, ::cuda::maximum<>{}); + return InternalWarpReduce{temp_storage}.template Reduce(input, LogicalWarpThreads, ::cuda::maximum<>{}); } template @@ -249,12 +247,12 @@ public: { auto thread_reduction = cub::ThreadReduce(input, ::cuda::maximum<>{}); return InternalWarpReduce{temp_storage}.template Reduce( - thread_reduction, LOGICAL_WARP_THREADS, ::cuda::maximum<>{}); + thread_reduction, LogicalWarpThreads, ::cuda::maximum<>{}); } _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Min(T input) { - return InternalWarpReduce{temp_storage}.template Reduce(input, LOGICAL_WARP_THREADS, ::cuda::minimum<>{}); + return InternalWarpReduce{temp_storage}.template Reduce(input, LogicalWarpThreads, ::cuda::minimum<>{}); } template @@ -264,7 +262,7 @@ public: { auto thread_reduction = cub::ThreadReduce(input, ::cuda::minimum<>{}); return InternalWarpReduce{temp_storage}.template Reduce( - thread_reduction, LOGICAL_WARP_THREADS, ::cuda::minimum<>{}); + thread_reduction, LogicalWarpThreads, ::cuda::minimum<>{}); } //! @rst @@ -313,7 +311,7 @@ public: //! //! @param[in] valid_items //! Total number of valid items in the calling thread's logical warp - //! (may be less than ``LOGICAL_WARP_THREADS``) + //! (may be less than ``LogicalWarpThreads``) _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int valid_items) { // Determine if we don't need bounds checking @@ -496,7 +494,7 @@ public: template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op) { - return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); + return InternalWarpReduce(temp_storage).template Reduce(input, LogicalWarpThreads, reduction_op); } template @@ -505,7 +503,7 @@ public: Reduce(const InputType& input, ReductionOp reduction_op) { auto thread_reduction = cub::ThreadReduce(input, reduction_op); - return WarpReduce::Reduce(thread_reduction, LOGICAL_WARP_THREADS, reduction_op); + return WarpReduce::Reduce(thread_reduction, LogicalWarpThreads, reduction_op); } //! @rst //! Computes a partially-full warp-wide reduction in the calling warp using the specified binary @@ -562,7 +560,7 @@ public: //! //! @param[in] valid_items //! Total number of valid items in the calling thread's logical warp - //! (may be less than ``LOGICAL_WARP_THREADS``) + //! (may be less than ``LogicalWarpThreads``) template _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int valid_items) {