diff --git a/libcudacxx/include/cuda/__warp_communication/warp_shuffle.h b/libcudacxx/include/cuda/__warp_communication/warp_shuffle.h new file mode 100644 index 00000000000..8c517019ca9 --- /dev/null +++ b/libcudacxx/include/cuda/__warp_communication/warp_shuffle.h @@ -0,0 +1,169 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPO__RATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___WARP_COMMUNICATION_SHFL_H +#define _CUDA___WARP_COMMUNICATION_SHFL_H + +#include + +#include "cuda/__cccl_config" + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template +struct WarpShuffleResult +{ + _Tp data; + bool pred; + + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE operator _Tp() const + { + return data; + } +}; + +template +_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE WarpShuffleResult<_Tp> warp_shuffle( + _Tp __data, int __src_lane, uint32_t __lane_mask = 0xFFFFFFFF, _CUDA_VSTD::integral_constant = {}) +{ + constexpr int __warp_size = 32; + static_assert(_CUDA_VSTD::has_single_bit(static_cast(_Width)) && _Width >= 1 && _Width <= __warp_size, + "_Width must be a power of 2 and less or equal to the warp size"); + if constexpr (_Width == 1) + { + return WarpShuffleResult<_Tp>{__data, true}; + } + else + { + constexpr auto __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(uint32_t)); + auto __clamp_segmask = (_Width - 1) | ((__warp_size - _Width) << 8); + bool __pred; + uint32_t __array[__ratio]; + ::memcpy(static_cast(__array), static_cast(&__data), sizeof(_Tp)); +#pragma unroll + for (int i = 0; i < __ratio; ++i) + { + __array[i] = ::cuda::ptx::shfl_sync_idx(__array[i], __pred, __src_lane, __clamp_segmask, __lane_mask); + } + _Tp __result; + ::memcpy(static_cast(&__result), static_cast(__array), sizeof(_Tp)); + return WarpShuffleResult<_Tp>{__result, __pred}; + } +} + +template +_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE WarpShuffleResult<_Tp> warp_shuffle_up( + _Tp __data, int __delta, uint32_t __lane_mask = 0xFFFFFFFF, _CUDA_VSTD::integral_constant = {}) +{ + constexpr int __warp_size = 32; + static_assert(_CUDA_VSTD::has_single_bit(static_cast(_Width)) && _Width >= 1 && _Width <= __warp_size, + "_Width must be a power of 2 and less or equal to the warp size"); + _CCCL_ASSERT(__delta >= 1 && __delta < _Width, "delta must be in the range [1, _Width)"); + if constexpr (_Width == 1) + { + return WarpShuffleResult<_Tp>{__data, true}; + } + else + { + constexpr auto __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(uint32_t)); + auto __clamp_segmask = (_Width - 1) | ((__warp_size - _Width) << 8); + bool __pred; + uint32_t __array[__ratio]; + ::memcpy(static_cast(__array), static_cast(&__data), sizeof(_Tp)); +#pragma unroll + for (int i = 0; i < __ratio; ++i) + { + __array[i] = ::cuda::ptx::shfl_sync_up(__array[i], __pred, __delta, __clamp_segmask, __lane_mask); + } + _Tp __result; + ::memcpy(static_cast(&__result), static_cast(__array), sizeof(_Tp)); + return WarpShuffleResult<_Tp>{__result, __pred}; + } +} + +template +_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE WarpShuffleResult<_Tp> warp_shuffle_down( + _Tp __data, int __delta, uint32_t __lane_mask = 0xFFFFFFFF, _CUDA_VSTD::integral_constant = {}) +{ + constexpr int __warp_size = 32; + static_assert(_CUDA_VSTD::has_single_bit(static_cast(_Width)) && _Width >= 1 && _Width <= __warp_size, + "_Width must be a power of 2 and less or equal to the warp size"); + _CCCL_ASSERT(__delta >= 1 && __delta < _Width, "delta must be in the range [1, _Width)"); + if constexpr (_Width == 1) + { + return WarpShuffleResult<_Tp>{__data, true}; + } + else + { + constexpr auto __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(uint32_t)); + auto __clamp_segmask = (_Width - 1) | ((__warp_size - _Width) << 8); + bool __pred; + uint32_t __array[__ratio]; + ::memcpy(static_cast(__array), static_cast(&__data), sizeof(_Tp)); +#pragma unroll + for (int i = 0; i < __ratio; ++i) + { + __array[i] = ::cuda::ptx::shfl_sync_down(__array[i], __pred, __delta, __clamp_segmask, __lane_mask); + } + _Tp __result; + ::memcpy(static_cast(&__result), static_cast(__array), sizeof(_Tp)); + return WarpShuffleResult<_Tp>{__result, __pred}; + } +} + +template +_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE WarpShuffleResult<_Tp> warp_shuffle_xor( + _Tp __data, int __xor_mask, uint32_t __lane_mask = 0xFFFFFFFF, _CUDA_VSTD::integral_constant = {}) +{ + constexpr int __warp_size = 32; + static_assert(_CUDA_VSTD::has_single_bit(static_cast(_Width)) && _Width >= 1 && _Width <= __warp_size, + "_Width must be a power of 2 and less or equal to the warp size"); + _CCCL_ASSERT(__xor_mask >= 1 && __xor_mask < _Width, "delta must be in the range [1, _Width)"); + if constexpr (_Width == 1) + { + return WarpShuffleResult<_Tp>{__data, true}; + } + else + { + constexpr auto __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(uint32_t)); + auto __clamp_segmask = (_Width - 1) | ((__warp_size - _Width) << 8); + bool __pred; + uint32_t __array[__ratio]; + ::memcpy(static_cast(__array), static_cast(&__data), sizeof(_Tp)); +#pragma unroll + for (int i = 0; i < __ratio; ++i) + { + __array[i] = ::cuda::ptx::shfl_sync_bfly(__array[i], __pred, __xor_mask, __clamp_segmask, __lane_mask); + } + _Tp __result; + ::memcpy(static_cast(&__result), static_cast(__array), sizeof(_Tp)); + return WarpShuffleResult<_Tp>{__result, __pred}; + } +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___WARP_COMMUNICATION_SHFL_H diff --git a/libcudacxx/include/cuda/warp_comm b/libcudacxx/include/cuda/warp_comm new file mode 100644 index 00000000000..e8067f49df0 --- /dev/null +++ b/libcudacxx/include/cuda/warp_comm @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_WARP_COMM +#define _CUDA_WARP_COMM + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#endif // _CUDA_WARP_COMM diff --git a/libcudacxx/test/libcudacxx/cuda/warp_comm/warp_comm.pass.cpp b/libcudacxx/test/libcudacxx/cuda/warp_comm/warp_comm.pass.cpp new file mode 100644 index 00000000000..0d5498823d6 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/warp_comm/warp_comm.pass.cpp @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +template +inline constexpr auto width_v = cuda::std::integral_constant{}; + +template +__device__ void warp_shuffle_semantic_test() +{ + uint32_t data = threadIdx.x; + for (int i = 0; i < 32; i++) + { + assert(cuda::warp_shuffle(data, i, 0xFFFFFFFF, width_v) == __shfl_sync(0xFFFFFFFF, data, i, Value)); + } +} + +template +__device__ void type_test(const T& data) +{} + +__device__ void type_test() +{ + type_test(cuda::std::array{1.0, 2.0, 3.0, 4.0}); +} + +__global__ void test_kernel() +{ + warp_shuffle_semantic_test<1>(); + warp_shuffle_semantic_test<2>(); + warp_shuffle_semantic_test<4>(); + warp_shuffle_semantic_test<8>(); + warp_shuffle_semantic_test<16>(); + warp_shuffle_semantic_test<32>(); + type_test(); +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, (test_kernel<<<1, 32>>>();)) + return 0; +}