Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Safe warp shuffle operations #3907

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 169 additions & 0 deletions libcudacxx/include/cuda/__warp_communication/warp_shuffle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPO__RATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___WARP_COMMUNICATION_SHFL_H
#define _CUDA___WARP_COMMUNICATION_SHFL_H

#include <cuda/std/detail/__config>

#include "cuda/__cccl_config"

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/__cmath/ceil_div.h>
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/__ptx/instructions/shfl_sync.h>
#include <cuda/std/__bit/has_single_bit.h>
#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__type_traits/integral_constant.h>
#include <cuda/std/cstdint>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

template <typename _Tp>
struct WarpShuffleResult
{
_Tp data;
bool pred;

_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE operator _Tp() const
{
return data;
}
};

template <int _Width = 32, typename _Tp>
_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE WarpShuffleResult<_Tp> warp_shuffle(
_Tp __data, int __src_lane, uint32_t __lane_mask = 0xFFFFFFFF, _CUDA_VSTD::integral_constant<int, _Width> = {})
{
constexpr int __warp_size = 32;
static_assert(_CUDA_VSTD::has_single_bit(static_cast<uint32_t>(_Width)) && _Width >= 1 && _Width <= __warp_size,
"_Width must be a power of 2 and less or equal to the warp size");
if constexpr (_Width == 1)
{
return WarpShuffleResult<_Tp>{__data, true};
}
else
{
constexpr auto __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(uint32_t));
auto __clamp_segmask = (_Width - 1) | ((__warp_size - _Width) << 8);
bool __pred;
uint32_t __array[__ratio];
::memcpy(static_cast<void*>(__array), static_cast<void*>(&__data), sizeof(_Tp));
#pragma unroll
for (int i = 0; i < __ratio; ++i)
{
__array[i] = ::cuda::ptx::shfl_sync_idx(__array[i], __pred, __src_lane, __clamp_segmask, __lane_mask);
}
_Tp __result;
::memcpy(static_cast<void*>(&__result), static_cast<void*>(__array), sizeof(_Tp));
return WarpShuffleResult<_Tp>{__result, __pred};
}
}

template <int _Width = 32, typename _Tp>
_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE WarpShuffleResult<_Tp> warp_shuffle_up(
_Tp __data, int __delta, uint32_t __lane_mask = 0xFFFFFFFF, _CUDA_VSTD::integral_constant<int, _Width> = {})
{
constexpr int __warp_size = 32;
static_assert(_CUDA_VSTD::has_single_bit(static_cast<uint32_t>(_Width)) && _Width >= 1 && _Width <= __warp_size,
"_Width must be a power of 2 and less or equal to the warp size");
_CCCL_ASSERT(__delta >= 1 && __delta < _Width, "delta must be in the range [1, _Width)");
if constexpr (_Width == 1)
{
return WarpShuffleResult<_Tp>{__data, true};
}
else
{
constexpr auto __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(uint32_t));
auto __clamp_segmask = (_Width - 1) | ((__warp_size - _Width) << 8);
bool __pred;
uint32_t __array[__ratio];
::memcpy(static_cast<void*>(__array), static_cast<void*>(&__data), sizeof(_Tp));
#pragma unroll
for (int i = 0; i < __ratio; ++i)
{
__array[i] = ::cuda::ptx::shfl_sync_up(__array[i], __pred, __delta, __clamp_segmask, __lane_mask);
}
_Tp __result;
::memcpy(static_cast<void*>(&__result), static_cast<void*>(__array), sizeof(_Tp));
return WarpShuffleResult<_Tp>{__result, __pred};
}
}

template <int _Width = 32, typename _Tp>
_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE WarpShuffleResult<_Tp> warp_shuffle_down(
_Tp __data, int __delta, uint32_t __lane_mask = 0xFFFFFFFF, _CUDA_VSTD::integral_constant<int, _Width> = {})
{
constexpr int __warp_size = 32;
static_assert(_CUDA_VSTD::has_single_bit(static_cast<uint32_t>(_Width)) && _Width >= 1 && _Width <= __warp_size,
"_Width must be a power of 2 and less or equal to the warp size");
_CCCL_ASSERT(__delta >= 1 && __delta < _Width, "delta must be in the range [1, _Width)");
if constexpr (_Width == 1)
{
return WarpShuffleResult<_Tp>{__data, true};
}
else
{
constexpr auto __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(uint32_t));
auto __clamp_segmask = (_Width - 1) | ((__warp_size - _Width) << 8);
bool __pred;
uint32_t __array[__ratio];
::memcpy(static_cast<void*>(__array), static_cast<void*>(&__data), sizeof(_Tp));
#pragma unroll
for (int i = 0; i < __ratio; ++i)
{
__array[i] = ::cuda::ptx::shfl_sync_down(__array[i], __pred, __delta, __clamp_segmask, __lane_mask);
}
_Tp __result;
::memcpy(static_cast<void*>(&__result), static_cast<void*>(__array), sizeof(_Tp));
return WarpShuffleResult<_Tp>{__result, __pred};
}
}

template <int _Width = 32, typename _Tp>
_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE WarpShuffleResult<_Tp> warp_shuffle_xor(
_Tp __data, int __xor_mask, uint32_t __lane_mask = 0xFFFFFFFF, _CUDA_VSTD::integral_constant<int, _Width> = {})
{
constexpr int __warp_size = 32;
static_assert(_CUDA_VSTD::has_single_bit(static_cast<uint32_t>(_Width)) && _Width >= 1 && _Width <= __warp_size,
"_Width must be a power of 2 and less or equal to the warp size");
_CCCL_ASSERT(__xor_mask >= 1 && __xor_mask < _Width, "delta must be in the range [1, _Width)");
if constexpr (_Width == 1)
{
return WarpShuffleResult<_Tp>{__data, true};
}
else
{
constexpr auto __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(uint32_t));
auto __clamp_segmask = (_Width - 1) | ((__warp_size - _Width) << 8);
bool __pred;
uint32_t __array[__ratio];
::memcpy(static_cast<void*>(__array), static_cast<void*>(&__data), sizeof(_Tp));
#pragma unroll
for (int i = 0; i < __ratio; ++i)
{
__array[i] = ::cuda::ptx::shfl_sync_bfly(__array[i], __pred, __xor_mask, __clamp_segmask, __lane_mask);
}
_Tp __result;
::memcpy(static_cast<void*>(&__result), static_cast<void*>(__array), sizeof(_Tp));
return WarpShuffleResult<_Tp>{__result, __pred};
}
}

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CUDA___WARP_COMMUNICATION_SHFL_H
26 changes: 26 additions & 0 deletions libcudacxx/include/cuda/warp_comm
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_WARP_COMM
#define _CUDA_WARP_COMM

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/__warp_communication/warp_shuffle.h>

#endif // _CUDA_WARP_COMM
54 changes: 54 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/warp_comm/warp_comm.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
//===----------------------------------------------------------------------===//
//
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <cuda/std/array>
#include <cuda/std/cassert>
#include <cuda/std/cstdint>
#include <cuda/warp_comm>

#include <type_traits>

template <int Value>
inline constexpr auto width_v = cuda::std::integral_constant<int, Value>{};

template <int Value>
__device__ void warp_shuffle_semantic_test()
{
uint32_t data = threadIdx.x;
for (int i = 0; i < 32; i++)
{
assert(cuda::warp_shuffle(data, i, 0xFFFFFFFF, width_v<Value>) == __shfl_sync(0xFFFFFFFF, data, i, Value));
}
}

template <class T>
__device__ void type_test(const T& data)
{}

__device__ void type_test()
{
type_test(cuda::std::array<double, 4>{1.0, 2.0, 3.0, 4.0});
}

__global__ void test_kernel()
{
warp_shuffle_semantic_test<1>();
warp_shuffle_semantic_test<2>();
warp_shuffle_semantic_test<4>();
warp_shuffle_semantic_test<8>();
warp_shuffle_semantic_test<16>();
warp_shuffle_semantic_test<32>();
type_test();
}

int main(int, char**)
{
NV_IF_TARGET(NV_IS_HOST, (test_kernel<<<1, 32>>>();))
return 0;
}