Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use intrinsic for genLog2() #69333

Merged
merged 6 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 22 additions & 32 deletions src/coreclr/inc/bitposition.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,47 +12,37 @@
//
// Notes:
// 'value' must have exactly one bit set.
// The algorithm is as follows:
// - PRIME is a prime bigger than sizeof(unsigned int), which is not of the
// form 2^n-1.
// - Taking the modulo of 'value' with this will produce a unique hash for all
// powers of 2 (which is what "value" is).
// - Entries in hashTable[] which are -1 should never be used. There
// should be PRIME-8*sizeof(value) entries which are -1 .
// It performs the "TrailingZeroCount" operation using intrinsics.
//
inline
unsigned BitPosition(unsigned value)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This LGTM, but its worth calling out that this is "effectively" doing TrailingZeroCount, but without handling 0.

It might be worth centralizing these "primitive bit operation" methods somewhere and ensuring they are generally reusable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened tracking issue #69401

{
_ASSERTE((value != 0) && ((value & (value-1)) == 0));
#if defined(HOST_ARM) && defined(__llvm__)
// use intrinsic functions for arm32
// this is applied for LLVM only but it may work for some compilers
DWORD index = __builtin_clz(__builtin_arm_rbit(value));
#elif !defined(HOST_AMD64)
const unsigned PRIME = 37;

static const char hashTable[PRIME] =
{
-1, 0, 1, 26, 2, 23, 27, -1, 3, 16,
24, 30, 28, 11, -1, 13, 4, 7, 17, -1,
25, 22, 31, 15, 29, 10, 12, 6, -1, 21,
14, 9, 5, 20, 8, 19, 18
};

_ASSERTE(PRIME >= 8*sizeof(value));
_ASSERTE(sizeof(hashTable) == PRIME);
DWORD index;
BitScanForward(&index, value);
return index;
}


unsigned hash = value % PRIME;
unsigned index = hashTable[hash];
_ASSERTE(index != (unsigned char)-1);
#else
// not enabled for x86 because BSF is extremely slow on Atom
// (15 clock cycles vs 1-3 on any other Intel CPU post-P4)
#ifdef HOST_64BIT
//------------------------------------------------------------------------
// BitPosition: Return the position of the single bit that is set in 'value'.
//
// Return Value:
// The position (0 is LSB) of bit that is set in 'value'
//
// Notes:
// 'value' must have exactly one bit set.
// It performs the "TrailingZeroCount" operation using intrinsics.
//
inline
unsigned BitPosition(unsigned __int64 value)
{
_ASSERTE((value != 0) && ((value & (value-1)) == 0));
DWORD index;
BitScanForward(&index, value);
#endif
BitScanForward64(&index, value);
return index;
}
#endif // HOST_64BIT

#endif
77 changes: 37 additions & 40 deletions src/coreclr/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,56 +97,49 @@ inline T genFindLowestBit(T value)
return (value & (0 - value));
}

/*****************************************************************************/
/*****************************************************************************
*
* Return the highest bit that is set (that is, a mask that includes just the highest bit).
* TODO-ARM64-Throughput: we should convert these to use the _BitScanReverse() / _BitScanReverse64()
* compiler intrinsics, but our CRT header file intrin.h doesn't define these for ARM64 yet.
*/

//------------------------------------------------------------------------
// genFindHighestBit: Return the highest bit that is set (that is, a mask that includes just the
// highest bit).
//
// Return Value:
// The highest position (0 is LSB) of bit that is set in the 'value'.
//
// Note:
// It performs the "LeadingZeroCount " operation using intrinsics and then mask out everything
// but the highest bit.
inline unsigned int genFindHighestBit(unsigned int mask)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. LGTM but this is effectively using LeadingZeroCount (and then masking out everything but the highest bit) but without strictly handling some of the potential edge cases.

It might be worthwhile to centralize these somewhere (maybe as an "up for grabs" work item)

{
assert(mask != 0);
unsigned int bit = 1U << ((sizeof(unsigned int) * 8) - 1); // start looking at the top
while ((bit & mask) == 0)
{
bit >>= 1;
}
return bit;
}

inline unsigned __int64 genFindHighestBit(unsigned __int64 mask)
{
assert(mask != 0);
unsigned __int64 bit = 1ULL << ((sizeof(unsigned __int64) * 8) - 1); // start looking at the top
while ((bit & mask) == 0)
{
bit >>= 1;
}
return bit;
}

#if 0
// TODO-ARM64-Cleanup: These should probably be the implementation, when intrin.h is updated for ARM64
inline
unsigned int genFindHighestBit(unsigned int mask)
{
assert(mask != 0);
#if defined(_MSC_VER)
unsigned long index;
#else
unsigned int index;
_BitScanReverse(&index, mask);
#endif
BitScanReverse(&index, mask);
return 1L << index;
}

inline
unsigned __int64 genFindHighestBit(unsigned __int64 mask)
//------------------------------------------------------------------------
// genFindHighestBit: Return the highest bit that is set (that is, a mask that includes just the
// highest bit).
//
// Return Value:
// The highest position (0 is LSB) of bit that is set in the 'value'.
//
// Note:
// It performs the "LeadingZeroCount " operation using intrinsics and then mask out everything
// but the highest bit.
inline unsigned __int64 genFindHighestBit(unsigned __int64 mask)
{
assert(mask != 0);
#if defined(_MSC_VER)
unsigned long index;
#else
unsigned int index;
_BitScanReverse64(&index, mask);
#endif
BitScanReverse64(&index, mask);
return 1LL << index;
}
#endif // 0

/*****************************************************************************
*
Expand Down Expand Up @@ -222,8 +215,11 @@ inline unsigned uhi32(unsigned __int64 value)

inline unsigned genLog2(unsigned __int64 value)
{
unsigned lo32 = ulo32(value);
unsigned hi32 = uhi32(value);
#ifdef HOST_64BIT
return BitPosition(value);
#else // HOST_32BIT
unsigned lo32 = ulo32(value);
unsigned hi32 = uhi32(value);

if (lo32 != 0)
{
Expand All @@ -234,6 +230,7 @@ inline unsigned genLog2(unsigned __int64 value)
{
return genLog2(hi32) + 32;
}
#endif
}

/*****************************************************************************
Expand Down