Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port yield normalization from CoreCLR to Native AOT #103675

Merged
merged 23 commits into from
Jul 17, 2024
Merged
36 changes: 36 additions & 0 deletions src/coreclr/inc/yieldprocessornormalized.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,25 @@

#pragma once

// not sure if necessary
#include <limits.h>

// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
// the intention is to use the system-default implementation of YieldProcessor().
#define HAS_SYSTEM_YIELDPROCESSOR
#ifdef FEATURE_NATIVEAOT
FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
#else
FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
#endif
#ifdef YieldProcessor
#undef YieldProcessor
#endif
#define YieldProcessor Dont_Use_YieldProcessor
#ifdef PalYieldProcessor
#undef PalYieldProcessor
#endif
#define PalYieldProcessor Dont_Use_PalYieldProcessor

#define DISABLE_COPY(T) \
T(const T &) = delete; \
Expand All @@ -20,6 +31,31 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
T() = delete; \
DISABLE_COPY(T)

#ifdef FEATURE_NATIVEAOT
#define static_assert_no_msg( cond ) static_assert( cond, #cond )
#define SIZE_T uintptr_t
// verify these are correct
typedef BYTE UINT8;
typedef ULONGLONG UINT64;

template <typename T>
T Min(T v1, T v2)
{
// STATIC_CONTRACT_LEAF;
return v1 < v2 ? v1 : v2;
}

template <typename T>
T Max(T v1, T v2)
{
// STATIC_CONTRACT_LEAF;
return v1 > v2 ? v1 : v2;
}

void InitializeYieldProcessorNormalizedCrst();
void EnsureYieldProcessorNormalizedInitialized();
#endif

class YieldProcessorNormalization
{
public:
Expand Down
103 changes: 2 additions & 101 deletions src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,105 +14,6 @@
#include "slist.h"
#include "volatile.h"
#include "yieldprocessornormalized.h"
#include "../../vm/synch.h"

#define ULONGLONG int64_t

static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
static CrstStatic s_initializeYieldProcessorNormalizedCrst;

// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
// tuned for Skylake processors
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;

void InitializeYieldProcessorNormalizedCrst()
{
WRAPPER_NO_CONTRACT;
s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
}

static void InitializeYieldProcessorNormalized()
{
WRAPPER_NO_CONTRACT;

CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);

if (s_isYieldProcessorNormalizedInitialized)
{
return;
}

// Intel pre-Skylake processor: measured typically 14-17 cycles per yield
// Intel post-Skylake processor: measured typically 125-150 cycles per yield
const int MeasureDurationMs = 10;
const int NsPerSecond = 1000 * 1000 * 1000;

ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();

if (ticksPerSecond < 1000 / MeasureDurationMs)
{
// High precision clock not available or clock resolution is too low, resort to defaults
s_isYieldProcessorNormalizedInitialized = true;
return;
}

// Measure the nanosecond delay per yield
ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
unsigned int yieldCount = 0;
ULONGLONG startTicks = PalQueryPerformanceCounter();
ULONGLONG elapsedTicks;
do
{
// On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
// the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
// low microsecond range.
for (int i = 0; i < 1000; ++i)
{
System_YieldProcessor();
}
yieldCount += 1000;

ULONGLONG nowTicks = PalQueryPerformanceCounter();
elapsedTicks = nowTicks - startTicks;
} while (elapsedTicks < measureDurationTicks);
double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
if (nsPerYield < 1)
{
nsPerYield = 1;
}

// Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
// value is naturally limited to MinNsPerNormalizedYield.
int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
if (yieldsPerNormalizedYield < 1)
{
yieldsPerNormalizedYield = 1;
}
_ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);

// Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
// spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
// better job of allowing other work to run.
int optimalMaxNormalizedYieldsPerSpinIteration =
(int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
{
optimalMaxNormalizedYieldsPerSpinIteration = 1;
}

g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
s_isYieldProcessorNormalizedInitialized = true;

GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
}

void EnsureYieldProcessorNormalizedInitialized()
{
WRAPPER_NO_CONTRACT;

if (!s_isYieldProcessorNormalizedInitialized)
{
InitializeYieldProcessorNormalized();
}
}
#include "../../vm/yieldprocessornormalizedshared.cpp"
228 changes: 2 additions & 226 deletions src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
Original file line number Diff line number Diff line change
@@ -1,229 +1,5 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

#pragma once

#include <limits.h>

// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
// the intention is to use the system-default implementation of YieldProcessor().
#define HAS_SYSTEM_YIELDPROCESSOR
FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
#ifdef YieldProcessor
#undef YieldProcessor
#endif
#define YieldProcessor Dont_Use_YieldProcessor
#ifdef PalYieldProcessor
#undef PalYieldProcessor
#endif
#define PalYieldProcessor Dont_Use_PalYieldProcessor

#define SIZE_T uintptr_t

const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake

extern unsigned int g_yieldsPerNormalizedYield;
extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;

void InitializeYieldProcessorNormalizedCrst();
void EnsureYieldProcessorNormalizedInitialized();

class YieldProcessorNormalizationInfo
{
private:
unsigned int yieldsPerNormalizedYield;
unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
unsigned int optimalMaxYieldsPerSpinIteration;

public:
YieldProcessorNormalizationInfo()
: yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
{
}

friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
};

// See YieldProcessorNormalized() for preliminary info. Typical usage:
// if (!condition)
// {
// YieldProcessorNormalizationInfo normalizationInfo;
// do
// {
// YieldProcessorNormalized(normalizationInfo);
// } while (!condition);
// }
FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
{
unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
_ASSERTE(n != 0);
do
{
System_YieldProcessor();
} while (--n != 0);
}

// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
// - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
// for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
// and decrease scalability of the operation.
// while(!condition)
// {
// YieldProcessorNormalized();
// }
// - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
// condition, otherwise it may unnecessarily increase latency of the operation
// - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
// yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
// issue above on later iterations.
// - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
// issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
// System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
FORCEINLINE void YieldProcessorNormalized()
{
YieldProcessorNormalized(YieldProcessorNormalizationInfo());
}

// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
// if (!moreExpensiveCondition)
// {
// YieldProcessorNormalizationInfo normalizationInfo;
// do
// {
// YieldProcessorNormalized(normalizationInfo, 2);
// } while (!moreExpensiveCondition);
// }
FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
{
_ASSERTE(count != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
// is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
if (count > MaxCount)
{
count = MaxCount;
}
}

SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
_ASSERTE(n != 0);
do
{
System_YieldProcessor();
} while (--n != 0);
}

// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
// while(!moreExpensiveCondition)
// {
// YieldProcessorNormalized(2);
// }
FORCEINLINE void YieldProcessorNormalized(unsigned int count)
{
YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
}

// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
// info. Typical usage:
// if (!condition)
// {
// YieldProcessorNormalizationInfo normalizationInfo;
// do
// {
// YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
// } while (!condition);
// }
FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
const YieldProcessorNormalizationInfo &normalizationInfo,
unsigned int preSkylakeCount)
{
_ASSERTE(preSkylakeCount != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
// is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
preSkylakeCount = MaxCount;
}
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
{
n = 1;
}
do
{
System_YieldProcessor();
} while (--n != 0);
}

// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
// while(!condition)
// {
// YieldProcessorNormalizedForPreSkylakeCount(100);
// }
FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
{
YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
}

// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
// iteration exponentially up to a limit. Typical usage:
// if (!conditionThatMayNotBeSatisfiedSoon)
// {
// YieldProcessorNormalizationInfo normalizationInfo;
// do
// {
// YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
// } while (!conditionThatMayNotBeSatisfiedSoon);
// }
FORCEINLINE void YieldProcessorWithBackOffNormalized(
const YieldProcessorNormalizationInfo &normalizationInfo,
unsigned int spinIteration)
{
// normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
// InitializeYieldProcessorNormalized()
const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
_ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);

// This shift value should be adjusted based on the asserted condition below
const uint8_t MaxShift = 3;
static_assert(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");

unsigned int n;
if (spinIteration <= MaxShift &&
((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
{
n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
}
else
{
n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
}
_ASSERTE(n != 0);
do
{
System_YieldProcessor();
} while (--n != 0);
}
#include "PalRedhawk.h"
#include "../../inc/yieldprocessornormalized.h"
Loading
Loading