dotnet · eduardo-vp · Jul 17, 2024 · Jun 18, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
@@ -3,14 +3,25 @@
 
 #pragma once
 
+// not sure if necessary
+#include <limits.h>
+
 // Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
 // the intention is to use the system-default implementation of YieldProcessor().
 #define HAS_SYSTEM_YIELDPROCESSOR
+#ifdef FEATURE_NATIVEAOT
+FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
+#else
 FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
+#endif
 #ifdef YieldProcessor
 #undef YieldProcessor
 #endif
 #define YieldProcessor Dont_Use_YieldProcessor
+#ifdef PalYieldProcessor
+#undef PalYieldProcessor
+#endif
+#define PalYieldProcessor Dont_Use_PalYieldProcessor
 
 #define DISABLE_COPY(T) \
     T(const T &) = delete; \
@@ -20,6 +31,31 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
     T() = delete; \
     DISABLE_COPY(T)
 
+#ifdef FEATURE_NATIVEAOT
+#define static_assert_no_msg( cond ) static_assert( cond, #cond )
+#define SIZE_T uintptr_t
+// verify these are correct
+typedef BYTE UINT8;
+typedef ULONGLONG UINT64;
+
+template <typename T>
+T Min(T v1, T v2)
+{
+    // STATIC_CONTRACT_LEAF;
+    return v1 < v2 ? v1 : v2;
+}
+
+template <typename T>
+T Max(T v1, T v2)
+{
+    // STATIC_CONTRACT_LEAF;
+    return v1 > v2 ? v1 : v2;
+}
+
+void InitializeYieldProcessorNormalizedCrst();
+void EnsureYieldProcessorNormalizedInitialized();
+#endif
+
 class YieldProcessorNormalization
 {
 public:

@@ -14,105 +14,6 @@
 #include "slist.h"
 #include "volatile.h"
 #include "yieldprocessornormalized.h"
+#include "../../vm/synch.h"
 
-#define ULONGLONG int64_t
-
-static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
-static CrstStatic s_initializeYieldProcessorNormalizedCrst;
-
-// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
-// tuned for Skylake processors
-unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
-unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
-
-void InitializeYieldProcessorNormalizedCrst()
-{
-    WRAPPER_NO_CONTRACT;
-    s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
-}
-
-static void InitializeYieldProcessorNormalized()
-{
-    WRAPPER_NO_CONTRACT;
-
-    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
-
-    if (s_isYieldProcessorNormalizedInitialized)
-    {
-        return;
-    }
-
-    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
-    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
-    const int MeasureDurationMs = 10;
-    const int NsPerSecond = 1000 * 1000 * 1000;
-
-    ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();
-
-    if (ticksPerSecond < 1000 / MeasureDurationMs)
-    {
-        // High precision clock not available or clock resolution is too low, resort to defaults
-        s_isYieldProcessorNormalizedInitialized = true;
-        return;
-    }
-
-    // Measure the nanosecond delay per yield
-    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
-    unsigned int yieldCount = 0;
-      ULONGLONG startTicks = PalQueryPerformanceCounter();
-    ULONGLONG elapsedTicks;
-    do
-    {
-        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
-        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
-        // low microsecond range.
-        for (int i = 0; i < 1000; ++i)
-        {
-            System_YieldProcessor();
-        }
-        yieldCount += 1000;
-
-        ULONGLONG nowTicks = PalQueryPerformanceCounter();
-        elapsedTicks = nowTicks - startTicks;
-    } while (elapsedTicks < measureDurationTicks);
-    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
-    if (nsPerYield < 1)
-    {
-        nsPerYield = 1;
-    }
-
-    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
-    // value is naturally limited to MinNsPerNormalizedYield.
-    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
-    if (yieldsPerNormalizedYield < 1)
-    {
-        yieldsPerNormalizedYield = 1;
-    }
-    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
-
-    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
-    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
-    // better job of allowing other work to run.
-    int optimalMaxNormalizedYieldsPerSpinIteration =
-        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
-    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
-    {
-        optimalMaxNormalizedYieldsPerSpinIteration = 1;
-    }
-
-    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
-    s_isYieldProcessorNormalizedInitialized = true;
-
-    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
-}
-
-void EnsureYieldProcessorNormalizedInitialized()
-{
-    WRAPPER_NO_CONTRACT;
-
-    if (!s_isYieldProcessorNormalizedInitialized)
-    {
-        InitializeYieldProcessorNormalized();
-    }
-}
+#include "../../vm/yieldprocessornormalizedshared.cpp"
@@ -1,229 +1,5 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-#pragma once
-
-#include <limits.h>
-
-// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
-// the intention is to use the system-default implementation of YieldProcessor().
-#define HAS_SYSTEM_YIELDPROCESSOR
-FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
-#ifdef YieldProcessor
-#undef YieldProcessor
-#endif
-#define YieldProcessor Dont_Use_YieldProcessor
-#ifdef PalYieldProcessor
-#undef PalYieldProcessor
-#endif
-#define PalYieldProcessor Dont_Use_PalYieldProcessor
-
-#define SIZE_T uintptr_t
-
-const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
-const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
-
-extern unsigned int g_yieldsPerNormalizedYield;
-extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
-
-void InitializeYieldProcessorNormalizedCrst();
-void EnsureYieldProcessorNormalizedInitialized();
-
-class YieldProcessorNormalizationInfo
-{
-private:
-    unsigned int yieldsPerNormalizedYield;
-    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
-    unsigned int optimalMaxYieldsPerSpinIteration;
-
-public:
-    YieldProcessorNormalizationInfo()
-        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
-        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
-        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
-    {
-    }
-
-    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
-    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
-    friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
-    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
-};
-
-// See YieldProcessorNormalized() for preliminary info. Typical usage:
-//     if (!condition)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorNormalized(normalizationInfo);
-//         } while (!condition);
-//     }
-FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
-{
-    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
-    _ASSERTE(n != 0);
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
-
-// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
-// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
-//   - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
-//     for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
-//     and decrease scalability of the operation.
-//         while(!condition)
-//         {
-//             YieldProcessorNormalized();
-//         }
-//   - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
-//     condition, otherwise it may unnecessarily increase latency of the operation
-//   - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
-//     yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
-//     issue above on later iterations.
-//   - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
-//     issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
-//     System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
-FORCEINLINE void YieldProcessorNormalized()
-{
-    YieldProcessorNormalized(YieldProcessorNormalizationInfo());
-}
-
-// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
-//     if (!moreExpensiveCondition)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorNormalized(normalizationInfo, 2);
-//         } while (!moreExpensiveCondition);
-//     }
-FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
-{
-    _ASSERTE(count != 0);
-
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
-    {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
-        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
-        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
-        if (count > MaxCount)
-        {
-            count = MaxCount;
-        }
-    }
-
-    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
-    _ASSERTE(n != 0);
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
-
-// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
-// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
-//     while(!moreExpensiveCondition)
-//     {
-//         YieldProcessorNormalized(2);
-//     }
-FORCEINLINE void YieldProcessorNormalized(unsigned int count)
-{
-    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
-}
-
-// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
-// info. Typical usage:
-//     if (!condition)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
-//         } while (!condition);
-//     }
-FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
-    const YieldProcessorNormalizationInfo &normalizationInfo,
-    unsigned int preSkylakeCount)
-{
-    _ASSERTE(preSkylakeCount != 0);
-
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
-    {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
-        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
-        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
-        if (preSkylakeCount > MaxCount)
-        {
-            preSkylakeCount = MaxCount;
-        }
-    }
-
-    const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
-    if (n == 0)
-    {
-        n = 1;
-    }
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
-
-// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
-// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
-// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
-// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
-//     while(!condition)
-//     {
-//         YieldProcessorNormalizedForPreSkylakeCount(100);
-//     }
-FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
-{
-    YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
-}
-
-// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
-// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
-// iteration exponentially up to a limit. Typical usage:
-//     if (!conditionThatMayNotBeSatisfiedSoon)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
-//         } while (!conditionThatMayNotBeSatisfiedSoon);
-//     }
-FORCEINLINE void YieldProcessorWithBackOffNormalized(
-    const YieldProcessorNormalizationInfo &normalizationInfo,
-    unsigned int spinIteration)
-{
-    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
-    // InitializeYieldProcessorNormalized()
-    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
-        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
-    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    // This shift value should be adjusted based on the asserted condition below
-    const uint8_t MaxShift = 3;
-    static_assert(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
-
-    unsigned int n;
-    if (spinIteration <= MaxShift &&
-        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
-    {
-        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
-    }
-    else
-    {
-        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
-    }
-    _ASSERTE(n != 0);
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
+#include "PalRedhawk.h"
+#include "../../inc/yieldprocessornormalized.h"