From 4303ed614d0d758d9e9bcdef8afee3274769d2fb Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Wed, 28 Jun 2023 01:07:10 -0400 Subject: [PATCH 1/4] x64: Add detection of monitorx instructions monitorx introduces 2 instructions: MONITORX and MWAITX. --- src/common/telemetry.cpp | 1 + src/common/x64/cpu_detect.cpp | 1 + src/common/x64/cpu_detect.h | 1 + 3 files changed, 3 insertions(+) diff --git a/src/common/telemetry.cpp b/src/common/telemetry.cpp index 91352912d..929ed67e4 100644 --- a/src/common/telemetry.cpp +++ b/src/common/telemetry.cpp @@ -93,6 +93,7 @@ void AppendCPUInfo(FieldCollection& fc) { add_field("CPU_Extension_x64_GFNI", caps.gfni); add_field("CPU_Extension_x64_INVARIANT_TSC", caps.invariant_tsc); add_field("CPU_Extension_x64_LZCNT", caps.lzcnt); + add_field("CPU_Extension_x64_MONITORX", caps.monitorx); add_field("CPU_Extension_x64_MOVBE", caps.movbe); add_field("CPU_Extension_x64_PCLMULQDQ", caps.pclmulqdq); add_field("CPU_Extension_x64_POPCNT", caps.popcnt); diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp index c998b1197..780120a5b 100644 --- a/src/common/x64/cpu_detect.cpp +++ b/src/common/x64/cpu_detect.cpp @@ -168,6 +168,7 @@ static CPUCaps Detect() { __cpuid(cpu_id, 0x80000001); caps.lzcnt = Common::Bit<5>(cpu_id[2]); caps.fma4 = Common::Bit<16>(cpu_id[2]); + caps.monitorx = Common::Bit<29>(cpu_id[2]); } if (max_ex_fn >= 0x80000007) { diff --git a/src/common/x64/cpu_detect.h b/src/common/x64/cpu_detect.h index 8253944d6..756459417 100644 --- a/src/common/x64/cpu_detect.h +++ b/src/common/x64/cpu_detect.h @@ -63,6 +63,7 @@ struct CPUCaps { bool gfni : 1; bool invariant_tsc : 1; bool lzcnt : 1; + bool monitorx : 1; bool movbe : 1; bool pclmulqdq : 1; bool popcnt : 1; From 3d868baaa44152e7a4bd8c64905443fd9a08adce Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Wed, 28 Jun 2023 01:07:59 -0400 Subject: [PATCH 2/4] x64: cpu_wait: Make use of MWAITX in MicroSleep MWAITX is equivalent to UMWAIT on Intel's Alder Lake CPUs. We can emulate TPAUSE by using MONITORX in conjunction with MWAITX to wait for 100K cycles. --- src/common/x64/cpu_wait.cpp | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/common/x64/cpu_wait.cpp b/src/common/x64/cpu_wait.cpp index c53dd4945..11b9c4d83 100644 --- a/src/common/x64/cpu_wait.cpp +++ b/src/common/x64/cpu_wait.cpp @@ -13,24 +13,30 @@ namespace Common::X64 { +namespace { + +// 100,000 cycles is a reasonable amount of time to wait to save on CPU resources. +// For reference: +// At 1 GHz, 100K cycles is 100us +// At 2 GHz, 100K cycles is 50us +// At 4 GHz, 100K cycles is 25us +constexpr auto PauseCycles = 100'000U; + +} // Anonymous namespace + #ifdef _MSC_VER __forceinline static void TPAUSE() { - // 100,000 cycles is a reasonable amount of time to wait to save on CPU resources. - // For reference: - // At 1 GHz, 100K cycles is 100us - // At 2 GHz, 100K cycles is 50us - // At 4 GHz, 100K cycles is 25us - static constexpr auto PauseCycles = 100'000; _tpause(0, FencedRDTSC() + PauseCycles); } + +__forceinline static void MWAITX() { + // monitor_var should be aligned to a cache line. + alignas(64) u64 monitor_var{}; + _mm_monitorx(&monitor_var, 0, 0); + _mm_mwaitx(/* extensions*/ 2, /* hints */ 0, /* cycles */ PauseCycles); +} #else static void TPAUSE() { - // 100,000 cycles is a reasonable amount of time to wait to save on CPU resources. - // For reference: - // At 1 GHz, 100K cycles is 100us - // At 2 GHz, 100K cycles is 50us - // At 4 GHz, 100K cycles is 25us - static constexpr auto PauseCycles = 100'000; const auto tsc = FencedRDTSC() + PauseCycles; const auto eax = static_cast(tsc & 0xFFFFFFFF); const auto edx = static_cast(tsc >> 32); @@ -40,9 +46,12 @@ static void TPAUSE() { void MicroSleep() { static const bool has_waitpkg = GetCPUCaps().waitpkg; + static const bool has_monitorx = GetCPUCaps().monitorx; if (has_waitpkg) { TPAUSE(); + } else if (has_monitorx) { + MWAITX(); } else { std::this_thread::yield(); } From 2b68a3cbbf144b97aa524eb1dd17aad34cdf1a67 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Wed, 28 Jun 2023 01:19:41 -0400 Subject: [PATCH 3/4] x64: cpu_wait: Remove magic values --- src/common/x64/cpu_wait.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/common/x64/cpu_wait.cpp b/src/common/x64/cpu_wait.cpp index 11b9c4d83..ea16c8490 100644 --- a/src/common/x64/cpu_wait.cpp +++ b/src/common/x64/cpu_wait.cpp @@ -26,21 +26,26 @@ constexpr auto PauseCycles = 100'000U; #ifdef _MSC_VER __forceinline static void TPAUSE() { - _tpause(0, FencedRDTSC() + PauseCycles); + static constexpr auto RequestC02State = 0U; + _tpause(RequestC02State, FencedRDTSC() + PauseCycles); } __forceinline static void MWAITX() { + static constexpr auto EnableWaitTimeFlag = 1U << 1; + static constexpr auto RequestC1State = 0U; + // monitor_var should be aligned to a cache line. alignas(64) u64 monitor_var{}; _mm_monitorx(&monitor_var, 0, 0); - _mm_mwaitx(/* extensions*/ 2, /* hints */ 0, /* cycles */ PauseCycles); + _mm_mwaitx(EnableWaitTimeFlag, RequestC1State, PauseCycles); } #else static void TPAUSE() { + static constexpr auto RequestC02State = 0U; const auto tsc = FencedRDTSC() + PauseCycles; const auto eax = static_cast(tsc & 0xFFFFFFFF); const auto edx = static_cast(tsc >> 32); - asm volatile("tpause %0" : : "r"(0), "d"(edx), "a"(eax)); + asm volatile("tpause %0" : : "r"(RequestC02State), "d"(edx), "a"(eax)); } #endif From 295fc7d0f8f0b6158307c5c9b11a60516f9eb221 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Wed, 28 Jun 2023 01:35:16 -0400 Subject: [PATCH 4/4] x64: cpu_wait: Implement MWAITX for non-MSVC compilers --- src/common/x64/cpu_wait.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/common/x64/cpu_wait.cpp b/src/common/x64/cpu_wait.cpp index ea16c8490..41d385f59 100644 --- a/src/common/x64/cpu_wait.cpp +++ b/src/common/x64/cpu_wait.cpp @@ -47,6 +47,16 @@ static void TPAUSE() { const auto edx = static_cast(tsc >> 32); asm volatile("tpause %0" : : "r"(RequestC02State), "d"(edx), "a"(eax)); } + +static void MWAITX() { + static constexpr auto EnableWaitTimeFlag = 1U << 1; + static constexpr auto RequestC1State = 0U; + + // monitor_var should be aligned to a cache line. + alignas(64) u64 monitor_var{}; + asm volatile("monitorx" : : "a"(&monitor_var), "c"(0), "d"(0)); + asm volatile("mwaitx" : : "a"(RequestC1State), "b"(PauseCycles), "c"(EnableWaitTimeFlag)); +} #endif void MicroSleep() {