Merge pull request #10935 from Morph1984/mwaitx
x64: Make use of monitorx instructions for power efficient sleeps (AMD)
This commit is contained in:
commit
5e70db0d43
|
@ -93,6 +93,7 @@ void AppendCPUInfo(FieldCollection& fc) {
|
||||||
add_field("CPU_Extension_x64_GFNI", caps.gfni);
|
add_field("CPU_Extension_x64_GFNI", caps.gfni);
|
||||||
add_field("CPU_Extension_x64_INVARIANT_TSC", caps.invariant_tsc);
|
add_field("CPU_Extension_x64_INVARIANT_TSC", caps.invariant_tsc);
|
||||||
add_field("CPU_Extension_x64_LZCNT", caps.lzcnt);
|
add_field("CPU_Extension_x64_LZCNT", caps.lzcnt);
|
||||||
|
add_field("CPU_Extension_x64_MONITORX", caps.monitorx);
|
||||||
add_field("CPU_Extension_x64_MOVBE", caps.movbe);
|
add_field("CPU_Extension_x64_MOVBE", caps.movbe);
|
||||||
add_field("CPU_Extension_x64_PCLMULQDQ", caps.pclmulqdq);
|
add_field("CPU_Extension_x64_PCLMULQDQ", caps.pclmulqdq);
|
||||||
add_field("CPU_Extension_x64_POPCNT", caps.popcnt);
|
add_field("CPU_Extension_x64_POPCNT", caps.popcnt);
|
||||||
|
|
|
@ -168,6 +168,7 @@ static CPUCaps Detect() {
|
||||||
__cpuid(cpu_id, 0x80000001);
|
__cpuid(cpu_id, 0x80000001);
|
||||||
caps.lzcnt = Common::Bit<5>(cpu_id[2]);
|
caps.lzcnt = Common::Bit<5>(cpu_id[2]);
|
||||||
caps.fma4 = Common::Bit<16>(cpu_id[2]);
|
caps.fma4 = Common::Bit<16>(cpu_id[2]);
|
||||||
|
caps.monitorx = Common::Bit<29>(cpu_id[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (max_ex_fn >= 0x80000007) {
|
if (max_ex_fn >= 0x80000007) {
|
||||||
|
|
|
@ -63,6 +63,7 @@ struct CPUCaps {
|
||||||
bool gfni : 1;
|
bool gfni : 1;
|
||||||
bool invariant_tsc : 1;
|
bool invariant_tsc : 1;
|
||||||
bool lzcnt : 1;
|
bool lzcnt : 1;
|
||||||
|
bool monitorx : 1;
|
||||||
bool movbe : 1;
|
bool movbe : 1;
|
||||||
bool pclmulqdq : 1;
|
bool pclmulqdq : 1;
|
||||||
bool popcnt : 1;
|
bool popcnt : 1;
|
||||||
|
|
|
@ -13,36 +13,60 @@
|
||||||
|
|
||||||
namespace Common::X64 {
|
namespace Common::X64 {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
// 100,000 cycles is a reasonable amount of time to wait to save on CPU resources.
|
||||||
|
// For reference:
|
||||||
|
// At 1 GHz, 100K cycles is 100us
|
||||||
|
// At 2 GHz, 100K cycles is 50us
|
||||||
|
// At 4 GHz, 100K cycles is 25us
|
||||||
|
constexpr auto PauseCycles = 100'000U;
|
||||||
|
|
||||||
|
} // Anonymous namespace
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
__forceinline static void TPAUSE() {
|
__forceinline static void TPAUSE() {
|
||||||
// 100,000 cycles is a reasonable amount of time to wait to save on CPU resources.
|
static constexpr auto RequestC02State = 0U;
|
||||||
// For reference:
|
_tpause(RequestC02State, FencedRDTSC() + PauseCycles);
|
||||||
// At 1 GHz, 100K cycles is 100us
|
}
|
||||||
// At 2 GHz, 100K cycles is 50us
|
|
||||||
// At 4 GHz, 100K cycles is 25us
|
__forceinline static void MWAITX() {
|
||||||
static constexpr auto PauseCycles = 100'000;
|
static constexpr auto EnableWaitTimeFlag = 1U << 1;
|
||||||
_tpause(0, FencedRDTSC() + PauseCycles);
|
static constexpr auto RequestC1State = 0U;
|
||||||
|
|
||||||
|
// monitor_var should be aligned to a cache line.
|
||||||
|
alignas(64) u64 monitor_var{};
|
||||||
|
_mm_monitorx(&monitor_var, 0, 0);
|
||||||
|
_mm_mwaitx(EnableWaitTimeFlag, RequestC1State, PauseCycles);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static void TPAUSE() {
|
static void TPAUSE() {
|
||||||
// 100,000 cycles is a reasonable amount of time to wait to save on CPU resources.
|
static constexpr auto RequestC02State = 0U;
|
||||||
// For reference:
|
|
||||||
// At 1 GHz, 100K cycles is 100us
|
|
||||||
// At 2 GHz, 100K cycles is 50us
|
|
||||||
// At 4 GHz, 100K cycles is 25us
|
|
||||||
static constexpr auto PauseCycles = 100'000;
|
|
||||||
const auto tsc = FencedRDTSC() + PauseCycles;
|
const auto tsc = FencedRDTSC() + PauseCycles;
|
||||||
const auto eax = static_cast<u32>(tsc & 0xFFFFFFFF);
|
const auto eax = static_cast<u32>(tsc & 0xFFFFFFFF);
|
||||||
const auto edx = static_cast<u32>(tsc >> 32);
|
const auto edx = static_cast<u32>(tsc >> 32);
|
||||||
asm volatile("tpause %0" : : "r"(0), "d"(edx), "a"(eax));
|
asm volatile("tpause %0" : : "r"(RequestC02State), "d"(edx), "a"(eax));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void MWAITX() {
|
||||||
|
static constexpr auto EnableWaitTimeFlag = 1U << 1;
|
||||||
|
static constexpr auto RequestC1State = 0U;
|
||||||
|
|
||||||
|
// monitor_var should be aligned to a cache line.
|
||||||
|
alignas(64) u64 monitor_var{};
|
||||||
|
asm volatile("monitorx" : : "a"(&monitor_var), "c"(0), "d"(0));
|
||||||
|
asm volatile("mwaitx" : : "a"(RequestC1State), "b"(PauseCycles), "c"(EnableWaitTimeFlag));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void MicroSleep() {
|
void MicroSleep() {
|
||||||
static const bool has_waitpkg = GetCPUCaps().waitpkg;
|
static const bool has_waitpkg = GetCPUCaps().waitpkg;
|
||||||
|
static const bool has_monitorx = GetCPUCaps().monitorx;
|
||||||
|
|
||||||
if (has_waitpkg) {
|
if (has_waitpkg) {
|
||||||
TPAUSE();
|
TPAUSE();
|
||||||
|
} else if (has_monitorx) {
|
||||||
|
MWAITX();
|
||||||
} else {
|
} else {
|
||||||
std::this_thread::yield();
|
std::this_thread::yield();
|
||||||
}
|
}
|
||||||
|
|
Reference in New Issue