intel_idle: Disable IBRS during long idle
authorPeter Zijlstra <peterz@infradead.org>
Tue, 14 Jun 2022 21:15:58 +0000 (23:15 +0200)
committerBorislav Petkov <bp@suse.de>
Mon, 27 Jun 2022 08:33:59 +0000 (10:33 +0200)
Having IBRS enabled while the SMT sibling is idle unnecessarily slows
down the running sibling. OTOH, disabling IBRS around idle takes two
MSR writes, which will increase the idle latency.

Therefore, only disable IBRS around deeper idle states. Shallow idle
states are bounded by the tick in duration, since NOHZ is not allowed
for them by virtue of their short target residency.

Only do this for mwait-driven idle, since that keeps interrupts disabled
across idle, which makes disabling IBRS vs IRQ-entry a non-issue.

Note: C6 is a random threshold, most importantly C1 probably shouldn't
disable IBRS, benchmarking needed.

Suggested-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
arch/x86/include/asm/nospec-branch.h
arch/x86/kernel/cpu/bugs.c
drivers/idle/intel_idle.c

index e14046daa7ba31bd3adb9c3f50c284ff110c29cd..ce1acb5571624c9959e98b75a0b52ff2f016e490 100644 (file)
@@ -255,6 +255,7 @@ static inline void indirect_branch_prediction_barrier(void)
 /* The Intel SPEC CTRL MSR base value cache */
 extern u64 x86_spec_ctrl_base;
 extern void write_spec_ctrl_current(u64 val, bool force);
+extern u64 spec_ctrl_current(void);
 
 /*
  * With retpoline, we must use IBRS to restrict branch prediction
index 05f29db9473b9cb0e0c88bbeb91fe738446eafc4..00e9c769ce0b7691b64c777f1e58b8826b99d62a 100644 (file)
@@ -79,6 +79,12 @@ void write_spec_ctrl_current(u64 val, bool force)
                wrmsrl(MSR_IA32_SPEC_CTRL, val);
 }
 
+u64 spec_ctrl_current(void)
+{
+       return this_cpu_read(x86_spec_ctrl_current);
+}
+EXPORT_SYMBOL_GPL(spec_ctrl_current);
+
 /*
  * The vendor and possibly platform specific bits which can be modified in
  * x86_spec_ctrl_base.
index 424ef470223df4d687b2609c952147299de620ec..f5c6802aa6c3ea6f5c6aef233f61b53126ce9e90 100644 (file)
 #include <linux/tick.h>
 #include <trace/events/power.h>
 #include <linux/sched.h>
+#include <linux/sched/smt.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/moduleparam.h>
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
+#include <asm/nospec-branch.h>
 #include <asm/mwait.h>
 #include <asm/msr.h>
 
@@ -105,6 +107,12 @@ static unsigned int mwait_substates __initdata;
  */
 #define CPUIDLE_FLAG_ALWAYS_ENABLE     BIT(15)
 
+/*
+ * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE
+ * above.
+ */
+#define CPUIDLE_FLAG_IBRS              BIT(16)
+
 /*
  * MWAIT takes an 8-bit "hint" in EAX "suggesting"
  * the C-state (top nibble) and sub-state (bottom nibble)
@@ -159,6 +167,24 @@ static __cpuidle int intel_idle_irq(struct cpuidle_device *dev,
        return ret;
 }
 
+static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
+                                    struct cpuidle_driver *drv, int index)
+{
+       bool smt_active = sched_smt_active();
+       u64 spec_ctrl = spec_ctrl_current();
+       int ret;
+
+       if (smt_active)
+               wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+
+       ret = __intel_idle(dev, drv, index);
+
+       if (smt_active)
+               wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl);
+
+       return ret;
+}
+
 /**
  * intel_idle_s2idle - Ask the processor to enter the given idle state.
  * @dev: cpuidle device of the target CPU.
@@ -680,7 +706,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
        {
                .name = "C6",
                .desc = "MWAIT 0x20",
-               .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 85,
                .target_residency = 200,
                .enter = &intel_idle,
@@ -688,7 +714,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
        {
                .name = "C7s",
                .desc = "MWAIT 0x33",
-               .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 124,
                .target_residency = 800,
                .enter = &intel_idle,
@@ -696,7 +722,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
        {
                .name = "C8",
                .desc = "MWAIT 0x40",
-               .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 200,
                .target_residency = 800,
                .enter = &intel_idle,
@@ -704,7 +730,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
        {
                .name = "C9",
                .desc = "MWAIT 0x50",
-               .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 480,
                .target_residency = 5000,
                .enter = &intel_idle,
@@ -712,7 +738,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
        {
                .name = "C10",
                .desc = "MWAIT 0x60",
-               .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 890,
                .target_residency = 5000,
                .enter = &intel_idle,
@@ -741,7 +767,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
        {
                .name = "C6",
                .desc = "MWAIT 0x20",
-               .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 133,
                .target_residency = 600,
                .enter = &intel_idle,
@@ -1819,6 +1845,12 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
                if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IRQ_ENABLE)
                        drv->states[drv->state_count].enter = intel_idle_irq;
 
+               if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
+                   cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS) {
+                       WARN_ON_ONCE(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IRQ_ENABLE);
+                       drv->states[drv->state_count].enter = intel_idle_ibrs;
+               }
+
                if ((disabled_states_mask & BIT(drv->state_count)) ||
                    ((icpu->use_acpi || force_use_acpi) &&
                     intel_idle_off_by_default(mwait_hint) &&