x86/retbleed: Add SKL return thunk
authorThomas Gleixner <tglx@linutronix.de>
Thu, 15 Sep 2022 11:11:27 +0000 (13:11 +0200)
committerPeter Zijlstra <peterz@infradead.org>
Mon, 17 Oct 2022 14:41:15 +0000 (16:41 +0200)
To address the Intel SKL RSB underflow issue in software it's required to
do call depth tracking.

Provide a return thunk for call depth tracking on Intel SKL CPUs.

The tracking does not use a counter. It uses uses arithmetic shift
right on call entry and logical shift left on return.

The depth tracking variable is initialized to 0x8000.... when the call
depth is zero. The arithmetic shift right sign extends the MSB and
saturates after the 12th call. The shift count is 5 so the tracking covers
12 nested calls. On return the variable is shifted left logically so it
becomes zero again.

 CALL       RET
 0: 0x8000000000000000 0x0000000000000000
 1: 0xfc00000000000000 0xf000000000000000
...
11: 0xfffffffffffffff8 0xfffffffffffffc00
12: 0xffffffffffffffff 0xffffffffffffffe0

After a return buffer fill the depth is credited 12 calls before the next
stuffing has to take place.

There is a inaccuracy for situations like this:

   10 calls
    5 returns
    3 calls
    4 returns
    3 calls
    ....

The shift count might cause this to be off by one in either direction, but
there is still a cushion vs. the RSB depth. The algorithm does not claim to
be perfect, but it should obfuscate the problem enough to make exploitation
extremly difficult.

The theory behind this is:

RSB is a stack with depth 16 which is filled on every call. On the return
path speculation "pops" entries to speculate down the call chain. Once the
speculative RSB is empty it switches to other predictors, e.g. the Branch
History Buffer, which can be mistrained by user space and misguide the
speculation path to a gadget.

Call depth tracking is designed to break this speculation path by stuffing
speculation trap calls into the RSB which are never getting a corresponding
return executed. This stalls the prediction path until it gets resteered,

The assumption is that stuffing at the 12th return is sufficient to break
the speculation before it hits the underflow and the fallback to the other
predictors. Testing confirms that it works. Johannes, one of the retbleed
researchers. tried to attack this approach but failed.

There is obviously no scientific proof that this will withstand future
research progress, but all we can do right now is to speculate about it.

The SAR/SHL usage was suggested by Andi Kleen.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220915111147.890071690@infradead.org
arch/x86/entry/entry_64.S
arch/x86/include/asm/current.h
arch/x86/include/asm/nospec-branch.h
arch/x86/kernel/asm-offsets.c
arch/x86/kvm/svm/vmenter.S
arch/x86/lib/retpoline.S

index 4cc0125fdfdc2400993680f42053d894417f4fa5..15739a2c09833ba30ac8b2eb6a9be9e93608f7ea 100644 (file)
@@ -288,6 +288,7 @@ SYM_FUNC_END(__switch_to_asm)
 SYM_CODE_START_NOALIGN(ret_from_fork)
        UNWIND_HINT_EMPTY
        ANNOTATE_NOENDBR // copy_thread
+       CALL_DEPTH_ACCOUNT
        movq    %rax, %rdi
        call    schedule_tail                   /* rdi: 'prev' task parameter */
 
@@ -332,7 +333,7 @@ SYM_CODE_START(xen_error_entry)
        UNWIND_HINT_FUNC
        PUSH_AND_CLEAR_REGS save_ret=1
        ENCODE_FRAME_POINTER 8
-       UNTRAIN_RET
+       UNTRAIN_RET_FROM_CALL
        RET
 SYM_CODE_END(xen_error_entry)
 
@@ -977,7 +978,7 @@ SYM_CODE_START(paranoid_entry)
         * CR3 above, keep the old value in a callee saved register.
         */
        IBRS_ENTER save_reg=%r15
-       UNTRAIN_RET
+       UNTRAIN_RET_FROM_CALL
 
        RET
 SYM_CODE_END(paranoid_entry)
@@ -1062,7 +1063,7 @@ SYM_CODE_START(error_entry)
        /* We have user CR3.  Change to kernel CR3. */
        SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
        IBRS_ENTER
-       UNTRAIN_RET
+       UNTRAIN_RET_FROM_CALL
 
        leaq    8(%rsp), %rdi                   /* arg0 = pt_regs pointer */
        /* Put us onto the real thread stack. */
@@ -1097,6 +1098,7 @@ SYM_CODE_START(error_entry)
         */
 .Lerror_entry_done_lfence:
        FENCE_SWAPGS_KERNEL_ENTRY
+       CALL_DEPTH_ACCOUNT
        leaq    8(%rsp), %rax                   /* return pt_regs pointer */
        ANNOTATE_UNRET_END
        RET
@@ -1115,7 +1117,7 @@ SYM_CODE_START(error_entry)
        FENCE_SWAPGS_USER_ENTRY
        SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
        IBRS_ENTER
-       UNTRAIN_RET
+       UNTRAIN_RET_FROM_CALL
 
        /*
         * Pretend that the exception came from user mode: set up pt_regs
index b89aba077b84b11238eb15a301ab6118a03e2645..a1168e7b69e5b747276b61ac62b755ad748e5534 100644 (file)
@@ -17,6 +17,9 @@ struct pcpu_hot {
                        struct task_struct      *current_task;
                        int                     preempt_count;
                        int                     cpu_number;
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+                       u64                     call_depth;
+#endif
                        unsigned long           top_of_stack;
                        void                    *hardirq_stack_ptr;
                        u16                     softirq_pending;
index f10ca334dd752cd8d115c4e47aab4ae0b1e4d209..d4be826a22824ec083bd24ec679a6af8b41ec439 100644 (file)
 #include <asm/msr-index.h>
 #include <asm/unwind_hints.h>
 #include <asm/percpu.h>
+#include <asm/current.h>
 
-#define RETPOLINE_THUNK_SIZE   32
+/*
+ * Call depth tracking for Intel SKL CPUs to address the RSB underflow
+ * issue in software.
+ *
+ * The tracking does not use a counter. It uses uses arithmetic shift
+ * right on call entry and logical shift left on return.
+ *
+ * The depth tracking variable is initialized to 0x8000.... when the call
+ * depth is zero. The arithmetic shift right sign extends the MSB and
+ * saturates after the 12th call. The shift count is 5 for both directions
+ * so the tracking covers 12 nested calls.
+ *
+ *  Call
+ *  0: 0x8000000000000000      0x0000000000000000
+ *  1: 0xfc00000000000000      0xf000000000000000
+ * ...
+ * 11: 0xfffffffffffffff8      0xfffffffffffffc00
+ * 12: 0xffffffffffffffff      0xffffffffffffffe0
+ *
+ * After a return buffer fill the depth is credited 12 calls before the
+ * next stuffing has to take place.
+ *
+ * There is a inaccuracy for situations like this:
+ *
+ *  10 calls
+ *   5 returns
+ *   3 calls
+ *   4 returns
+ *   3 calls
+ *   ....
+ *
+ * The shift count might cause this to be off by one in either direction,
+ * but there is still a cushion vs. the RSB depth. The algorithm does not
+ * claim to be perfect and it can be speculated around by the CPU, but it
+ * is considered that it obfuscates the problem enough to make exploitation
+ * extremly difficult.
+ */
+#define RET_DEPTH_SHIFT                        5
+#define RSB_RET_STUFF_LOOPS            16
+#define RET_DEPTH_INIT                 0x8000000000000000ULL
+#define RET_DEPTH_INIT_FROM_CALL       0xfc00000000000000ULL
+#define RET_DEPTH_CREDIT               0xffffffffffffffffULL
+
+#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
+
+#include <asm/asm-offsets.h>
+
+#define CREDIT_CALL_DEPTH                                      \
+       movq    $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#define ASM_CREDIT_CALL_DEPTH                                  \
+       movq    $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#define RESET_CALL_DEPTH                                       \
+       mov     $0x80, %rax;                                    \
+       shl     $56, %rax;                                      \
+       movq    %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#define RESET_CALL_DEPTH_FROM_CALL                             \
+       mov     $0xfc, %rax;                                    \
+       shl     $56, %rax;                                      \
+       movq    %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#define INCREMENT_CALL_DEPTH                                   \
+       sarq    $5, %gs:pcpu_hot + X86_call_depth;
+
+#define ASM_INCREMENT_CALL_DEPTH                               \
+       sarq    $5, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#else
+#define CREDIT_CALL_DEPTH
+#define RESET_CALL_DEPTH
+#define INCREMENT_CALL_DEPTH
+#define RESET_CALL_DEPTH_FROM_CALL
+#endif
 
 /*
  * Fill the CPU return stack buffer.
  * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
  */
 
+#define RETPOLINE_THUNK_SIZE   32
 #define RSB_CLEAR_LOOPS                32      /* To forcibly overwrite all entries */
 
 /*
        dec     reg;                                    \
        jnz     771b;                                   \
        /* barrier for jnz misprediction */             \
-       lfence;
+       lfence;                                         \
+       ASM_CREDIT_CALL_DEPTH
 #else
 /*
  * i386 doesn't unconditionally have LFENCE, as such it can't
  * where we have a stack but before any RET instruction.
  */
 .macro UNTRAIN_RET
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY)
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
+       defined(CONFIG_X86_FEATURE_CALL_DEPTH)
        ANNOTATE_UNRET_END
-       ALTERNATIVE_2 "",                                               \
-                     CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET,          \
-                     "call entry_ibpb", X86_FEATURE_ENTRY_IBPB
+       ALTERNATIVE_3 "",                                               \
+                     CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET,          \
+                     "call entry_ibpb", X86_FEATURE_ENTRY_IBPB,        \
+                     __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
+#endif
+.endm
+
+.macro UNTRAIN_RET_FROM_CALL
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
+       defined(CONFIG_X86_FEATURE_CALL_DEPTH)
+       ANNOTATE_UNRET_END
+       ALTERNATIVE_3 "",                                               \
+                     CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET,          \
+                     "call entry_ibpb", X86_FEATURE_ENTRY_IBPB,        \
+                     __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH
+#endif
+.endm
+
+
+.macro CALL_DEPTH_ACCOUNT
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+       ALTERNATIVE "",                                                 \
+                   __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
 #endif
 .endm
 
@@ -214,6 +312,17 @@ extern void (*x86_return_thunk)(void);
 #define x86_return_thunk       (&__x86_return_thunk)
 #endif
 
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+extern void __x86_return_skl(void);
+
+static inline void x86_set_skl_return_thunk(void)
+{
+       x86_return_thunk = &__x86_return_skl;
+}
+#else
+static inline void x86_set_skl_return_thunk(void) {}
+#endif
+
 #ifdef CONFIG_RETPOLINE
 
 #define GEN(reg) \
index a9824318e1c556d226ba6070beb489b2d4c0fad2..13afdbbee349e9a77b9edca561e46797255501bc 100644 (file)
@@ -110,6 +110,9 @@ static void __used common(void)
        OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
 
        OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+       OFFSET(X86_call_depth, pcpu_hot, call_depth);
+#endif
 
        if (IS_ENABLED(CONFIG_KVM_INTEL)) {
                BLANK();
index 723f8534986c31b505a2e5d314c347f720fbecd9..09eacf19d718595391a1f1cda41145006d79b046 100644 (file)
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/linkage.h>
 #include <asm/asm.h>
+#include <asm/asm-offsets.h>
 #include <asm/bitsperlong.h>
 #include <asm/kvm_vcpu_regs.h>
 #include <asm/nospec-branch.h>
index 073289a55f8495cb3f175d97d847485b05eb7a8b..1e79eccc1d6986f4acdb5f86619d4d227cb9cc45 100644 (file)
@@ -5,9 +5,11 @@
 #include <asm/dwarf2.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative.h>
+#include <asm/asm-offsets.h>
 #include <asm/export.h>
 #include <asm/nospec-branch.h>
 #include <asm/unwind_hints.h>
+#include <asm/percpu.h>
 #include <asm/frame.h>
 
        .section .text.__x86.indirect_thunk
@@ -140,3 +142,32 @@ __EXPORT_THUNK(zen_untrain_ret)
 EXPORT_SYMBOL(__x86_return_thunk)
 
 #endif /* CONFIG_RETHUNK */
+
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+
+       .align 64
+SYM_FUNC_START(__x86_return_skl)
+       ANNOTATE_NOENDBR
+       /* Keep the hotpath in a 16byte I-fetch */
+       shlq    $5, PER_CPU_VAR(pcpu_hot + X86_call_depth)
+       jz      1f
+       ANNOTATE_UNRET_SAFE
+       ret
+       int3
+1:
+       .rept   16
+       ANNOTATE_INTRA_FUNCTION_CALL
+       call    2f
+       int3
+2:
+       .endr
+       add     $(8*16), %rsp
+
+       CREDIT_CALL_DEPTH
+
+       ANNOTATE_UNRET_SAFE
+       ret
+       int3
+SYM_FUNC_END(__x86_return_skl)
+
+#endif /* CONFIG_CALL_DEPTH_TRACKING */