Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Oct 2018 17:43:04 +0000 (18:43 +0100)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Oct 2018 17:43:04 +0000 (18:43 +0100)
Pull x86 pti updates from Ingo Molnar:
 "The main changes:

   - Make the IBPB barrier more strict and add STIBP support (Jiri
     Kosina)

   - Micro-optimize and clean up the entry code (Andy Lutomirski)

   - ... plus misc other fixes"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/speculation: Propagate information about RSB filling mitigation to sysfs
  x86/speculation: Enable cross-hyperthread spectre v2 STIBP mitigation
  x86/speculation: Apply IBPB more strictly to avoid cross-process data leak
  x86/speculation: Add RETPOLINE_AMD support to the inline asm CALL_NOSPEC variant
  x86/CPU: Fix unused variable warning when !CONFIG_IA32_EMULATION
  x86/pti/64: Remove the SYSCALL64 entry trampoline
  x86/entry/64: Use the TSS sp2 slot for SYSCALL/SYSRET scratch space
  x86/entry/64: Document idtentry

1  2 
arch/x86/entry/entry_64.S
arch/x86/include/asm/processor.h
arch/x86/kernel/asm-offsets.c
arch/x86/kernel/cpu/bugs.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/kprobes/core.c
arch/x86/kernel/process_64.c
arch/x86/kernel/traps.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/mm/tlb.c
kernel/cpu.c

index 7c5ce0a6c4d2623edccda5f3586a591da3abc12e,0d728142467fda9431f6c83afadc770d2dbed584..4d7a2d9d44cfec5928b902cef1bca9bca29093a6
@@@ -142,67 -142,6 +142,6 @@@ END(native_usergs_sysret64
   * with them due to bugs in both AMD and Intel CPUs.
   */
  
-       .pushsection .entry_trampoline, "ax"
- /*
-  * The code in here gets remapped into cpu_entry_area's trampoline.  This means
-  * that the assembler and linker have the wrong idea as to where this code
-  * lives (and, in fact, it's mapped more than once, so it's not even at a
-  * fixed address).  So we can't reference any symbols outside the entry
-  * trampoline and expect it to work.
-  *
-  * Instead, we carefully abuse %rip-relative addressing.
-  * _entry_trampoline(%rip) refers to the start of the remapped) entry
-  * trampoline.  We can thus find cpu_entry_area with this macro:
-  */
- #define CPU_ENTRY_AREA \
-       _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
- /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
- #define RSP_SCRATCH   CPU_ENTRY_AREA_entry_stack + \
-                       SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
- ENTRY(entry_SYSCALL_64_trampoline)
-       UNWIND_HINT_EMPTY
-       swapgs
-       /* Stash the user RSP. */
-       movq    %rsp, RSP_SCRATCH
-       /* Note: using %rsp as a scratch reg. */
-       SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
-       /* Load the top of the task stack into RSP */
-       movq    CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
-       /* Start building the simulated IRET frame. */
-       pushq   $__USER_DS                      /* pt_regs->ss */
-       pushq   RSP_SCRATCH                     /* pt_regs->sp */
-       pushq   %r11                            /* pt_regs->flags */
-       pushq   $__USER_CS                      /* pt_regs->cs */
-       pushq   %rcx                            /* pt_regs->ip */
-       /*
-        * x86 lacks a near absolute jump, and we can't jump to the real
-        * entry text with a relative jump.  We could push the target
-        * address and then use retq, but this destroys the pipeline on
-        * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead,
-        * spill RDI and restore it in a second-stage trampoline.
-        */
-       pushq   %rdi
-       movq    $entry_SYSCALL_64_stage2, %rdi
-       JMP_NOSPEC %rdi
- END(entry_SYSCALL_64_trampoline)
-       .popsection
- ENTRY(entry_SYSCALL_64_stage2)
-       UNWIND_HINT_EMPTY
-       popq    %rdi
-       jmp     entry_SYSCALL_64_after_hwframe
- END(entry_SYSCALL_64_stage2)
  ENTRY(entry_SYSCALL_64)
        UNWIND_HINT_EMPTY
        /*
         */
  
        swapgs
-       /*
-        * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
-        * is not required to switch CR3.
-        */
-       movq    %rsp, PER_CPU_VAR(rsp_scratch)
+       /* tss.sp2 is scratch space. */
+       movq    %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
  
        /* Construct struct pt_regs on stack */
-       pushq   $__USER_DS                      /* pt_regs->ss */
-       pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
-       pushq   %r11                            /* pt_regs->flags */
-       pushq   $__USER_CS                      /* pt_regs->cs */
-       pushq   %rcx                            /* pt_regs->ip */
+       pushq   $__USER_DS                              /* pt_regs->ss */
+       pushq   PER_CPU_VAR(cpu_tss_rw + TSS_sp2)       /* pt_regs->sp */
+       pushq   %r11                                    /* pt_regs->flags */
+       pushq   $__USER_CS                              /* pt_regs->cs */
+       pushq   %rcx                                    /* pt_regs->ip */
  GLOBAL(entry_SYSCALL_64_after_hwframe)
-       pushq   %rax                            /* pt_regs->orig_ax */
+       pushq   %rax                                    /* pt_regs->orig_ax */
  
        PUSH_AND_CLEAR_REGS rax=$-ENOSYS
  
@@@ -900,6 -837,42 +837,42 @@@ apicinterrupt IRQ_WORK_VECTOR                    irq_wor
   */
  #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
  
+ /**
+  * idtentry - Generate an IDT entry stub
+  * @sym:              Name of the generated entry point
+  * @do_sym:           C function to be called
+  * @has_error_code:   True if this IDT vector has an error code on the stack
+  * @paranoid:                 non-zero means that this vector may be invoked from
+  *                    kernel mode with user GSBASE and/or user CR3.
+  *                    2 is special -- see below.
+  * @shift_ist:                Set to an IST index if entries from kernel mode should
+  *                            decrement the IST stack so that nested entries get a
+  *                    fresh stack.  (This is for #DB, which has a nasty habit
+  *                            of recursing.)
+  *
+  * idtentry generates an IDT stub that sets up a usable kernel context,
+  * creates struct pt_regs, and calls @do_sym.  The stub has the following
+  * special behaviors:
+  *
+  * On an entry from user mode, the stub switches from the trampoline or
+  * IST stack to the normal thread stack.  On an exit to user mode, the
+  * normal exit-to-usermode path is invoked.
+  *
+  * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
+  * whereas we omit the preemption check if @paranoid != 0.  This is purely
+  * because the implementation is simpler this way.  The kernel only needs
+  * to check for asynchronous kernel preemption when IRQ handlers return.
+  *
+  * If @paranoid == 0, then the stub will handle IRET faults by pretending
+  * that the fault came from user mode.  It will handle gs_change faults by
+  * pretending that the fault happened with kernel GSBASE.  Since this handling
+  * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
+  * @paranoid == 0.  This special handling will do the wrong thing for
+  * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
+  *
+  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
+  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
+  */
  .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
  ENTRY(\sym)
        UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@@ -1050,7 -1023,7 +1023,7 @@@ ENTRY(do_softirq_own_stack
        ret
  ENDPROC(do_softirq_own_stack)
  
 -#ifdef CONFIG_XEN
 +#ifdef CONFIG_XEN_PV
  idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
  
  /*
@@@ -1130,13 -1103,11 +1103,13 @@@ ENTRY(xen_failsafe_callback
        ENCODE_FRAME_POINTER
        jmp     error_exit
  END(xen_failsafe_callback)
 +#endif /* CONFIG_XEN_PV */
  
 +#ifdef CONFIG_XEN_PVHVM
  apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
        xen_hvm_callback_vector xen_evtchn_do_upcall
 +#endif
  
 -#endif /* CONFIG_XEN */
  
  #if IS_ENABLED(CONFIG_HYPERV)
  apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@@ -1153,7 -1124,7 +1126,7 @@@ idtentry debug                  do_debug                has_error_co
  idtentry int3                 do_int3                 has_error_code=0
  idtentry stack_segment                do_stack_segment        has_error_code=1
  
 -#ifdef CONFIG_XEN
 +#ifdef CONFIG_XEN_PV
  idtentry xennmi                       do_nmi                  has_error_code=0
  idtentry xendebug             do_debug                has_error_code=0
  idtentry xenint3              do_int3                 has_error_code=0
@@@ -1189,16 -1160,6 +1162,16 @@@ ENTRY(paranoid_entry
        xorl    %ebx, %ebx
  
  1:
 +      /*
 +       * Always stash CR3 in %r14.  This value will be restored,
 +       * verbatim, at exit.  Needed if paranoid_entry interrupted
 +       * another entry that already switched to the user CR3 value
 +       * but has not yet returned to userspace.
 +       *
 +       * This is also why CS (stashed in the "iret frame" by the
 +       * hardware at entry) can not be used: this may be a return
 +       * to kernel code, but with a user CR3 value.
 +       */
        SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
  
        ret
@@@ -1223,13 -1184,11 +1196,13 @@@ ENTRY(paranoid_exit
        testl   %ebx, %ebx                      /* swapgs needed? */
        jnz     .Lparanoid_exit_no_swapgs
        TRACE_IRQS_IRETQ
 +      /* Always restore stashed CR3 value (see paranoid_entry) */
        RESTORE_CR3     scratch_reg=%rbx save_reg=%r14
        SWAPGS_UNSAFE_STACK
        jmp     .Lparanoid_exit_restore
  .Lparanoid_exit_no_swapgs:
        TRACE_IRQS_IRETQ_DEBUG
 +      /* Always restore stashed CR3 value (see paranoid_entry) */
        RESTORE_CR3     scratch_reg=%rbx save_reg=%r14
  .Lparanoid_exit_restore:
        jmp restore_regs_and_return_to_kernel
@@@ -1640,7 -1599,6 +1613,7 @@@ end_repeat_nmi
        movq    $-1, %rsi
        call    do_nmi
  
 +      /* Always restore stashed CR3 value (see paranoid_entry) */
        RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
  
        testl   %ebx, %ebx                      /* swapgs needed? */
index c7a4e2a174b9cd15422296bc72efcb93560555eb,b2bb1d691efc54139d9635b42a11df53de4b035d..617805981cce159b111ea0afb6b1222b3cf50f60
@@@ -155,8 -155,7 +155,8 @@@ enum cpuid_regs_idx 
  #define X86_VENDOR_CENTAUR    5
  #define X86_VENDOR_TRANSMETA  7
  #define X86_VENDOR_NSC                8
 -#define X86_VENDOR_NUM                9
 +#define X86_VENDOR_HYGON      9
 +#define X86_VENDOR_NUM                10
  
  #define X86_VENDOR_UNKNOWN    0xff
  
@@@ -316,7 -315,13 +316,13 @@@ struct x86_hw_tss 
         */
        u64                     sp1;
  
+       /*
+        * Since Linux does not use ring 2, the 'sp2' slot is unused by
+        * hardware.  entry_SYSCALL_64 uses it as scratch space to stash
+        * the user RSP value.
+        */
        u64                     sp2;
        u64                     reserved2;
        u64                     ist[7];
        u32                     reserved3;
@@@ -579,7 -584,7 +585,7 @@@ static inline bool on_thread_stack(void
                               current_stack_pointer) < THREAD_SIZE;
  }
  
 -#ifdef CONFIG_PARAVIRT
 +#ifdef CONFIG_PARAVIRT_XXL
  #include <asm/paravirt.h>
  #else
  #define __cpuid                       native_cpuid
@@@ -590,7 -595,7 +596,7 @@@ static inline void load_sp0(unsigned lo
  }
  
  #define set_iopl_mask native_set_iopl_mask
 -#endif /* CONFIG_PARAVIRT */
 +#endif /* CONFIG_PARAVIRT_XXL */
  
  /* Free all resources held by a thread. */
  extern void release_thread(struct task_struct *);
index fc02c3cf238f41891b0fd2d0d1aa8deaad016d62,083c01309027e37e061cbee024b030c4ca554874..72adf6c335dca2e2db24c2560919d4807692d936
@@@ -64,12 -64,15 +64,12 @@@ void common(void) 
        OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
  #endif
  
 -#ifdef CONFIG_PARAVIRT
 +#ifdef CONFIG_PARAVIRT_XXL
        BLANK();
 -      OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
 -      OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
 -      OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 -      OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 -      OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
 -      OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 -      OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 +      OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable);
 +      OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable);
 +      OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret);
 +      OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2);
  #endif
  
  #ifdef CONFIG_XEN
        OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
  
        /* Layout info for cpu_entry_area */
-       OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
-       OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
        OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
        DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
        DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
  
-       /* Offset for sp0 and sp1 into the tss_struct */
+       /* Offset for fields in tss_struct */
        OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
        OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
+       OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
  }
index b810cc23937515430c8b24f51a4831051039e383,fe32103fcdc7ea4ca97847b37c84f1c67be7ec31..c37e66e493bff6775fae88d00c1f991d97f8f908
@@@ -35,12 -35,10 +35,10 @@@ static void __init spectre_v2_select_mi
  static void __init ssb_select_mitigation(void);
  static void __init l1tf_select_mitigation(void);
  
- /*
-  * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
-  * writes to SPEC_CTRL contain whatever reserved bits have been set.
-  */
- u64 __ro_after_init x86_spec_ctrl_base;
+ /* The base value of the SPEC_CTRL MSR that always has to be preserved. */
+ u64 x86_spec_ctrl_base;
  EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+ static DEFINE_MUTEX(spec_ctrl_mutex);
  
  /*
   * The vendor and possibly platform specific bits which can be modified in
@@@ -312,7 -310,6 +310,7 @@@ static enum spectre_v2_mitigation_cmd _
        }
  
        if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
 +          boot_cpu_data.x86_vendor != X86_VENDOR_HYGON &&
            boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
                pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
                return SPECTRE_V2_CMD_AUTO;
        return cmd;
  }
  
+ static bool stibp_needed(void)
+ {
+       if (spectre_v2_enabled == SPECTRE_V2_NONE)
+               return false;
+       if (!boot_cpu_has(X86_FEATURE_STIBP))
+               return false;
+       return true;
+ }
+ static void update_stibp_msr(void *info)
+ {
+       wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ }
+ void arch_smt_update(void)
+ {
+       u64 mask;
+       if (!stibp_needed())
+               return;
+       mutex_lock(&spec_ctrl_mutex);
+       mask = x86_spec_ctrl_base;
+       if (cpu_smt_control == CPU_SMT_ENABLED)
+               mask |= SPEC_CTRL_STIBP;
+       else
+               mask &= ~SPEC_CTRL_STIBP;
+       if (mask != x86_spec_ctrl_base) {
+               pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n",
+                               cpu_smt_control == CPU_SMT_ENABLED ?
+                               "Enabling" : "Disabling");
+               x86_spec_ctrl_base = mask;
+               on_each_cpu(update_stibp_msr, NULL, 1);
+       }
+       mutex_unlock(&spec_ctrl_mutex);
+ }
  static void __init spectre_v2_select_mitigation(void)
  {
        enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
        return;
  
  retpoline_auto:
 -      if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 +      if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
 +          boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
        retpoline_amd:
                if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
                        pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
@@@ -426,6 -462,9 +464,9 @@@ specv2_set_mode
                setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
                pr_info("Enabling Restricted Speculation for firmware calls\n");
        }
+       /* Enable STIBP if appropriate */
+       arch_smt_update();
  }
  
  #undef pr_fmt
@@@ -816,6 -855,8 +857,8 @@@ static ssize_t l1tf_show_state(char *bu
  static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
                               char *buf, unsigned int bug)
  {
+       int ret;
        if (!boot_cpu_has_bug(bug))
                return sprintf(buf, "Not affected\n");
  
                return sprintf(buf, "Mitigation: __user pointer sanitization\n");
  
        case X86_BUG_SPECTRE_V2:
-               return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+               ret = sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
                               boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
                               boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+                              (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "",
+                              boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
                               spectre_v2_module_string());
+               return ret;
  
        case X86_BUG_SPEC_STORE_BYPASS:
                return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
index 9315a16606682fa221c88ecb6d5753655dd63ad5,8bffeae9bac25ac520660eaea492196c34c1b29e..660d0b22e962e83b1026743d37f1194d95ed46e0
@@@ -949,11 -949,11 +949,11 @@@ static void identify_cpu_without_cpuid(
  }
  
  static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
 -      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_CEDARVIEW,   X86_FEATURE_ANY },
 -      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_CLOVERVIEW,  X86_FEATURE_ANY },
 -      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_LINCROFT,    X86_FEATURE_ANY },
 -      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_PENWELL,     X86_FEATURE_ANY },
 -      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_PINEVIEW,    X86_FEATURE_ANY },
 +      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_SALTWELL,    X86_FEATURE_ANY },
 +      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_SALTWELL_TABLET,     X86_FEATURE_ANY },
 +      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_BONNELL_MID, X86_FEATURE_ANY },
 +      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_SALTWELL_MID,        X86_FEATURE_ANY },
 +      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_BONNELL,     X86_FEATURE_ANY },
        { X86_VENDOR_CENTAUR,   5 },
        { X86_VENDOR_INTEL,     5 },
        { X86_VENDOR_NSC,       5 },
  
  static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
        { X86_VENDOR_AMD },
 +      { X86_VENDOR_HYGON },
        {}
  };
  
  /* Only list CPUs which speculate but are non susceptible to SSB */
  static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
 -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT1     },
 +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT      },
        { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_AIRMONT         },
 -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT    },
 -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_MERRIFIELD      },
 +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT_X    },
 +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT_MID  },
        { X86_VENDOR_INTEL,     6,      INTEL_FAM6_CORE_YONAH           },
        { X86_VENDOR_INTEL,     6,      INTEL_FAM6_XEON_PHI_KNL         },
        { X86_VENDOR_INTEL,     6,      INTEL_FAM6_XEON_PHI_KNM         },
  
  static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
        /* in addition to cpu_no_speculation */
 -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT1     },
 -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT    },
 +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT      },
 +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT_X    },
        { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_AIRMONT         },
 -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_MERRIFIELD      },
 -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_MOOREFIELD      },
 +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT_MID  },
 +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_AIRMONT_MID     },
        { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_GOLDMONT        },
 -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_DENVERTON       },
 -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_GEMINI_LAKE     },
 +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_GOLDMONT_X      },
 +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_GOLDMONT_PLUS   },
        { X86_VENDOR_INTEL,     6,      INTEL_FAM6_XEON_PHI_KNL         },
        { X86_VENDOR_INTEL,     6,      INTEL_FAM6_XEON_PHI_KNM         },
        {}
@@@ -1077,9 -1076,6 +1077,9 @@@ static void __init early_identify_cpu(s
        memset(&c->x86_capability, 0, sizeof c->x86_capability);
        c->extended_cpuid_level = 0;
  
 +      if (!have_cpuid_p())
 +              identify_cpu_without_cpuid(c);
 +
        /* cyrix could have cpuid enabled via c_identify()*/
        if (have_cpuid_p()) {
                cpu_detect(c);
                if (this_cpu->c_bsp_init)
                        this_cpu->c_bsp_init(c);
        } else {
 -              identify_cpu_without_cpuid(c);
                setup_clear_cpu_cap(X86_FEATURE_CPUID);
        }
  
@@@ -1243,10 -1240,10 +1243,10 @@@ static void generic_identify(struct cpu
         * ESPFIX issue, we can change this.
         */
  #ifdef CONFIG_X86_32
 -# ifdef CONFIG_PARAVIRT
 +# ifdef CONFIG_PARAVIRT_XXL
        do {
                extern void native_iret(void);
 -              if (pv_cpu_ops.iret == native_iret)
 +              if (pv_ops.cpu.iret == native_iret)
                        set_cpu_bug(c, X86_BUG_ESPFIX);
        } while (0);
  # else
@@@ -1534,19 -1531,8 +1534,8 @@@ EXPORT_PER_CPU_SYMBOL(__preempt_count)
  /* May not be marked __init: used by software suspend */
  void syscall_init(void)
  {
-       extern char _entry_trampoline[];
-       extern char entry_SYSCALL_64_trampoline[];
-       int cpu = smp_processor_id();
-       unsigned long SYSCALL64_entry_trampoline =
-               (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
-               (entry_SYSCALL_64_trampoline - _entry_trampoline);
        wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-       if (static_cpu_has(X86_FEATURE_PTI))
-               wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
-       else
-               wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+       wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
  
  #ifdef CONFIG_IA32_EMULATION
        wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
         * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
         */
        wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-       wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+       wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+                   (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
        wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
  #else
        wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@@ -1672,29 -1659,6 +1662,29 @@@ static void wait_for_master_cpu(int cpu
  #endif
  }
  
 +#ifdef CONFIG_X86_64
 +static void setup_getcpu(int cpu)
 +{
 +      unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
 +      struct desc_struct d = { };
 +
 +      if (static_cpu_has(X86_FEATURE_RDTSCP))
 +              write_rdtscp_aux(cpudata);
 +
 +      /* Store CPU and node number in limit. */
 +      d.limit0 = cpudata;
 +      d.limit1 = cpudata >> 16;
 +
 +      d.type = 5;             /* RO data, expand down, accessed */
 +      d.dpl = 3;              /* Visible to user code */
 +      d.s = 1;                /* Not a system segment */
 +      d.p = 1;                /* Present */
 +      d.d = 1;                /* 32-bit */
 +
 +      write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
 +}
 +#endif
 +
  /*
   * cpu_init() initializes state that is per-CPU. Some data is already
   * initialized (naturally) in the bootstrap process, such as the GDT
@@@ -1732,7 -1696,6 +1722,7 @@@ void cpu_init(void
            early_cpu_to_node(cpu) != NUMA_NO_NODE)
                set_numa_node(early_cpu_to_node(cpu));
  #endif
 +      setup_getcpu(cpu);
  
        me = current;
  
index f72a47b602e208ce3c18c24f1f215e4e2a465674,f802cf5b447885ca4636727c40c2f78784c832b6..c33b06f5faa4079bb87392d42dd232ad6a2fe5a1
@@@ -1020,26 -1020,56 +1020,18 @@@ int kprobe_fault_handler(struct pt_reg
                 */
                if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
                        return 1;
 -
 -              /*
 -               * In case the user-specified fault handler returned
 -               * zero, try to fix up.
 -               */
 -              if (fixup_exception(regs, trapnr))
 -                      return 1;
 -
 -              /*
 -               * fixup routine could not handle it,
 -               * Let do_page_fault() fix it.
 -               */
        }
  
        return 0;
  }
  NOKPROBE_SYMBOL(kprobe_fault_handler);
  
 -/*
 - * Wrapper routine for handling exceptions.
 - */
 -int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
 -                           void *data)
 -{
 -      struct die_args *args = data;
 -      int ret = NOTIFY_DONE;
 -
 -      if (args->regs && user_mode(args->regs))
 -              return ret;
 -
 -      if (val == DIE_GPF) {
 -              /*
 -               * To be potentially processing a kprobe fault and to
 -               * trust the result from kprobe_running(), we have
 -               * be non-preemptible.
 -               */
 -              if (!preemptible() && kprobe_running() &&
 -                  kprobe_fault_handler(args->regs, args->trapnr))
 -                      ret = NOTIFY_STOP;
 -      }
 -      return ret;
 -}
 -NOKPROBE_SYMBOL(kprobe_exceptions_notify);
 -
  bool arch_within_kprobe_blacklist(unsigned long addr)
  {
-       bool is_in_entry_trampoline_section = false;
- #ifdef CONFIG_X86_64
-       is_in_entry_trampoline_section =
-               (addr >= (unsigned long)__entry_trampoline_start &&
-                addr < (unsigned long)__entry_trampoline_end);
- #endif
        return  (addr >= (unsigned long)__kprobes_text_start &&
                 addr < (unsigned long)__kprobes_text_end) ||
                (addr >= (unsigned long)__entry_text_start &&
-                addr < (unsigned long)__entry_text_end) ||
-               is_in_entry_trampoline_section;
+                addr < (unsigned long)__entry_text_end);
  }
  
  int __init arch_init_kprobes(void)
index d6674a425714b653def5c7c10628af9e723a6745,0fa7aa19f09e00696f38d19e156107760c2bb2cb..31b4755369f084575f6b3a0ec30b340392106f70
  #include <asm/vdso.h>
  #include <asm/intel_rdt_sched.h>
  #include <asm/unistd.h>
 +#include <asm/fsgsbase.h>
  #ifdef CONFIG_IA32_EMULATION
  /* Not included via unistd.h */
  #include <asm/unistd_32_ia32.h>
  #endif
  
- __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
  /* Prints also some state that isn't saved in the pt_regs */
 -void __show_regs(struct pt_regs *regs, int all)
 +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
  {
        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
        unsigned long d0, d1, d2, d3, d6, d7;
        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
               regs->r13, regs->r14, regs->r15);
  
 -      if (!all)
 +      if (mode == SHOW_REGS_SHORT)
                return;
  
 +      if (mode == SHOW_REGS_USER) {
 +              rdmsrl(MSR_FS_BASE, fs);
 +              rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 +              printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
 +                     fs, shadowgs);
 +              return;
 +      }
 +
        asm("movl %%ds,%0" : "=r" (ds));
        asm("movl %%cs,%0" : "=r" (cs));
        asm("movl %%es,%0" : "=r" (es));
@@@ -287,138 -276,6 +285,138 @@@ static __always_inline void load_seg_le
        }
  }
  
 +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
 +                                            struct thread_struct *next)
 +{
 +      load_seg_legacy(prev->fsindex, prev->fsbase,
 +                      next->fsindex, next->fsbase, FS);
 +      load_seg_legacy(prev->gsindex, prev->gsbase,
 +                      next->gsindex, next->gsbase, GS);
 +}
 +
 +static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
 +                                          unsigned short selector)
 +{
 +      unsigned short idx = selector >> 3;
 +      unsigned long base;
 +
 +      if (likely((selector & SEGMENT_TI_MASK) == 0)) {
 +              if (unlikely(idx >= GDT_ENTRIES))
 +                      return 0;
 +
 +              /*
 +               * There are no user segments in the GDT with nonzero bases
 +               * other than the TLS segments.
 +               */
 +              if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 +                      return 0;
 +
 +              idx -= GDT_ENTRY_TLS_MIN;
 +              base = get_desc_base(&task->thread.tls_array[idx]);
 +      } else {
 +#ifdef CONFIG_MODIFY_LDT_SYSCALL
 +              struct ldt_struct *ldt;
 +
 +              /*
 +               * If performance here mattered, we could protect the LDT
 +               * with RCU.  This is a slow path, though, so we can just
 +               * take the mutex.
 +               */
 +              mutex_lock(&task->mm->context.lock);
 +              ldt = task->mm->context.ldt;
 +              if (unlikely(idx >= ldt->nr_entries))
 +                      base = 0;
 +              else
 +                      base = get_desc_base(ldt->entries + idx);
 +              mutex_unlock(&task->mm->context.lock);
 +#else
 +              base = 0;
 +#endif
 +      }
 +
 +      return base;
 +}
 +
 +void x86_fsbase_write_cpu(unsigned long fsbase)
 +{
 +      /*
 +       * Set the selector to 0 as a notion, that the segment base is
 +       * overwritten, which will be checked for skipping the segment load
 +       * during context switch.
 +       */
 +      loadseg(FS, 0);
 +      wrmsrl(MSR_FS_BASE, fsbase);
 +}
 +
 +void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
 +{
 +      /* Set the selector to 0 for the same reason as %fs above. */
 +      loadseg(GS, 0);
 +      wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
 +}
 +
 +unsigned long x86_fsbase_read_task(struct task_struct *task)
 +{
 +      unsigned long fsbase;
 +
 +      if (task == current)
 +              fsbase = x86_fsbase_read_cpu();
 +      else if (task->thread.fsindex == 0)
 +              fsbase = task->thread.fsbase;
 +      else
 +              fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
 +
 +      return fsbase;
 +}
 +
 +unsigned long x86_gsbase_read_task(struct task_struct *task)
 +{
 +      unsigned long gsbase;
 +
 +      if (task == current)
 +              gsbase = x86_gsbase_read_cpu_inactive();
 +      else if (task->thread.gsindex == 0)
 +              gsbase = task->thread.gsbase;
 +      else
 +              gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
 +
 +      return gsbase;
 +}
 +
 +int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
 +{
 +      /*
 +       * Not strictly needed for %fs, but do it for symmetry
 +       * with %gs
 +       */
 +      if (unlikely(fsbase >= TASK_SIZE_MAX))
 +              return -EPERM;
 +
 +      preempt_disable();
 +      task->thread.fsbase = fsbase;
 +      if (task == current)
 +              x86_fsbase_write_cpu(fsbase);
 +      task->thread.fsindex = 0;
 +      preempt_enable();
 +
 +      return 0;
 +}
 +
 +int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 +{
 +      if (unlikely(gsbase >= TASK_SIZE_MAX))
 +              return -EPERM;
 +
 +      preempt_disable();
 +      task->thread.gsbase = gsbase;
 +      if (task == current)
 +              x86_gsbase_write_cpu_inactive(gsbase);
 +      task->thread.gsindex = 0;
 +      preempt_enable();
 +
 +      return 0;
 +}
 +
  int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
                unsigned long arg, struct task_struct *p, unsigned long tls)
  {
@@@ -606,7 -463,10 +604,7 @@@ __switch_to(struct task_struct *prev_p
        if (unlikely(next->ds | prev->ds))
                loadsegment(ds, next->ds);
  
 -      load_seg_legacy(prev->fsindex, prev->fsbase,
 -                      next->fsindex, next->fsbase, FS);
 -      load_seg_legacy(prev->gsindex, prev->gsbase,
 -                      next->gsindex, next->gsbase, GS);
 +      x86_fsgsbase_load(prev, next);
  
        switch_fpu_finish(next_fpu, cpu);
  
@@@ -757,25 -617,54 +755,25 @@@ static long prctl_map_vdso(const struc
  long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
  {
        int ret = 0;
 -      int doit = task == current;
 -      int cpu;
  
        switch (option) {
 -      case ARCH_SET_GS:
 -              if (arg2 >= TASK_SIZE_MAX)
 -                      return -EPERM;
 -              cpu = get_cpu();
 -              task->thread.gsindex = 0;
 -              task->thread.gsbase = arg2;
 -              if (doit) {
 -                      load_gs_index(0);
 -                      ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
 -              }
 -              put_cpu();
 +      case ARCH_SET_GS: {
 +              ret = x86_gsbase_write_task(task, arg2);
                break;
 -      case ARCH_SET_FS:
 -              /* Not strictly needed for fs, but do it for symmetry
 -                 with gs */
 -              if (arg2 >= TASK_SIZE_MAX)
 -                      return -EPERM;
 -              cpu = get_cpu();
 -              task->thread.fsindex = 0;
 -              task->thread.fsbase = arg2;
 -              if (doit) {
 -                      /* set the selector to 0 to not confuse __switch_to */
 -                      loadsegment(fs, 0);
 -                      ret = wrmsrl_safe(MSR_FS_BASE, arg2);
 -              }
 -              put_cpu();
 +      }
 +      case ARCH_SET_FS: {
 +              ret = x86_fsbase_write_task(task, arg2);
                break;
 +      }
        case ARCH_GET_FS: {
 -              unsigned long base;
 +              unsigned long base = x86_fsbase_read_task(task);
  
 -              if (doit)
 -                      rdmsrl(MSR_FS_BASE, base);
 -              else
 -                      base = task->thread.fsbase;
                ret = put_user(base, (unsigned long __user *)arg2);
                break;
        }
        case ARCH_GET_GS: {
 -              unsigned long base;
 +              unsigned long base = x86_gsbase_read_task(task);
  
 -              if (doit)
 -                      rdmsrl(MSR_KERNEL_GS_BASE, base);
 -              else
 -                      base = task->thread.gsbase;
                ret = put_user(base, (unsigned long __user *)arg2);
                break;
        }
diff --combined arch/x86/kernel/traps.c
index 16c95cb904964ff45ba2bf077df266c893e3fa57,1a90821c0b7421e29864bd9088501d64fe165917..5bd0a997d81e28f1ab0ce859dfb2415128602909
@@@ -206,7 -206,7 +206,7 @@@ do_trap_no_signal(struct task_struct *t
        }
  
        if (!user_mode(regs)) {
 -              if (fixup_exception(regs, trapnr))
 +              if (fixup_exception(regs, trapnr, error_code, 0))
                        return 0;
  
                tsk->thread.error_code = error_code;
@@@ -383,6 -383,10 +383,10 @@@ dotraplinkage void do_double_fault(stru
                 * we won't enable interupts or schedule before we invoke
                 * general_protection, so nothing will clobber the stack
                 * frame we just set up.
+                *
+                * We will enter general_protection with kernel GSBASE,
+                * which is what the stub expects, given that the faulting
+                * RIP will be the IRET instruction.
                 */
                regs->ip = (unsigned long)general_protection;
                regs->sp = (unsigned long)&gpregs->orig_ax;
@@@ -551,21 -555,11 +555,21 @@@ do_general_protection(struct pt_regs *r
  
        tsk = current;
        if (!user_mode(regs)) {
 -              if (fixup_exception(regs, X86_TRAP_GP))
 +              if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
                        return;
  
                tsk->thread.error_code = error_code;
                tsk->thread.trap_nr = X86_TRAP_GP;
 +
 +              /*
 +               * To be potentially processing a kprobe fault and to
 +               * trust the result from kprobe_running(), we have to
 +               * be non-preemptible.
 +               */
 +              if (!preemptible() && kprobe_running() &&
 +                  kprobe_fault_handler(regs, X86_TRAP_GP))
 +                      return;
 +
                if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
                               X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
                        die("general protection fault", regs, error_code);
@@@ -848,7 -842,7 +852,7 @@@ static void math_error(struct pt_regs *
        cond_local_irq_enable(regs);
  
        if (!user_mode(regs)) {
 -              if (fixup_exception(regs, trapnr))
 +              if (fixup_exception(regs, trapnr, error_code, 0))
                        return;
  
                task->thread.error_code = error_code;
index 5dd3317d761f4065b0fc8f7cdcac4e08be600f0e,9c77d2df9c2725399ee65c5947535c72164a4349..0d618ee634ac40cbdd35957fcbe2f17f13446a6f
@@@ -65,23 -65,6 +65,23 @@@ jiffies_64 = jiffies
  #define ALIGN_ENTRY_TEXT_BEGIN        . = ALIGN(PMD_SIZE);
  #define ALIGN_ENTRY_TEXT_END  . = ALIGN(PMD_SIZE);
  
 +/*
 + * This section contains data which will be mapped as decrypted. Memory
 + * encryption operates on a page basis. Make this section PMD-aligned
 + * to avoid splitting the pages while mapping the section early.
 + *
 + * Note: We use a separate section so that only this section gets
 + * decrypted to avoid exposing more than we wish.
 + */
 +#define BSS_DECRYPTED                                         \
 +      . = ALIGN(PMD_SIZE);                                    \
 +      __start_bss_decrypted = .;                              \
 +      *(.bss..decrypted);                                     \
 +      . = ALIGN(PAGE_SIZE);                                   \
 +      __start_bss_decrypted_unused = .;                       \
 +      . = ALIGN(PMD_SIZE);                                    \
 +      __end_bss_decrypted = .;                                \
 +
  #else
  
  #define X86_ALIGN_RODATA_BEGIN
@@@ -91,7 -74,6 +91,7 @@@
  
  #define ALIGN_ENTRY_TEXT_BEGIN
  #define ALIGN_ENTRY_TEXT_END
 +#define BSS_DECRYPTED
  
  #endif
  
@@@ -136,16 -118,6 +136,6 @@@ SECTION
                *(.fixup)
                *(.gnu.warning)
  
- #ifdef CONFIG_X86_64
-               . = ALIGN(PAGE_SIZE);
-               __entry_trampoline_start = .;
-               _entry_trampoline = .;
-               *(.entry_trampoline)
-               . = ALIGN(PAGE_SIZE);
-               __entry_trampoline_end = .;
-               ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
- #endif
  #ifdef CONFIG_RETPOLINE
                __indirect_thunk_start = .;
                *(.text.__x86.indirect_thunk)
                __bss_start = .;
                *(.bss..page_aligned)
                *(.bss)
 +              BSS_DECRYPTED
                . = ALIGN(PAGE_SIZE);
                __bss_stop = .;
        }
diff --combined arch/x86/mm/tlb.c
index 7d68489cfdb15ff0838aba5beb6207c36b27cd77,073b8df349a0d6f81888bf87cf27bbf2b2efba7a..bddd6b3cee1de51ac8321974b827d208dbab1831
@@@ -7,6 -7,7 +7,7 @@@
  #include <linux/export.h>
  #include <linux/cpu.h>
  #include <linux/debugfs.h>
+ #include <linux/ptrace.h>
  
  #include <asm/tlbflush.h>
  #include <asm/mmu_context.h>
@@@ -180,16 -181,26 +181,29 @@@ static void sync_current_stack_to_mm(st
        }
  }
  
+ static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
+ {
+       /*
+        * Check if the current (previous) task has access to the memory
+        * of the @tsk (next) task. If access is denied, make sure to
+        * issue a IBPB to stop user->user Spectre-v2 attacks.
+        *
+        * Note: __ptrace_may_access() returns 0 or -ERRNO.
+        */
+       return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
+               ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
+ }
  void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                        struct task_struct *tsk)
  {
        struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
        u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 +      bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
        unsigned cpu = smp_processor_id();
        u64 next_tlb_gen;
 +      bool need_flush;
 +      u16 new_asid;
  
        /*
         * NB: The scheduler will call us with prev == next when switching
                           next->context.ctx_id);
  
                /*
 -               * We don't currently support having a real mm loaded without
 -               * our cpu set in mm_cpumask().  We have all the bookkeeping
 -               * in place to figure out whether we would need to flush
 -               * if our cpu were cleared in mm_cpumask(), but we don't
 -               * currently use it.
 +               * Even in lazy TLB mode, the CPU should stay set in the
 +               * mm_cpumask. The TLB shootdown code can figure out from
 +               * from cpu_tlbstate.is_lazy whether or not to send an IPI.
                 */
                if (WARN_ON_ONCE(real_prev != &init_mm &&
                                 !cpumask_test_cpu(cpu, mm_cpumask(next))))
                        cpumask_set_cpu(cpu, mm_cpumask(next));
  
 -              return;
 +              /*
 +               * If the CPU is not in lazy TLB mode, we are just switching
 +               * from one thread in a process to another thread in the same
 +               * process. No TLB flush required.
 +               */
 +              if (!was_lazy)
 +                      return;
 +
 +              /*
 +               * Read the tlb_gen to check whether a flush is needed.
 +               * If the TLB is up to date, just use it.
 +               * The barrier synchronizes with the tlb_gen increment in
 +               * the TLB shootdown code.
 +               */
 +              smp_mb();
 +              next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 +              if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
 +                              next_tlb_gen)
 +                      return;
 +
 +              /*
 +               * TLB contents went out of date while we were in lazy
 +               * mode. Fall through to the TLB switching code below.
 +               */
 +              new_asid = prev_asid;
 +              need_flush = true;
        } else {
 -              u16 new_asid;
 -              bool need_flush;
                u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
  
                /*
                 * one process from doing Spectre-v2 attacks on another.
                 *
                 * As an optimization, flush indirect branches only when
-                * switching into processes that disable dumping. This
-                * protects high value processes like gpg, without having
-                * too high performance overhead. IBPB is *expensive*!
-                *
-                * This will not flush branches when switching into kernel
-                * threads. It will also not flush if we switch to idle
-                * thread and back to the same process. It will flush if we
-                * switch to a different non-dumpable process.
+                * switching into a processes that can't be ptrace by the
+                * current one (as in such case, attacker has much more
+                * convenient way how to tamper with the next process than
+                * branch buffer poisoning).
                 */
-               if (tsk && tsk->mm &&
-                   tsk->mm->context.ctx_id != last_ctx_id &&
-                   get_dumpable(tsk->mm) != SUID_DUMP_USER)
+               if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
+                               ibpb_needed(tsk, last_ctx_id))
                        indirect_branch_prediction_barrier();
  
                if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                /* Let nmi_uaccess_okay() know that we're changing CR3. */
                this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
                barrier();
 +      }
  
 -              if (need_flush) {
 -                      this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 -                      this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
 -                      load_new_mm_cr3(next->pgd, new_asid, true);
 -
 -                      /*
 -                       * NB: This gets called via leave_mm() in the idle path
 -                       * where RCU functions differently.  Tracing normally
 -                       * uses RCU, so we need to use the _rcuidle variant.
 -                       *
 -                       * (There is no good reason for this.  The idle code should
 -                       *  be rearranged to call this before rcu_idle_enter().)
 -                       */
 -                      trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 -              } else {
 -                      /* The new ASID is already up to date. */
 -                      load_new_mm_cr3(next->pgd, new_asid, false);
 -
 -                      /* See above wrt _rcuidle. */
 -                      trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
 -              }
 +      if (need_flush) {
 +              this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 +              this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
 +              load_new_mm_cr3(next->pgd, new_asid, true);
  
                /*
 -               * Record last user mm's context id, so we can avoid
 -               * flushing branch buffer with IBPB if we switch back
 -               * to the same user.
 +               * NB: This gets called via leave_mm() in the idle path
 +               * where RCU functions differently.  Tracing normally
 +               * uses RCU, so we need to use the _rcuidle variant.
 +               *
 +               * (There is no good reason for this.  The idle code should
 +               *  be rearranged to call this before rcu_idle_enter().)
                 */
 -              if (next != &init_mm)
 -                      this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
 -
 -              /* Make sure we write CR3 before loaded_mm. */
 -              barrier();
 +              trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 +      } else {
 +              /* The new ASID is already up to date. */
 +              load_new_mm_cr3(next->pgd, new_asid, false);
  
 -              this_cpu_write(cpu_tlbstate.loaded_mm, next);
 -              this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 +              /* See above wrt _rcuidle. */
 +              trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
        }
  
 -      load_mm_cr4(next);
 -      switch_ldt(real_prev, next);
 +      /*
 +       * Record last user mm's context id, so we can avoid
 +       * flushing branch buffer with IBPB if we switch back
 +       * to the same user.
 +       */
 +      if (next != &init_mm)
 +              this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
 +
 +      /* Make sure we write CR3 before loaded_mm. */
 +      barrier();
 +
 +      this_cpu_write(cpu_tlbstate.loaded_mm, next);
 +      this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 +
 +      if (next != real_prev) {
 +              load_mm_cr4(next);
 +              switch_ldt(real_prev, next);
 +      }
  }
  
  /*
@@@ -394,7 -377,20 +403,7 @@@ void enter_lazy_tlb(struct mm_struct *m
        if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
                return;
  
 -      if (tlb_defer_switch_to_init_mm()) {
 -              /*
 -               * There's a significant optimization that may be possible
 -               * here.  We have accurate enough TLB flush tracking that we
 -               * don't need to maintain coherence of TLB per se when we're
 -               * lazy.  We do, however, need to maintain coherence of
 -               * paging-structure caches.  We could, in principle, leave our
 -               * old mm loaded and only switch to init_mm when
 -               * tlb_remove_page() happens.
 -               */
 -              this_cpu_write(cpu_tlbstate.is_lazy, true);
 -      } else {
 -              switch_mm(NULL, &init_mm, NULL);
 -      }
 +      this_cpu_write(cpu_tlbstate.is_lazy, true);
  }
  
  /*
@@@ -481,9 -477,6 +490,9 @@@ static void flush_tlb_func_common(cons
                 * paging-structure cache to avoid speculatively reading
                 * garbage into our TLB.  Since switching to init_mm is barely
                 * slower than a minimal flush, just switch to init_mm.
 +               *
 +               * This should be rare, with native_flush_tlb_others skipping
 +               * IPIs to lazy TLB mode CPUs.
                 */
                switch_mm_irqs_off(NULL, &init_mm, NULL);
                return;
            f->new_tlb_gen == local_tlb_gen + 1 &&
            f->new_tlb_gen == mm_tlb_gen) {
                /* Partial flush */
 -              unsigned long addr;
 -              unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
 +              unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
 +              unsigned long addr = f->start;
  
 -              addr = f->start;
                while (addr < f->end) {
                        __flush_tlb_one_user(addr);
 -                      addr += PAGE_SIZE;
 +                      addr += 1UL << f->stride_shift;
                }
                if (local)
 -                      count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
 -              trace_tlb_flush(reason, nr_pages);
 +                      count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
 +              trace_tlb_flush(reason, nr_invalidate);
        } else {
                /* Full flush. */
                local_flush_tlb();
@@@ -586,11 -580,6 +595,11 @@@ static void flush_tlb_func_remote(void 
        flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
  }
  
 +static bool tlb_is_not_lazy(int cpu, void *data)
 +{
 +      return !per_cpu(cpu_tlbstate.is_lazy, cpu);
 +}
 +
  void native_flush_tlb_others(const struct cpumask *cpumask,
                             const struct flush_tlb_info *info)
  {
                                               (void *)info, 1);
                return;
        }
 -      smp_call_function_many(cpumask, flush_tlb_func_remote,
 +
 +      /*
 +       * If no page tables were freed, we can skip sending IPIs to
 +       * CPUs in lazy TLB mode. They will flush the CPU themselves
 +       * at the next context switch.
 +       *
 +       * However, if page tables are getting freed, we need to send the
 +       * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
 +       * up on the new contents of what used to be page tables, while
 +       * doing a speculative memory access.
 +       */
 +      if (info->freed_tables)
 +              smp_call_function_many(cpumask, flush_tlb_func_remote,
                               (void *)info, 1);
 +      else
 +              on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
 +                              (void *)info, 1, GFP_ATOMIC, cpumask);
  }
  
  /*
  static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
  
  void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 -                              unsigned long end, unsigned long vmflag)
 +                              unsigned long end, unsigned int stride_shift,
 +                              bool freed_tables)
  {
        int cpu;
  
        struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
                .mm = mm,
 +              .stride_shift = stride_shift,
 +              .freed_tables = freed_tables,
        };
  
        cpu = get_cpu();
  
        /* Should we flush just the requested range? */
        if ((end != TLB_FLUSH_ALL) &&
 -          !(vmflag & VM_HUGETLB) &&
 -          ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
 +          ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
                info.start = start;
                info.end = end;
        } else {
diff --combined kernel/cpu.c
index e82920b8bee14e62be2bb248d92413a85aff4aa9,2fb49916ea560735d7f51f8a7610892303ec243a..3c7f3b4c453cf57c8e37dd5fadc9f5941f074f0d
@@@ -315,16 -315,6 +315,16 @@@ void lockdep_assert_cpus_held(void
        percpu_rwsem_assert_held(&cpu_hotplug_lock);
  }
  
 +static void lockdep_acquire_cpus_lock(void)
 +{
 +      rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
 +}
 +
 +static void lockdep_release_cpus_lock(void)
 +{
 +      rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
 +}
 +
  /*
   * Wait for currently running CPU hotplug operations to complete (if any) and
   * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
@@@ -354,17 -344,6 +354,17 @@@ void cpu_hotplug_enable(void
        cpu_maps_update_done();
  }
  EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 +
 +#else
 +
 +static void lockdep_acquire_cpus_lock(void)
 +{
 +}
 +
 +static void lockdep_release_cpus_lock(void)
 +{
 +}
 +
  #endif        /* CONFIG_HOTPLUG_CPU */
  
  #ifdef CONFIG_HOTPLUG_SMT
@@@ -383,7 -362,6 +383,7 @@@ void __init cpu_smt_disable(bool force
                pr_info("SMT: Force disabled\n");
                cpu_smt_control = CPU_SMT_FORCE_DISABLED;
        } else {
 +              pr_info("SMT: disabled\n");
                cpu_smt_control = CPU_SMT_DISABLED;
        }
  }
@@@ -629,21 -607,15 +629,21 @@@ static void cpuhp_thread_fun(unsigned i
        bool bringup = st->bringup;
        enum cpuhp_state state;
  
 +      if (WARN_ON_ONCE(!st->should_run))
 +              return;
 +
        /*
         * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
         * that if we see ->should_run we also see the rest of the state.
         */
        smp_mb();
  
 -      if (WARN_ON_ONCE(!st->should_run))
 -              return;
 -
 +      /*
 +       * The BP holds the hotplug lock, but we're now running on the AP,
 +       * ensure that anybody asserting the lock is held, will actually find
 +       * it so.
 +       */
 +      lockdep_acquire_cpus_lock();
        cpuhp_lock_acquire(bringup);
  
        if (st->single) {
        }
  
        cpuhp_lock_release(bringup);
 +      lockdep_release_cpus_lock();
  
        if (!st->should_run)
                complete_ap_thread(st, bringup);
@@@ -945,8 -916,7 +945,8 @@@ static int cpuhp_down_callbacks(unsigne
                ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
                if (ret) {
                        st->target = prev_state;
 -                      undo_cpu_down(cpu, st);
 +                      if (st->state < prev_state)
 +                              undo_cpu_down(cpu, st);
                        break;
                }
        }
@@@ -999,7 -969,7 +999,7 @@@ static int __ref _cpu_down(unsigned in
         * to do the further cleanups.
         */
        ret = cpuhp_down_callbacks(cpu, st, target);
 -      if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
 +      if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
                cpuhp_reset_state(st, prev_state);
                __cpuhp_kick_ap(st);
        }
@@@ -2055,6 -2025,12 +2055,12 @@@ static void cpuhp_online_cpu_device(uns
        kobject_uevent(&dev->kobj, KOBJ_ONLINE);
  }
  
+ /*
+  * Architectures that need SMT-specific errata handling during SMT hotplug
+  * should override this.
+  */
+ void __weak arch_smt_update(void) { };
  static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
  {
        int cpu, ret = 0;
                 */
                cpuhp_offline_cpu_device(cpu);
        }
-       if (!ret)
+       if (!ret) {
                cpu_smt_control = ctrlval;
+               arch_smt_update();
+       }
        cpu_maps_update_done();
        return ret;
  }
@@@ -2093,6 -2071,7 +2101,7 @@@ static int cpuhp_smt_enable(void
  
        cpu_maps_update_begin();
        cpu_smt_control = CPU_SMT_ENABLED;
+       arch_smt_update();
        for_each_present_cpu(cpu) {
                /* Skip online CPUs and CPUs on offline nodes */
                if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))