Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 23 Oct 2018 17:43:04 +0000 (18:43 +0100)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 23 Oct 2018 17:43:04 +0000 (18:43 +0100)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Oct 2018 17:43:04 +0000 (18:43 +0100)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Oct 2018 17:43:04 +0000 (18:43 +0100)
diff --combined arch/x86/entry/entry_64.S

index 7c5ce0a6c4d2623edccda5f3586a591da3abc12e,0d728142467fda9431f6c83afadc770d2dbed584..4d7a2d9d44cfec5928b902cef1bca9bca29093a6
--- 1/arch/x86/entry/entry_64.S
--- 2/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@@ -142,67 -142,6 +142,6 @@@ END(native_usergs_sysret64
    * with them due to bugs in both AMD and Intel CPUs.
    */
   
-       .pushsection .entry_trampoline, "ax"
- 
- /*
-  * The code in here gets remapped into cpu_entry_area's trampoline.  This means
-  * that the assembler and linker have the wrong idea as to where this code
-  * lives (and, in fact, it's mapped more than once, so it's not even at a
-  * fixed address).  So we can't reference any symbols outside the entry
-  * trampoline and expect it to work.
-  *
-  * Instead, we carefully abuse %rip-relative addressing.
-  * _entry_trampoline(%rip) refers to the start of the remapped) entry
-  * trampoline.  We can thus find cpu_entry_area with this macro:
-  */
- 
- #define CPU_ENTRY_AREA \
-       _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
- 
- /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
- #define RSP_SCRATCH   CPU_ENTRY_AREA_entry_stack + \
-                       SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
- 
- ENTRY(entry_SYSCALL_64_trampoline)
-       UNWIND_HINT_EMPTY
-       swapgs
- 
-       /* Stash the user RSP. */
-       movq    %rsp, RSP_SCRATCH
- 
-       /* Note: using %rsp as a scratch reg. */
-       SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
- 
-       /* Load the top of the task stack into RSP */
-       movq    CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
- 
-       /* Start building the simulated IRET frame. */
-       pushq   $__USER_DS                      /* pt_regs->ss */
-       pushq   RSP_SCRATCH                     /* pt_regs->sp */
-       pushq   %r11                            /* pt_regs->flags */
-       pushq   $__USER_CS                      /* pt_regs->cs */
-       pushq   %rcx                            /* pt_regs->ip */
- 
-       /*
-        * x86 lacks a near absolute jump, and we can't jump to the real
-        * entry text with a relative jump.  We could push the target
-        * address and then use retq, but this destroys the pipeline on
-        * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead,
-        * spill RDI and restore it in a second-stage trampoline.
-        */
-       pushq   %rdi
-       movq    $entry_SYSCALL_64_stage2, %rdi
-       JMP_NOSPEC %rdi
- END(entry_SYSCALL_64_trampoline)
- 
-       .popsection
- 
- ENTRY(entry_SYSCALL_64_stage2)
-       UNWIND_HINT_EMPTY
-       popq    %rdi
-       jmp     entry_SYSCALL_64_after_hwframe
- END(entry_SYSCALL_64_stage2)
- 
   ENTRY(entry_SYSCALL_64)
         UNWIND_HINT_EMPTY
         /*
@@@ -212,21 -151,19 +151,19 @@@
          */
   
         swapgs
-       /*
-        * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
-        * is not required to switch CR3.
-        */
-       movq    %rsp, PER_CPU_VAR(rsp_scratch)
+       /* tss.sp2 is scratch space. */
+       movq    %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
   
         /* Construct struct pt_regs on stack */
-       pushq   $__USER_DS                      /* pt_regs->ss */
-       pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
-       pushq   %r11                            /* pt_regs->flags */
-       pushq   $__USER_CS                      /* pt_regs->cs */
-       pushq   %rcx                            /* pt_regs->ip */
+       pushq   $__USER_DS                              /* pt_regs->ss */
+       pushq   PER_CPU_VAR(cpu_tss_rw + TSS_sp2)       /* pt_regs->sp */
+       pushq   %r11                                    /* pt_regs->flags */
+       pushq   $__USER_CS                              /* pt_regs->cs */
+       pushq   %rcx                                    /* pt_regs->ip */
   GLOBAL(entry_SYSCALL_64_after_hwframe)
-       pushq   %rax                            /* pt_regs->orig_ax */
+       pushq   %rax                                    /* pt_regs->orig_ax */
   
         PUSH_AND_CLEAR_REGS rax=$-ENOSYS
   
@@@ -900,6 -837,42 +837,42 @@@ apicinterrupt IRQ_WORK_VECTOR                    irq_wor
    */
   #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
   
+ /**
+  * idtentry - Generate an IDT entry stub
+  * @sym:              Name of the generated entry point
+  * @do_sym:           C function to be called
+  * @has_error_code:   True if this IDT vector has an error code on the stack
+  * @paranoid:                 non-zero means that this vector may be invoked from
+  *                    kernel mode with user GSBASE and/or user CR3.
+  *                    2 is special -- see below.
+  * @shift_ist:                Set to an IST index if entries from kernel mode should
+  *                            decrement the IST stack so that nested entries get a
+  *                    fresh stack.  (This is for #DB, which has a nasty habit
+  *                            of recursing.)
+  *
+  * idtentry generates an IDT stub that sets up a usable kernel context,
+  * creates struct pt_regs, and calls @do_sym.  The stub has the following
+  * special behaviors:
+  *
+  * On an entry from user mode, the stub switches from the trampoline or
+  * IST stack to the normal thread stack.  On an exit to user mode, the
+  * normal exit-to-usermode path is invoked.
+  *
+  * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
+  * whereas we omit the preemption check if @paranoid != 0.  This is purely
+  * because the implementation is simpler this way.  The kernel only needs
+  * to check for asynchronous kernel preemption when IRQ handlers return.
+  *
+  * If @paranoid == 0, then the stub will handle IRET faults by pretending
+  * that the fault came from user mode.  It will handle gs_change faults by
+  * pretending that the fault happened with kernel GSBASE.  Since this handling
+  * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
+  * @paranoid == 0.  This special handling will do the wrong thing for
+  * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
+  *
+  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
+  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
+  */
   .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
   ENTRY(\sym)
         UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@@ -1050,7 -1023,7 +1023,7 @@@ ENTRY(do_softirq_own_stack
         ret
   ENDPROC(do_softirq_own_stack)
   
- -#ifdef CONFIG_XEN
+ +#ifdef CONFIG_XEN_PV
   idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
   
   /*
@@@ -1130,13 -1103,11 +1103,13 @@@ ENTRY(xen_failsafe_callback
         ENCODE_FRAME_POINTER
         jmp     error_exit
   END(xen_failsafe_callback)
+ +#endif /* CONFIG_XEN_PV */
   
+ +#ifdef CONFIG_XEN_PVHVM
   apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
         xen_hvm_callback_vector xen_evtchn_do_upcall
+ +#endif
   
- -#endif /* CONFIG_XEN */
   
   #if IS_ENABLED(CONFIG_HYPERV)
   apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@@ -1153,7 -1124,7 +1126,7 @@@ idtentry debug                  do_debug                has_error_co
   idtentry int3                 do_int3                 has_error_code=0
   idtentry stack_segment                do_stack_segment        has_error_code=1
   
- -#ifdef CONFIG_XEN
+ +#ifdef CONFIG_XEN_PV
   idtentry xennmi                       do_nmi                  has_error_code=0
   idtentry xendebug             do_debug                has_error_code=0
   idtentry xenint3              do_int3                 has_error_code=0
@@@ -1189,16 -1160,6 +1162,16 @@@ ENTRY(paranoid_entry
         xorl    %ebx, %ebx
   
   1:
+ +      /*
+ +       * Always stash CR3 in %r14.  This value will be restored,
+ +       * verbatim, at exit.  Needed if paranoid_entry interrupted
+ +       * another entry that already switched to the user CR3 value
+ +       * but has not yet returned to userspace.
+ +       *
+ +       * This is also why CS (stashed in the "iret frame" by the
+ +       * hardware at entry) can not be used: this may be a return
+ +       * to kernel code, but with a user CR3 value.
+ +       */
         SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
   
         ret
@@@ -1223,13 -1184,11 +1196,13 @@@ ENTRY(paranoid_exit
         testl   %ebx, %ebx                      /* swapgs needed? */
         jnz     .Lparanoid_exit_no_swapgs
         TRACE_IRQS_IRETQ
+ +      /* Always restore stashed CR3 value (see paranoid_entry) */
         RESTORE_CR3     scratch_reg=%rbx save_reg=%r14
         SWAPGS_UNSAFE_STACK
         jmp     .Lparanoid_exit_restore
   .Lparanoid_exit_no_swapgs:
         TRACE_IRQS_IRETQ_DEBUG
+ +      /* Always restore stashed CR3 value (see paranoid_entry) */
         RESTORE_CR3     scratch_reg=%rbx save_reg=%r14
   .Lparanoid_exit_restore:
         jmp restore_regs_and_return_to_kernel
@@@ -1640,7 -1599,6 +1613,7 @@@ end_repeat_nmi
         movq    $-1, %rsi
         call    do_nmi
   
+ +      /* Always restore stashed CR3 value (see paranoid_entry) */
         RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
   
         testl   %ebx, %ebx                      /* swapgs needed? */
diff --combined arch/x86/include/asm/processor.h

index c7a4e2a174b9cd15422296bc72efcb93560555eb,b2bb1d691efc54139d9635b42a11df53de4b035d..617805981cce159b111ea0afb6b1222b3cf50f60
--- 1/arch/x86/include/asm/processor.h
--- 2/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@@ -155,8 -155,7 +155,8 @@@ enum cpuid_regs_idx 
   #define X86_VENDOR_CENTAUR    5
   #define X86_VENDOR_TRANSMETA  7
   #define X86_VENDOR_NSC                8
- -#define X86_VENDOR_NUM                9
+ +#define X86_VENDOR_HYGON      9
+ +#define X86_VENDOR_NUM                10
   
   #define X86_VENDOR_UNKNOWN    0xff
   
@@@ -316,7 -315,13 +316,13 @@@ struct x86_hw_tss 
          */
         u64                     sp1;
   
+       /*
+        * Since Linux does not use ring 2, the 'sp2' slot is unused by
+        * hardware.  entry_SYSCALL_64 uses it as scratch space to stash
+        * the user RSP value.
+        */
         u64                     sp2;
+ 
         u64                     reserved2;
         u64                     ist[7];
         u32                     reserved3;
@@@ -579,7 -584,7 +585,7 @@@ static inline bool on_thread_stack(void
                                current_stack_pointer) < THREAD_SIZE;
   }
   
- -#ifdef CONFIG_PARAVIRT
+ +#ifdef CONFIG_PARAVIRT_XXL
   #include <asm/paravirt.h>
   #else
   #define __cpuid                       native_cpuid
@@@ -590,7 -595,7 +596,7 @@@ static inline void load_sp0(unsigned lo
   }
   
   #define set_iopl_mask native_set_iopl_mask
- -#endif /* CONFIG_PARAVIRT */
+ +#endif /* CONFIG_PARAVIRT_XXL */
   
   /* Free all resources held by a thread. */
   extern void release_thread(struct task_struct *);
diff --combined arch/x86/kernel/asm-offsets.c

index fc02c3cf238f41891b0fd2d0d1aa8deaad016d62,083c01309027e37e061cbee024b030c4ca554874..72adf6c335dca2e2db24c2560919d4807692d936
--- 1/arch/x86/kernel/asm-offsets.c
--- 2/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@@ -64,12 -64,15 +64,12 @@@ void common(void) 
         OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
   #endif
   
- -#ifdef CONFIG_PARAVIRT
+ +#ifdef CONFIG_PARAVIRT_XXL
         BLANK();
- -      OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
- -      OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
- -      OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
- -      OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
- -      OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
- -      OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
- -      OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
+ +      OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable);
+ +      OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable);
+ +      OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret);
+ +      OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2);
   #endif
   
   #ifdef CONFIG_XEN
@@@ -96,13 -99,12 +96,12 @@@
         OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
   
         /* Layout info for cpu_entry_area */
-       OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
-       OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
         OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
         DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
         DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
   
-       /* Offset for sp0 and sp1 into the tss_struct */
+       /* Offset for fields in tss_struct */
         OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
         OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
+       OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
   }
diff --combined arch/x86/kernel/cpu/bugs.c

index b810cc23937515430c8b24f51a4831051039e383,fe32103fcdc7ea4ca97847b37c84f1c67be7ec31..c37e66e493bff6775fae88d00c1f991d97f8f908
--- 1/arch/x86/kernel/cpu/bugs.c
--- 2/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@@ -35,12 -35,10 +35,10 @@@ static void __init spectre_v2_select_mi
   static void __init ssb_select_mitigation(void);
   static void __init l1tf_select_mitigation(void);
   
- /*
-  * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
-  * writes to SPEC_CTRL contain whatever reserved bits have been set.
-  */
- u64 __ro_after_init x86_spec_ctrl_base;
+ /* The base value of the SPEC_CTRL MSR that always has to be preserved. */
+ u64 x86_spec_ctrl_base;
   EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+ static DEFINE_MUTEX(spec_ctrl_mutex);
   
   /*
    * The vendor and possibly platform specific bits which can be modified in
@@@ -312,7 -310,6 +310,7 @@@ static enum spectre_v2_mitigation_cmd _
         }
   
         if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
+ +          boot_cpu_data.x86_vendor != X86_VENDOR_HYGON &&
             boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
                 pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
                 return SPECTRE_V2_CMD_AUTO;
@@@ -326,6 -323,46 +324,46 @@@
         return cmd;
   }
   
+ static bool stibp_needed(void)
+ {
+       if (spectre_v2_enabled == SPECTRE_V2_NONE)
+               return false;
+ 
+       if (!boot_cpu_has(X86_FEATURE_STIBP))
+               return false;
+ 
+       return true;
+ }
+ 
+ static void update_stibp_msr(void *info)
+ {
+       wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ }
+ 
+ void arch_smt_update(void)
+ {
+       u64 mask;
+ 
+       if (!stibp_needed())
+               return;
+ 
+       mutex_lock(&spec_ctrl_mutex);
+       mask = x86_spec_ctrl_base;
+       if (cpu_smt_control == CPU_SMT_ENABLED)
+               mask |= SPEC_CTRL_STIBP;
+       else
+               mask &= ~SPEC_CTRL_STIBP;
+ 
+       if (mask != x86_spec_ctrl_base) {
+               pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n",
+                               cpu_smt_control == CPU_SMT_ENABLED ?
+                               "Enabling" : "Disabling");
+               x86_spec_ctrl_base = mask;
+               on_each_cpu(update_stibp_msr, NULL, 1);
+       }
+       mutex_unlock(&spec_ctrl_mutex);
+ }
+ 
   static void __init spectre_v2_select_mitigation(void)
   {
         enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@@ -372,8 -409,7 +410,8 @@@
         return;
   
   retpoline_auto:
- -      if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ +      if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ +          boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
         retpoline_amd:
                 if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
                         pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
@@@ -426,6 -462,9 +464,9 @@@ specv2_set_mode
                 setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
                 pr_info("Enabling Restricted Speculation for firmware calls\n");
         }
+ 
+       /* Enable STIBP if appropriate */
+       arch_smt_update();
   }
   
   #undef pr_fmt
@@@ -816,6 -855,8 +857,8 @@@ static ssize_t l1tf_show_state(char *bu
   static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
                                char *buf, unsigned int bug)
   {
+       int ret;
+ 
         if (!boot_cpu_has_bug(bug))
                 return sprintf(buf, "Not affected\n");
   
@@@ -833,10 -874,13 +876,13 @@@
                 return sprintf(buf, "Mitigation: __user pointer sanitization\n");
   
         case X86_BUG_SPECTRE_V2:
-               return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+               ret = sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
                                boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
                                boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+                              (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "",
+                              boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
                                spectre_v2_module_string());
+               return ret;
   
         case X86_BUG_SPEC_STORE_BYPASS:
                 return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
diff --combined arch/x86/kernel/cpu/common.c

index 9315a16606682fa221c88ecb6d5753655dd63ad5,8bffeae9bac25ac520660eaea492196c34c1b29e..660d0b22e962e83b1026743d37f1194d95ed46e0
--- 1/arch/x86/kernel/cpu/common.c
--- 2/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -949,11 -949,11 +949,11 @@@ static void identify_cpu_without_cpuid(
   }
   
   static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
- -      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_CEDARVIEW,   X86_FEATURE_ANY },
- -      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_CLOVERVIEW,  X86_FEATURE_ANY },
- -      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_LINCROFT,    X86_FEATURE_ANY },
- -      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_PENWELL,     X86_FEATURE_ANY },
- -      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_PINEVIEW,    X86_FEATURE_ANY },
+ +      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_SALTWELL,    X86_FEATURE_ANY },
+ +      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_SALTWELL_TABLET,     X86_FEATURE_ANY },
+ +      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_BONNELL_MID, X86_FEATURE_ANY },
+ +      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_SALTWELL_MID,        X86_FEATURE_ANY },
+ +      { X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_BONNELL,     X86_FEATURE_ANY },
         { X86_VENDOR_CENTAUR,   5 },
         { X86_VENDOR_INTEL,     5 },
         { X86_VENDOR_NSC,       5 },
@@@ -963,16 -963,15 +963,16 @@@
   
   static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
         { X86_VENDOR_AMD },
+ +      { X86_VENDOR_HYGON },
         {}
   };
   
   /* Only list CPUs which speculate but are non susceptible to SSB */
   static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
- -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT1     },
+ +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT      },
         { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_AIRMONT         },
- -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT2     },
- -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_MERRIFIELD      },
+ +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT_X    },
+ +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT_MID  },
         { X86_VENDOR_INTEL,     6,      INTEL_FAM6_CORE_YONAH           },
         { X86_VENDOR_INTEL,     6,      INTEL_FAM6_XEON_PHI_KNL         },
         { X86_VENDOR_INTEL,     6,      INTEL_FAM6_XEON_PHI_KNM         },
@@@ -985,14 -984,14 +985,14 @@@
   
   static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
         /* in addition to cpu_no_speculation */
- -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT1     },
- -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT2     },
+ +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT      },
+ +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT_X    },
         { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_AIRMONT         },
- -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_MERRIFIELD      },
- -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_MOOREFIELD      },
+ +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_SILVERMONT_MID  },
+ +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_AIRMONT_MID     },
         { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_GOLDMONT        },
- -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_DENVERTON       },
- -      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_GEMINI_LAKE     },
+ +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_GOLDMONT_X      },
+ +      { X86_VENDOR_INTEL,     6,      INTEL_FAM6_ATOM_GOLDMONT_PLUS   },
         { X86_VENDOR_INTEL,     6,      INTEL_FAM6_XEON_PHI_KNL         },
         { X86_VENDOR_INTEL,     6,      INTEL_FAM6_XEON_PHI_KNM         },
         {}
@@@ -1077,9 -1076,6 +1077,9 @@@ static void __init early_identify_cpu(s
         memset(&c->x86_capability, 0, sizeof c->x86_capability);
         c->extended_cpuid_level = 0;
   
+ +      if (!have_cpuid_p())
+ +              identify_cpu_without_cpuid(c);
+ +
         /* cyrix could have cpuid enabled via c_identify()*/
         if (have_cpuid_p()) {
                 cpu_detect(c);
@@@ -1097,6 -1093,7 +1097,6 @@@
                 if (this_cpu->c_bsp_init)
                         this_cpu->c_bsp_init(c);
         } else {
- -              identify_cpu_without_cpuid(c);
                 setup_clear_cpu_cap(X86_FEATURE_CPUID);
         }
   
@@@ -1243,10 -1240,10 +1243,10 @@@ static void generic_identify(struct cpu
          * ESPFIX issue, we can change this.
          */
   #ifdef CONFIG_X86_32
- -# ifdef CONFIG_PARAVIRT
+ +# ifdef CONFIG_PARAVIRT_XXL
         do {
                 extern void native_iret(void);
- -              if (pv_cpu_ops.iret == native_iret)
+ +              if (pv_ops.cpu.iret == native_iret)
                         set_cpu_bug(c, X86_BUG_ESPFIX);
         } while (0);
   # else
@@@ -1534,19 -1531,8 +1534,8 @@@ EXPORT_PER_CPU_SYMBOL(__preempt_count)
   /* May not be marked __init: used by software suspend */
   void syscall_init(void)
   {
-       extern char _entry_trampoline[];
-       extern char entry_SYSCALL_64_trampoline[];
- 
-       int cpu = smp_processor_id();
-       unsigned long SYSCALL64_entry_trampoline =
-               (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
-               (entry_SYSCALL_64_trampoline - _entry_trampoline);
- 
         wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-       if (static_cpu_has(X86_FEATURE_PTI))
-               wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
-       else
-               wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+       wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
   
   #ifdef CONFIG_IA32_EMULATION
         wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@@@ -1557,7 -1543,8 +1546,8 @@@
          * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
          */
         wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-       wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+       wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+                   (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
         wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
   #else
         wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@@ -1672,29 -1659,6 +1662,29 @@@ static void wait_for_master_cpu(int cpu
   #endif
   }
   
+ +#ifdef CONFIG_X86_64
+ +static void setup_getcpu(int cpu)
+ +{
+ +      unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
+ +      struct desc_struct d = { };
+ +
+ +      if (static_cpu_has(X86_FEATURE_RDTSCP))
+ +              write_rdtscp_aux(cpudata);
+ +
+ +      /* Store CPU and node number in limit. */
+ +      d.limit0 = cpudata;
+ +      d.limit1 = cpudata >> 16;
+ +
+ +      d.type = 5;             /* RO data, expand down, accessed */
+ +      d.dpl = 3;              /* Visible to user code */
+ +      d.s = 1;                /* Not a system segment */
+ +      d.p = 1;                /* Present */
+ +      d.d = 1;                /* 32-bit */
+ +
+ +      write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
+ +}
+ +#endif
+ +
   /*
    * cpu_init() initializes state that is per-CPU. Some data is already
    * initialized (naturally) in the bootstrap process, such as the GDT
@@@ -1732,7 -1696,6 +1722,7 @@@ void cpu_init(void
             early_cpu_to_node(cpu) != NUMA_NO_NODE)
                 set_numa_node(early_cpu_to_node(cpu));
   #endif
+ +      setup_getcpu(cpu);
   
         me = current;
   
diff --combined arch/x86/kernel/kprobes/core.c

index f72a47b602e208ce3c18c24f1f215e4e2a465674,f802cf5b447885ca4636727c40c2f78784c832b6..c33b06f5faa4079bb87392d42dd232ad6a2fe5a1
--- 1/arch/x86/kernel/kprobes/core.c
--- 2/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@@ -1020,26 -1020,56 +1020,18 @@@ int kprobe_fault_handler(struct pt_reg
                  */
                 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
                         return 1;
- -
- -              /*
- -               * In case the user-specified fault handler returned
- -               * zero, try to fix up.
- -               */
- -              if (fixup_exception(regs, trapnr))
- -                      return 1;
- -
- -              /*
- -               * fixup routine could not handle it,
- -               * Let do_page_fault() fix it.
- -               */
         }
   
         return 0;
   }
   NOKPROBE_SYMBOL(kprobe_fault_handler);
   
- -/*
- - * Wrapper routine for handling exceptions.
- - */
- -int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
- -                           void *data)
- -{
- -      struct die_args *args = data;
- -      int ret = NOTIFY_DONE;
- -
- -      if (args->regs && user_mode(args->regs))
- -              return ret;
- -
- -      if (val == DIE_GPF) {
- -              /*
- -               * To be potentially processing a kprobe fault and to
- -               * trust the result from kprobe_running(), we have
- -               * be non-preemptible.
- -               */
- -              if (!preemptible() && kprobe_running() &&
- -                  kprobe_fault_handler(args->regs, args->trapnr))
- -                      ret = NOTIFY_STOP;
- -      }
- -      return ret;
- -}
- -NOKPROBE_SYMBOL(kprobe_exceptions_notify);
- -
   bool arch_within_kprobe_blacklist(unsigned long addr)
   {
-       bool is_in_entry_trampoline_section = false;
- 
- #ifdef CONFIG_X86_64
-       is_in_entry_trampoline_section =
-               (addr >= (unsigned long)__entry_trampoline_start &&
-                addr < (unsigned long)__entry_trampoline_end);
- #endif
         return  (addr >= (unsigned long)__kprobes_text_start &&
                  addr < (unsigned long)__kprobes_text_end) ||
                 (addr >= (unsigned long)__entry_text_start &&
-                addr < (unsigned long)__entry_text_end) ||
-               is_in_entry_trampoline_section;
+                addr < (unsigned long)__entry_text_end);
   }
   
   int __init arch_init_kprobes(void)
diff --combined arch/x86/kernel/process_64.c

index d6674a425714b653def5c7c10628af9e723a6745,0fa7aa19f09e00696f38d19e156107760c2bb2cb..31b4755369f084575f6b3a0ec30b340392106f70
--- 1/arch/x86/kernel/process_64.c
--- 2/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@@ -54,16 -54,13 +54,14 @@@
   #include <asm/vdso.h>
   #include <asm/intel_rdt_sched.h>
   #include <asm/unistd.h>
+ +#include <asm/fsgsbase.h>
   #ifdef CONFIG_IA32_EMULATION
   /* Not included via unistd.h */
   #include <asm/unistd_32_ia32.h>
   #endif
   
- __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
- 
   /* Prints also some state that isn't saved in the pt_regs */
- -void __show_regs(struct pt_regs *regs, int all)
+ +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
   {
         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
         unsigned long d0, d1, d2, d3, d6, d7;
@@@ -88,17 -85,9 +86,17 @@@
         printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
                regs->r13, regs->r14, regs->r15);
   
- -      if (!all)
+ +      if (mode == SHOW_REGS_SHORT)
                 return;
   
+ +      if (mode == SHOW_REGS_USER) {
+ +              rdmsrl(MSR_FS_BASE, fs);
+ +              rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ +              printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
+ +                     fs, shadowgs);
+ +              return;
+ +      }
+ +
         asm("movl %%ds,%0" : "=r" (ds));
         asm("movl %%cs,%0" : "=r" (cs));
         asm("movl %%es,%0" : "=r" (es));
@@@ -287,138 -276,6 +285,138 @@@ static __always_inline void load_seg_le
         }
   }
   
+ +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
+ +                                            struct thread_struct *next)
+ +{
+ +      load_seg_legacy(prev->fsindex, prev->fsbase,
+ +                      next->fsindex, next->fsbase, FS);
+ +      load_seg_legacy(prev->gsindex, prev->gsbase,
+ +                      next->gsindex, next->gsbase, GS);
+ +}
+ +
+ +static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+ +                                          unsigned short selector)
+ +{
+ +      unsigned short idx = selector >> 3;
+ +      unsigned long base;
+ +
+ +      if (likely((selector & SEGMENT_TI_MASK) == 0)) {
+ +              if (unlikely(idx >= GDT_ENTRIES))
+ +                      return 0;
+ +
+ +              /*
+ +               * There are no user segments in the GDT with nonzero bases
+ +               * other than the TLS segments.
+ +               */
+ +              if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ +                      return 0;
+ +
+ +              idx -= GDT_ENTRY_TLS_MIN;
+ +              base = get_desc_base(&task->thread.tls_array[idx]);
+ +      } else {
+ +#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ +              struct ldt_struct *ldt;
+ +
+ +              /*
+ +               * If performance here mattered, we could protect the LDT
+ +               * with RCU.  This is a slow path, though, so we can just
+ +               * take the mutex.
+ +               */
+ +              mutex_lock(&task->mm->context.lock);
+ +              ldt = task->mm->context.ldt;
+ +              if (unlikely(idx >= ldt->nr_entries))
+ +                      base = 0;
+ +              else
+ +                      base = get_desc_base(ldt->entries + idx);
+ +              mutex_unlock(&task->mm->context.lock);
+ +#else
+ +              base = 0;
+ +#endif
+ +      }
+ +
+ +      return base;
+ +}
+ +
+ +void x86_fsbase_write_cpu(unsigned long fsbase)
+ +{
+ +      /*
+ +       * Set the selector to 0 as a notion, that the segment base is
+ +       * overwritten, which will be checked for skipping the segment load
+ +       * during context switch.
+ +       */
+ +      loadseg(FS, 0);
+ +      wrmsrl(MSR_FS_BASE, fsbase);
+ +}
+ +
+ +void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
+ +{
+ +      /* Set the selector to 0 for the same reason as %fs above. */
+ +      loadseg(GS, 0);
+ +      wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ +}
+ +
+ +unsigned long x86_fsbase_read_task(struct task_struct *task)
+ +{
+ +      unsigned long fsbase;
+ +
+ +      if (task == current)
+ +              fsbase = x86_fsbase_read_cpu();
+ +      else if (task->thread.fsindex == 0)
+ +              fsbase = task->thread.fsbase;
+ +      else
+ +              fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
+ +
+ +      return fsbase;
+ +}
+ +
+ +unsigned long x86_gsbase_read_task(struct task_struct *task)
+ +{
+ +      unsigned long gsbase;
+ +
+ +      if (task == current)
+ +              gsbase = x86_gsbase_read_cpu_inactive();
+ +      else if (task->thread.gsindex == 0)
+ +              gsbase = task->thread.gsbase;
+ +      else
+ +              gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
+ +
+ +      return gsbase;
+ +}
+ +
+ +int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
+ +{
+ +      /*
+ +       * Not strictly needed for %fs, but do it for symmetry
+ +       * with %gs
+ +       */
+ +      if (unlikely(fsbase >= TASK_SIZE_MAX))
+ +              return -EPERM;
+ +
+ +      preempt_disable();
+ +      task->thread.fsbase = fsbase;
+ +      if (task == current)
+ +              x86_fsbase_write_cpu(fsbase);
+ +      task->thread.fsindex = 0;
+ +      preempt_enable();
+ +
+ +      return 0;
+ +}
+ +
+ +int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
+ +{
+ +      if (unlikely(gsbase >= TASK_SIZE_MAX))
+ +              return -EPERM;
+ +
+ +      preempt_disable();
+ +      task->thread.gsbase = gsbase;
+ +      if (task == current)
+ +              x86_gsbase_write_cpu_inactive(gsbase);
+ +      task->thread.gsindex = 0;
+ +      preempt_enable();
+ +
+ +      return 0;
+ +}
+ +
   int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
                 unsigned long arg, struct task_struct *p, unsigned long tls)
   {
@@@ -606,7 -463,10 +604,7 @@@ __switch_to(struct task_struct *prev_p
         if (unlikely(next->ds | prev->ds))
                 loadsegment(ds, next->ds);
   
- -      load_seg_legacy(prev->fsindex, prev->fsbase,
- -                      next->fsindex, next->fsbase, FS);
- -      load_seg_legacy(prev->gsindex, prev->gsbase,
- -                      next->gsindex, next->gsbase, GS);
+ +      x86_fsgsbase_load(prev, next);
   
         switch_fpu_finish(next_fpu, cpu);
   
@@@ -757,25 -617,54 +755,25 @@@ static long prctl_map_vdso(const struc
   long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
   {
         int ret = 0;
- -      int doit = task == current;
- -      int cpu;
   
         switch (option) {
- -      case ARCH_SET_GS:
- -              if (arg2 >= TASK_SIZE_MAX)
- -                      return -EPERM;
- -              cpu = get_cpu();
- -              task->thread.gsindex = 0;
- -              task->thread.gsbase = arg2;
- -              if (doit) {
- -                      load_gs_index(0);
- -                      ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
- -              }
- -              put_cpu();
+ +      case ARCH_SET_GS: {
+ +              ret = x86_gsbase_write_task(task, arg2);
                 break;
- -      case ARCH_SET_FS:
- -              /* Not strictly needed for fs, but do it for symmetry
- -                 with gs */
- -              if (arg2 >= TASK_SIZE_MAX)
- -                      return -EPERM;
- -              cpu = get_cpu();
- -              task->thread.fsindex = 0;
- -              task->thread.fsbase = arg2;
- -              if (doit) {
- -                      /* set the selector to 0 to not confuse __switch_to */
- -                      loadsegment(fs, 0);
- -                      ret = wrmsrl_safe(MSR_FS_BASE, arg2);
- -              }
- -              put_cpu();
+ +      }
+ +      case ARCH_SET_FS: {
+ +              ret = x86_fsbase_write_task(task, arg2);
                 break;
+ +      }
         case ARCH_GET_FS: {
- -              unsigned long base;
+ +              unsigned long base = x86_fsbase_read_task(task);
   
- -              if (doit)
- -                      rdmsrl(MSR_FS_BASE, base);
- -              else
- -                      base = task->thread.fsbase;
                 ret = put_user(base, (unsigned long __user *)arg2);
                 break;
         }
         case ARCH_GET_GS: {
- -              unsigned long base;
+ +              unsigned long base = x86_gsbase_read_task(task);
   
- -              if (doit)
- -                      rdmsrl(MSR_KERNEL_GS_BASE, base);
- -              else
- -                      base = task->thread.gsbase;
                 ret = put_user(base, (unsigned long __user *)arg2);
                 break;
         }
diff --combined arch/x86/kernel/traps.c

index 16c95cb904964ff45ba2bf077df266c893e3fa57,1a90821c0b7421e29864bd9088501d64fe165917..5bd0a997d81e28f1ab0ce859dfb2415128602909
--- 1/arch/x86/kernel/traps.c
--- 2/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -206,7 -206,7 +206,7 @@@ do_trap_no_signal(struct task_struct *t
         }
   
         if (!user_mode(regs)) {
- -              if (fixup_exception(regs, trapnr))
+ +              if (fixup_exception(regs, trapnr, error_code, 0))
                         return 0;
   
                 tsk->thread.error_code = error_code;
@@@ -383,6 -383,10 +383,10 @@@ dotraplinkage void do_double_fault(stru
                  * we won't enable interupts or schedule before we invoke
                  * general_protection, so nothing will clobber the stack
                  * frame we just set up.
+                *
+                * We will enter general_protection with kernel GSBASE,
+                * which is what the stub expects, given that the faulting
+                * RIP will be the IRET instruction.
                  */
                 regs->ip = (unsigned long)general_protection;
                 regs->sp = (unsigned long)&gpregs->orig_ax;
@@@ -551,21 -555,11 +555,21 @@@ do_general_protection(struct pt_regs *r
   
         tsk = current;
         if (!user_mode(regs)) {
- -              if (fixup_exception(regs, X86_TRAP_GP))
+ +              if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
                         return;
   
                 tsk->thread.error_code = error_code;
                 tsk->thread.trap_nr = X86_TRAP_GP;
+ +
+ +              /*
+ +               * To be potentially processing a kprobe fault and to
+ +               * trust the result from kprobe_running(), we have to
+ +               * be non-preemptible.
+ +               */
+ +              if (!preemptible() && kprobe_running() &&
+ +                  kprobe_fault_handler(regs, X86_TRAP_GP))
+ +                      return;
+ +
                 if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
                                X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
                         die("general protection fault", regs, error_code);
@@@ -848,7 -842,7 +852,7 @@@ static void math_error(struct pt_regs *
         cond_local_irq_enable(regs);
   
         if (!user_mode(regs)) {
- -              if (fixup_exception(regs, trapnr))
+ +              if (fixup_exception(regs, trapnr, error_code, 0))
                         return;
   
                 task->thread.error_code = error_code;
diff --combined arch/x86/kernel/vmlinux.lds.S

index 5dd3317d761f4065b0fc8f7cdcac4e08be600f0e,9c77d2df9c2725399ee65c5947535c72164a4349..0d618ee634ac40cbdd35957fcbe2f17f13446a6f
--- 1/arch/x86/kernel/vmlinux.lds.S
--- 2/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@@ -65,23 -65,6 +65,23 @@@ jiffies_64 = jiffies
   #define ALIGN_ENTRY_TEXT_BEGIN        . = ALIGN(PMD_SIZE);
   #define ALIGN_ENTRY_TEXT_END  . = ALIGN(PMD_SIZE);
   
+ +/*
+ + * This section contains data which will be mapped as decrypted. Memory
+ + * encryption operates on a page basis. Make this section PMD-aligned
+ + * to avoid splitting the pages while mapping the section early.
+ + *
+ + * Note: We use a separate section so that only this section gets
+ + * decrypted to avoid exposing more than we wish.
+ + */
+ +#define BSS_DECRYPTED                                         \
+ +      . = ALIGN(PMD_SIZE);                                    \
+ +      __start_bss_decrypted = .;                              \
+ +      *(.bss..decrypted);                                     \
+ +      . = ALIGN(PAGE_SIZE);                                   \
+ +      __start_bss_decrypted_unused = .;                       \
+ +      . = ALIGN(PMD_SIZE);                                    \
+ +      __end_bss_decrypted = .;                                \
+ +
   #else
   
   #define X86_ALIGN_RODATA_BEGIN
@@@ -91,7 -74,6 +91,7 @@@
   
   #define ALIGN_ENTRY_TEXT_BEGIN
   #define ALIGN_ENTRY_TEXT_END
+ +#define BSS_DECRYPTED
   
   #endif
   
@@@ -136,16 -118,6 +136,6 @@@ SECTION
                 *(.fixup)
                 *(.gnu.warning)
   
- #ifdef CONFIG_X86_64
-               . = ALIGN(PAGE_SIZE);
-               __entry_trampoline_start = .;
-               _entry_trampoline = .;
-               *(.entry_trampoline)
-               . = ALIGN(PAGE_SIZE);
-               __entry_trampoline_end = .;
-               ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
- #endif
- 
   #ifdef CONFIG_RETPOLINE
                 __indirect_thunk_start = .;
                 *(.text.__x86.indirect_thunk)
@@@ -373,7 -345,6 +363,7 @@@
                 __bss_start = .;
                 *(.bss..page_aligned)
                 *(.bss)
+ +              BSS_DECRYPTED
                 . = ALIGN(PAGE_SIZE);
                 __bss_stop = .;
         }
diff --combined arch/x86/mm/tlb.c

index 7d68489cfdb15ff0838aba5beb6207c36b27cd77,073b8df349a0d6f81888bf87cf27bbf2b2efba7a..bddd6b3cee1de51ac8321974b827d208dbab1831
--- 1/arch/x86/mm/tlb.c
--- 2/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@@ -7,6 -7,7 +7,7 @@@
   #include <linux/export.h>
   #include <linux/cpu.h>
   #include <linux/debugfs.h>
+ #include <linux/ptrace.h>
   
   #include <asm/tlbflush.h>
   #include <asm/mmu_context.h>
@@@ -180,16 -181,26 +181,29 @@@ static void sync_current_stack_to_mm(st
         }
   }
   
+ static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
+ {
+       /*
+        * Check if the current (previous) task has access to the memory
+        * of the @tsk (next) task. If access is denied, make sure to
+        * issue a IBPB to stop user->user Spectre-v2 attacks.
+        *
+        * Note: __ptrace_may_access() returns 0 or -ERRNO.
+        */
+       return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
+               ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
+ }
+ 
   void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                         struct task_struct *tsk)
   {
         struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
         u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ +      bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
         unsigned cpu = smp_processor_id();
         u64 next_tlb_gen;
+ +      bool need_flush;
+ +      u16 new_asid;
   
         /*
          * NB: The scheduler will call us with prev == next when switching
@@@ -243,41 -254,20 +257,41 @@@
                            next->context.ctx_id);
   
                 /*
- -               * We don't currently support having a real mm loaded without
- -               * our cpu set in mm_cpumask().  We have all the bookkeeping
- -               * in place to figure out whether we would need to flush
- -               * if our cpu were cleared in mm_cpumask(), but we don't
- -               * currently use it.
+ +               * Even in lazy TLB mode, the CPU should stay set in the
+ +               * mm_cpumask. The TLB shootdown code can figure out from
+ +               * from cpu_tlbstate.is_lazy whether or not to send an IPI.
                  */
                 if (WARN_ON_ONCE(real_prev != &init_mm &&
                                  !cpumask_test_cpu(cpu, mm_cpumask(next))))
                         cpumask_set_cpu(cpu, mm_cpumask(next));
   
- -              return;
+ +              /*
+ +               * If the CPU is not in lazy TLB mode, we are just switching
+ +               * from one thread in a process to another thread in the same
+ +               * process. No TLB flush required.
+ +               */
+ +              if (!was_lazy)
+ +                      return;
+ +
+ +              /*
+ +               * Read the tlb_gen to check whether a flush is needed.
+ +               * If the TLB is up to date, just use it.
+ +               * The barrier synchronizes with the tlb_gen increment in
+ +               * the TLB shootdown code.
+ +               */
+ +              smp_mb();
+ +              next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ +              if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+ +                              next_tlb_gen)
+ +                      return;
+ +
+ +              /*
+ +               * TLB contents went out of date while we were in lazy
+ +               * mode. Fall through to the TLB switching code below.
+ +               */
+ +              new_asid = prev_asid;
+ +              need_flush = true;
         } else {
- -              u16 new_asid;
- -              bool need_flush;
                 u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
   
                 /*
@@@ -286,18 -276,13 +300,13 @@@
                  * one process from doing Spectre-v2 attacks on another.
                  *
                  * As an optimization, flush indirect branches only when
-                * switching into processes that disable dumping. This
-                * protects high value processes like gpg, without having
-                * too high performance overhead. IBPB is *expensive*!
-                *
-                * This will not flush branches when switching into kernel
-                * threads. It will also not flush if we switch to idle
-                * thread and back to the same process. It will flush if we
-                * switch to a different non-dumpable process.
+                * switching into a processes that can't be ptrace by the
+                * current one (as in such case, attacker has much more
+                * convenient way how to tamper with the next process than
+                * branch buffer poisoning).
                  */
-               if (tsk && tsk->mm &&
-                   tsk->mm->context.ctx_id != last_ctx_id &&
-                   get_dumpable(tsk->mm) != SUID_DUMP_USER)
+               if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
+                               ibpb_needed(tsk, last_ctx_id))
                         indirect_branch_prediction_barrier();
   
                 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@@ -332,48 -317,46 +341,48 @@@
                 /* Let nmi_uaccess_okay() know that we're changing CR3. */
                 this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
                 barrier();
+ +      }
   
- -              if (need_flush) {
- -                      this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
- -                      this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- -                      load_new_mm_cr3(next->pgd, new_asid, true);
- -
- -                      /*
- -                       * NB: This gets called via leave_mm() in the idle path
- -                       * where RCU functions differently.  Tracing normally
- -                       * uses RCU, so we need to use the _rcuidle variant.
- -                       *
- -                       * (There is no good reason for this.  The idle code should
- -                       *  be rearranged to call this before rcu_idle_enter().)
- -                       */
- -                      trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- -              } else {
- -                      /* The new ASID is already up to date. */
- -                      load_new_mm_cr3(next->pgd, new_asid, false);
- -
- -                      /* See above wrt _rcuidle. */
- -                      trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
- -              }
+ +      if (need_flush) {
+ +              this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ +              this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ +              load_new_mm_cr3(next->pgd, new_asid, true);
   
                 /*
- -               * Record last user mm's context id, so we can avoid
- -               * flushing branch buffer with IBPB if we switch back
- -               * to the same user.
+ +               * NB: This gets called via leave_mm() in the idle path
+ +               * where RCU functions differently.  Tracing normally
+ +               * uses RCU, so we need to use the _rcuidle variant.
+ +               *
+ +               * (There is no good reason for this.  The idle code should
+ +               *  be rearranged to call this before rcu_idle_enter().)
                  */
- -              if (next != &init_mm)
- -                      this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
- -
- -              /* Make sure we write CR3 before loaded_mm. */
- -              barrier();
+ +              trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ +      } else {
+ +              /* The new ASID is already up to date. */
+ +              load_new_mm_cr3(next->pgd, new_asid, false);
   
- -              this_cpu_write(cpu_tlbstate.loaded_mm, next);
- -              this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ +              /* See above wrt _rcuidle. */
+ +              trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
         }
   
- -      load_mm_cr4(next);
- -      switch_ldt(real_prev, next);
+ +      /*
+ +       * Record last user mm's context id, so we can avoid
+ +       * flushing branch buffer with IBPB if we switch back
+ +       * to the same user.
+ +       */
+ +      if (next != &init_mm)
+ +              this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+ +
+ +      /* Make sure we write CR3 before loaded_mm. */
+ +      barrier();
+ +
+ +      this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ +      this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ +
+ +      if (next != real_prev) {
+ +              load_mm_cr4(next);
+ +              switch_ldt(real_prev, next);
+ +      }
   }
   
   /*
@@@ -394,7 -377,20 +403,7 @@@ void enter_lazy_tlb(struct mm_struct *m
         if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
                 return;
   
- -      if (tlb_defer_switch_to_init_mm()) {
- -              /*
- -               * There's a significant optimization that may be possible
- -               * here.  We have accurate enough TLB flush tracking that we
- -               * don't need to maintain coherence of TLB per se when we're
- -               * lazy.  We do, however, need to maintain coherence of
- -               * paging-structure caches.  We could, in principle, leave our
- -               * old mm loaded and only switch to init_mm when
- -               * tlb_remove_page() happens.
- -               */
- -              this_cpu_write(cpu_tlbstate.is_lazy, true);
- -      } else {
- -              switch_mm(NULL, &init_mm, NULL);
- -      }
+ +      this_cpu_write(cpu_tlbstate.is_lazy, true);
   }
   
   /*
@@@ -481,9 -477,6 +490,9 @@@ static void flush_tlb_func_common(cons
                  * paging-structure cache to avoid speculatively reading
                  * garbage into our TLB.  Since switching to init_mm is barely
                  * slower than a minimal flush, just switch to init_mm.
+ +               *
+ +               * This should be rare, with native_flush_tlb_others skipping
+ +               * IPIs to lazy TLB mode CPUs.
                  */
                 switch_mm_irqs_off(NULL, &init_mm, NULL);
                 return;
@@@ -544,16 -537,17 +553,16 @@@
             f->new_tlb_gen == local_tlb_gen + 1 &&
             f->new_tlb_gen == mm_tlb_gen) {
                 /* Partial flush */
- -              unsigned long addr;
- -              unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+ +              unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
+ +              unsigned long addr = f->start;
   
- -              addr = f->start;
                 while (addr < f->end) {
                         __flush_tlb_one_user(addr);
- -                      addr += PAGE_SIZE;
+ +                      addr += 1UL << f->stride_shift;
                 }
                 if (local)
- -                      count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
- -              trace_tlb_flush(reason, nr_pages);
+ +                      count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
+ +              trace_tlb_flush(reason, nr_invalidate);
         } else {
                 /* Full flush. */
                 local_flush_tlb();
@@@ -586,11 -580,6 +595,11 @@@ static void flush_tlb_func_remote(void 
         flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
   }
   
+ +static bool tlb_is_not_lazy(int cpu, void *data)
+ +{
+ +      return !per_cpu(cpu_tlbstate.is_lazy, cpu);
+ +}
+ +
   void native_flush_tlb_others(const struct cpumask *cpumask,
                              const struct flush_tlb_info *info)
   {
@@@ -626,23 -615,8 +635,23 @@@
                                                (void *)info, 1);
                 return;
         }
- -      smp_call_function_many(cpumask, flush_tlb_func_remote,
+ +
+ +      /*
+ +       * If no page tables were freed, we can skip sending IPIs to
+ +       * CPUs in lazy TLB mode. They will flush the CPU themselves
+ +       * at the next context switch.
+ +       *
+ +       * However, if page tables are getting freed, we need to send the
+ +       * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
+ +       * up on the new contents of what used to be page tables, while
+ +       * doing a speculative memory access.
+ +       */
+ +      if (info->freed_tables)
+ +              smp_call_function_many(cpumask, flush_tlb_func_remote,
                                (void *)info, 1);
+ +      else
+ +              on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
+ +                              (void *)info, 1, GFP_ATOMIC, cpumask);
   }
   
   /*
@@@ -658,15 -632,12 +667,15 @@@
   static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
   
   void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
- -                              unsigned long end, unsigned long vmflag)
+ +                              unsigned long end, unsigned int stride_shift,
+ +                              bool freed_tables)
   {
         int cpu;
   
         struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
                 .mm = mm,
+ +              .stride_shift = stride_shift,
+ +              .freed_tables = freed_tables,
         };
   
         cpu = get_cpu();
@@@ -676,7 -647,8 +685,7 @@@
   
         /* Should we flush just the requested range? */
         if ((end != TLB_FLUSH_ALL) &&
- -          !(vmflag & VM_HUGETLB) &&
- -          ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
+ +          ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
                 info.start = start;
                 info.end = end;
         } else {
diff --combined kernel/cpu.c

index e82920b8bee14e62be2bb248d92413a85aff4aa9,2fb49916ea560735d7f51f8a7610892303ec243a..3c7f3b4c453cf57c8e37dd5fadc9f5941f074f0d
--- 1/kernel/cpu.c
--- 2/kernel/cpu.c
+++ b/kernel/cpu.c
@@@ -315,16 -315,6 +315,16 @@@ void lockdep_assert_cpus_held(void
         percpu_rwsem_assert_held(&cpu_hotplug_lock);
   }
   
+ +static void lockdep_acquire_cpus_lock(void)
+ +{
+ +      rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+ +}
+ +
+ +static void lockdep_release_cpus_lock(void)
+ +{
+ +      rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
+ +}
+ +
   /*
    * Wait for currently running CPU hotplug operations to complete (if any) and
    * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
@@@ -354,17 -344,6 +354,17 @@@ void cpu_hotplug_enable(void
         cpu_maps_update_done();
   }
   EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
+ +
+ +#else
+ +
+ +static void lockdep_acquire_cpus_lock(void)
+ +{
+ +}
+ +
+ +static void lockdep_release_cpus_lock(void)
+ +{
+ +}
+ +
   #endif        /* CONFIG_HOTPLUG_CPU */
   
   #ifdef CONFIG_HOTPLUG_SMT
@@@ -383,7 -362,6 +383,7 @@@ void __init cpu_smt_disable(bool force
                 pr_info("SMT: Force disabled\n");
                 cpu_smt_control = CPU_SMT_FORCE_DISABLED;
         } else {
+ +              pr_info("SMT: disabled\n");
                 cpu_smt_control = CPU_SMT_DISABLED;
         }
   }
@@@ -629,21 -607,15 +629,21 @@@ static void cpuhp_thread_fun(unsigned i
         bool bringup = st->bringup;
         enum cpuhp_state state;
   
+ +      if (WARN_ON_ONCE(!st->should_run))
+ +              return;
+ +
         /*
          * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
          * that if we see ->should_run we also see the rest of the state.
          */
         smp_mb();
   
- -      if (WARN_ON_ONCE(!st->should_run))
- -              return;
- -
+ +      /*
+ +       * The BP holds the hotplug lock, but we're now running on the AP,
+ +       * ensure that anybody asserting the lock is held, will actually find
+ +       * it so.
+ +       */
+ +      lockdep_acquire_cpus_lock();
         cpuhp_lock_acquire(bringup);
   
         if (st->single) {
@@@ -689,7 -661,6 +689,7 @@@
         }
   
         cpuhp_lock_release(bringup);
+ +      lockdep_release_cpus_lock();
   
         if (!st->should_run)
                 complete_ap_thread(st, bringup);
@@@ -945,8 -916,7 +945,8 @@@ static int cpuhp_down_callbacks(unsigne
                 ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
                 if (ret) {
                         st->target = prev_state;
- -                      undo_cpu_down(cpu, st);
+ +                      if (st->state < prev_state)
+ +                              undo_cpu_down(cpu, st);
                         break;
                 }
         }
@@@ -999,7 -969,7 +999,7 @@@ static int __ref _cpu_down(unsigned in
          * to do the further cleanups.
          */
         ret = cpuhp_down_callbacks(cpu, st, target);
- -      if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
+ +      if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
                 cpuhp_reset_state(st, prev_state);
                 __cpuhp_kick_ap(st);
         }
@@@ -2055,6 -2025,12 +2055,12 @@@ static void cpuhp_online_cpu_device(uns
         kobject_uevent(&dev->kobj, KOBJ_ONLINE);
   }
   
+ /*
+  * Architectures that need SMT-specific errata handling during SMT hotplug
+  * should override this.
+  */
+ void __weak arch_smt_update(void) { };
+ 
   static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
   {
         int cpu, ret = 0;
@@@ -2081,8 -2057,10 +2087,10 @@@
                  */
                 cpuhp_offline_cpu_device(cpu);
         }
-       if (!ret)
+       if (!ret) {
                 cpu_smt_control = ctrlval;
+               arch_smt_update();
+       }
         cpu_maps_update_done();
         return ret;
   }
@@@ -2093,6 -2071,7 +2101,7 @@@ static int cpuhp_smt_enable(void
   
         cpu_maps_update_begin();
         cpu_smt_control = CPU_SMT_ENABLED;
+       arch_smt_update();
         for_each_present_cpu(cpu) {
                 /* Skip online CPUs and CPUs on offline nodes */
                 if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 23 Oct 2018 17:43:04 +0000 (18:43 +0100)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 23 Oct 2018 17:43:04 +0000 (18:43 +0100)
		1	2
arch/x86/entry/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/processor.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/asm-offsets.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/bugs.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/kprobes/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/traps.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/vmlinux.lds.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/tlb.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cpu.c	patch \|	diff1 \|	diff2 \|	blob \| history