Merge branch 'x86-apic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 5 Sep 2017 00:43:56 +0000 (17:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 5 Sep 2017 00:43:56 +0000 (17:43 -0700)
Pull x86 apic updates from Thomas Gleixner:
 "This update provides:

   - Cleanup of the IDT management including the removal of the extra
     tracing IDT. A first step to cleanup the vector management code.

   - The removal of the paravirt op adjust_exception_frame. This is a
     XEN specific issue, but merged through this branch to avoid nasty
     merge collisions

   - Prevent dmesg spam about the TSC DEADLINE bug, when the CPU has
     disabled the TSC DEADLINE timer in CPUID.

   - Adjust a debug message in the ioapic code to print out the
     information correctly"

* 'x86-apic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
  x86/idt: Fix the X86_TRAP_BP gate
  x86/xen: Get rid of paravirt op adjust_exception_frame
  x86/eisa: Add missing include
  x86/idt: Remove superfluous ALIGNment
  x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on CPUs without the feature
  x86/idt: Remove the tracing IDT leftovers
  x86/idt: Hide set_intr_gate()
  x86/idt: Simplify alloc_intr_gate()
  x86/idt: Deinline setup functions
  x86/idt: Remove unused functions/inlines
  x86/idt: Move interrupt gate initialization to IDT code
  x86/idt: Move APIC gate initialization to tables
  x86/idt: Move regular trap init to tables
  x86/idt: Move IST stack based traps to table init
  x86/idt: Move debug stack init to table based
  x86/idt: Switch early trap init to IDT tables
  x86/idt: Prepare for table based init
  x86/idt: Move early IDT setup out of 32-bit asm
  x86/idt: Move early IDT handler setup to IDT code
  x86/idt: Consolidate IDT invalidation
  ...

1  2 
arch/x86/boot/compressed/eboot.c
arch/x86/entry/entry_64.S
arch/x86/entry/entry_64_compat.S
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/mcheck/mce_amd.c
arch/x86/kernel/head64.c
arch/x86/kernel/setup.c
arch/x86/kvm/vmx.c
arch/x86/mm/fault.c
arch/x86/xen/enlighten_pv.c

index e007887a33b0d2c8b62eef451c40a1b2f875c2cb,65f0b24f60db3d1dfc1699d91117fbdef76d4998..926c2cc4facc9fdec21a28c73c3297545ae6e3a0
@@@ -767,7 -767,7 +767,7 @@@ static efi_status_t setup_e820(struct b
                m |= (u64)efi->efi_memmap_hi << 32;
  #endif
  
 -              d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size));
 +              d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i);
                switch (d->type) {
                case EFI_RESERVED_TYPE:
                case EFI_RUNTIME_SERVICES_CODE:
@@@ -1058,7 -1058,7 +1058,7 @@@ struct boot_params *efi_main(struct efi
                desc->s = DESC_TYPE_CODE_DATA;
                desc->dpl = 0;
                desc->p = 1;
-               desc->limit = 0xf;
+               desc->limit1 = 0xf;
                desc->avl = 0;
                desc->l = 0;
                desc->d = SEG_OP_SIZE_32BIT;
        desc->s = DESC_TYPE_CODE_DATA;
        desc->dpl = 0;
        desc->p = 1;
-       desc->limit = 0xf;
+       desc->limit1 = 0xf;
        desc->avl = 0;
        if (IS_ENABLED(CONFIG_X86_64)) {
                desc->l = 1;
        desc->s = DESC_TYPE_CODE_DATA;
        desc->dpl = 0;
        desc->p = 1;
-       desc->limit = 0xf;
+       desc->limit1 = 0xf;
        desc->avl = 0;
        desc->l = 0;
        desc->d = SEG_OP_SIZE_32BIT;
                desc->s = 0;
                desc->dpl = 0;
                desc->p = 1;
-               desc->limit = 0x0;
+               desc->limit1 = 0x0;
                desc->avl = 0;
                desc->l = 0;
                desc->d = 0;
index ca0b250eefc4b480fc0494157696f68dbcf46110,bdd024a9afc992f81628591a6ead8b2b4a896545..49167258d587570673c5e515cb00b4d8f26263b7
@@@ -748,26 -748,18 +748,13 @@@ ENTRY(\sym
  END(\sym)
  .endm
  
- #ifdef CONFIG_TRACING
- #define trace(sym) trace_##sym
- #define smp_trace(sym) smp_trace_##sym
- .macro trace_apicinterrupt num sym
- apicinterrupt3 \num trace(\sym) smp_trace(\sym)
- .endm
- #else
- .macro trace_apicinterrupt num sym do_sym
- .endm
- #endif
  /* Make sure APIC interrupt handlers end up in the irqentry section: */
 -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 -# define PUSH_SECTION_IRQENTRY        .pushsection .irqentry.text, "ax"
 -# define POP_SECTION_IRQENTRY .popsection
 -#else
 -# define PUSH_SECTION_IRQENTRY
 -# define POP_SECTION_IRQENTRY
 -#endif
 +#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
 +#define POP_SECTION_IRQENTRY  .popsection
  
  .macro apicinterrupt num sym do_sym
  PUSH_SECTION_IRQENTRY
  apicinterrupt3 \num \sym \do_sym
- trace_apicinterrupt \num \sym
  POP_SECTION_IRQENTRY
  .endm
  
@@@ -829,7 -821,6 +816,6 @@@ ENTRY(\sym
        .endif
  
        ASM_CLAC
-       PARAVIRT_ADJUST_EXCEPTION_FRAME
  
        .ifeq \has_error_code
        pushq   $-1                             /* ORIG_RAX: no syscall to restart */
  END(\sym)
  .endm
  
- #ifdef CONFIG_TRACING
- .macro trace_idtentry sym do_sym has_error_code:req
- idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
- idtentry \sym \do_sym has_error_code=\has_error_code
- .endm
- #else
- .macro trace_idtentry sym do_sym has_error_code:req
- idtentry \sym \do_sym has_error_code=\has_error_code
- .endm
- #endif
  idtentry divide_error                 do_divide_error                 has_error_code=0
  idtentry overflow                     do_overflow                     has_error_code=0
  idtentry bounds                               do_bounds                       has_error_code=0
@@@ -986,7 -966,7 +961,7 @@@ ENTRY(do_softirq_own_stack
  ENDPROC(do_softirq_own_stack)
  
  #ifdef CONFIG_XEN
- idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
+ idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
  
  /*
   * A note on the "critical region" in our callback handler.
@@@ -1053,8 -1033,6 +1028,6 @@@ ENTRY(xen_failsafe_callback
        movq    8(%rsp), %r11
        addq    $0x30, %rsp
        pushq   $0                              /* RIP */
-       pushq   %r11
-       pushq   %rcx
        UNWIND_HINT_IRET_REGS offset=8
        jmp     general_protection
  1:    /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@@ -1085,13 -1063,12 +1058,12 @@@ idtentry int3                        do_int3                 has_error_cod
  idtentry stack_segment                do_stack_segment        has_error_code=1
  
  #ifdef CONFIG_XEN
- idtentry xen_debug            do_debug                has_error_code=0
- idtentry xen_int3             do_int3                 has_error_code=0
- idtentry xen_stack_segment    do_stack_segment        has_error_code=1
+ idtentry xendebug             do_debug                has_error_code=0
+ idtentry xenint3              do_int3                 has_error_code=0
  #endif
  
  idtentry general_protection   do_general_protection   has_error_code=1
trace_idtentry page_fault     do_page_fault           has_error_code=1
idtentry page_fault           do_page_fault           has_error_code=1
  
  #ifdef CONFIG_KVM_GUEST
  idtentry async_page_fault     do_async_page_fault     has_error_code=1
@@@ -1251,20 -1228,9 +1223,9 @@@ ENTRY(error_exit
  END(error_exit)
  
  /* Runs on exception stack */
+ /* XXX: broken on Xen PV */
  ENTRY(nmi)
        UNWIND_HINT_IRET_REGS
-       /*
-        * Fix up the exception frame if we're on Xen.
-        * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
-        * one value to the stack on native, so it may clobber the rdx
-        * scratch slot, but it won't clobber any of the important
-        * slots past it.
-        *
-        * Xen is a different story, because the Xen frame itself overlaps
-        * the "NMI executing" variable.
-        */
-       PARAVIRT_ADJUST_EXCEPTION_FRAME
        /*
         * We allow breakpoints in NMIs. If a breakpoint occurs, then
         * the iretq it performs will take us out of NMI context.
index 4b86d8da3ea37e280d94b5c3de2704e62a2836b3,d8468ba24be069609a46e6f96f06d2983e81d2a1..e26c25ca77565938fb9ca4d16b08c3bcf4f48264
@@@ -293,7 -293,6 +293,6 @@@ ENTRY(entry_INT80_compat
        /*
         * Interrupts are off on entry.
         */
-       PARAVIRT_ADJUST_EXCEPTION_FRAME
        ASM_CLAC                        /* Do this early to minimize exposure */
        SWAPGS
  
        jmp     restore_regs_and_iret
  END(entry_INT80_compat)
  
 -      ALIGN
 -GLOBAL(stub32_clone)
 +ENTRY(stub32_clone)
        /*
         * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
         * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
         */
        xchg    %r8, %rcx
        jmp     sys_clone
 +ENDPROC(stub32_clone)
index b95cd94ca97bc191121e87bd5c0471d0ad8de494,71ab8a45cd665d072af81c8211c92bc40aab258f..efba8e3da3e28a4fb65a0b12b812f857aee36dba
@@@ -168,24 -168,6 +168,24 @@@ static int __init x86_mpx_setup(char *s
  }
  __setup("nompx", x86_mpx_setup);
  
 +#ifdef CONFIG_X86_64
 +static int __init x86_pcid_setup(char *s)
 +{
 +      /* require an exact match without trailing characters */
 +      if (strlen(s))
 +              return 0;
 +
 +      /* do not emit a message if the feature is not present */
 +      if (!boot_cpu_has(X86_FEATURE_PCID))
 +              return 1;
 +
 +      setup_clear_cpu_cap(X86_FEATURE_PCID);
 +      pr_info("nopcid: PCID feature disabled\n");
 +      return 1;
 +}
 +__setup("nopcid", x86_pcid_setup);
 +#endif
 +
  static int __init x86_noinvpcid_setup(char *s)
  {
        /* noinvpcid doesn't accept parameters */
@@@ -329,25 -311,6 +329,25 @@@ static __always_inline void setup_smap(
        }
  }
  
 +static void setup_pcid(struct cpuinfo_x86 *c)
 +{
 +      if (cpu_has(c, X86_FEATURE_PCID)) {
 +              if (cpu_has(c, X86_FEATURE_PGE)) {
 +                      cr4_set_bits(X86_CR4_PCIDE);
 +              } else {
 +                      /*
 +                       * flush_tlb_all(), as currently implemented, won't
 +                       * work if PCID is on but PGE is not.  Since that
 +                       * combination doesn't exist on real hardware, there's
 +                       * no reason to try to fully support it, but it's
 +                       * polite to avoid corrupting data if we're on
 +                       * an improperly configured VM.
 +                       */
 +                      clear_cpu_cap(c, X86_FEATURE_PCID);
 +              }
 +      }
 +}
 +
  /*
   * Protection Keys are not available in 32-bit mode.
   */
@@@ -1162,9 -1125,6 +1162,9 @@@ static void identify_cpu(struct cpuinfo
        setup_smep(c);
        setup_smap(c);
  
 +      /* Set up PCID */
 +      setup_pcid(c);
 +
        /*
         * The vendor-specific functions might have changed features.
         * Now we do "generic changes."
@@@ -1329,15 -1289,6 +1329,6 @@@ static __init int setup_disablecpuid(ch
  __setup("clearcpuid=", setup_disablecpuid);
  
  #ifdef CONFIG_X86_64
- struct desc_ptr idt_descr __ro_after_init = {
-       .size = NR_VECTORS * 16 - 1,
-       .address = (unsigned long) idt_table,
- };
- const struct desc_ptr debug_idt_descr = {
-       .size = NR_VECTORS * 16 - 1,
-       .address = (unsigned long) debug_idt_table,
- };
  DEFINE_PER_CPU_FIRST(union irq_stack_union,
                     irq_stack_union) __aligned(PAGE_SIZE) __visible;
  
index 5ce1a5689162046d419fa9fda0d86c26eb7e4faa,172924d57d2449b202b7ba0b31c1d123964dc1a5..40e28ed77fbf017aee3ddb5c8dd128ff8ac2b826
@@@ -201,8 -201,8 +201,8 @@@ static void smca_configure(unsigned in
                wrmsr(smca_config, low, high);
        }
  
 -      /* Collect bank_info using CPU 0 for now. */
 -      if (cpu)
 +      /* Return early if this bank was already initialized. */
 +      if (smca_banks[bank].hwid)
                return;
  
        if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
        for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
                s_hwid = &smca_hwid_mcatypes[i];
                if (hwid_mcatype == s_hwid->hwid_mcatype) {
 -
 -                      WARN(smca_banks[bank].hwid,
 -                           "Bank %s already initialized!\n",
 -                           smca_get_name(s_hwid->bank_type));
 -
                        smca_banks[bank].hwid = s_hwid;
                        smca_banks[bank].id = low;
                        smca_banks[bank].sysfs_id = s_hwid->count++;
@@@ -771,24 -776,12 +771,12 @@@ static void __log_error(unsigned int ba
        mce_log(&m);
  }
  
- static inline void __smp_deferred_error_interrupt(void)
- {
-       inc_irq_stat(irq_deferred_error_count);
-       deferred_error_int_vector();
- }
  asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
- {
-       entering_irq();
-       __smp_deferred_error_interrupt();
-       exiting_ack_irq();
- }
- asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
  {
        entering_irq();
        trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
-       __smp_deferred_error_interrupt();
+       inc_irq_stat(irq_deferred_error_count);
+       deferred_error_int_vector();
        trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
        exiting_ack_irq();
  }
diff --combined arch/x86/kernel/head64.c
index 6a193b93fd952d59b4bca8a2071859edf8bcfcb6,d6ab034bd65f99e658c2d2ed11e5e3ed3cec131e..bab4fa579450cd5192a60366993a168493789fb4
@@@ -14,7 -14,6 +14,7 @@@
  #include <linux/start_kernel.h>
  #include <linux/io.h>
  #include <linux/memblock.h>
 +#include <linux/mem_encrypt.h>
  
  #include <asm/processor.h>
  #include <asm/proto.h>
@@@ -34,6 -33,7 +34,6 @@@
  /*
   * Manage page tables very early on.
   */
 -extern pgd_t early_top_pgt[PTRS_PER_PGD];
  extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
  static unsigned int __initdata next_early_pgt;
  pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
@@@ -45,11 -45,9 +45,11 @@@ static void __head *fixup_pointer(void 
        return ptr - (void *)_text + (void *)physaddr;
  }
  
 -void __head __startup_64(unsigned long physaddr)
 +unsigned long __head __startup_64(unsigned long physaddr,
 +                                struct boot_params *bp)
  {
        unsigned long load_delta, *p;
 +      unsigned long pgtable_flags;
        pgdval_t *pgd;
        p4dval_t *p4d;
        pudval_t *pud;
        if (load_delta & ~PMD_PAGE_MASK)
                for (;;);
  
 +      /* Activate Secure Memory Encryption (SME) if supported and enabled */
 +      sme_enable(bp);
 +
 +      /* Include the SME encryption mask in the fixup value */
 +      load_delta += sme_get_me_mask();
 +
        /* Fixup the physical addresses in the page table */
  
        pgd = fixup_pointer(&early_top_pgt, physaddr);
         * creates a bunch of nonsense entries but that is fine --
         * it avoids problems around wraparound.
         */
 +
        next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
        pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
        pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
  
 +      pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
 +
        if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
                p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
  
                i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
 -              pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
 -              pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
 +              pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
 +              pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
  
                i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
 -              p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
 -              p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
 +              p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
 +              p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
        } else {
                i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
 -              pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
 -              pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
 +              pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
 +              pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
        }
  
        i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
 -      pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
 -      pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
 +      pud[i + 0] = (pudval_t)pmd + pgtable_flags;
 +      pud[i + 1] = (pudval_t)pmd + pgtable_flags;
  
        pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
 +      pmd_entry += sme_get_me_mask();
        pmd_entry +=  physaddr;
  
        for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
                        pmd[i] += load_delta;
        }
  
 -      /* Fixup phys_base */
 +      /*
 +       * Fixup phys_base - remove the memory encryption mask to obtain
 +       * the true physical address.
 +       */
        p = fixup_pointer(&phys_base, physaddr);
 -      *p += load_delta;
 +      *p += load_delta - sme_get_me_mask();
 +
 +      /* Encrypt the kernel (if SME is active) */
 +      sme_encrypt_kernel();
 +
 +      /*
 +       * Return the SME encryption mask (if SME is active) to be used as a
 +       * modifier for the initial pgdir entry programmed into CR3.
 +       */
 +      return sme_get_me_mask();
 +}
 +
 +unsigned long __startup_secondary_64(void)
 +{
 +      /*
 +       * Return the SME encryption mask (if SME is active) to be used as a
 +       * modifier for the initial pgdir entry programmed into CR3.
 +       */
 +      return sme_get_me_mask();
  }
  
  /* Wipe all early page tables except for the kernel symbol map */
@@@ -180,17 -147,17 +180,17 @@@ static void __init reset_early_page_tab
  {
        memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
        next_early_pgt = 0;
 -      write_cr3(__pa_nodebug(early_top_pgt));
 +      write_cr3(__sme_pa_nodebug(early_top_pgt));
  }
  
  /* Create a new PMD entry */
 -int __init early_make_pgtable(unsigned long address)
 +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
  {
        unsigned long physaddr = address - __PAGE_OFFSET;
        pgdval_t pgd, *pgd_p;
        p4dval_t p4d, *p4d_p;
        pudval_t pud, *pud_p;
 -      pmdval_t pmd, *pmd_p;
 +      pmdval_t *pmd_p;
  
        /* Invalid address or early pgt is done ?  */
        if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
@@@ -249,21 -216,12 +249,21 @@@ again
                memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
                *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
        }
 -      pmd = (physaddr & PMD_MASK) + early_pmd_flags;
        pmd_p[pmd_index(address)] = pmd;
  
        return 0;
  }
  
 +int __init early_make_pgtable(unsigned long address)
 +{
 +      unsigned long physaddr = address - __PAGE_OFFSET;
 +      pmdval_t pmd;
 +
 +      pmd = (physaddr & PMD_MASK) + early_pmd_flags;
 +
 +      return __early_make_pgtable(address, pmd);
 +}
 +
  /* Don't add a printk in there. printk relies on the PDA which is not initialized 
     yet. */
  static void __init clear_bss(void)
@@@ -286,12 -244,6 +286,12 @@@ static void __init copy_bootdata(char *
        char * command_line;
        unsigned long cmd_line_ptr;
  
 +      /*
 +       * If SME is active, this will create decrypted mappings of the
 +       * boot data in advance of the copy operations.
 +       */
 +      sme_map_bootdata(real_mode_data);
 +
        memcpy(&boot_params, real_mode_data, sizeof boot_params);
        sanitize_boot_params(&boot_params);
        cmd_line_ptr = get_cmd_line_ptr();
                command_line = __va(cmd_line_ptr);
                memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
        }
 +
 +      /*
 +       * The old boot data is no longer needed and won't be reserved,
 +       * freeing up that memory for use by the system. If SME is active,
 +       * we need to remove the mappings that were created so that the
 +       * memory doesn't remain mapped as decrypted.
 +       */
 +      sme_unmap_bootdata(real_mode_data);
  }
  
  asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
  {
-       int i;
        /*
         * Build-time sanity checks on the kernel image and module
         * area mappings. (these are purely build-time and produce no code)
  
        clear_page(init_top_pgt);
  
 +      /*
 +       * SME support may update early_pmd_flags to include the memory
 +       * encryption mask, so it needs to be called before anything
 +       * that may generate a page fault.
 +       */
 +      sme_early_init();
 +
        kasan_early_init();
  
-       for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
-               set_intr_gate(i, early_idt_handler_array[i]);
-       load_idt((const struct desc_ptr *)&idt_descr);
+       idt_setup_early_handler();
  
        copy_bootdata(__va(real_mode_data));
  
diff --combined arch/x86/kernel/setup.c
index 022ebddb3734e6cdf528f47393c62195b29a0834,30dc84ee35b270e1a8522693e74bb0d3a840f1bd..9cc16a8417459427f330101578e23f6234dabadd
@@@ -69,7 -69,6 +69,7 @@@
  #include <linux/crash_dump.h>
  #include <linux/tboot.h>
  #include <linux/jiffies.h>
 +#include <linux/mem_encrypt.h>
  
  #include <linux/usb/xhci-dbgp.h>
  #include <video/edid.h>
@@@ -376,14 -375,6 +376,14 @@@ static void __init reserve_initrd(void
            !ramdisk_image || !ramdisk_size)
                return;         /* No initrd provided by bootloader */
  
 +      /*
 +       * If SME is active, this memory will be marked encrypted by the
 +       * kernel when it is accessed (including relocation). However, the
 +       * ramdisk image was loaded decrypted by the bootloader, so make
 +       * sure that it is encrypted before accessing it.
 +       */
 +      sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
 +
        initrd_start = 0;
  
        mapped_size = memblock_mem_size(max_pfn_mapped);
@@@ -900,7 -891,7 +900,7 @@@ void __init setup_arch(char **cmdline_p
         */
        olpc_ofw_detect();
  
-       early_trap_init();
+       idt_setup_early_traps();
        early_cpu_init();
        early_ioremap_init();
  
  
        init_mem_mapping();
  
-       early_trap_pf_init();
+       idt_setup_early_pf();
  
        /*
         * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
diff --combined arch/x86/kvm/vmx.c
index d40900914a72b04a329e64b829e2988e00d71eb3,08d00eefabeacbff275d3a78a1153546421d4f7e..70b90c0810d0efc8b6331dbbf7af52ddb07322c6
@@@ -6556,7 -6556,7 +6556,7 @@@ void vmx_enable_tdp(void
                enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
                0ull, VMX_EPT_EXECUTABLE_MASK,
                cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
 -              VMX_EPT_RWX_MASK);
 +              VMX_EPT_RWX_MASK, 0ull);
  
        ept_set_mmio_spte_mask();
        kvm_enable_tdp();
@@@ -8779,7 -8779,7 +8779,7 @@@ static void vmx_handle_external_intr(st
  
                vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
                desc = (gate_desc *)vmx->host_idt_base + vector;
-               entry = gate_offset(*desc);
+               entry = gate_offset(desc);
                asm volatile(
  #ifdef CONFIG_X86_64
                        "mov %%" _ASM_SP ", %[sp]\n\t"
diff --combined arch/x86/mm/fault.c
index 0cdf14cf3270c97010e0fd97b003f6a59e064abf,f9bb6608f6f141d79e8eb2b0491c33a793465494..b836a7274e123af88edd7b4f261da88609778be9
@@@ -396,18 -396,14 +396,18 @@@ static void dump_pagetable(unsigned lon
        pte_t *pte;
  
  #ifdef CONFIG_X86_PAE
 -      printk("*pdpt = %016Lx ", pgd_val(*pgd));
 +      pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
        if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
                goto out;
 +#define pr_pde pr_cont
 +#else
 +#define pr_pde pr_info
  #endif
        p4d = p4d_offset(pgd, address);
        pud = pud_offset(p4d, address);
        pmd = pmd_offset(pud, address);
 -      printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
 +      pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
 +#undef pr_pde
  
        /*
         * We must not directly access the pte in the highpte
                goto out;
  
        pte = pte_offset_kernel(pmd, address);
 -      printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
 +      pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
  out:
 -      printk("\n");
 +      pr_cont("\n");
  }
  
  #else /* CONFIG_X86_64: */
@@@ -569,7 -565,7 +569,7 @@@ static void dump_pagetable(unsigned lon
        if (bad_address(pgd))
                goto bad;
  
 -      printk("PGD %lx ", pgd_val(*pgd));
 +      pr_info("PGD %lx ", pgd_val(*pgd));
  
        if (!pgd_present(*pgd))
                goto out;
        if (bad_address(p4d))
                goto bad;
  
 -      printk("P4D %lx ", p4d_val(*p4d));
 +      pr_cont("P4D %lx ", p4d_val(*p4d));
        if (!p4d_present(*p4d) || p4d_large(*p4d))
                goto out;
  
        if (bad_address(pud))
                goto bad;
  
 -      printk("PUD %lx ", pud_val(*pud));
 +      pr_cont("PUD %lx ", pud_val(*pud));
        if (!pud_present(*pud) || pud_large(*pud))
                goto out;
  
        if (bad_address(pmd))
                goto bad;
  
 -      printk("PMD %lx ", pmd_val(*pmd));
 +      pr_cont("PMD %lx ", pmd_val(*pmd));
        if (!pmd_present(*pmd) || pmd_large(*pmd))
                goto out;
  
        if (bad_address(pte))
                goto bad;
  
 -      printk("PTE %lx", pte_val(*pte));
 +      pr_cont("PTE %lx", pte_val(*pte));
  out:
 -      printk("\n");
 +      pr_cont("\n");
        return;
  bad:
 -      printk("BAD\n");
 +      pr_info("BAD\n");
  }
  
  #endif /* CONFIG_X86_64 */
@@@ -1258,10 -1254,6 +1258,6 @@@ static inline bool smap_violation(int e
   * This routine handles page faults.  It determines the address,
   * and the problem, and then passes it off to one of the appropriate
   * routines.
-  *
-  * This function must have noinline because both callers
-  * {,trace_}do_page_fault() have notrace on. Having this an actual function
-  * guarantees there's a function trace entry.
   */
  static noinline void
  __do_page_fault(struct pt_regs *regs, unsigned long error_code,
@@@ -1494,27 -1486,6 +1490,6 @@@ good_area
  }
  NOKPROBE_SYMBOL(__do_page_fault);
  
- dotraplinkage void notrace
- do_page_fault(struct pt_regs *regs, unsigned long error_code)
- {
-       unsigned long address = read_cr2(); /* Get the faulting address */
-       enum ctx_state prev_state;
-       /*
-        * We must have this function tagged with __kprobes, notrace and call
-        * read_cr2() before calling anything else. To avoid calling any kind
-        * of tracing machinery before we've observed the CR2 value.
-        *
-        * exception_{enter,exit}() contain all sorts of tracepoints.
-        */
-       prev_state = exception_enter();
-       __do_page_fault(regs, error_code, address);
-       exception_exit(prev_state);
- }
- NOKPROBE_SYMBOL(do_page_fault);
- #ifdef CONFIG_TRACING
  static nokprobe_inline void
  trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
                         unsigned long error_code)
                trace_page_fault_kernel(address, regs, error_code);
  }
  
+ /*
+  * We must have this function blacklisted from kprobes, tagged with notrace
+  * and call read_cr2() before calling anything else. To avoid calling any
+  * kind of tracing machinery before we've observed the CR2 value.
+  *
+  * exception_{enter,exit}() contains all sorts of tracepoints.
+  */
  dotraplinkage void notrace
trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+ do_page_fault(struct pt_regs *regs, unsigned long error_code)
  {
-       /*
-        * The exception_enter and tracepoint processing could
-        * trigger another page faults (user space callchain
-        * reading) and destroy the original cr2 value, so read
-        * the faulting address now.
-        */
-       unsigned long address = read_cr2();
+       unsigned long address = read_cr2(); /* Get the faulting address */
        enum ctx_state prev_state;
  
        prev_state = exception_enter();
-       trace_page_fault_entries(address, regs, error_code);
+       if (trace_pagefault_enabled())
+               trace_page_fault_entries(address, regs, error_code);
        __do_page_fault(regs, error_code, address);
        exception_exit(prev_state);
  }
- NOKPROBE_SYMBOL(trace_do_page_fault);
- #endif /* CONFIG_TRACING */
+ NOKPROBE_SYMBOL(do_page_fault);
index 6c279c8f0a0efd07c3954d83509788287e58021e,3859fc19164a16d5da8dcbba36fdb2cf6d22689b..ae2a2e2d636286f36e570dde5b4cb9d29dc55f5b
@@@ -263,13 -263,6 +263,13 @@@ static void __init xen_init_capabilitie
        setup_clear_cpu_cap(X86_FEATURE_MTRR);
        setup_clear_cpu_cap(X86_FEATURE_ACC);
        setup_clear_cpu_cap(X86_FEATURE_X2APIC);
 +      setup_clear_cpu_cap(X86_FEATURE_SME);
 +
 +      /*
 +       * Xen PV would need some work to support PCID: CR3 handling as well
 +       * as xen_flush_tlb_others() would need updating.
 +       */
 +      setup_clear_cpu_cap(X86_FEATURE_PCID);
  
        if (!xen_initial_domain())
                setup_clear_cpu_cap(X86_FEATURE_ACPI);
@@@ -501,7 -494,7 +501,7 @@@ static void __init xen_load_gdt_boot(co
  static inline bool desc_equal(const struct desc_struct *d1,
                              const struct desc_struct *d2)
  {
-       return d1->a == d2->a && d1->b == d2->b;
+       return !memcmp(d1, d2, sizeof(*d1));
  }
  
  static void load_TLS_descriptor(struct thread_struct *t,
@@@ -586,59 -579,91 +586,91 @@@ static void xen_write_ldt_entry(struct 
        preempt_enable();
  }
  
+ #ifdef CONFIG_X86_64
+ struct trap_array_entry {
+       void (*orig)(void);
+       void (*xen)(void);
+       bool ist_okay;
+ };
+ static struct trap_array_entry trap_array[] = {
+       { debug,                       xen_xendebug,                    true },
+       { int3,                        xen_xenint3,                     true },
+       { double_fault,                xen_double_fault,                true },
+ #ifdef CONFIG_X86_MCE
+       { machine_check,               xen_machine_check,               true },
+ #endif
+       { nmi,                         xen_nmi,                         true },
+       { overflow,                    xen_overflow,                    false },
+ #ifdef CONFIG_IA32_EMULATION
+       { entry_INT80_compat,          xen_entry_INT80_compat,          false },
+ #endif
+       { page_fault,                  xen_page_fault,                  false },
+       { divide_error,                xen_divide_error,                false },
+       { bounds,                      xen_bounds,                      false },
+       { invalid_op,                  xen_invalid_op,                  false },
+       { device_not_available,        xen_device_not_available,        false },
+       { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false },
+       { invalid_TSS,                 xen_invalid_TSS,                 false },
+       { segment_not_present,         xen_segment_not_present,         false },
+       { stack_segment,               xen_stack_segment,               false },
+       { general_protection,          xen_general_protection,          false },
+       { spurious_interrupt_bug,      xen_spurious_interrupt_bug,      false },
+       { coprocessor_error,           xen_coprocessor_error,           false },
+       { alignment_check,             xen_alignment_check,             false },
+       { simd_coprocessor_error,      xen_simd_coprocessor_error,      false },
+ };
+ static bool get_trap_addr(void **addr, unsigned int ist)
+ {
+       unsigned int nr;
+       bool ist_okay = false;
+       /*
+        * Replace trap handler addresses by Xen specific ones.
+        * Check for known traps using IST and whitelist them.
+        * The debugger ones are the only ones we care about.
+        * Xen will handle faults like double_fault, * so we should never see
+        * them.  Warn if there's an unexpected IST-using fault handler.
+        */
+       for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) {
+               struct trap_array_entry *entry = trap_array + nr;
+               if (*addr == entry->orig) {
+                       *addr = entry->xen;
+                       ist_okay = entry->ist_okay;
+                       break;
+               }
+       }
+       if (WARN_ON(ist != 0 && !ist_okay))
+               return false;
+       return true;
+ }
+ #endif
  static int cvt_gate_to_trap(int vector, const gate_desc *val,
                            struct trap_info *info)
  {
        unsigned long addr;
  
-       if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
+       if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT)
                return 0;
  
        info->vector = vector;
  
-       addr = gate_offset(*val);
+       addr = gate_offset(val);
  #ifdef CONFIG_X86_64
-       /*
-        * Look for known traps using IST, and substitute them
-        * appropriately.  The debugger ones are the only ones we care
-        * about.  Xen will handle faults like double_fault,
-        * so we should never see them.  Warn if
-        * there's an unexpected IST-using fault handler.
-        */
-       if (addr == (unsigned long)debug)
-               addr = (unsigned long)xen_debug;
-       else if (addr == (unsigned long)int3)
-               addr = (unsigned long)xen_int3;
-       else if (addr == (unsigned long)stack_segment)
-               addr = (unsigned long)xen_stack_segment;
-       else if (addr == (unsigned long)double_fault) {
-               /* Don't need to handle these */
+       if (!get_trap_addr((void **)&addr, val->bits.ist))
                return 0;
- #ifdef CONFIG_X86_MCE
-       } else if (addr == (unsigned long)machine_check) {
-               /*
-                * when xen hypervisor inject vMCE to guest,
-                * use native mce handler to handle it
-                */
-               ;
- #endif
-       } else if (addr == (unsigned long)nmi)
-               /*
-                * Use the native version as well.
-                */
-               ;
-       else {
-               /* Some other trap using IST? */
-               if (WARN_ON(val->ist != 0))
-                       return 0;
-       }
  #endif        /* CONFIG_X86_64 */
        info->address = addr;
  
-       info->cs = gate_segment(*val);
-       info->flags = val->dpl;
+       info->cs = gate_segment(val);
+       info->flags = val->bits.dpl;
        /* interrupt gates clear IF */
-       if (val->type == GATE_INTERRUPT)
+       if (val->bits.type == GATE_INTERRUPT)
                info->flags |= 1 << 2;
  
        return 1;