Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 22 Mar 2016 23:28:22 +0000 (16:28 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 22 Mar 2016 23:28:22 +0000 (16:28 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 22 Mar 2016 23:28:22 +0000 (16:28 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 22 Mar 2016 23:28:22 +0000 (16:28 -0700)
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c

index 3e0fb66d8e053b5bbd7317d8bbcedf69950833dc..6accd66d26f0edb01a35df2c0628b86266301b62 100644 (file)
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -373,7 +373,9 @@ static void exit_vm_noop(void *info)
  
  void force_vm_exit(const cpumask_t *mask)
  {
+       preempt_disable();
         smp_call_function_many(mask, exit_vm_noop, NULL, true);
+       preempt_enable();
  }
  
  /**
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h

index 226f49d69ea98ed0a5dac3eb9448be4f460eb548..eb7490d232a0f39328c42fd646de8b5548feb178 100644 (file)
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -26,7 +26,13 @@
  #define KVM_ARM64_DEBUG_DIRTY_SHIFT    0
  #define KVM_ARM64_DEBUG_DIRTY          (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
  
-#define kvm_ksym_ref(sym)              phys_to_virt((u64)&sym - kimage_voffset)
+#define kvm_ksym_ref(sym)                                              \
+       ({                                                              \
+               void *val = &sym;                                       \
+               if (!is_kernel_in_hyp_mode())                           \
+                       val = phys_to_virt((u64)&sym - kimage_voffset); \
+               val;                                                    \
+        })
  
  #ifndef __ASSEMBLY__
  struct kvm;
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile

index b6a8fc5ad1afaa4b498b2d7172a78f8fcf61b541..778d0effa2afd38bd94b34b0a7a88c02ff2a79bf 100644 (file)
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -16,3 +16,7 @@ obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
  obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
  obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
  obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o
+
+GCOV_PROFILE   := n
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile

index 7f7b6d86ac731af7241e4ce343a4ed6a8e0542c8..eba0bea6e032b67968771341dcaa1c0eb9624863 100644 (file)
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -8,7 +8,8 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
  KVM := ../../../virt/kvm
  
  common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-               $(KVM)/eventfd.o $(KVM)/vfio.o
+               $(KVM)/eventfd.o
+common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
  
  CFLAGS_e500_mmu.o := -I.
  CFLAGS_e500_mmu_host.o := -I.
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c

index 82970042295eb6ca30b78c57f2b1ba650cebb6f1..18cf6d1f81748ef4d4c0f32f4f21f69736e8e651 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -209,6 +209,32 @@ fail:
         return ret;
  }
  
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba, unsigned long tce)
+{
+       struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+       long ret;
+
+       /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
+       /*          liobn, ioba, tce); */
+
+       if (!stt)
+               return H_TOO_HARD;
+
+       ret = kvmppc_ioba_validate(stt, ioba, 1);
+       if (ret != H_SUCCESS)
+               return ret;
+
+       ret = kvmppc_tce_validate(stt, tce);
+       if (ret != H_SUCCESS)
+               return ret;
+
+       kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+
+       return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
+
  long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                 unsigned long liobn, unsigned long ioba,
                 unsigned long tce_list, unsigned long npages)
@@ -264,3 +290,29 @@ unlock_exit:
         return ret;
  }
  EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect);
+
+long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_value, unsigned long npages)
+{
+       struct kvmppc_spapr_tce_table *stt;
+       long i, ret;
+
+       stt = kvmppc_find_table(vcpu, liobn);
+       if (!stt)
+               return H_TOO_HARD;
+
+       ret = kvmppc_ioba_validate(stt, ioba, npages);
+       if (ret != H_SUCCESS)
+               return ret;
+
+       /* Check permission bits only to allow userspace poison TCE for debug */
+       if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
+               return H_PARAMETER;
+
+       for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
+               kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
+
+       return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c

index f88b859af53b5c85d71a35e3d11b4dace211674e..d461c440889aa1882d9fd68f46d98d94e3afed16 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -180,8 +180,8 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
  EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
  
  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-                     unsigned long ioba, unsigned long tce)
+long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+               unsigned long ioba, unsigned long tce)
  {
         struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
         long ret;
@@ -204,7 +204,6 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
  
         return H_SUCCESS;
  }
-EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
  
  static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
                 unsigned long ua, unsigned long *phpa)
@@ -296,7 +295,7 @@ unlock_exit:
         return ret;
  }
  
-long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
                 unsigned long liobn, unsigned long ioba,
                 unsigned long tce_value, unsigned long npages)
  {
@@ -320,7 +319,6 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
  
         return H_SUCCESS;
  }
-EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
  
  long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                       unsigned long ioba)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S

index 62ea3c6acdee44cae85ecf0ebd8abc69b73d043f..e571ad277398fd6625499c05ae03cde47a46d20a 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1942,7 +1942,7 @@ hcall_real_table:
         .long   DOTSYM(kvmppc_h_clear_ref) - hcall_real_table
         .long   DOTSYM(kvmppc_h_protect) - hcall_real_table
         .long   DOTSYM(kvmppc_h_get_tce) - hcall_real_table
-       .long   DOTSYM(kvmppc_h_put_tce) - hcall_real_table
+       .long   DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table
         .long   0               /* 0x24 - H_SET_SPRG0 */
         .long   DOTSYM(kvmppc_h_set_dabr) - hcall_real_table
         .long   0               /* 0x2c */
@@ -2020,7 +2020,7 @@ hcall_real_table:
         .long   0               /* 0x12c */
         .long   0               /* 0x130 */
         .long   DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
-       .long   DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table
+       .long   DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table
         .long   DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
         .long   0               /* 0x140 */
         .long   0               /* 0x144 */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c

index 19aa59b0850cf73489d5f61623794da607a6fc41..6a68730774ee7dd701358cbc6cfce36c37db4009 100644 (file)
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -96,6 +96,9 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
                  * so we don't miss a request because the requester sees
                  * OUTSIDE_GUEST_MODE and assumes we'll be checking requests
                  * before next entering the guest (and thus doesn't IPI).
+                * This also orders the write to mode from any reads
+                * to the page tables done while the VCPU is running.
+                * Please see the comment in kvm_flush_remote_tlbs.
                  */
                 smp_mb();
  
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 01c8b501cb6d5afbdd5eb1c5dfaba366ea158cf8..f62a9f37f79f6c15221aed106f34508d05940be8 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -84,7 +84,8 @@
                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
                           | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
-                         | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP))
+                         | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
+                         | X86_CR4_PKE))
  
  #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  
@@ -187,12 +188,14 @@ enum {
  #define PFERR_USER_BIT 2
  #define PFERR_RSVD_BIT 3
  #define PFERR_FETCH_BIT 4
+#define PFERR_PK_BIT 5
  
  #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
  #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
  #define PFERR_USER_MASK (1U << PFERR_USER_BIT)
  #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
  #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
  
  /* apic attention bits */
  #define KVM_APIC_CHECK_VAPIC   0
@@ -335,6 +338,14 @@ struct kvm_mmu {
          */
         u8 permissions[16];
  
+       /*
+       * The pkru_mask indicates if protection key checks are needed.  It
+       * consists of 16 domains indexed by page fault error code bits [4:1],
+       * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
+       * Each domain has 2 bits which are ANDed with AD and WD from PKRU.
+       */
+       u32 pkru_mask;
+
         u64 *pae_root;
         u64 *lm_root;
  
@@ -874,6 +885,7 @@ struct kvm_x86_ops {
         void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
         unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
         void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+       u32 (*get_pkru)(struct kvm_vcpu *vcpu);
         void (*fpu_activate)(struct kvm_vcpu *vcpu);
         void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
  
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index 1ff49ec29ecede42b78c49132e431ea4ed193026..97f3242e133ccc9c2866baade8ca4f82ade65f04 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -107,6 +107,12 @@ static inline u32 read_pkru(void)
         return 0;
  }
  
+static inline void write_pkru(u32 pkru)
+{
+       if (boot_cpu_has(X86_FEATURE_OSPKE))
+               __write_pkru(pkru);
+}
+
  static inline int pte_young(pte_t pte)
  {
         return pte_flags(pte) & _PAGE_ACCESSED;
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h

index aee6e76e561ec0cd24cfe1f92f4dc9cf6e24ece4..d96d0437776569f5c9c0e6f28d125dbc5671d037 100644 (file)
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -113,11 +113,27 @@ static inline u32 __read_pkru(void)
                      : "c" (ecx));
         return pkru;
  }
+
+static inline void __write_pkru(u32 pkru)
+{
+       u32 ecx = 0, edx = 0;
+
+       /*
+        * "wrpkru" instruction.  Loads contents in EAX to PKRU,
+        * requires that ecx = edx = 0.
+        */
+       asm volatile(".byte 0x0f,0x01,0xef\n\t"
+                    : : "a" (pkru), "c"(ecx), "d"(edx));
+}
  #else
  static inline u32 __read_pkru(void)
  {
         return 0;
  }
+
+static inline void __write_pkru(u32 pkru)
+{
+}
  #endif
  
  static inline void native_wbinvd(void)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c

index 47190bd399e7ef5680172859fa78d9ae1ba0e595..807950860fb7028e28fe1e98d8a2cddeccfa8063 100644 (file)
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -36,6 +36,7 @@
  #include <linux/kprobes.h>
  #include <linux/debugfs.h>
  #include <linux/nmi.h>
+#include <linux/swait.h>
  #include <asm/timer.h>
  #include <asm/cpu.h>
  #include <asm/traps.h>
@@ -91,14 +92,14 @@ static void kvm_io_delay(void)
  
  struct kvm_task_sleep_node {
         struct hlist_node link;
-       wait_queue_head_t wq;
+       struct swait_queue_head wq;
         u32 token;
         int cpu;
         bool halted;
  };
  
  static struct kvm_task_sleep_head {
-       spinlock_t lock;
+       raw_spinlock_t lock;
         struct hlist_head list;
  } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
  
@@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
         u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
         struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
         struct kvm_task_sleep_node n, *e;
-       DEFINE_WAIT(wait);
+       DECLARE_SWAITQUEUE(wait);
  
         rcu_irq_enter();
  
-       spin_lock(&b->lock);
+       raw_spin_lock(&b->lock);
         e = _find_apf_task(b, token);
         if (e) {
                 /* dummy entry exist -> wake up was delivered ahead of PF */
                 hlist_del(&e->link);
                 kfree(e);
-               spin_unlock(&b->lock);
+               raw_spin_unlock(&b->lock);
  
                 rcu_irq_exit();
                 return;
@@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
         n.token = token;
         n.cpu = smp_processor_id();
         n.halted = is_idle_task(current) || preempt_count() > 1;
-       init_waitqueue_head(&n.wq);
+       init_swait_queue_head(&n.wq);
         hlist_add_head(&n.link, &b->list);
-       spin_unlock(&b->lock);
+       raw_spin_unlock(&b->lock);
  
         for (;;) {
                 if (!n.halted)
-                       prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+                       prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
                 if (hlist_unhashed(&n.link))
                         break;
  
@@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token)
                 }
         }
         if (!n.halted)
-               finish_wait(&n.wq, &wait);
+               finish_swait(&n.wq, &wait);
  
         rcu_irq_exit();
         return;
@@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
         hlist_del_init(&n->link);
         if (n->halted)
                 smp_send_reschedule(n->cpu);
-       else if (waitqueue_active(&n->wq))
-               wake_up(&n->wq);
+       else if (swait_active(&n->wq))
+               swake_up(&n->wq);
  }
  
  static void apf_task_wake_all(void)
@@ -189,14 +190,14 @@ static void apf_task_wake_all(void)
         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
                 struct hlist_node *p, *next;
                 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
-               spin_lock(&b->lock);
+               raw_spin_lock(&b->lock);
                 hlist_for_each_safe(p, next, &b->list) {
                         struct kvm_task_sleep_node *n =
                                 hlist_entry(p, typeof(*n), link);
                         if (n->cpu == smp_processor_id())
                                 apf_task_wake_one(n);
                 }
-               spin_unlock(&b->lock);
+               raw_spin_unlock(&b->lock);
         }
  }
  
@@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token)
         }
  
  again:
-       spin_lock(&b->lock);
+       raw_spin_lock(&b->lock);
         n = _find_apf_task(b, token);
         if (!n) {
                 /*
@@ -225,17 +226,17 @@ again:
                          * Allocation failed! Busy wait while other cpu
                          * handles async PF.
                          */
-                       spin_unlock(&b->lock);
+                       raw_spin_unlock(&b->lock);
                         cpu_relax();
                         goto again;
                 }
                 n->token = token;
                 n->cpu = smp_processor_id();
-               init_waitqueue_head(&n->wq);
+               init_swait_queue_head(&n->wq);
                 hlist_add_head(&n->link, &b->list);
         } else
                 apf_task_wake_one(n);
-       spin_unlock(&b->lock);
+       raw_spin_unlock(&b->lock);
         return;
  }
  EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
@@ -486,7 +487,7 @@ void __init kvm_guest_init(void)
         paravirt_ops_setup();
         register_reboot_notifier(&kvm_pv_reboot_nb);
         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
-               spin_lock_init(&async_pf_sleepers[i].lock);
+               raw_spin_lock_init(&async_pf_sleepers[i].lock);
         if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
                 x86_init.irqs.trap_init = kvm_apf_trap_init;
  
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index 0029644bf09c39117ef7c1d2cca05561341a24e1..8efb839948e512e9aac6aaf544230195614297c1 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -88,6 +88,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
                         apic->lapic_timer.timer_mode_mask = 1 << 17;
         }
  
+       best = kvm_find_cpuid_entry(vcpu, 7, 0);
+       if (best) {
+               /* Update OSPKE bit */
+               if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
+                       best->ecx &= ~F(OSPKE);
+                       if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
+                               best->ecx |= F(OSPKE);
+               }
+       }
+
         best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
         if (!best) {
                 vcpu->arch.guest_supported_xcr0 = 0;
@@ -305,7 +315,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
         unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
  
         /* cpuid 1.edx */
-       const u32 kvm_supported_word0_x86_features =
+       const u32 kvm_cpuid_1_edx_x86_features =
                 F(FPU) | F(VME) | F(DE) | F(PSE) |
                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
@@ -315,7 +325,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
                 0 /* HTT, TM, Reserved, PBE */;
         /* cpuid 0x80000001.edx */
-       const u32 kvm_supported_word1_x86_features =
+       const u32 kvm_cpuid_8000_0001_edx_x86_features =
                 F(FPU) | F(VME) | F(DE) | F(PSE) |
                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
@@ -325,7 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
                 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
         /* cpuid 1.ecx */
-       const u32 kvm_supported_word4_x86_features =
+       const u32 kvm_cpuid_1_ecx_x86_features =
                 /* NOTE: MONITOR (and MWAIT) are emulated as NOP,
                  * but *not* advertised to guests via CPUID ! */
                 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
@@ -337,29 +347,32 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
                 F(F16C) | F(RDRAND);
         /* cpuid 0x80000001.ecx */
-       const u32 kvm_supported_word6_x86_features =
+       const u32 kvm_cpuid_8000_0001_ecx_x86_features =
                 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
                 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
                 F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
                 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
  
         /* cpuid 0xC0000001.edx */
-       const u32 kvm_supported_word5_x86_features =
+       const u32 kvm_cpuid_C000_0001_edx_x86_features =
                 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
                 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
                 F(PMM) | F(PMM_EN);
  
         /* cpuid 7.0.ebx */
-       const u32 kvm_supported_word9_x86_features =
+       const u32 kvm_cpuid_7_0_ebx_x86_features =
                 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
                 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
                 F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
                 F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
  
         /* cpuid 0xD.1.eax */
-       const u32 kvm_supported_word10_x86_features =
+       const u32 kvm_cpuid_D_1_eax_x86_features =
                 F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
  
+       /* cpuid 7.0.ecx*/
+       const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
+
         /* all calls to cpuid_count() should be made on the same cpu */
         get_cpu();
  
@@ -376,10 +389,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 entry->eax = min(entry->eax, (u32)0xd);
                 break;
         case 1:
-               entry->edx &= kvm_supported_word0_x86_features;
-               cpuid_mask(&entry->edx, 0);
-               entry->ecx &= kvm_supported_word4_x86_features;
-               cpuid_mask(&entry->ecx, 4);
+               entry->edx &= kvm_cpuid_1_edx_x86_features;
+               cpuid_mask(&entry->edx, CPUID_1_EDX);
+               entry->ecx &= kvm_cpuid_1_ecx_x86_features;
+               cpuid_mask(&entry->ecx, CPUID_1_ECX);
                 /* we support x2apic emulation even if host does not support
                  * it since we emulate x2apic in software */
                 entry->ecx |= F(X2APIC);
@@ -433,14 +446,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                 /* Mask ebx against host capability word 9 */
                 if (index == 0) {
-                       entry->ebx &= kvm_supported_word9_x86_features;
-                       cpuid_mask(&entry->ebx, 9);
+                       entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
+                       cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
                         // TSC_ADJUST is emulated
                         entry->ebx |= F(TSC_ADJUST);
-               } else
+                       entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+                       cpuid_mask(&entry->ecx, CPUID_7_ECX);
+                       /* PKU is not yet implemented for shadow paging. */
+                       if (!tdp_enabled)
+                               entry->ecx &= ~F(PKU);
+               } else {
                         entry->ebx = 0;
+                       entry->ecx = 0;
+               }
                 entry->eax = 0;
-               entry->ecx = 0;
                 entry->edx = 0;
                 break;
         }
@@ -514,7 +533,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
  
                         do_cpuid_1_ent(&entry[i], function, idx);
                         if (idx == 1) {
-                               entry[i].eax &= kvm_supported_word10_x86_features;
+                               entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
                                 entry[i].ebx = 0;
                                 if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
                                         entry[i].ebx =
@@ -564,10 +583,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 entry->eax = min(entry->eax, 0x8000001a);
                 break;
         case 0x80000001:
-               entry->edx &= kvm_supported_word1_x86_features;
-               cpuid_mask(&entry->edx, 1);
-               entry->ecx &= kvm_supported_word6_x86_features;
-               cpuid_mask(&entry->ecx, 6);
+               entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
+               cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
+               entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
+               cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
                 break;
         case 0x80000007: /* Advanced power management */
                 /* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -600,8 +619,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 entry->eax = min(entry->eax, 0xC0000004);
                 break;
         case 0xC0000001:
-               entry->edx &= kvm_supported_word5_x86_features;
-               cpuid_mask(&entry->edx, 5);
+               entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
+               cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
                 break;
         case 3: /* Processor serial number */
         case 5: /* MONITOR/MWAIT */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h

index 66a6581724ad264aa2f7d9e2678efa243a471fbd..e17a74b1d8525708a051c18f93854af5667805be 100644 (file)
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -80,6 +80,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
         return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
  }
  
+static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 7, 0);
+       return best && (best->ecx & bit(X86_FEATURE_PKU));
+}
+
  static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
  {
         struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h

index e1e89ee4af750dc51f78cfbf8aa71e22d77b4cd1..762cdf2595f992fd4ac8bb1e4c2c8914b344db04 100644 (file)
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,6 +84,11 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
                 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
  }
  
+static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
+{
+       return kvm_x86_ops->get_pkru(vcpu);
+}
+
  static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
  {
         vcpu->arch.hflags |= HF_GUEST_MASK;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index c512f095cdac82b9e2ba258ae052a9a4199dc13c..6bdfbc23ecaa8fc779085076bc08f340fc704513 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -632,12 +632,12 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
          * kvm_flush_remote_tlbs() IPI to all active vcpus.
          */
         local_irq_disable();
-       vcpu->mode = READING_SHADOW_PAGE_TABLES;
+
         /*
          * Make sure a following spte read is not reordered ahead of the write
          * to vcpu->mode.
          */
-       smp_mb();
+       smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
  }
  
  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
@@ -647,8 +647,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
          * reads to sptes.  If it does, kvm_commit_zap_page() can see us
          * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
          */
-       smp_mb();
-       vcpu->mode = OUTSIDE_GUEST_MODE;
+       smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
         local_irq_enable();
  }
  
@@ -2390,14 +2389,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                 return;
  
         /*
-        * wmb: make sure everyone sees our modifications to the page tables
-        * rmb: make sure we see changes to vcpu->mode
-        */
-       smp_mb();
-
-       /*
-        * Wait for all vcpus to exit guest mode and/or lockless shadow
-        * page table walks.
+        * We need to make sure everyone sees our modifications to
+        * the page tables and see changes to vcpu->mode here. The barrier
+        * in the kvm_flush_remote_tlbs() achieves this. This pairs
+        * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+        *
+        * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+        * guest mode and/or lockless shadow page table walks.
          */
         kvm_flush_remote_tlbs(kvm);
  
@@ -3923,6 +3921,81 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
         }
  }
  
+/*
+* PKU is an additional mechanism by which the paging controls access to
+* user-mode addresses based on the value in the PKRU register.  Protection
+* key violations are reported through a bit in the page fault error code.
+* Unlike other bits of the error code, the PK bit is not known at the
+* call site of e.g. gva_to_gpa; it must be computed directly in
+* permission_fault based on two bits of PKRU, on some machine state (CR4,
+* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+*
+* In particular the following conditions come from the error code, the
+* page tables and the machine state:
+* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+* - PK is always zero if U=0 in the page tables
+* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+*
+* The PKRU bitmask caches the result of these four conditions.  The error
+* code (minus the P bit) and the page table's U bit form an index into the
+* PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
+* with the two bits of the PKRU register corresponding to the protection key.
+* For the first three conditions above the bits will be 00, thus masking
+* away both AD and WD.  For all reads or if the last condition holds, WD
+* only will be masked away.
+*/
+static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+                               bool ept)
+{
+       unsigned bit;
+       bool wp;
+
+       if (ept) {
+               mmu->pkru_mask = 0;
+               return;
+       }
+
+       /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
+       if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
+               mmu->pkru_mask = 0;
+               return;
+       }
+
+       wp = is_write_protection(vcpu);
+
+       for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+               unsigned pfec, pkey_bits;
+               bool check_pkey, check_write, ff, uf, wf, pte_user;
+
+               pfec = bit << 1;
+               ff = pfec & PFERR_FETCH_MASK;
+               uf = pfec & PFERR_USER_MASK;
+               wf = pfec & PFERR_WRITE_MASK;
+
+               /* PFEC.RSVD is replaced by ACC_USER_MASK. */
+               pte_user = pfec & PFERR_RSVD_MASK;
+
+               /*
+                * Only need to check the access which is not an
+                * instruction fetch and is to a user page.
+                */
+               check_pkey = (!ff && pte_user);
+               /*
+                * write access is controlled by PKRU if it is a
+                * user access or CR0.WP = 1.
+                */
+               check_write = check_pkey && wf && (uf || wp);
+
+               /* PKRU.AD stops both read and write access. */
+               pkey_bits = !!check_pkey;
+               /* PKRU.WD stops write access. */
+               pkey_bits |= (!!check_write) << 1;
+
+               mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+       }
+}
+
  static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
  {
         unsigned root_level = mmu->root_level;
@@ -3941,6 +4014,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
  
         reset_rsvds_bits_mask(vcpu, context);
         update_permission_bitmask(vcpu, context, false);
+       update_pkru_bitmask(vcpu, context, false);
         update_last_nonleaf_level(vcpu, context);
  
         MMU_WARN_ON(!is_pae(vcpu));
@@ -3968,6 +4042,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
  
         reset_rsvds_bits_mask(vcpu, context);
         update_permission_bitmask(vcpu, context, false);
+       update_pkru_bitmask(vcpu, context, false);
         update_last_nonleaf_level(vcpu, context);
  
         context->page_fault = paging32_page_fault;
@@ -4026,6 +4101,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
         }
  
         update_permission_bitmask(vcpu, context, false);
+       update_pkru_bitmask(vcpu, context, false);
         update_last_nonleaf_level(vcpu, context);
         reset_tdp_shadow_zero_bits_mask(vcpu, context);
  }
@@ -4078,6 +4154,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
         context->direct_map = false;
  
         update_permission_bitmask(vcpu, context, true);
+       update_pkru_bitmask(vcpu, context, true);
         reset_rsvds_bits_mask_ept(vcpu, context, execonly);
         reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
  }
@@ -4132,6 +4209,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
         }
  
         update_permission_bitmask(vcpu, g_context, false);
+       update_pkru_bitmask(vcpu, g_context, false);
         update_last_nonleaf_level(vcpu, g_context);
  }
  
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h

index 58fe98a0a526c00f1126ee3af212428ff5123b06..b70df72e2b33d417307d01f57bca57388650d34f 100644 (file)
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -10,10 +10,11 @@
  #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
  
  #define PT_WRITABLE_SHIFT 1
+#define PT_USER_SHIFT 2
  
  #define PT_PRESENT_MASK (1ULL << 0)
  #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
+#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
  #define PT_PWT_MASK (1ULL << 3)
  #define PT_PCD_MASK (1ULL << 4)
  #define PT_ACCESSED_SHIFT 5
@@ -141,11 +142,16 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
  }
  
  /*
- * Will a fault with a given page-fault error code (pfec) cause a permission
- * fault with the given access (in ACC_* format)?
+ * Check if a given access (described through the I/D, W/R and U/S bits of a
+ * page fault error code pfec) causes a permission fault with the given PTE
+ * access rights (in ACC_* format).
+ *
+ * Return zero if the access does not fault; return the page fault error code
+ * if the access faults.
   */
-static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-                                   unsigned pte_access, unsigned pfec)
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+                                 unsigned pte_access, unsigned pte_pkey,
+                                 unsigned pfec)
  {
         int cpl = kvm_x86_ops->get_cpl(vcpu);
         unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
@@ -166,10 +172,32 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
         unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
         int index = (pfec >> 1) +
                     (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
+       bool fault = (mmu->permissions[index] >> pte_access) & 1;
+
+       WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
+       pfec |= PFERR_PRESENT_MASK;
+
+       if (unlikely(mmu->pkru_mask)) {
+               u32 pkru_bits, offset;
+
+               /*
+               * PKRU defines 32 bits, there are 16 domains and 2
+               * attribute bits per domain in pkru.  pte_pkey is the
+               * index of the protection domain, so pte_pkey * 2 is
+               * is the index of the first bit for the domain.
+               */
+               pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
+
+               /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
+               offset = pfec - 1 +
+                       ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
  
-       WARN_ON(pfec & PFERR_RSVD_MASK);
+               pkru_bits &= mmu->pkru_mask >> offset;
+               pfec |= -pkru_bits & PFERR_PK_MASK;
+               fault |= (pkru_bits != 0);
+       }
  
-       return (mmu->permissions[index] >> pte_access) & 1;
+       return -(uint32_t)fault & pfec;
  }
  
  void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c

index 11f76436f74f0d762123a4dc38dbcf31b0c47f53..b431539c3714b0e9b72420180aee914583699dbb 100644 (file)
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -142,12 +142,17 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
  bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
                               enum kvm_page_track_mode mode)
  {
-       struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-       int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+       struct kvm_memory_slot *slot;
+       int index;
  
         if (WARN_ON(!page_track_mode_is_valid(mode)))
                 return false;
  
+       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+       if (!slot)
+               return false;
+
+       index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
         return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
  }
  
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h

index e159a8185ad9af081de317ff6b686d1ee3eb1d4b..1d971c7553c3847f0d1335487ce551a19875b709 100644 (file)
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -257,6 +257,17 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
         return 0;
  }
  
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+       unsigned pkeys = 0;
+#if PTTYPE == 64
+       pte_t pte = {.pte = gpte};
+
+       pkeys = pte_flags_pkey(pte_flags(pte));
+#endif
+       return pkeys;
+}
+
  /*
   * Fetch a guest pte for a guest virtual address
   */
@@ -268,7 +279,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
         pt_element_t pte;
         pt_element_t __user *uninitialized_var(ptep_user);
         gfn_t table_gfn;
-       unsigned index, pt_access, pte_access, accessed_dirty;
+       unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
         gpa_t pte_gpa;
         int offset;
         const int write_fault = access & PFERR_WRITE_MASK;
@@ -359,10 +370,10 @@ retry_walk:
                 walker->ptes[walker->level - 1] = pte;
         } while (!is_last_gpte(mmu, walker->level, pte));
  
-       if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) {
-               errcode |= PFERR_PRESENT_MASK;
+       pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
+       errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
+       if (unlikely(errcode))
                 goto error;
-       }
  
         gfn = gpte_to_gfn_lvl(pte, walker->level);
         gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
@@ -949,6 +960,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                         return 0;
  
                 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+                       /*
+                        * Update spte before increasing tlbs_dirty to make
+                        * sure no tlb flush is lost after spte is zapped; see
+                        * the comments in kvm_flush_remote_tlbs().
+                        */
+                       smp_wmb();
                         vcpu->kvm->tlbs_dirty++;
                         continue;
                 }
@@ -964,6 +981,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  
                 if (gfn != sp->gfns[i]) {
                         drop_spte(vcpu->kvm, &sp->spt[i]);
+                       /*
+                        * The same as above where we are doing
+                        * prefetch_invalid_gpte().
+                        */
+                       smp_wmb();
                         vcpu->kvm->tlbs_dirty++;
                         continue;
                 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

index 95070386d5991baef11e2934349727af714cfce3..31346a3f20a5c8b5384e6fda2a81029059cdf621 100644 (file)
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1280,6 +1280,11 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
         to_svm(vcpu)->vmcb->save.rflags = rflags;
  }
  
+static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
  static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
  {
         switch (reg) {
@@ -4347,6 +4352,9 @@ static struct kvm_x86_ops svm_x86_ops = {
         .cache_reg = svm_cache_reg,
         .get_rflags = svm_get_rflags,
         .set_rflags = svm_set_rflags,
+
+       .get_pkru = svm_get_pkru,
+
         .fpu_activate = svm_fpu_activate,
         .fpu_deactivate = svm_fpu_deactivate,
  
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 75173efccac552c48472f95fc2729d410628c139..efc243e4dabfc75d44240fab88052598c0d3da26 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -598,6 +598,10 @@ struct vcpu_vmx {
         struct page *pml_pg;
  
         u64 current_tsc_ratio;
+
+       bool guest_pkru_valid;
+       u32 guest_pkru;
+       u32 host_pkru;
  };
  
  enum segment_cache_field {
@@ -2107,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
         } while (cmpxchg(&pi_desc->control, old.control,
                         new.control) != old.control);
  }
+
  /*
   * Switches to specified vcpu, until a matching vcpu_put(), but assumes
   * vcpu mutex is already taken.
@@ -2167,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
         }
  
         vmx_vcpu_pi_load(vcpu, cpu);
+       vmx->host_pkru = read_pkru();
  }
  
  static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -2286,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
         vmcs_writel(GUEST_RFLAGS, rflags);
  }
  
+static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
+{
+       return to_vmx(vcpu)->guest_pkru;
+}
+
  static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
  {
         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2712,8 +2723,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
         } else
                 vmx->nested.nested_vmx_ept_caps = 0;
  
+       /*
+        * Old versions of KVM use the single-context version without
+        * checking for support, so declare that it is supported even
+        * though it is treated as global context.  The alternative is
+        * not failing the single-context invvpid, and it is worse.
+        */
         if (enable_vpid)
                 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+                               VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
                                 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
         else
                 vmx->nested.nested_vmx_vpid_caps = 0;
@@ -3886,13 +3904,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  
         if (!enable_unrestricted_guest && !is_paging(vcpu))
                 /*
-                * SMEP/SMAP is disabled if CPU is in non-paging mode in
-                * hardware.  However KVM always uses paging mode without
-                * unrestricted guest.
-                * To emulate this behavior, SMEP/SMAP needs to be manually
-                * disabled when guest switches to non-paging mode.
+                * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+                * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
+                * to be manually disabled when guest switches to non-paging
+                * mode.
+                *
+                * If !enable_unrestricted_guest, the CPU is always running
+                * with CR0.PG=1 and CR4 needs to be modified.
+                * If enable_unrestricted_guest, the CPU automatically
+                * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
                  */
-               hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+               hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
  
         vmcs_writel(CR4_READ_SHADOW, cr4);
         vmcs_writel(GUEST_CR4, hw_cr4);
@@ -7399,6 +7421,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
         if (!(types & (1UL << type))) {
                 nested_vmx_failValid(vcpu,
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+               skip_emulated_instruction(vcpu);
                 return 1;
         }
  
@@ -7457,6 +7480,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
         if (!(types & (1UL << type))) {
                 nested_vmx_failValid(vcpu,
                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+               skip_emulated_instruction(vcpu);
                 return 1;
         }
  
@@ -7473,12 +7497,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
         }
  
         switch (type) {
+       case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+               /*
+                * Old versions of KVM use the single-context version so we
+                * have to support it; just treat it the same as all-context.
+                */
         case VMX_VPID_EXTENT_ALL_CONTEXT:
                 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
                 nested_vmx_succeed(vcpu);
                 break;
         default:
-               /* Trap single context invalidation invvpid calls */
+               /* Trap individual address invalidation invvpid calls */
                 BUG_ON(1);
                 break;
         }
@@ -8621,6 +8650,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                 vmx_set_interrupt_shadow(vcpu, 0);
  
+       if (vmx->guest_pkru_valid)
+               __write_pkru(vmx->guest_pkru);
+
         atomic_switch_perf_msrs(vmx);
         debugctlmsr = get_debugctlmsr();
  
@@ -8760,6 +8792,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  
         vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
  
+       /*
+        * eager fpu is enabled if PKEY is supported and CR4 is switched
+        * back on host, so it is safe to read guest PKRU from current
+        * XSAVE.
+        */
+       if (boot_cpu_has(X86_FEATURE_OSPKE)) {
+               vmx->guest_pkru = __read_pkru();
+               if (vmx->guest_pkru != vmx->host_pkru) {
+                       vmx->guest_pkru_valid = true;
+                       __write_pkru(vmx->host_pkru);
+               } else
+                       vmx->guest_pkru_valid = false;
+       }
+
         /*
          * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
          * we did not inject a still-pending event to L1 now because of
@@ -10884,6 +10930,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
         .cache_reg = vmx_cache_reg,
         .get_rflags = vmx_get_rflags,
         .set_rflags = vmx_set_rflags,
+
+       .get_pkru = vmx_get_pkru,
+
         .fpu_activate = vmx_fpu_activate,
         .fpu_deactivate = vmx_fpu_deactivate,
  
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 7236bd3a4c3d7a0c5a6148decc6fad276eb18bb7..e260ccbc8f5574d8e587ebd7704d051ee873147a 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -723,7 +723,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
         unsigned long old_cr4 = kvm_read_cr4(vcpu);
         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
-                                  X86_CR4_SMEP | X86_CR4_SMAP;
+                                  X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
  
         if (cr4 & CR4_RESERVED_BITS)
                 return 1;
@@ -740,6 +740,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
         if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
                 return 1;
  
+       if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE))
+               return 1;
+
         if (is_long_mode(vcpu)) {
                 if (!(cr4 & X86_CR4_PAE))
                         return 1;
@@ -765,7 +768,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                 kvm_mmu_reset_context(vcpu);
  
-       if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+       if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
                 kvm_update_cpuid(vcpu);
  
         return 0;
@@ -4326,9 +4329,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
         u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
                 | (write ? PFERR_WRITE_MASK : 0);
  
+       /*
+        * currently PKRU is only applied to ept enabled guest so
+        * there is no pkey in EPT page table for L1 guest or EPT
+        * shadow page table for L2 guest.
+        */
         if (vcpu_match_mmio_gva(vcpu, gva)
             && !permission_fault(vcpu, vcpu->arch.walk_mmu,
-                                vcpu->arch.access, access)) {
+                                vcpu->arch.access, 0, access)) {
                 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
                                         (gva & (PAGE_SIZE - 1));
                 trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -6588,8 +6596,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  
         srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
  
-       /* We should set ->mode before check ->requests,
-        * see the comment in make_all_cpus_request.
+       /*
+        * We should set ->mode before check ->requests,
+        * Please see the comment in kvm_make_all_cpus_request.
+        * This also orders the write to mode from any reads
+        * to the page tables done while the VCPU is running.
+        * Please see the comment in kvm_flush_remote_tlbs.
          */
         smp_mb__after_srcu_read_unlock();
  
@@ -7123,7 +7135,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
  
         mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
-       if (sregs->cr4 & X86_CR4_OSXSAVE)
+       if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE))
                 kvm_update_cpuid(vcpu);
  
         idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index 007940faa5c6357d1c5c1c4bd200b3b8ae195cb0..7ce3634ab5fe6189a95163d8cd9781a923d21727 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -183,7 +183,8 @@ bool kvm_vector_hashing_enabled(void);
  
  #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
                                 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
-                               | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
+                               | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+                               | XFEATURE_MASK_PKRU)
  extern u64 host_xcr0;
  
  extern u64 kvm_supported_xcr0(void);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 99ee4b1ce2dbd2b500c8b80f805dfe21c2eecf42..4fd482fb9260b89feda9736854861719b50c974f 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -170,8 +170,8 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
                 kvm_make_request(req, vcpu);
                 cpu = vcpu->cpu;
  
-               /* Set ->requests bit before we read ->mode */
-               smp_mb();
+               /* Set ->requests bit before we read ->mode. */
+               smp_mb__after_atomic();
  
                 if (cpus != NULL && cpu != -1 && cpu != me &&
                       kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
@@ -191,9 +191,23 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
  #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
  void kvm_flush_remote_tlbs(struct kvm *kvm)
  {
-       long dirty_count = kvm->tlbs_dirty;
+       /*
+        * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
+        * kvm_make_all_cpus_request.
+        */
+       long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
  
-       smp_mb();
+       /*
+        * We want to publish modifications to the page tables before reading
+        * mode. Pairs with a memory barrier in arch-specific code.
+        * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
+        * and smp_mb in walk_shadow_page_lockless_begin/end.
+        * - powerpc: smp_mb in kvmppc_prepare_to_enter.
+        *
+        * There is already an smp_mb__after_atomic() before
+        * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
+        * barrier here.
+        */
         if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
                 ++kvm->stat.remote_tlb_flush;
         cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
@@ -536,6 +550,16 @@ static struct kvm *kvm_create_vm(unsigned long type)
         if (!kvm)
                 return ERR_PTR(-ENOMEM);
  
+       spin_lock_init(&kvm->mmu_lock);
+       atomic_inc(&current->mm->mm_count);
+       kvm->mm = current->mm;
+       kvm_eventfd_init(kvm);
+       mutex_init(&kvm->lock);
+       mutex_init(&kvm->irq_lock);
+       mutex_init(&kvm->slots_lock);
+       atomic_set(&kvm->users_count, 1);
+       INIT_LIST_HEAD(&kvm->devices);
+
         r = kvm_arch_init_vm(kvm, type);
         if (r)
                 goto out_err_no_disable;
@@ -568,16 +592,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
                         goto out_err;
         }
  
-       spin_lock_init(&kvm->mmu_lock);
-       kvm->mm = current->mm;
-       atomic_inc(&kvm->mm->mm_count);
-       kvm_eventfd_init(kvm);
-       mutex_init(&kvm->lock);
-       mutex_init(&kvm->irq_lock);
-       mutex_init(&kvm->slots_lock);
-       atomic_set(&kvm->users_count, 1);
-       INIT_LIST_HEAD(&kvm->devices);
-
         r = kvm_init_mmu_notifier(kvm);
         if (r)
                 goto out_err;
@@ -602,6 +616,7 @@ out_err_no_disable:
         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
                 kvm_free_memslots(kvm, kvm->memslots[i]);
         kvm_arch_free_vm(kvm);
+       mmdrop(current->mm);
         return ERR_PTR(r);
  }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 22 Mar 2016 23:28:22 +0000 (16:28 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 22 Mar 2016 23:28:22 +0000 (16:28 -0700)
arch/arm/kvm/arm.c		patch \| blob \| blame \| history
arch/arm64/include/asm/kvm_asm.h		patch \| blob \| blame \| history
arch/arm64/kvm/hyp/Makefile		patch \| blob \| blame \| history
arch/powerpc/kvm/Makefile		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_64_vio.c		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_64_vio_hv.c		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_hv_rmhandlers.S		patch \| blob \| blame \| history
arch/powerpc/kvm/powerpc.c		patch \| blob \| blame \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| blame \| history
arch/x86/include/asm/pgtable.h		patch \| blob \| blame \| history
arch/x86/include/asm/special_insns.h		patch \| blob \| blame \| history
arch/x86/kernel/kvm.c		patch \| blob \| blame \| history
arch/x86/kvm/cpuid.c		patch \| blob \| blame \| history
arch/x86/kvm/cpuid.h		patch \| blob \| blame \| history
arch/x86/kvm/kvm_cache_regs.h		patch \| blob \| blame \| history
arch/x86/kvm/mmu.c		patch \| blob \| blame \| history
arch/x86/kvm/mmu.h		patch \| blob \| blame \| history
arch/x86/kvm/page_track.c		patch \| blob \| blame \| history
arch/x86/kvm/paging_tmpl.h		patch \| blob \| blame \| history
arch/x86/kvm/svm.c		patch \| blob \| blame \| history
arch/x86/kvm/vmx.c		patch \| blob \| blame \| history
arch/x86/kvm/x86.c		patch \| blob \| blame \| history
arch/x86/kvm/x86.h		patch \| blob \| blame \| history
virt/kvm/kvm_main.c		patch \| blob \| blame \| history