PPC | KVM_REG_PPC_DBSR | 32
PPC | KVM_REG_PPC_TIDR | 64
PPC | KVM_REG_PPC_PSSCR | 64
+ PPC | KVM_REG_PPC_DEC_EXPIRY | 64
PPC | KVM_REG_PPC_TM_GPR0 | 64
...
PPC | KVM_REG_PPC_TM_GPR31 | 64
struct kvm_s390_irq_state {
__u64 buf;
- __u32 flags;
+ __u32 flags; /* will stay unused for compatibility reasons */
__u32 len;
- __u32 reserved[4];
+ __u32 reserved[4]; /* will stay unused for compatibility reasons */
};
Userspace passes in the above struct and for each pending interrupt a
struct kvm_s390_irq is copied to the provided buffer.
+The structure contains a flags and a reserved field for future extensions. As
+the kernel never checked for flags == 0 and QEMU never pre-zeroed flags and
+reserved, these fields can not be used in the future without breaking
+compatibility.
+
If -ENOBUFS is returned the buffer provided was too small and userspace
may retry with a bigger buffer.
struct kvm_s390_irq_state {
__u64 buf;
+ __u32 flags; /* will stay unused for compatibility reasons */
__u32 len;
- __u32 pad;
+ __u32 reserved[4]; /* will stay unused for compatibility reasons */
};
+The restrictions for flags and reserved apply as well.
+(see KVM_S390_GET_IRQ_STATE)
+
The userspace memory referenced by buf contains a struct kvm_s390_irq
for each interrupt to be injected into the guest.
If one of the interrupts could not be injected for some reason the
or if no page table is present for the addresses (e.g. when using
hugepages).
+4.109 KVM_PPC_GET_CPU_CHAR
+
+Capability: KVM_CAP_PPC_GET_CPU_CHAR
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_cpu_char (out)
+Returns: 0 on successful completion
+ -EFAULT if struct kvm_ppc_cpu_char cannot be written
+
+This ioctl gives userspace information about certain characteristics
+of the CPU relating to speculative execution of instructions and
+possible information leakage resulting from speculative execution (see
+CVE-2017-5715, CVE-2017-5753 and CVE-2017-5754). The information is
+returned in struct kvm_ppc_cpu_char, which looks like this:
+
+struct kvm_ppc_cpu_char {
+ __u64 character; /* characteristics of the CPU */
+ __u64 behaviour; /* recommended software behaviour */
+ __u64 character_mask; /* valid bits in character */
+ __u64 behaviour_mask; /* valid bits in behaviour */
+};
+
+For extensibility, the character_mask and behaviour_mask fields
+indicate which bits of character and behaviour have been filled in by
+the kernel. If the set of defined bits is extended in future then
+userspace will be able to tell whether it is running on a kernel that
+knows about the new bits.
+
+The character field describes attributes of the CPU which can help
+with preventing inadvertent information disclosure - specifically,
+whether there is an instruction to flash-invalidate the L1 data cache
+(ori 30,30,0 or mtspr SPRN_TRIG2,rN), whether the L1 data cache is set
+to a mode where entries can only be used by the thread that created
+them, whether the bcctr[l] instruction prevents speculation, and
+whether a speculation barrier instruction (ori 31,31,0) is provided.
+
+The behaviour field describes actions that software should take to
+prevent inadvertent information disclosure, and thus describes which
+vulnerabilities the hardware is subject to; specifically whether the
+L1 data cache should be flushed when returning to user mode from the
+kernel, and whether a speculation barrier should be placed between an
+array bounds check and the array access.
+
+These fields use the same bit definitions as the new
+H_GET_CPU_CHARACTERISTICS hypercall.
+
+4.110 KVM_MEMORY_ENCRYPT_OP
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: an opaque platform specific structure (in/out)
+Returns: 0 on success; -1 on error
+
+If the platform supports creating encrypted VMs then this ioctl can be used
+for issuing platform-specific memory encryption commands to manage those
+encrypted VMs.
+
+Currently, this ioctl is used for issuing Secure Encrypted Virtualization
+(SEV) commands on AMD Processors. The SEV commands are defined in
+Documentation/virtual/kvm/amd-memory-encryption.txt.
+
+4.111 KVM_MEMORY_ENCRYPT_REG_REGION
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: struct kvm_enc_region (in)
+Returns: 0 on success; -1 on error
+
+This ioctl can be used to register a guest memory region which may
+contain encrypted data (e.g. guest RAM, SMRAM etc).
+
+It is used in the SEV-enabled guest. When encryption is enabled, a guest
+memory region may contain encrypted data. The SEV memory encryption
+engine uses a tweak such that two identical plaintext pages, each at
+different locations will have differing ciphertexts. So swapping or
+moving ciphertext of those pages will not result in plaintext being
+swapped. So relocating (or migrating) physical backing pages for the SEV
+guest will require some additional steps.
+
+Note: The current SEV key management spec does not provide commands to
+swap or migrate (move) ciphertext pages. Hence, for now we pin the guest
+memory region registered with the ioctl.
+
+4.112 KVM_MEMORY_ENCRYPT_UNREG_REGION
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: struct kvm_enc_region (in)
+Returns: 0 on success; -1 on error
+
+This ioctl can be used to unregister the guest memory region registered
+with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
+
+
5. The kvm_run structure
------------------------
__u32 ap_encodings[8];
};
+/* For KVM_PPC_GET_CPU_CHAR */
+struct kvm_ppc_cpu_char {
+ __u64 character; /* characteristics of the CPU */
+ __u64 behaviour; /* recommended software behaviour */
+ __u64 character_mask; /* valid bits in character */
+ __u64 behaviour_mask; /* valid bits in behaviour */
+};
+
+/*
+ * Values for character and character_mask.
+ * These are identical to the values used by H_GET_CPU_CHARACTERISTICS.
+ */
+#define KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 (1ULL << 63)
+#define KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED (1ULL << 62)
+#define KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 (1ULL << 61)
+#define KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 (1ULL << 60)
+#define KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV (1ULL << 59)
+#define KVM_PPC_CPU_CHAR_BR_HINT_HONOURED (1ULL << 58)
+#define KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF (1ULL << 57)
+#define KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS (1ULL << 56)
+
+#define KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY (1ULL << 63)
+#define KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR (1ULL << 62)
+#define KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ULL << 61)
+
/* Per-vcpu XICS interrupt controller state */
#define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
#define KVM_REG_PPC_TIDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc)
#define KVM_REG_PPC_PSSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd)
+ #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
+
/* Transactional Memory checkpointed state:
* This is all GPRs, all VSX regs and a subset of SPRs
*/
OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);
OFFSET(PACA_IN_MCE, paca_struct, in_mce);
OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
+ OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
+ OFFSET(PACA_EXRFI, paca_struct, exrfi);
+ OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence);
+ OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets);
+
#endif
OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions);
OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded);
OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded);
+ OFFSET(VCPU_IRQ_PENDING, kvm_vcpu, arch.irq_pending);
OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request);
OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr);
OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc);
DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
arch.xive_cam_word));
DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
+ DEFINE(VCPU_XIVE_ESC_ON, offsetof(struct kvm_vcpu, arch.xive_esc_on));
+ DEFINE(VCPU_XIVE_ESC_RADDR, offsetof(struct kvm_vcpu, arch.xive_esc_raddr));
+ DEFINE(VCPU_XIVE_ESC_VADDR, offsetof(struct kvm_vcpu, arch.xive_esc_vaddr));
#endif
#ifdef CONFIG_KVM_EXIT_TIMING
mtmsrd r0,1 /* clear RI in MSR */
mtsrr0 r5
mtsrr1 r6
- RFI
+ RFI_TO_KERNEL
kvmppc_call_hv_entry:
BEGIN_FTR_SECTION
mtmsrd r6, 1 /* Clear RI in MSR */
mtsrr0 r8
mtsrr1 r7
- RFI
+ RFI_TO_KERNEL
/* Virtual-mode return */
.Lvirt_return:
lbz r0, KVM_RADIX(r9)
cmpwi cr7, r0, 0
- /* Clear out SLB if hash */
- bne cr7, 2f
- li r6,0
- slbmte r6,r6
- slbia
- ptesync
- 2:
/*
* POWER7/POWER8 host -> guest partition switch code.
* We don't have to lock against concurrent tlbies,
10: cmpdi r4, 0
beq kvmppc_primary_no_guest
kvmppc_got_guest:
-
- /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
- lwz r5,VCPU_SLB_MAX(r4)
- cmpwi r5,0
- beq 9f
- mtctr r5
- addi r6,r4,VCPU_SLB
- 1: ld r8,VCPU_SLB_E(r6)
- ld r9,VCPU_SLB_V(r6)
- slbmte r9,r8
- addi r6,r6,VCPU_SLB_SIZE
- bdnz 1b
- 9:
/* Increment yield count if they have a VPA */
ld r3, VCPU_VPA(r4)
cmpdi r3, 0
mftb r7
subf r3,r7,r8
mtspr SPRN_DEC,r3
- std r3,VCPU_DEC(r4)
ld r5, VCPU_SPRG0(r4)
ld r6, VCPU_SPRG1(r4)
cmpdi r3, 512 /* 1 microsecond */
blt hdec_soon
+ /* For hash guest, clear out and reload the SLB */
+ ld r6, VCPU_KVM(r4)
+ lbz r0, KVM_RADIX(r6)
+ cmpwi r0, 0
+ bne 9f
+ li r6, 0
+ slbmte r6, r6
+ slbia
+ ptesync
+
+ /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
+ lwz r5,VCPU_SLB_MAX(r4)
+ cmpwi r5,0
+ beq 9f
+ mtctr r5
+ addi r6,r4,VCPU_SLB
+ 1: ld r8,VCPU_SLB_E(r6)
+ ld r9,VCPU_SLB_V(r6)
+ slbmte r9,r8
+ addi r6,r6,VCPU_SLB_SIZE
+ bdnz 1b
+ 9:
+
#ifdef CONFIG_KVM_XICS
/* We are entering the guest on that thread, push VCPU to XIVE */
ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
li r9, TM_QW1_OS + TM_WORD2
stwcix r11,r9,r10
li r9, 1
- stw r9, VCPU_XIVE_PUSHED(r4)
+ stb r9, VCPU_XIVE_PUSHED(r4)
eieio
+
+ /*
+ * We clear the irq_pending flag. There is a small chance of a
+ * race vs. the escalation interrupt happening on another
+ * processor setting it again, but the only consequence is to
+ * cause a spurrious wakeup on the next H_CEDE which is not an
+ * issue.
+ */
+ li r0,0
+ stb r0, VCPU_IRQ_PENDING(r4)
+
+ /*
+ * In single escalation mode, if the escalation interrupt is
+ * on, we mask it.
+ */
+ lbz r0, VCPU_XIVE_ESC_ON(r4)
+ cmpwi r0,0
+ beq 1f
+ ld r10, VCPU_XIVE_ESC_RADDR(r4)
+ li r9, XIVE_ESB_SET_PQ_01
+ ldcix r0, r10, r9
+ sync
+
+ /* We have a possible subtle race here: The escalation interrupt might
+ * have fired and be on its way to the host queue while we mask it,
+ * and if we unmask it early enough (re-cede right away), there is
+ * a theorical possibility that it fires again, thus landing in the
+ * target queue more than once which is a big no-no.
+ *
+ * Fortunately, solving this is rather easy. If the above load setting
+ * PQ to 01 returns a previous value where P is set, then we know the
+ * escalation interrupt is somewhere on its way to the host. In that
+ * case we simply don't clear the xive_esc_on flag below. It will be
+ * eventually cleared by the handler for the escalation interrupt.
+ *
+ * Then, when doing a cede, we check that flag again before re-enabling
+ * the escalation interrupt, and if set, we abort the cede.
+ */
+ andi. r0, r0, XIVE_ESB_VAL_P
+ bne- 1f
+
+ /* Now P is 0, we can clear the flag */
+ li r0, 0
+ stb r0, VCPU_XIVE_ESC_ON(r4)
+ 1:
no_xive:
#endif /* CONFIG_KVM_XICS */
ld r0, VCPU_GPR(R0)(r4)
ld r4, VCPU_GPR(R4)(r4)
-
- hrfid
+ HRFI_TO_GUEST
b .
secondary_too_late:
addi r3, r4, VCPU_TB_RMEXIT
bl kvmhv_accumulate_time
#endif
- b guest_exit_cont
+ b guest_bypass
/******************************************************************************
* *
blt deliver_guest_interrupt
guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
+ /* Save more register state */
+ mfdar r6
+ mfdsisr r7
+ std r6, VCPU_DAR(r9)
+ stw r7, VCPU_DSISR(r9)
+ /* don't overwrite fault_dar/fault_dsisr if HDSI */
+ cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
+ beq mc_cont
+ std r6, VCPU_FAULT_DAR(r9)
+ stw r7, VCPU_FAULT_DSISR(r9)
+
+ /* See if it is a machine check */
+ cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+ beq machine_check_realmode
+ mc_cont:
+ #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ addi r3, r9, VCPU_TB_RMEXIT
+ mr r4, r9
+ bl kvmhv_accumulate_time
+ #endif
#ifdef CONFIG_KVM_XICS
/* We are exiting, pull the VP from the XIVE */
- lwz r0, VCPU_XIVE_PUSHED(r9)
+ lbz r0, VCPU_XIVE_PUSHED(r9)
cmpwi cr0, r0, 0
beq 1f
li r7, TM_SPC_PULL_OS_CTX
li r6, TM_QW1_OS
mfmsr r0
- andi. r0, r0, MSR_IR /* in real mode? */
+ andi. r0, r0, MSR_DR /* in real mode? */
beq 2f
ld r10, HSTATE_XIVE_TIMA_VIRT(r13)
cmpldi cr0, r10, 0
/* Fixup some of the state for the next load */
li r10, 0
li r0, 0xff
- stw r10, VCPU_XIVE_PUSHED(r9)
+ stb r10, VCPU_XIVE_PUSHED(r9)
stb r10, (VCPU_XIVE_SAVED_STATE+3)(r9)
stb r0, (VCPU_XIVE_SAVED_STATE+4)(r9)
eieio
1:
#endif /* CONFIG_KVM_XICS */
- /* Save more register state */
- mfdar r6
- mfdsisr r7
- std r6, VCPU_DAR(r9)
- stw r7, VCPU_DSISR(r9)
- /* don't overwrite fault_dar/fault_dsisr if HDSI */
- cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
- beq mc_cont
- std r6, VCPU_FAULT_DAR(r9)
- stw r7, VCPU_FAULT_DSISR(r9)
- /* See if it is a machine check */
- cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
- beq machine_check_realmode
- mc_cont:
- #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
- addi r3, r9, VCPU_TB_RMEXIT
- mr r4, r9
- bl kvmhv_accumulate_time
- #endif
+ /* For hash guest, read the guest SLB and save it away */
+ ld r5, VCPU_KVM(r9)
+ lbz r0, KVM_RADIX(r5)
+ li r5, 0
+ cmpwi r0, 0
+ bne 3f /* for radix, save 0 entries */
+ lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */
+ mtctr r0
+ li r6,0
+ addi r7,r9,VCPU_SLB
+ 1: slbmfee r8,r6
+ andis. r0,r8,SLB_ESID_V@h
+ beq 2f
+ add r8,r8,r6 /* put index in */
+ slbmfev r3,r6
+ std r8,VCPU_SLB_E(r7)
+ std r3,VCPU_SLB_V(r7)
+ addi r7,r7,VCPU_SLB_SIZE
+ addi r5,r5,1
+ 2: addi r6,r6,1
+ bdnz 1b
+ /* Finally clear out the SLB */
+ li r0,0
+ slbmte r0,r0
+ slbia
+ ptesync
+ 3: stw r5,VCPU_SLB_MAX(r9)
+ guest_bypass:
mr r3, r12
/* Increment exit count, poke other threads to exit */
bl kvmhv_commence_exit
ori r6,r6,1
mtspr SPRN_CTRLT,r6
4:
- /* Check if we are running hash or radix and store it in cr2 */
- ld r5, VCPU_KVM(r9)
- lbz r0, KVM_RADIX(r5)
- cmpwi cr2,r0,0
-
- /* Read the guest SLB and save it away */
- li r5, 0
- bne cr2, 3f /* for radix, save 0 entries */
- lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */
- mtctr r0
- li r6,0
- addi r7,r9,VCPU_SLB
- 1: slbmfee r8,r6
- andis. r0,r8,SLB_ESID_V@h
- beq 2f
- add r8,r8,r6 /* put index in */
- slbmfev r3,r6
- std r8,VCPU_SLB_E(r7)
- std r3,VCPU_SLB_V(r7)
- addi r7,r7,VCPU_SLB_SIZE
- addi r5,r5,1
- 2: addi r6,r6,1
- bdnz 1b
- 3: stw r5,VCPU_SLB_MAX(r9)
-
/*
* Save the guest PURR/SPURR
*/
ld r5, VCPU_KVM(r9)
lbz r0, KVM_RADIX(r5)
cmpwi cr2, r0, 0
- beq cr2, 3f
+ beq cr2, 4f
/* Radix: Handle the case where the guest used an illegal PID */
LOAD_REG_ADDR(r4, mmu_base_pid)
BEGIN_FTR_SECTION
PPC_INVALIDATE_ERAT
END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
- b 4f
+ 4:
#endif /* CONFIG_PPC_RADIX_MMU */
- /* Hash: clear out SLB */
- 3: li r5,0
- slbmte r5,r5
- slbia
- ptesync
- 4:
/*
* POWER7/POWER8 guest -> host partition switch code.
* We don't have to lock against tlbies but we do
bne 27f
bl kvmppc_realmode_hmi_handler
nop
+ cmpdi r3, 0
li r12, BOOK3S_INTERRUPT_HMI
/*
- * At this point kvmppc_realmode_hmi_handler would have resync-ed
- * the TB. Hence it is not required to subtract guest timebase
- * offset from timebase. So, skip it.
+ * At this point kvmppc_realmode_hmi_handler may have resync-ed
+ * the TB, and if it has, we must not subtract the guest timebase
+ * offset from the timebase. So, skip it.
*
* Also, do not call kvmppc_subcore_exit_guest() because it has
* been invoked as part of kvmppc_realmode_hmi_handler().
*/
- b 30f
+ beq 30f
27:
/* Subtract timebase offset from timebase */
/* we've ceded but we want to give control to the host */
kvm_cede_exit:
ld r9, HSTATE_KVM_VCPU(r13)
- b guest_exit_cont
+ #ifdef CONFIG_KVM_XICS
+ /* Abort if we still have a pending escalation */
+ lbz r5, VCPU_XIVE_ESC_ON(r9)
+ cmpwi r5, 0
+ beq 1f
+ li r0, 0
+ stb r0, VCPU_CEDED(r9)
+ 1: /* Enable XIVE escalation */
+ li r5, XIVE_ESB_SET_PQ_00
+ mfmsr r0
+ andi. r0, r0, MSR_DR /* in real mode? */
+ beq 1f
+ ld r10, VCPU_XIVE_ESC_VADDR(r9)
+ cmpdi r10, 0
+ beq 3f
+ ldx r0, r10, r5
+ b 2f
+ 1: ld r10, VCPU_XIVE_ESC_RADDR(r9)
+ cmpdi r10, 0
+ beq 3f
+ ldcix r0, r10, r5
+ 2: sync
+ li r0, 1
+ stb r0, VCPU_XIVE_ESC_ON(r9)
+ #endif /* CONFIG_KVM_XICS */
+ 3: b guest_exit_cont
/* Try to handle a machine check in real mode */
machine_check_realmode:
ld r4, PACAKMSR(r13)
mtspr SPRN_SRR0, r3
mtspr SPRN_SRR1, r4
- rfid
+ RFI_TO_KERNEL
9: addi r3, r1, STACK_FRAME_OVERHEAD
bl kvmppc_bad_interrupt
b 9b
{
struct kvm_vcpu *vcpu = data;
- /* We use the existing H_PROD mechanism to wake up the target */
- vcpu->arch.prodded = 1;
+ vcpu->arch.irq_pending = 1;
smp_mb();
if (vcpu->arch.ceded)
kvmppc_fast_vcpu_kick(vcpu);
+ /* Since we have the no-EOI flag, the interrupt is effectively
+ * disabled now. Clearing xive_esc_on means we won't bother
+ * doing so on the next entry.
+ *
+ * This also allows the entry code to know that if a PQ combination
+ * of 10 is observed while xive_esc_on is true, it means the queue
+ * contains an unprocessed escalation interrupt. We don't make use of
+ * that knowledge today but might (see comment in book3s_hv_rmhandler.S)
+ */
+ vcpu->arch.xive_esc_on = false;
+
return IRQ_HANDLED;
}
return -EIO;
}
- /*
- * Future improvement: start with them disabled
- * and handle DD2 and later scheme of merged escalation
- * interrupts
- */
- name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
- vcpu->kvm->arch.lpid, xc->server_num, prio);
+ if (xc->xive->single_escalation)
+ name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
+ vcpu->kvm->arch.lpid, xc->server_num);
+ else
+ name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
+ vcpu->kvm->arch.lpid, xc->server_num, prio);
if (!name) {
pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
prio, xc->server_num);
rc = -ENOMEM;
goto error;
}
+
+ pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio);
+
rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
IRQF_NO_THREAD, name, vcpu);
if (rc) {
goto error;
}
xc->esc_virq_names[prio] = name;
+
+ /* In single escalation mode, we grab the ESB MMIO of the
+ * interrupt and mask it. Also populate the VCPU v/raddr
+ * of the ESB page for use by asm entry/exit code. Finally
+ * set the XIVE_IRQ_NO_EOI flag which will prevent the
+ * core code from performing an EOI on the escalation
+ * interrupt, thus leaving it effectively masked after
+ * it fires once.
+ */
+ if (xc->xive->single_escalation) {
+ struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
+ vcpu->arch.xive_esc_raddr = xd->eoi_page;
+ vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio;
+ xd->flags |= XIVE_IRQ_NO_EOI;
+ }
+
return 0;
error:
irq_dispose_mapping(xc->esc_virq[prio]);
pr_devel("Provisioning prio... %d\n", prio);
- /* Provision each VCPU and enable escalations */
+ /* Provision each VCPU and enable escalations if needed */
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!vcpu->arch.xive_vcpu)
continue;
rc = xive_provision_queue(vcpu, prio);
- if (rc == 0)
+ if (rc == 0 && !xive->single_escalation)
xive_attach_escalation(vcpu, prio);
if (rc)
return rc;
/* Return the per-cpu state for state saving/migration */
return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
- (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT;
+ (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT |
+ (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT;
}
int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
/* Allocate IPI */
xc->vp_ipi = xive_native_alloc_irq();
if (!xc->vp_ipi) {
+ pr_err("Failed to allocate xive irq for VCPU IPI\n");
r = -EIO;
goto bail;
}
if (r)
goto bail;
+ /*
+ * Enable the VP first as the single escalation mode will
+ * affect escalation interrupts numbering
+ */
+ r = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
+ if (r) {
+ pr_err("Failed to enable VP in OPAL, err %d\n", r);
+ goto bail;
+ }
+
/*
* Initialize queues. Initially we set them all for no queueing
* and we enable escalation for queue 0 only which we'll use for
* our mfrr change notifications. If the VCPU is hot-plugged, we
- * do handle provisioning however.
+ * do handle provisioning however based on the existing "map"
+ * of enabled queues.
*/
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
struct xive_q *q = &xc->queues[i];
+ /* Single escalation, no queue 7 */
+ if (i == 7 && xive->single_escalation)
+ break;
+
/* Is queue already enabled ? Provision it */
if (xive->qmap & (1 << i)) {
r = xive_provision_queue(vcpu, i);
- if (r == 0)
+ if (r == 0 && !xive->single_escalation)
xive_attach_escalation(vcpu, i);
if (r)
goto bail;
if (r)
goto bail;
- /* Enable the VP */
- r = xive_native_enable_vp(xc->vp_id);
- if (r)
- goto bail;
-
/* Route the IPI */
r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
if (!r)
pr_devel(" val=0x016%llx (server=0x%x, guest_prio=%d)\n",
val, server, guest_prio);
+
/*
* If the source doesn't already have an IPI, allocate
* one and get the corresponding data
/*
* Restore P and Q. If the interrupt was pending, we
- * force both P and Q, which will trigger a resend.
+ * force Q and !P, which will trigger a resend.
*
* That means that a guest that had both an interrupt
* pending (queued) and Q set will restore with only
* is perfectly fine as coalescing interrupts that haven't
* been presented yet is always allowed.
*/
- if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+ if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))
state->old_p = true;
if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
state->old_q = true;
if (xive->vp_base == XIVE_INVALID_VP)
ret = -ENOMEM;
+ xive->single_escalation = xive_native_has_single_escalation();
+
if (ret) {
kfree(xive);
return ret;
kvm_for_each_vcpu(i, vcpu, kvm) {
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+ unsigned int i;
if (!xc)
continue;
xc->server_num, xc->cppr, xc->hw_cppr,
xc->mfrr, xc->pending,
xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
+ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+ struct xive_q *q = &xc->queues[i];
+ u32 i0, i1, idx;
+
+ if (!q->qpage && !xc->esc_virq[i])
+ continue;
+
+ seq_printf(m, " [q%d]: ", i);
+
+ if (q->qpage) {
+ idx = q->idx;
+ i0 = be32_to_cpup(q->qpage + idx);
+ idx = (idx + 1) & q->msk;
+ i1 = be32_to_cpup(q->qpage + idx);
+ seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1);
+ }
+ if (xc->esc_virq[i]) {
+ struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+ u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
+ seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
+ (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
+ (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
+ xc->esc_virq[i], pq, xd->eoi_page);
+ seq_printf(m, "\n");
+ }
+ }
t_rm_h_xirr += xc->stat_rm_h_xirr;
t_rm_h_ipoll += xc->stat_rm_h_ipoll;
#include <asm/iommu.h>
#include <asm/switch_to.h>
#include <asm/xive.h>
+#ifdef CONFIG_PPC_PSERIES
+#include <asm/hvcall.h>
+#include <asm/plpar_wrappers.h>
+#endif
#include "timing.h"
#include "irq.h"
#ifdef CONFIG_KVM_XICS
case KVM_CAP_IRQ_XICS:
#endif
+ case KVM_CAP_PPC_GET_CPU_CHAR:
r = 1;
break;
hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
- vcpu->arch.dec_expires = ~(u64)0;
+ vcpu->arch.dec_expires = get_tb();
#ifdef CONFIG_KVM_EXIT_TIMING
mutex_init(&vcpu->arch.exit_timing_lock);
{
enum emulation_result emulated = EMULATE_DONE;
- /* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
- if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
- (vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+ /* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */
+ if (vcpu->arch.mmio_vsx_copy_nums > 4)
return EMULATE_FAIL;
- }
while (vcpu->arch.mmio_vsx_copy_nums) {
emulated = __kvmppc_handle_load(run, vcpu, rt, bytes,
vcpu->arch.io_gpr = rs;
- /* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
- if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
- (vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+ /* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */
+ if (vcpu->arch.mmio_vsx_copy_nums > 4)
return EMULATE_FAIL;
- }
while (vcpu->arch.mmio_vsx_copy_nums) {
if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1)
{
int r;
+ vcpu_load(vcpu);
+
if (vcpu->mmio_needed) {
vcpu->mmio_needed = 0;
if (!vcpu->mmio_is_write)
r = kvmppc_emulate_mmio_vsx_loadstore(vcpu, run);
if (r == RESUME_HOST) {
vcpu->mmio_needed = 1;
- return r;
+ goto out;
}
}
#endif
kvm_sigset_deactivate(vcpu);
+out:
+ vcpu_put(vcpu);
return r;
}
return -EINVAL;
}
-long kvm_arch_vcpu_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_async_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
- long r;
- switch (ioctl) {
- case KVM_INTERRUPT: {
+ if (ioctl == KVM_INTERRUPT) {
struct kvm_interrupt irq;
- r = -EFAULT;
if (copy_from_user(&irq, argp, sizeof(irq)))
- goto out;
- r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
- goto out;
+ return -EFAULT;
+ return kvm_vcpu_ioctl_interrupt(vcpu, &irq);
}
+ return -ENOIOCTLCMD;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ long r;
+
+ vcpu_load(vcpu);
+ switch (ioctl) {
case KVM_ENABLE_CAP:
{
struct kvm_enable_cap cap;
}
out:
+ vcpu_put(vcpu);
return r;
}
return r;
}
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * These functions check whether the underlying hardware is safe
+ * against attacks based on observing the effects of speculatively
+ * executed instructions, and whether it supplies instructions for
+ * use in workarounds. The information comes from firmware, either
+ * via the device tree on powernv platforms or from an hcall on
+ * pseries platforms.
+ */
+#ifdef CONFIG_PPC_PSERIES
+static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp)
+{
+ struct h_cpu_char_result c;
+ unsigned long rc;
+
+ if (!machine_is(pseries))
+ return -ENOTTY;
+
+ rc = plpar_get_cpu_characteristics(&c);
+ if (rc == H_SUCCESS) {
+ cp->character = c.character;
+ cp->behaviour = c.behaviour;
+ cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 |
+ KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED |
+ KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 |
+ KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 |
+ KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV |
+ KVM_PPC_CPU_CHAR_BR_HINT_HONOURED |
+ KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF |
+ KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
+ cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY |
+ KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR |
+ KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+ }
+ return 0;
+}
+#else
+static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp)
+{
+ return -ENOTTY;
+}
+#endif
+
+static inline bool have_fw_feat(struct device_node *fw_features,
+ const char *state, const char *name)
+{
+ struct device_node *np;
+ bool r = false;
+
+ np = of_get_child_by_name(fw_features, name);
+ if (np) {
+ r = of_property_read_bool(np, state);
+ of_node_put(np);
+ }
+ return r;
+}
+
+static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp)
+{
+ struct device_node *np, *fw_features;
+ int r;
+
+ memset(cp, 0, sizeof(*cp));
+ r = pseries_get_cpu_char(cp);
+ if (r != -ENOTTY)
+ return r;
+
+ np = of_find_node_by_name(NULL, "ibm,opal");
+ if (np) {
+ fw_features = of_get_child_by_name(np, "fw-features");
+ of_node_put(np);
+ if (!fw_features)
+ return 0;
+ if (have_fw_feat(fw_features, "enabled",
+ "inst-spec-barrier-ori31,31,0"))
+ cp->character |= KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31;
+ if (have_fw_feat(fw_features, "enabled",
+ "fw-bcctrl-serialized"))
+ cp->character |= KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED;
+ if (have_fw_feat(fw_features, "enabled",
+ "inst-l1d-flush-ori30,30,0"))
+ cp->character |= KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30;
+ if (have_fw_feat(fw_features, "enabled",
+ "inst-l1d-flush-trig2"))
+ cp->character |= KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2;
+ if (have_fw_feat(fw_features, "enabled",
+ "fw-l1d-thread-split"))
+ cp->character |= KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV;
+ if (have_fw_feat(fw_features, "enabled",
+ "fw-count-cache-disabled"))
+ cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
+ cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 |
+ KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED |
+ KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 |
+ KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 |
+ KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV |
+ KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
+
+ if (have_fw_feat(fw_features, "enabled",
+ "speculation-policy-favor-security"))
+ cp->behaviour |= KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY;
+ if (!have_fw_feat(fw_features, "disabled",
+ "needs-l1d-flush-msr-pr-0-to-1"))
+ cp->behaviour |= KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR;
+ if (!have_fw_feat(fw_features, "disabled",
+ "needs-spec-barrier-for-bound-checks"))
+ cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+ cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY |
+ KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR |
+ KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+
+ of_node_put(fw_features);
+ }
+
+ return 0;
+}
+#endif
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
r = -EFAULT;
break;
}
+ case KVM_PPC_GET_CPU_CHAR: {
+ struct kvm_ppc_cpu_char cpuchar;
+
+ r = kvmppc_get_cpu_char(&cpuchar);
+ if (r >= 0 && copy_to_user(argp, &cpuchar, sizeof(cpuchar)))
+ r = -EFAULT;
+ break;
+ }
default: {
struct kvm *kvm = filp->private_data;
r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);