KVM: x86: Virtualize FLUSH_L1D and passthrough MSR_IA32_FLUSH_CMD
[linux-block.git] / arch / x86 / kvm / x86.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
043405e1
CO
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * derived from drivers/kvm/kvm_main.c
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
4d5c5d0f
BAY
8 * Copyright (C) 2008 Qumranet, Inc.
9 * Copyright IBM Corporation, 2008
9611c187 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
043405e1
CO
11 *
12 * Authors:
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
4d5c5d0f
BAY
15 * Amit Shah <amit.shah@qumranet.com>
16 * Ben-Ami Yassour <benami@il.ibm.com>
043405e1 17 */
8d20bd63 18#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
043405e1 19
edf88417 20#include <linux/kvm_host.h>
313a3dc7 21#include "irq.h"
88197e6a 22#include "ioapic.h"
1d737c8a 23#include "mmu.h"
7837699f 24#include "i8254.h"
37817f29 25#include "tss.h"
5fdbf976 26#include "kvm_cache_regs.h"
2f728d66 27#include "kvm_emulate.h"
26eef70c 28#include "x86.h"
00b27a3e 29#include "cpuid.h"
474a5bb9 30#include "pmu.h"
e83d5887 31#include "hyperv.h"
8df14af4 32#include "lapic.h"
23200b7a 33#include "xen.h"
b0b42197 34#include "smm.h"
313a3dc7 35
18068523 36#include <linux/clocksource.h>
4d5c5d0f 37#include <linux/interrupt.h>
313a3dc7
CO
38#include <linux/kvm.h>
39#include <linux/fs.h>
40#include <linux/vmalloc.h>
1767e931
PG
41#include <linux/export.h>
42#include <linux/moduleparam.h>
0de10343 43#include <linux/mman.h>
2bacc55c 44#include <linux/highmem.h>
19de40a8 45#include <linux/iommu.h>
c8076604 46#include <linux/cpufreq.h>
18863bdd 47#include <linux/user-return-notifier.h>
a983fb23 48#include <linux/srcu.h>
5a0e3ad6 49#include <linux/slab.h>
ff9d07a0 50#include <linux/perf_event.h>
7bee342a 51#include <linux/uaccess.h>
af585b92 52#include <linux/hash.h>
a1b60c1c 53#include <linux/pci.h>
16e8d74d
MT
54#include <linux/timekeeper_internal.h>
55#include <linux/pvclock_gtod.h>
87276880
FW
56#include <linux/kvm_irqfd.h>
57#include <linux/irqbypass.h>
3905f9ad 58#include <linux/sched/stat.h>
0c5f81da 59#include <linux/sched/isolation.h>
d0ec49d4 60#include <linux/mem_encrypt.h>
72c3c0fe 61#include <linux/entry-kvm.h>
7d62874f 62#include <linux/suspend.h>
3905f9ad 63
aec51dc4 64#include <trace/events/kvm.h>
2ed152af 65
24f1e32c 66#include <asm/debugreg.h>
d825ed0a 67#include <asm/msr.h>
a5f61300 68#include <asm/desc.h>
890ca9ae 69#include <asm/mce.h>
784a4661 70#include <asm/pkru.h>
f89e32e0 71#include <linux/kernel_stat.h>
a0ff0611
TG
72#include <asm/fpu/api.h>
73#include <asm/fpu/xcr.h>
74#include <asm/fpu/xstate.h>
1d5f066e 75#include <asm/pvclock.h>
217fc9cf 76#include <asm/div64.h>
efc64404 77#include <asm/irq_remapping.h>
b0c39dc6 78#include <asm/mshyperv.h>
0092e434 79#include <asm/hypervisor.h>
9715092f 80#include <asm/tlbflush.h>
bf8c55d8 81#include <asm/intel_pt.h>
b3dc0695 82#include <asm/emulate_prefix.h>
fe7e9488 83#include <asm/sgx.h>
dd2cb348 84#include <clocksource/hyperv_timer.h>
043405e1 85
d1898b73
DH
86#define CREATE_TRACE_POINTS
87#include "trace.h"
88
313a3dc7 89#define MAX_IO_MSRS 256
890ca9ae 90#define KVM_MAX_MCE_BANKS 32
938c8745
SC
91
92struct kvm_caps kvm_caps __read_mostly = {
93 .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
94};
95EXPORT_SYMBOL_GPL(kvm_caps);
890ca9ae 96
6e37ec88
SC
97#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
98
0f65dd70 99#define emul_to_vcpu(ctxt) \
c9b8b07c 100 ((struct kvm_vcpu *)(ctxt)->vcpu)
0f65dd70 101
50a37eb4
JR
102/* EFER defaults:
103 * - enable syscall per default because its emulated by KVM
104 * - enable LME and LMA per default on 64 bit KVM
105 */
106#ifdef CONFIG_X86_64
1260edbe
LJ
107static
108u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
50a37eb4 109#else
1260edbe 110static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
50a37eb4 111#endif
313a3dc7 112
b11306b5
SC
113static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
114
0dbb1123
AK
115#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
116
ba7bb663
DD
117#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
118
c519265f
RK
119#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
120 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
37131313 121
cb142eb7 122static void update_cr8_intercept(struct kvm_vcpu *vcpu);
7460fb4a 123static void process_nmi(struct kvm_vcpu *vcpu);
6addfc42 124static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
01643c51
KH
125static void store_regs(struct kvm_vcpu *vcpu);
126static int sync_regs(struct kvm_vcpu *vcpu);
d2f7d498 127static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
674eea0f 128
6dba9403
ML
129static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
130static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
131
3af4a9e6 132static DEFINE_MUTEX(vendor_module_lock);
afaf0b2f 133struct kvm_x86_ops kvm_x86_ops __read_mostly;
97896d04 134
9af5471b
JB
135#define KVM_X86_OP(func) \
136 DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
137 *(((struct kvm_x86_ops *)0)->func));
e4fc23ba 138#define KVM_X86_OP_OPTIONAL KVM_X86_OP
5be2226f 139#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
9af5471b
JB
140#include <asm/kvm-x86-ops.h>
141EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
142EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
9af5471b 143
893590c7 144static bool __read_mostly ignore_msrs = 0;
476bc001 145module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
ed85c068 146
d855066f 147bool __read_mostly report_ignored_msrs = true;
fab0aa3b 148module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
d855066f 149EXPORT_SYMBOL_GPL(report_ignored_msrs);
fab0aa3b 150
4c27625b 151unsigned int min_timer_period_us = 200;
9ed96e87
MT
152module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
153
630994b3
MT
154static bool __read_mostly kvmclock_periodic_sync = true;
155module_param(kvmclock_periodic_sync, bool, S_IRUGO);
156
cc578287 157/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
893590c7 158static u32 __read_mostly tsc_tolerance_ppm = 250;
cc578287
ZA
159module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
160
c3941d9e
SC
161/*
162 * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
d9f6e12f 163 * adaptive tuning starting from default advancement of 1000ns. '0' disables
c3941d9e 164 * advancement entirely. Any other value is used as-is and disables adaptive
d9f6e12f 165 * tuning, i.e. allows privileged userspace to set an exact advancement time.
c3941d9e
SC
166 */
167static int __read_mostly lapic_timer_advance_ns = -1;
0e6edceb 168module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
d0659d94 169
52004014
FW
170static bool __read_mostly vector_hashing = true;
171module_param(vector_hashing, bool, S_IRUGO);
172
c4ae60e4
LA
173bool __read_mostly enable_vmware_backdoor = false;
174module_param(enable_vmware_backdoor, bool, S_IRUGO);
175EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
176
d500e1ed
SC
177/*
178 * Flags to manipulate forced emulation behavior (any non-zero value will
179 * enable forced emulation).
180 */
181#define KVM_FEP_CLEAR_RFLAGS_RF BIT(1)
182static int __read_mostly force_emulation_prefix;
40aaa5b6 183module_param(force_emulation_prefix, int, 0644);
6c86eedc 184
0c5f81da
WL
185int __read_mostly pi_inject_timer = -1;
186module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
187
4732f244
LX
188/* Enable/disable PMU virtualization */
189bool __read_mostly enable_pmu = true;
190EXPORT_SYMBOL_GPL(enable_pmu);
191module_param(enable_pmu, bool, 0444);
192
cb00a70b 193bool __read_mostly eager_page_split = true;
a3fe5dbd
DM
194module_param(eager_page_split, bool, 0644);
195
6f0f2d5e
TL
196/* Enable/disable SMT_RSB bug mitigation */
197bool __read_mostly mitigate_smt_rsb;
198module_param(mitigate_smt_rsb, bool, 0444);
199
7e34fbd0
SC
200/*
201 * Restoring the host value for MSRs that are only consumed when running in
202 * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
203 * returns to userspace, i.e. the kernel can run with the guest's value.
204 */
205#define KVM_MAX_NR_USER_RETURN_MSRS 16
18863bdd 206
7e34fbd0 207struct kvm_user_return_msrs {
18863bdd
AK
208 struct user_return_notifier urn;
209 bool registered;
7e34fbd0 210 struct kvm_user_return_msr_values {
2bf78fa7
SY
211 u64 host;
212 u64 curr;
7e34fbd0 213 } values[KVM_MAX_NR_USER_RETURN_MSRS];
18863bdd
AK
214};
215
9cc39a5a
SC
216u32 __read_mostly kvm_nr_uret_msrs;
217EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
218static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
7e34fbd0 219static struct kvm_user_return_msrs __percpu *user_return_msrs;
18863bdd 220
cfc48181
SC
221#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
222 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
223 | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
86aff7a4 224 | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
cfc48181 225
91661989
SC
226u64 __read_mostly host_efer;
227EXPORT_SYMBOL_GPL(host_efer);
228
b96e6506 229bool __read_mostly allow_smaller_maxphyaddr = 0;
3edd6839
MG
230EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
231
fdf513e3
VK
232bool __read_mostly enable_apicv = true;
233EXPORT_SYMBOL_GPL(enable_apicv);
234
86137773
TL
235u64 __read_mostly host_xss;
236EXPORT_SYMBOL_GPL(host_xss);
139a12cf 237
fcfe1bae
JZ
238const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
239 KVM_GENERIC_VM_STATS(),
240 STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
241 STATS_DESC_COUNTER(VM, mmu_pte_write),
242 STATS_DESC_COUNTER(VM, mmu_pde_zapped),
243 STATS_DESC_COUNTER(VM, mmu_flooded),
244 STATS_DESC_COUNTER(VM, mmu_recycled),
245 STATS_DESC_COUNTER(VM, mmu_cache_miss),
246 STATS_DESC_ICOUNTER(VM, mmu_unsync),
71f51d2c
MZ
247 STATS_DESC_ICOUNTER(VM, pages_4k),
248 STATS_DESC_ICOUNTER(VM, pages_2m),
249 STATS_DESC_ICOUNTER(VM, pages_1g),
fcfe1bae 250 STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
ec1cf69c 251 STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
bc9e9e67 252 STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
fcfe1bae 253};
fcfe1bae
JZ
254
255const struct kvm_stats_header kvm_vm_stats_header = {
256 .name_size = KVM_STATS_NAME_SIZE,
257 .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
258 .id_offset = sizeof(struct kvm_stats_header),
259 .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
260 .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
261 sizeof(kvm_vm_stats_desc),
262};
263
ce55c049
JZ
264const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
265 KVM_GENERIC_VCPU_STATS(),
1075d41e 266 STATS_DESC_COUNTER(VCPU, pf_taken),
ce55c049 267 STATS_DESC_COUNTER(VCPU, pf_fixed),
1075d41e
SC
268 STATS_DESC_COUNTER(VCPU, pf_emulate),
269 STATS_DESC_COUNTER(VCPU, pf_spurious),
270 STATS_DESC_COUNTER(VCPU, pf_fast),
271 STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created),
ce55c049
JZ
272 STATS_DESC_COUNTER(VCPU, pf_guest),
273 STATS_DESC_COUNTER(VCPU, tlb_flush),
274 STATS_DESC_COUNTER(VCPU, invlpg),
275 STATS_DESC_COUNTER(VCPU, exits),
276 STATS_DESC_COUNTER(VCPU, io_exits),
277 STATS_DESC_COUNTER(VCPU, mmio_exits),
278 STATS_DESC_COUNTER(VCPU, signal_exits),
279 STATS_DESC_COUNTER(VCPU, irq_window_exits),
280 STATS_DESC_COUNTER(VCPU, nmi_window_exits),
281 STATS_DESC_COUNTER(VCPU, l1d_flush),
282 STATS_DESC_COUNTER(VCPU, halt_exits),
283 STATS_DESC_COUNTER(VCPU, request_irq_exits),
284 STATS_DESC_COUNTER(VCPU, irq_exits),
285 STATS_DESC_COUNTER(VCPU, host_state_reload),
286 STATS_DESC_COUNTER(VCPU, fpu_reload),
287 STATS_DESC_COUNTER(VCPU, insn_emulation),
288 STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
289 STATS_DESC_COUNTER(VCPU, hypercalls),
290 STATS_DESC_COUNTER(VCPU, irq_injections),
291 STATS_DESC_COUNTER(VCPU, nmi_injections),
292 STATS_DESC_COUNTER(VCPU, req_event),
293 STATS_DESC_COUNTER(VCPU, nested_run),
294 STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
295 STATS_DESC_COUNTER(VCPU, directed_yield_successful),
6cd88243
PB
296 STATS_DESC_COUNTER(VCPU, preemption_reported),
297 STATS_DESC_COUNTER(VCPU, preemption_other),
63f4b210 298 STATS_DESC_IBOOLEAN(VCPU, guest_mode),
2f4073e0 299 STATS_DESC_COUNTER(VCPU, notify_window_exits),
ce55c049 300};
ce55c049
JZ
301
302const struct kvm_stats_header kvm_vcpu_stats_header = {
303 .name_size = KVM_STATS_NAME_SIZE,
304 .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
305 .id_offset = sizeof(struct kvm_stats_header),
306 .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
307 .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
308 sizeof(kvm_vcpu_stats_desc),
309};
310
2acf923e
DC
311u64 __read_mostly host_xcr0;
312
c9b8b07c
SC
313static struct kmem_cache *x86_emulator_cache;
314
6abe9c13
PX
315/*
316 * When called, it means the previous get/set msr reached an invalid msr.
cc4cb017 317 * Return true if we want to ignore/silent this failed msr access.
6abe9c13 318 */
d632826f 319static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
6abe9c13
PX
320{
321 const char *op = write ? "wrmsr" : "rdmsr";
322
323 if (ignore_msrs) {
324 if (report_ignored_msrs)
d383b314
TI
325 kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
326 op, msr, data);
6abe9c13 327 /* Mask the error */
cc4cb017 328 return true;
6abe9c13 329 } else {
d383b314
TI
330 kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
331 op, msr, data);
cc4cb017 332 return false;
6abe9c13
PX
333 }
334}
335
c9b8b07c
SC
336static struct kmem_cache *kvm_alloc_emulator_cache(void)
337{
06add254
SC
338 unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
339 unsigned int size = sizeof(struct x86_emulate_ctxt);
340
341 return kmem_cache_create_usercopy("x86_emulator", size,
c9b8b07c 342 __alignof__(struct x86_emulate_ctxt),
06add254
SC
343 SLAB_ACCOUNT, useroffset,
344 size - useroffset, NULL);
c9b8b07c
SC
345}
346
b6785def 347static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
d6aa1000 348
af585b92
GN
349static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
350{
351 int i;
dd03bcaa 352 for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
af585b92
GN
353 vcpu->arch.apf.gfns[i] = ~0;
354}
355
18863bdd
AK
356static void kvm_on_user_return(struct user_return_notifier *urn)
357{
358 unsigned slot;
7e34fbd0
SC
359 struct kvm_user_return_msrs *msrs
360 = container_of(urn, struct kvm_user_return_msrs, urn);
361 struct kvm_user_return_msr_values *values;
1650b4eb
IA
362 unsigned long flags;
363
364 /*
365 * Disabling irqs at this point since the following code could be
366 * interrupted and executed through kvm_arch_hardware_disable()
367 */
368 local_irq_save(flags);
7e34fbd0
SC
369 if (msrs->registered) {
370 msrs->registered = false;
1650b4eb
IA
371 user_return_notifier_unregister(urn);
372 }
373 local_irq_restore(flags);
9cc39a5a 374 for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
7e34fbd0 375 values = &msrs->values[slot];
2bf78fa7 376 if (values->host != values->curr) {
9cc39a5a 377 wrmsrl(kvm_uret_msrs_list[slot], values->host);
2bf78fa7 378 values->curr = values->host;
18863bdd
AK
379 }
380 }
18863bdd
AK
381}
382
e5fda4bb 383static int kvm_probe_user_return_msr(u32 msr)
5104d7ff
SC
384{
385 u64 val;
386 int ret;
387
388 preempt_disable();
389 ret = rdmsrl_safe(msr, &val);
390 if (ret)
391 goto out;
392 ret = wrmsrl_safe(msr, val);
393out:
394 preempt_enable();
395 return ret;
396}
5104d7ff 397
e5fda4bb 398int kvm_add_user_return_msr(u32 msr)
2bf78fa7 399{
e5fda4bb
SC
400 BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
401
402 if (kvm_probe_user_return_msr(msr))
403 return -1;
404
405 kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
406 return kvm_nr_uret_msrs++;
18863bdd 407}
e5fda4bb 408EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
18863bdd 409
8ea8b8d6
SC
410int kvm_find_user_return_msr(u32 msr)
411{
412 int i;
413
9cc39a5a
SC
414 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
415 if (kvm_uret_msrs_list[i] == msr)
8ea8b8d6
SC
416 return i;
417 }
418 return -1;
419}
420EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
421
7e34fbd0 422static void kvm_user_return_msr_cpu_online(void)
18863bdd 423{
05c19c2f 424 unsigned int cpu = smp_processor_id();
7e34fbd0 425 struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
05c19c2f
SC
426 u64 value;
427 int i;
18863bdd 428
9cc39a5a
SC
429 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
430 rdmsrl_safe(kvm_uret_msrs_list[i], &value);
7e34fbd0
SC
431 msrs->values[i].host = value;
432 msrs->values[i].curr = value;
05c19c2f 433 }
18863bdd
AK
434}
435
7e34fbd0 436int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
18863bdd 437{
013f6a5d 438 unsigned int cpu = smp_processor_id();
7e34fbd0 439 struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
8b3c3104 440 int err;
18863bdd 441
7e34fbd0
SC
442 value = (value & mask) | (msrs->values[slot].host & ~mask);
443 if (value == msrs->values[slot].curr)
8b3c3104 444 return 0;
9cc39a5a 445 err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
8b3c3104
AH
446 if (err)
447 return 1;
448
7e34fbd0
SC
449 msrs->values[slot].curr = value;
450 if (!msrs->registered) {
451 msrs->urn.on_user_return = kvm_on_user_return;
452 user_return_notifier_register(&msrs->urn);
453 msrs->registered = true;
18863bdd 454 }
8b3c3104 455 return 0;
18863bdd 456}
7e34fbd0 457EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
18863bdd 458
13a34e06 459static void drop_user_return_notifiers(void)
3548bab5 460{
013f6a5d 461 unsigned int cpu = smp_processor_id();
7e34fbd0 462 struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
3548bab5 463
7e34fbd0
SC
464 if (msrs->registered)
465 kvm_on_user_return(&msrs->urn);
3548bab5
AK
466}
467
6866b83e
CO
468u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
469{
8a5a87d9 470 return vcpu->arch.apic_base;
6866b83e 471}
6866b83e 472
58871649
JM
473enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
474{
475 return kvm_apic_mode(kvm_get_apic_base(vcpu));
476}
477EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
478
58cb628d
JK
479int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
480{
58871649
JM
481 enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
482 enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
a8ac864a 483 u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
d6321d49 484 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
58cb628d 485
58871649 486 if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
58cb628d 487 return 1;
58871649
JM
488 if (!msr_info->host_initiated) {
489 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
490 return 1;
491 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
492 return 1;
493 }
58cb628d
JK
494
495 kvm_lapic_set_base(vcpu, msr_info->data);
4abaffce 496 kvm_recalculate_apic_map(vcpu->kvm);
58cb628d 497 return 0;
6866b83e 498}
6866b83e 499
ad0577c3
SC
500/*
501 * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
502 *
503 * Hardware virtualization extension instructions may fault if a reboot turns
504 * off virtualization while processes are running. Usually after catching the
505 * fault we just panic; during reboot instead the instruction is ignored.
506 */
507noinstr void kvm_spurious_fault(void)
e3ba45b8
GL
508{
509 /* Fault while not rebooting. We want the trace. */
b4fdcf60 510 BUG_ON(!kvm_rebooting);
e3ba45b8
GL
511}
512EXPORT_SYMBOL_GPL(kvm_spurious_fault);
513
3fd28fce
ED
514#define EXCPT_BENIGN 0
515#define EXCPT_CONTRIBUTORY 1
516#define EXCPT_PF 2
517
518static int exception_class(int vector)
519{
520 switch (vector) {
521 case PF_VECTOR:
522 return EXCPT_PF;
523 case DE_VECTOR:
524 case TS_VECTOR:
525 case NP_VECTOR:
526 case SS_VECTOR:
527 case GP_VECTOR:
528 return EXCPT_CONTRIBUTORY;
529 default:
530 break;
531 }
532 return EXCPT_BENIGN;
533}
534
d6e8c854
NA
535#define EXCPT_FAULT 0
536#define EXCPT_TRAP 1
537#define EXCPT_ABORT 2
538#define EXCPT_INTERRUPT 3
5623f751 539#define EXCPT_DB 4
d6e8c854
NA
540
541static int exception_type(int vector)
542{
543 unsigned int mask;
544
545 if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
546 return EXCPT_INTERRUPT;
547
548 mask = 1 << vector;
549
5623f751
SC
550 /*
551 * #DBs can be trap-like or fault-like, the caller must check other CPU
552 * state, e.g. DR6, to determine whether a #DB is a trap or fault.
553 */
554 if (mask & (1 << DB_VECTOR))
555 return EXCPT_DB;
556
557 if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
d6e8c854
NA
558 return EXCPT_TRAP;
559
560 if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
561 return EXCPT_ABORT;
562
563 /* Reserved exceptions will result in fault */
564 return EXCPT_FAULT;
565}
566
d4963e31
SC
567void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
568 struct kvm_queued_exception *ex)
da998b46 569{
d4963e31 570 if (!ex->has_payload)
da998b46
JM
571 return;
572
d4963e31 573 switch (ex->vector) {
f10c729f
JM
574 case DB_VECTOR:
575 /*
576 * "Certain debug exceptions may clear bit 0-3. The
577 * remaining contents of the DR6 register are never
578 * cleared by the processor".
579 */
580 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
581 /*
9a3ecd5e
CQ
582 * In order to reflect the #DB exception payload in guest
583 * dr6, three components need to be considered: active low
584 * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
585 * DR6_BS and DR6_BT)
586 * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
587 * In the target guest dr6:
588 * FIXED_1 bits should always be set.
589 * Active low bits should be cleared if 1-setting in payload.
590 * Active high bits should be set if 1-setting in payload.
591 *
592 * Note, the payload is compatible with the pending debug
593 * exceptions/exit qualification under VMX, that active_low bits
594 * are active high in payload.
595 * So they need to be flipped for DR6.
f10c729f 596 */
9a3ecd5e 597 vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
d4963e31
SC
598 vcpu->arch.dr6 |= ex->payload;
599 vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
307f1cfa
OU
600
601 /*
602 * The #DB payload is defined as compatible with the 'pending
603 * debug exceptions' field under VMX, not DR6. While bit 12 is
604 * defined in the 'pending debug exceptions' field (enabled
605 * breakpoint), it is reserved and must be zero in DR6.
606 */
607 vcpu->arch.dr6 &= ~BIT(12);
f10c729f 608 break;
da998b46 609 case PF_VECTOR:
d4963e31 610 vcpu->arch.cr2 = ex->payload;
da998b46
JM
611 break;
612 }
613
d4963e31
SC
614 ex->has_payload = false;
615 ex->payload = 0;
da998b46
JM
616}
617EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
618
7709aba8
SC
619static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
620 bool has_error_code, u32 error_code,
621 bool has_payload, unsigned long payload)
622{
623 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
624
625 ex->vector = vector;
626 ex->injected = false;
627 ex->pending = true;
628 ex->has_error_code = has_error_code;
629 ex->error_code = error_code;
630 ex->has_payload = has_payload;
631 ex->payload = payload;
632}
633
f9697df2
ML
634/* Forcibly leave the nested mode in cases like a vCPU reset */
635static void kvm_leave_nested(struct kvm_vcpu *vcpu)
636{
637 kvm_x86_ops.nested_ops->leave_nested(vcpu);
638}
639
3fd28fce 640static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
ce7ddec4 641 unsigned nr, bool has_error, u32 error_code,
91e86d22 642 bool has_payload, unsigned long payload, bool reinject)
3fd28fce
ED
643{
644 u32 prev_nr;
645 int class1, class2;
646
3842d135
AK
647 kvm_make_request(KVM_REQ_EVENT, vcpu);
648
7709aba8
SC
649 /*
650 * If the exception is destined for L2 and isn't being reinjected,
651 * morph it to a VM-Exit if L1 wants to intercept the exception. A
652 * previously injected exception is not checked because it was checked
653 * when it was original queued, and re-checking is incorrect if _L1_
654 * injected the exception, in which case it's exempt from interception.
655 */
656 if (!reinject && is_guest_mode(vcpu) &&
657 kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
658 kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
659 has_payload, payload);
660 return;
661 }
662
664f8e26 663 if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
3fd28fce 664 queue:
664f8e26
WL
665 if (reinject) {
666 /*
7709aba8
SC
667 * On VM-Entry, an exception can be pending if and only
668 * if event injection was blocked by nested_run_pending.
669 * In that case, however, vcpu_enter_guest() requests an
670 * immediate exit, and the guest shouldn't proceed far
671 * enough to need reinjection.
664f8e26 672 */
7709aba8 673 WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
664f8e26 674 vcpu->arch.exception.injected = true;
91e86d22
JM
675 if (WARN_ON_ONCE(has_payload)) {
676 /*
677 * A reinjected event has already
678 * delivered its payload.
679 */
680 has_payload = false;
681 payload = 0;
682 }
664f8e26
WL
683 } else {
684 vcpu->arch.exception.pending = true;
685 vcpu->arch.exception.injected = false;
686 }
3fd28fce 687 vcpu->arch.exception.has_error_code = has_error;
d4963e31 688 vcpu->arch.exception.vector = nr;
3fd28fce 689 vcpu->arch.exception.error_code = error_code;
91e86d22
JM
690 vcpu->arch.exception.has_payload = has_payload;
691 vcpu->arch.exception.payload = payload;
a06230b6 692 if (!is_guest_mode(vcpu))
d4963e31
SC
693 kvm_deliver_exception_payload(vcpu,
694 &vcpu->arch.exception);
3fd28fce
ED
695 return;
696 }
697
698 /* to check exception */
d4963e31 699 prev_nr = vcpu->arch.exception.vector;
3fd28fce
ED
700 if (prev_nr == DF_VECTOR) {
701 /* triple fault -> shutdown */
a8eeb04a 702 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3fd28fce
ED
703 return;
704 }
705 class1 = exception_class(prev_nr);
706 class2 = exception_class(nr);
81601495
SC
707 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
708 (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
664f8e26 709 /*
81601495
SC
710 * Synthesize #DF. Clear the previously injected or pending
711 * exception so as not to incorrectly trigger shutdown.
664f8e26 712 */
664f8e26 713 vcpu->arch.exception.injected = false;
81601495
SC
714 vcpu->arch.exception.pending = false;
715
716 kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
717 } else {
3fd28fce
ED
718 /* replace previous exception with a new one in a hope
719 that instruction re-execution will regenerate lost
720 exception */
721 goto queue;
81601495 722 }
3fd28fce
ED
723}
724
298101da
AK
725void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
726{
91e86d22 727 kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
298101da
AK
728}
729EXPORT_SYMBOL_GPL(kvm_queue_exception);
730
ce7ddec4
JR
731void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
732{
91e86d22 733 kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
ce7ddec4
JR
734}
735EXPORT_SYMBOL_GPL(kvm_requeue_exception);
736
4d5523cf
PB
737void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
738 unsigned long payload)
f10c729f
JM
739{
740 kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
741}
4d5523cf 742EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
f10c729f 743
da998b46
JM
744static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
745 u32 error_code, unsigned long payload)
746{
747 kvm_multiple_exception(vcpu, nr, true, error_code,
748 true, payload, false);
749}
750
6affcbed 751int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
c3c91fee 752{
db8fcefa
AP
753 if (err)
754 kvm_inject_gp(vcpu, 0);
755 else
6affcbed
KH
756 return kvm_skip_emulated_instruction(vcpu);
757
758 return 1;
db8fcefa
AP
759}
760EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
8df25a32 761
d2f7d498
HW
762static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
763{
764 if (err) {
765 kvm_inject_gp(vcpu, 0);
766 return 1;
767 }
768
769 return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
770 EMULTYPE_COMPLETE_USER_EXIT);
771}
772
6389ee94 773void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
c3c91fee
AK
774{
775 ++vcpu->stat.pf_guest;
7709aba8
SC
776
777 /*
778 * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
779 * whether or not L1 wants to intercept "regular" #PF.
780 */
781 if (is_guest_mode(vcpu) && fault->async_page_fault)
782 kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
783 true, fault->error_code,
784 true, fault->address);
785 else
da998b46
JM
786 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
787 fault->address);
c3c91fee
AK
788}
789
7709aba8 790void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
53b3d8e9 791 struct x86_exception *fault)
d4f8cf66 792{
0cd665bd 793 struct kvm_mmu *fault_mmu;
53b3d8e9
SC
794 WARN_ON_ONCE(fault->vector != PF_VECTOR);
795
0cd665bd
PB
796 fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
797 vcpu->arch.walk_mmu;
ef54bcfe 798
ee1fa209
JS
799 /*
800 * Invalidate the TLB entry for the faulting address, if it exists,
801 * else the access will fault indefinitely (and to emulate hardware).
802 */
803 if ((fault->error_code & PFERR_PRESENT_MASK) &&
804 !(fault->error_code & PFERR_RSVD_MASK))
805 kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
b9e5603c 806 fault_mmu->root.hpa);
ee1fa209
JS
807
808 fault_mmu->inject_page_fault(vcpu, fault);
d4f8cf66 809}
53b3d8e9 810EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
d4f8cf66 811
3419ffc8
SY
812void kvm_inject_nmi(struct kvm_vcpu *vcpu)
813{
7460fb4a
AK
814 atomic_inc(&vcpu->arch.nmi_queued);
815 kvm_make_request(KVM_REQ_NMI, vcpu);
3419ffc8 816}
3419ffc8 817
298101da
AK
818void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
819{
91e86d22 820 kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
298101da
AK
821}
822EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
823
ce7ddec4
JR
824void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
825{
91e86d22 826 kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
ce7ddec4
JR
827}
828EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
829
0a79b009
AK
830/*
831 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
832 * a #GP and return false.
833 */
834bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
298101da 835{
b3646477 836 if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
0a79b009
AK
837 return true;
838 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
839 return false;
298101da
AK
840}
841
16f8a6f9
NA
842bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
843{
844 if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
845 return true;
846
847 kvm_queue_exception(vcpu, UD_VECTOR);
848 return false;
849}
850EXPORT_SYMBOL_GPL(kvm_require_dr);
851
16cfacc8
SC
852static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
853{
5b7f575c 854 return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
16cfacc8
SC
855}
856
a03490ed 857/*
16cfacc8 858 * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
a03490ed 859 */
2df4a5eb 860int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
a03490ed 861{
2df4a5eb 862 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
a03490ed 863 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
15cabbc2 864 gpa_t real_gpa;
a03490ed
CO
865 int i;
866 int ret;
ff03a073 867 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
a03490ed 868
15cabbc2
SC
869 /*
870 * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
871 * to an L1 GPA.
872 */
c59a0f57
LJ
873 real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
874 PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
6e1d2a3f 875 if (real_gpa == INVALID_GPA)
15cabbc2
SC
876 return 0;
877
94c641ba 878 /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
15cabbc2 879 ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
94c641ba 880 cr3 & GENMASK(11, 5), sizeof(pdpte));
15cabbc2
SC
881 if (ret < 0)
882 return 0;
883
a03490ed 884 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
812f30b2 885 if ((pdpte[i] & PT_PRESENT_MASK) &&
16cfacc8 886 (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
15cabbc2 887 return 0;
a03490ed
CO
888 }
889 }
a03490ed 890
6b123c3a
LJ
891 /*
892 * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
893 * Shadow page roots need to be reconstructed instead.
894 */
895 if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
0c1c92f1 896 kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
6b123c3a 897
46cbc040
PB
898 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
899 kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
900 kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
158a48ec
ML
901 vcpu->arch.pdptrs_from_userspace = false;
902
15cabbc2 903 return 1;
a03490ed 904}
cc4b6871 905EXPORT_SYMBOL_GPL(load_pdptrs);
a03490ed 906
f27ad38a
TL
907void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
908{
f27ad38a
TL
909 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
910 kvm_clear_async_pf_completion_queue(vcpu);
911 kvm_async_pf_hash_reset(vcpu);
b5f61c03
PB
912
913 /*
914 * Clearing CR0.PG is defined to flush the TLB from the guest's
915 * perspective.
916 */
917 if (!(cr0 & X86_CR0_PG))
918 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
f27ad38a
TL
919 }
920
20f632bd 921 if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
f27ad38a
TL
922 kvm_mmu_reset_context(vcpu);
923
924 if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
925 kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
926 !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
927 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
928}
929EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
930
49a9b07e 931int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
a03490ed 932{
aad82703 933 unsigned long old_cr0 = kvm_read_cr0(vcpu);
aad82703 934
f9a48e6a
AK
935 cr0 |= X86_CR0_ET;
936
ab344828 937#ifdef CONFIG_X86_64
0f12244f
GN
938 if (cr0 & 0xffffffff00000000UL)
939 return 1;
ab344828
GN
940#endif
941
942 cr0 &= ~CR0_RESERVED_BITS;
a03490ed 943
0f12244f
GN
944 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
945 return 1;
a03490ed 946
0f12244f
GN
947 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
948 return 1;
a03490ed 949
a03490ed 950#ifdef CONFIG_X86_64
05487215
SC
951 if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
952 (cr0 & X86_CR0_PG)) {
953 int cs_db, cs_l;
954
955 if (!is_pae(vcpu))
956 return 1;
b3646477 957 static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
05487215 958 if (cs_l)
0f12244f 959 return 1;
a03490ed 960 }
05487215
SC
961#endif
962 if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
e63f315d 963 is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
2df4a5eb 964 !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
05487215 965 return 1;
a03490ed 966
777ab82d
LJ
967 if (!(cr0 & X86_CR0_PG) &&
968 (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
ad756a16
MJ
969 return 1;
970
b3646477 971 static_call(kvm_x86_set_cr0)(vcpu, cr0);
a03490ed 972
f27ad38a 973 kvm_post_set_cr0(vcpu, old_cr0, cr0);
b18d5431 974
0f12244f
GN
975 return 0;
976}
2d3ad1f4 977EXPORT_SYMBOL_GPL(kvm_set_cr0);
a03490ed 978
2d3ad1f4 979void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
a03490ed 980{
49a9b07e 981 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
a03490ed 982}
2d3ad1f4 983EXPORT_SYMBOL_GPL(kvm_lmsw);
a03490ed 984
139a12cf 985void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
42bdf991 986{
16809ecd
TL
987 if (vcpu->arch.guest_state_protected)
988 return;
989
139a12cf
AL
990 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
991
992 if (vcpu->arch.xcr0 != host_xcr0)
993 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
994
995 if (vcpu->arch.xsaves_enabled &&
996 vcpu->arch.ia32_xss != host_xss)
997 wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
998 }
37486135 999
945024d7 1000#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
37486135 1001 if (static_cpu_has(X86_FEATURE_PKU) &&
945024d7
JK
1002 vcpu->arch.pkru != vcpu->arch.host_pkru &&
1003 ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1004 kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
72a6c08c 1005 write_pkru(vcpu->arch.pkru);
945024d7 1006#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
42bdf991 1007}
139a12cf 1008EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
42bdf991 1009
139a12cf 1010void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
42bdf991 1011{
16809ecd
TL
1012 if (vcpu->arch.guest_state_protected)
1013 return;
1014
945024d7 1015#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
37486135 1016 if (static_cpu_has(X86_FEATURE_PKU) &&
945024d7
JK
1017 ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1018 kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
37486135
BM
1019 vcpu->arch.pkru = rdpkru();
1020 if (vcpu->arch.pkru != vcpu->arch.host_pkru)
72a6c08c 1021 write_pkru(vcpu->arch.host_pkru);
37486135 1022 }
945024d7 1023#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
37486135 1024
139a12cf
AL
1025 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
1026
1027 if (vcpu->arch.xcr0 != host_xcr0)
1028 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
1029
1030 if (vcpu->arch.xsaves_enabled &&
1031 vcpu->arch.ia32_xss != host_xss)
1032 wrmsrl(MSR_IA32_XSS, host_xss);
1033 }
1034
42bdf991 1035}
139a12cf 1036EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
42bdf991 1037
ba1f77c5 1038#ifdef CONFIG_X86_64
988896bb
LB
1039static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
1040{
ee519b3a 1041 return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
988896bb 1042}
ba1f77c5 1043#endif
988896bb 1044
69b0049a 1045static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
2acf923e 1046{
56c103ec
LJ
1047 u64 xcr0 = xcr;
1048 u64 old_xcr0 = vcpu->arch.xcr0;
46c34cb0 1049 u64 valid_bits;
2acf923e
DC
1050
1051 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
1052 if (index != XCR_XFEATURE_ENABLED_MASK)
1053 return 1;
d91cab78 1054 if (!(xcr0 & XFEATURE_MASK_FP))
2acf923e 1055 return 1;
d91cab78 1056 if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
2acf923e 1057 return 1;
46c34cb0
PB
1058
1059 /*
1060 * Do not allow the guest to set bits that we do not support
1061 * saving. However, xcr0 bit 0 is always set, even if the
e8f65b9b 1062 * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
46c34cb0 1063 */
ee519b3a 1064 valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
46c34cb0 1065 if (xcr0 & ~valid_bits)
2acf923e 1066 return 1;
46c34cb0 1067
d91cab78
DH
1068 if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
1069 (!(xcr0 & XFEATURE_MASK_BNDCSR)))
390bd528
LJ
1070 return 1;
1071
d91cab78
DH
1072 if (xcr0 & XFEATURE_MASK_AVX512) {
1073 if (!(xcr0 & XFEATURE_MASK_YMM))
612263b3 1074 return 1;
d91cab78 1075 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
612263b3
CP
1076 return 1;
1077 }
86aff7a4
JL
1078
1079 if ((xcr0 & XFEATURE_MASK_XTILE) &&
1080 ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
1081 return 1;
1082
2acf923e 1083 vcpu->arch.xcr0 = xcr0;
56c103ec 1084
d91cab78 1085 if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
aedbaf4f 1086 kvm_update_cpuid_runtime(vcpu);
2acf923e
DC
1087 return 0;
1088}
1089
92f9895c 1090int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
2acf923e 1091{
50b2d49b 1092 /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
92f9895c
SC
1093 if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
1094 __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
1095 kvm_inject_gp(vcpu, 0);
1096 return 1;
1097 }
bbefd4fc 1098
92f9895c 1099 return kvm_skip_emulated_instruction(vcpu);
2acf923e 1100}
92f9895c 1101EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
2acf923e 1102
c33f6f22 1103bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
a03490ed 1104{
b11306b5 1105 if (cr4 & cr4_reserved_bits)
ee69c92b 1106 return false;
b9baba86 1107
b899c132 1108 if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
ee69c92b 1109 return false;
3ca94192 1110
c33f6f22
SC
1111 return true;
1112}
1113EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4);
1114
1115static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1116{
1117 return __kvm_is_valid_cr4(vcpu, cr4) &&
1118 static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
3ca94192
WL
1119}
1120
5b51cb13
TL
1121void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
1122{
b5f61c03
PB
1123 if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
1124 kvm_mmu_reset_context(vcpu);
1125
509bfe3d 1126 /*
509bfe3d
LJ
1127 * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
1128 * according to the SDM; however, stale prev_roots could be reused
1129 * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
b5f61c03
PB
1130 * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
1131 * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
1132 * so fall through.
509bfe3d 1133 */
b5f61c03
PB
1134 if (!tdp_enabled &&
1135 (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
f6d0a252 1136 kvm_mmu_unload(vcpu);
b5f61c03
PB
1137
1138 /*
1139 * The TLB has to be flushed for all PCIDs if any of the following
1140 * (architecturally required) changes happen:
1141 * - CR4.PCIDE is changed from 1 to 0
1142 * - CR4.PGE is toggled
509bfe3d 1143 *
b5f61c03 1144 * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
509bfe3d 1145 */
b5f61c03
PB
1146 if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
1147 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
55261738 1148 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
b5f61c03
PB
1149
1150 /*
1151 * The TLB has to be flushed for the current PCID if any of the
1152 * following (architecturally required) changes happen:
1153 * - CR4.SMEP is changed from 0 to 1
1154 * - CR4.PAE is toggled
1155 */
1156 else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
1157 ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
1158 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1159
3ca94192 1160}
5b51cb13 1161EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
3ca94192
WL
1162
1163int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1164{
1165 unsigned long old_cr4 = kvm_read_cr4(vcpu);
3ca94192 1166
ee69c92b 1167 if (!kvm_is_valid_cr4(vcpu, cr4))
ae3e61e1
PB
1168 return 1;
1169
a03490ed 1170 if (is_long_mode(vcpu)) {
0f12244f
GN
1171 if (!(cr4 & X86_CR4_PAE))
1172 return 1;
d74fcfc1
SC
1173 if ((cr4 ^ old_cr4) & X86_CR4_LA57)
1174 return 1;
a2edf57f 1175 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
a37ebdce 1176 && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
2df4a5eb 1177 && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
0f12244f
GN
1178 return 1;
1179
ad756a16 1180 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
d6321d49 1181 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
ad756a16
MJ
1182 return 1;
1183
1184 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1185 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1186 return 1;
1187 }
1188
b3646477 1189 static_call(kvm_x86_set_cr4)(vcpu, cr4);
a03490ed 1190
5b51cb13 1191 kvm_post_set_cr4(vcpu, old_cr4, cr4);
2acf923e 1192
0f12244f
GN
1193 return 0;
1194}
2d3ad1f4 1195EXPORT_SYMBOL_GPL(kvm_set_cr4);
a03490ed 1196
21823fbd
SC
1197static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
1198{
1199 struct kvm_mmu *mmu = vcpu->arch.mmu;
1200 unsigned long roots_to_free = 0;
1201 int i;
1202
e45e9e39
LJ
1203 /*
1204 * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
1205 * this is reachable when running EPT=1 and unrestricted_guest=0, and
1206 * also via the emulator. KVM's TDP page tables are not in the scope of
1207 * the invalidation, but the guest's TLB entries need to be flushed as
1208 * the CPU may have cached entries in its TLB for the target PCID.
1209 */
1210 if (unlikely(tdp_enabled)) {
1211 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1212 return;
1213 }
1214
21823fbd
SC
1215 /*
1216 * If neither the current CR3 nor any of the prev_roots use the given
1217 * PCID, then nothing needs to be done here because a resync will
1218 * happen anyway before switching to any other CR3.
1219 */
1220 if (kvm_get_active_pcid(vcpu) == pcid) {
e62f1aa8 1221 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
21823fbd
SC
1222 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1223 }
1224
509bfe3d
LJ
1225 /*
1226 * If PCID is disabled, there is no need to free prev_roots even if the
1227 * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
1228 * with PCIDE=0.
1229 */
1230 if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
1231 return;
1232
21823fbd
SC
1233 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
1234 if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
1235 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
1236
0c1c92f1 1237 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
21823fbd
SC
1238}
1239
2390218b 1240int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
a03490ed 1241{
ade61e28 1242 bool skip_tlb_flush = false;
21823fbd 1243 unsigned long pcid = 0;
ac146235 1244#ifdef CONFIG_X86_64
c19986fe
JS
1245 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
1246
ade61e28 1247 if (pcid_enabled) {
208320ba
JS
1248 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
1249 cr3 &= ~X86_CR3_PCID_NOFLUSH;
21823fbd 1250 pcid = cr3 & X86_CR3_PCID_MASK;
ade61e28 1251 }
ac146235 1252#endif
9d88fca7 1253
c7313155 1254 /* PDPTRs are always reloaded for PAE paging. */
21823fbd
SC
1255 if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
1256 goto handle_tlb_flush;
d835dfec 1257
886bbcc7
SC
1258 /*
1259 * Do not condition the GPA check on long mode, this helper is used to
1260 * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
1261 * the current vCPU mode is accurate.
1262 */
1263 if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
d1cd3ce9 1264 return 1;
886bbcc7 1265
2df4a5eb 1266 if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
346874c9 1267 return 1;
a03490ed 1268
21823fbd 1269 if (cr3 != kvm_read_cr3(vcpu))
b5129100 1270 kvm_mmu_new_pgd(vcpu, cr3);
21823fbd 1271
0f12244f 1272 vcpu->arch.cr3 = cr3;
3883bc9d 1273 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
405329fc 1274 /* Do not call post_set_cr3, we do not get here for confidential guests. */
7c390d35 1275
21823fbd
SC
1276handle_tlb_flush:
1277 /*
1278 * A load of CR3 that flushes the TLB flushes only the current PCID,
1279 * even if PCID is disabled, in which case PCID=0 is flushed. It's a
1280 * moot point in the end because _disabling_ PCID will flush all PCIDs,
1281 * and it's impossible to use a non-zero PCID when PCID is disabled,
1282 * i.e. only PCID=0 can be relevant.
1283 */
1284 if (!skip_tlb_flush)
1285 kvm_invalidate_pcid(vcpu, pcid);
1286
0f12244f
GN
1287 return 0;
1288}
2d3ad1f4 1289EXPORT_SYMBOL_GPL(kvm_set_cr3);
a03490ed 1290
eea1cff9 1291int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
a03490ed 1292{
0f12244f
GN
1293 if (cr8 & CR8_RESERVED_BITS)
1294 return 1;
35754c98 1295 if (lapic_in_kernel(vcpu))
a03490ed
CO
1296 kvm_lapic_set_tpr(vcpu, cr8);
1297 else
ad312c7c 1298 vcpu->arch.cr8 = cr8;
0f12244f
GN
1299 return 0;
1300}
2d3ad1f4 1301EXPORT_SYMBOL_GPL(kvm_set_cr8);
a03490ed 1302
2d3ad1f4 1303unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
a03490ed 1304{
35754c98 1305 if (lapic_in_kernel(vcpu))
a03490ed
CO
1306 return kvm_lapic_get_cr8(vcpu);
1307 else
ad312c7c 1308 return vcpu->arch.cr8;
a03490ed 1309}
2d3ad1f4 1310EXPORT_SYMBOL_GPL(kvm_get_cr8);
a03490ed 1311
ae561ede
NA
1312static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
1313{
1314 int i;
1315
1316 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1317 for (i = 0; i < KVM_NR_DB_REGS; i++)
1318 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
ae561ede
NA
1319 }
1320}
1321
7c86663b 1322void kvm_update_dr7(struct kvm_vcpu *vcpu)
c8639010
JK
1323{
1324 unsigned long dr7;
1325
1326 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1327 dr7 = vcpu->arch.guest_debug_dr7;
1328 else
1329 dr7 = vcpu->arch.dr7;
b3646477 1330 static_call(kvm_x86_set_dr7)(vcpu, dr7);
360b948d
PB
1331 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1332 if (dr7 & DR7_BP_EN_MASK)
1333 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
c8639010 1334}
7c86663b 1335EXPORT_SYMBOL_GPL(kvm_update_dr7);
c8639010 1336
6f43ed01
NA
1337static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
1338{
1339 u64 fixed = DR6_FIXED_1;
1340
d6321d49 1341 if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
6f43ed01 1342 fixed |= DR6_RTM;
e8ea85fb
CQ
1343
1344 if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
1345 fixed |= DR6_BUS_LOCK;
6f43ed01
NA
1346 return fixed;
1347}
1348
996ff542 1349int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
020df079 1350{
ea740059
MP
1351 size_t size = ARRAY_SIZE(vcpu->arch.db);
1352
020df079
GN
1353 switch (dr) {
1354 case 0 ... 3:
ea740059 1355 vcpu->arch.db[array_index_nospec(dr, size)] = val;
020df079
GN
1356 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1357 vcpu->arch.eff_db[dr] = val;
1358 break;
1359 case 4:
020df079 1360 case 6:
f5f6145e 1361 if (!kvm_dr6_valid(val))
996ff542 1362 return 1; /* #GP */
6f43ed01 1363 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
020df079
GN
1364 break;
1365 case 5:
020df079 1366 default: /* 7 */
b91991bf 1367 if (!kvm_dr7_valid(val))
996ff542 1368 return 1; /* #GP */
020df079 1369 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
c8639010 1370 kvm_update_dr7(vcpu);
020df079
GN
1371 break;
1372 }
1373
1374 return 0;
1375}
1376EXPORT_SYMBOL_GPL(kvm_set_dr);
1377
29d6ca41 1378void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
020df079 1379{
ea740059
MP
1380 size_t size = ARRAY_SIZE(vcpu->arch.db);
1381
020df079
GN
1382 switch (dr) {
1383 case 0 ... 3:
ea740059 1384 *val = vcpu->arch.db[array_index_nospec(dr, size)];
020df079
GN
1385 break;
1386 case 4:
020df079 1387 case 6:
5679b803 1388 *val = vcpu->arch.dr6;
020df079
GN
1389 break;
1390 case 5:
020df079
GN
1391 default: /* 7 */
1392 *val = vcpu->arch.dr7;
1393 break;
1394 }
338dbc97 1395}
020df079
GN
1396EXPORT_SYMBOL_GPL(kvm_get_dr);
1397
c483c454 1398int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
022cd0e8 1399{
de3cd117 1400 u32 ecx = kvm_rcx_read(vcpu);
022cd0e8 1401 u64 data;
022cd0e8 1402
c483c454
SC
1403 if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
1404 kvm_inject_gp(vcpu, 0);
1405 return 1;
1406 }
1407
de3cd117
SC
1408 kvm_rax_write(vcpu, (u32)data);
1409 kvm_rdx_write(vcpu, data >> 32);
c483c454 1410 return kvm_skip_emulated_instruction(vcpu);
022cd0e8 1411}
c483c454 1412EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
022cd0e8 1413
043405e1
CO
1414/*
1415 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1416 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1417 *
7a5ee6ed
CQ
1418 * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1419 * extract the supported MSRs from the related const lists.
1420 * msrs_to_save is selected from the msrs_to_save_all to reflect the
e3267cbb 1421 * capabilities of the host cpu. This capabilities test skips MSRs that are
7a5ee6ed 1422 * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
62ef68bb 1423 * may depend on host virtualization features rather than host cpu features.
043405e1 1424 */
e3267cbb 1425
2374b731 1426static const u32 msrs_to_save_base[] = {
043405e1 1427 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
8c06585d 1428 MSR_STAR,
043405e1
CO
1429#ifdef CONFIG_X86_64
1430 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1431#endif
b3897a49 1432 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
32ad73db 1433 MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
2bdb76c0 1434 MSR_IA32_SPEC_CTRL,
bf8c55d8
CP
1435 MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1436 MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1437 MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1438 MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1439 MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1440 MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
6e3ba4ab
TX
1441 MSR_IA32_UMWAIT_CONTROL,
1442
2374b731
SC
1443 MSR_IA32_XFD, MSR_IA32_XFD_ERR,
1444};
1445
1446static const u32 msrs_to_save_pmu[] = {
e2ada66e 1447 MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
9fb12fe5 1448 MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
e2ada66e
JM
1449 MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1450 MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
4f1fa2a1
LX
1451 MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
1452
1453 /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */
e2ada66e
JM
1454 MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1455 MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1456 MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1457 MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
e2ada66e
JM
1458 MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1459 MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1460 MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1461 MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
e1fc1553
FM
1462
1463 MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
1464 MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
556f3c9a
LX
1465
1466 /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */
e1fc1553
FM
1467 MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
1468 MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
1469 MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
1470 MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
043405e1
CO
1471};
1472
2374b731
SC
1473static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
1474 ARRAY_SIZE(msrs_to_save_pmu)];
043405e1
CO
1475static unsigned num_msrs_to_save;
1476
7a5ee6ed 1477static const u32 emulated_msrs_all[] = {
62ef68bb
PB
1478 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1479 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1480 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1481 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
72c139ba 1482 HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
e7d9513b
AS
1483 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1484 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
e516cebb 1485 HV_X64_MSR_RESET,
11c4b1ca 1486 HV_X64_MSR_VP_INDEX,
9eec50b8 1487 HV_X64_MSR_VP_RUNTIME,
5c919412 1488 HV_X64_MSR_SCONTROL,
1f4b34f8 1489 HV_X64_MSR_STIMER0_CONFIG,
d4abc577 1490 HV_X64_MSR_VP_ASSIST_PAGE,
a2e164e7 1491 HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
2be1bd3a 1492 HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
f97f5a56
JD
1493 HV_X64_MSR_SYNDBG_OPTIONS,
1494 HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
1495 HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
1496 HV_X64_MSR_SYNDBG_PENDING_BUFFER,
a2e164e7
VK
1497
1498 MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
557a961a 1499 MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
62ef68bb 1500
ba904635 1501 MSR_IA32_TSC_ADJUST,
09141ec0 1502 MSR_IA32_TSC_DEADLINE,
2bdb76c0 1503 MSR_IA32_ARCH_CAPABILITIES,
27461da3 1504 MSR_IA32_PERF_CAPABILITIES,
043405e1 1505 MSR_IA32_MISC_ENABLE,
908e75f3
AK
1506 MSR_IA32_MCG_STATUS,
1507 MSR_IA32_MCG_CTL,
c45dcc71 1508 MSR_IA32_MCG_EXT_CTL,
64d60670 1509 MSR_IA32_SMBASE,
52797bf9 1510 MSR_SMI_COUNT,
db2336a8
KH
1511 MSR_PLATFORM_INFO,
1512 MSR_MISC_FEATURES_ENABLES,
bc226f07 1513 MSR_AMD64_VIRT_SPEC_CTRL,
5228eb96 1514 MSR_AMD64_TSC_RATIO,
6c6a2ab9 1515 MSR_IA32_POWER_CTL,
99634e3e 1516 MSR_IA32_UCODE_REV,
191c8137 1517
95c5c7c7
PB
1518 /*
1519 * The following list leaves out MSRs whose values are determined
1520 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1521 * We always support the "true" VMX control MSRs, even if the host
1522 * processor does not, so I am putting these registers here rather
7a5ee6ed 1523 * than in msrs_to_save_all.
95c5c7c7
PB
1524 */
1525 MSR_IA32_VMX_BASIC,
1526 MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1527 MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1528 MSR_IA32_VMX_TRUE_EXIT_CTLS,
1529 MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1530 MSR_IA32_VMX_MISC,
1531 MSR_IA32_VMX_CR0_FIXED0,
1532 MSR_IA32_VMX_CR4_FIXED0,
1533 MSR_IA32_VMX_VMCS_ENUM,
1534 MSR_IA32_VMX_PROCBASED_CTLS2,
1535 MSR_IA32_VMX_EPT_VPID_CAP,
1536 MSR_IA32_VMX_VMFUNC,
1537
191c8137 1538 MSR_K7_HWCR,
2d5ba19b 1539 MSR_KVM_POLL_CONTROL,
043405e1
CO
1540};
1541
7a5ee6ed 1542static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
62ef68bb
PB
1543static unsigned num_emulated_msrs;
1544
801e459a
TL
1545/*
1546 * List of msr numbers which are used to expose MSR-based features that
1547 * can be used by a hypervisor to validate requested CPU features.
1548 */
7a5ee6ed 1549static const u32 msr_based_features_all[] = {
1389309c
PB
1550 MSR_IA32_VMX_BASIC,
1551 MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1552 MSR_IA32_VMX_PINBASED_CTLS,
1553 MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1554 MSR_IA32_VMX_PROCBASED_CTLS,
1555 MSR_IA32_VMX_TRUE_EXIT_CTLS,
1556 MSR_IA32_VMX_EXIT_CTLS,
1557 MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1558 MSR_IA32_VMX_ENTRY_CTLS,
1559 MSR_IA32_VMX_MISC,
1560 MSR_IA32_VMX_CR0_FIXED0,
1561 MSR_IA32_VMX_CR0_FIXED1,
1562 MSR_IA32_VMX_CR4_FIXED0,
1563 MSR_IA32_VMX_CR4_FIXED1,
1564 MSR_IA32_VMX_VMCS_ENUM,
1565 MSR_IA32_VMX_PROCBASED_CTLS2,
1566 MSR_IA32_VMX_EPT_VPID_CAP,
1567 MSR_IA32_VMX_VMFUNC,
1568
2632daeb 1569 MSR_AMD64_DE_CFG,
518e7b94 1570 MSR_IA32_UCODE_REV,
cd283252 1571 MSR_IA32_ARCH_CAPABILITIES,
27461da3 1572 MSR_IA32_PERF_CAPABILITIES,
801e459a
TL
1573};
1574
7a5ee6ed 1575static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
801e459a
TL
1576static unsigned int num_msr_based_features;
1577
0204750b
JM
1578/*
1579 * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
1580 * does not yet virtualize. These include:
1581 * 10 - MISC_PACKAGE_CTRLS
1582 * 11 - ENERGY_FILTERING_CTL
1583 * 12 - DOITM
1584 * 18 - FB_CLEAR_CTRL
1585 * 21 - XAPIC_DISABLE_STATUS
1586 * 23 - OVERCLOCKING_STATUS
1587 */
1588
1589#define KVM_SUPPORTED_ARCH_CAP \
1590 (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
1591 ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
1592 ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
1593 ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
1594 ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
1595
4d22c17c 1596static u64 kvm_get_arch_capabilities(void)
5b76a3cf 1597{
4d22c17c 1598 u64 data = 0;
5b76a3cf 1599
0204750b 1600 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
4d22c17c 1601 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
0204750b
JM
1602 data &= KVM_SUPPORTED_ARCH_CAP;
1603 }
5b76a3cf 1604
b8e8c830
PB
1605 /*
1606 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1607 * the nested hypervisor runs with NX huge pages. If it is not,
d9f6e12f 1608 * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
b8e8c830
PB
1609 * L1 guests, so it need not worry about its own (L2) guests.
1610 */
1611 data |= ARCH_CAP_PSCHANGE_MC_NO;
1612
5b76a3cf
PB
1613 /*
1614 * If we're doing cache flushes (either "always" or "cond")
1615 * we will do one whenever the guest does a vmlaunch/vmresume.
1616 * If an outer hypervisor is doing the cache flush for us
1617 * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1618 * capability to the guest too, and if EPT is disabled we're not
1619 * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
1620 * require a nested hypervisor to do a flush of its own.
1621 */
1622 if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1623 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1624
0c54914d
PB
1625 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
1626 data |= ARCH_CAP_RDCL_NO;
1627 if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
1628 data |= ARCH_CAP_SSB_NO;
1629 if (!boot_cpu_has_bug(X86_BUG_MDS))
1630 data |= ARCH_CAP_MDS_NO;
1631
7131636e
PB
1632 if (!boot_cpu_has(X86_FEATURE_RTM)) {
1633 /*
1634 * If RTM=0 because the kernel has disabled TSX, the host might
1635 * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
1636 * and therefore knows that there cannot be TAA) but keep
1637 * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
1638 * and we want to allow migrating those guests to tsx=off hosts.
1639 */
1640 data &= ~ARCH_CAP_TAA_NO;
1641 } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
cbbaa272 1642 data |= ARCH_CAP_TAA_NO;
7131636e
PB
1643 } else {
1644 /*
1645 * Nothing to do here; we emulate TSX_CTRL if present on the
1646 * host so the guest can choose between disabling TSX or
1647 * using VERW to clear CPU buffers.
1648 */
1649 }
e1d38b63 1650
5b76a3cf
PB
1651 return data;
1652}
5b76a3cf 1653
66421c1e
WL
1654static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1655{
1656 switch (msr->index) {
cd283252 1657 case MSR_IA32_ARCH_CAPABILITIES:
5b76a3cf
PB
1658 msr->data = kvm_get_arch_capabilities();
1659 break;
5fe9805d
SC
1660 case MSR_IA32_PERF_CAPABILITIES:
1661 msr->data = kvm_caps.supported_perf_cap;
1662 break;
5b76a3cf 1663 case MSR_IA32_UCODE_REV:
cd283252 1664 rdmsrl_safe(msr->index, &msr->data);
518e7b94 1665 break;
66421c1e 1666 default:
b3646477 1667 return static_call(kvm_x86_get_msr_feature)(msr);
66421c1e
WL
1668 }
1669 return 0;
1670}
1671
801e459a
TL
1672static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1673{
1674 struct kvm_msr_entry msr;
66421c1e 1675 int r;
801e459a
TL
1676
1677 msr.index = index;
66421c1e 1678 r = kvm_get_msr_feature(&msr);
12bc2132
PX
1679
1680 if (r == KVM_MSR_RET_INVALID) {
1681 /* Unconditionally clear the output for simplicity */
1682 *data = 0;
d632826f 1683 if (kvm_msr_ignored_check(index, 0, false))
cc4cb017 1684 r = 0;
12bc2132
PX
1685 }
1686
66421c1e
WL
1687 if (r)
1688 return r;
801e459a
TL
1689
1690 *data = msr.data;
1691
1692 return 0;
1693}
1694
11988499 1695static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
15c4a640 1696{
8c19b6f2
KP
1697 if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS))
1698 return false;
1699
1b4d56b8 1700 if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
11988499 1701 return false;
1b2fd70c 1702
1b4d56b8 1703 if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
11988499 1704 return false;
d8017474 1705
0a629563
SC
1706 if (efer & (EFER_LME | EFER_LMA) &&
1707 !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1708 return false;
1709
1710 if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1711 return false;
d8017474 1712
384bb783 1713 return true;
11988499
SC
1714
1715}
1716bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1717{
1718 if (efer & efer_reserved_bits)
1719 return false;
1720
1721 return __kvm_valid_efer(vcpu, efer);
384bb783
JK
1722}
1723EXPORT_SYMBOL_GPL(kvm_valid_efer);
1724
11988499 1725static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
384bb783
JK
1726{
1727 u64 old_efer = vcpu->arch.efer;
11988499 1728 u64 efer = msr_info->data;
72f211ec 1729 int r;
384bb783 1730
11988499 1731 if (efer & efer_reserved_bits)
66f61c92 1732 return 1;
384bb783 1733
11988499
SC
1734 if (!msr_info->host_initiated) {
1735 if (!__kvm_valid_efer(vcpu, efer))
1736 return 1;
1737
1738 if (is_paging(vcpu) &&
1739 (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1740 return 1;
1741 }
384bb783 1742
15c4a640 1743 efer &= ~EFER_LMA;
f6801dff 1744 efer |= vcpu->arch.efer & EFER_LMA;
15c4a640 1745
b3646477 1746 r = static_call(kvm_x86_set_efer)(vcpu, efer);
72f211ec
ML
1747 if (r) {
1748 WARN_ON(r > 0);
1749 return r;
1750 }
a3d204e2 1751
d6174299 1752 if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
aad82703
SY
1753 kvm_mmu_reset_context(vcpu);
1754
b69e8cae 1755 return 0;
15c4a640
CO
1756}
1757
f2b4b7dd
JR
1758void kvm_enable_efer_bits(u64 mask)
1759{
1760 efer_reserved_bits &= ~mask;
1761}
1762EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1763
51de8151
AG
1764bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
1765{
b318e8de
SC
1766 struct kvm_x86_msr_filter *msr_filter;
1767 struct msr_bitmap_range *ranges;
1a155254 1768 struct kvm *kvm = vcpu->kvm;
b318e8de 1769 bool allowed;
1a155254 1770 int idx;
b318e8de 1771 u32 i;
1a155254 1772
b318e8de
SC
1773 /* x2APIC MSRs do not support filtering. */
1774 if (index >= 0x800 && index <= 0x8ff)
1a155254
AG
1775 return true;
1776
1a155254
AG
1777 idx = srcu_read_lock(&kvm->srcu);
1778
b318e8de
SC
1779 msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1780 if (!msr_filter) {
1781 allowed = true;
1782 goto out;
1783 }
1784
1785 allowed = msr_filter->default_allow;
1786 ranges = msr_filter->ranges;
1787
1788 for (i = 0; i < msr_filter->count; i++) {
1a155254
AG
1789 u32 start = ranges[i].base;
1790 u32 end = start + ranges[i].nmsrs;
1791 u32 flags = ranges[i].flags;
1792 unsigned long *bitmap = ranges[i].bitmap;
1793
1794 if ((index >= start) && (index < end) && (flags & type)) {
b318e8de 1795 allowed = !!test_bit(index - start, bitmap);
1a155254
AG
1796 break;
1797 }
1798 }
1799
b318e8de 1800out:
1a155254
AG
1801 srcu_read_unlock(&kvm->srcu, idx);
1802
b318e8de 1803 return allowed;
51de8151
AG
1804}
1805EXPORT_SYMBOL_GPL(kvm_msr_allowed);
1806
15c4a640 1807/*
f20935d8
SC
1808 * Write @data into the MSR specified by @index. Select MSR specific fault
1809 * checks are bypassed if @host_initiated is %true.
15c4a640
CO
1810 * Returns 0 on success, non-0 otherwise.
1811 * Assumes vcpu_load() was already called.
1812 */
f20935d8
SC
1813static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1814 bool host_initiated)
15c4a640 1815{
f20935d8
SC
1816 struct msr_data msr;
1817
1818 switch (index) {
854e8bb1
NA
1819 case MSR_FS_BASE:
1820 case MSR_GS_BASE:
1821 case MSR_KERNEL_GS_BASE:
1822 case MSR_CSTAR:
1823 case MSR_LSTAR:
f20935d8 1824 if (is_noncanonical_address(data, vcpu))
854e8bb1
NA
1825 return 1;
1826 break;
1827 case MSR_IA32_SYSENTER_EIP:
1828 case MSR_IA32_SYSENTER_ESP:
1829 /*
1830 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1831 * non-canonical address is written on Intel but not on
1832 * AMD (which ignores the top 32-bits, because it does
1833 * not implement 64-bit SYSENTER).
1834 *
1835 * 64-bit code should hence be able to write a non-canonical
1836 * value on AMD. Making the address canonical ensures that
1837 * vmentry does not fail on Intel after writing a non-canonical
1838 * value, and that something deterministic happens if the guest
1839 * invokes 64-bit SYSENTER.
1840 */
1fb85d06 1841 data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
61a05d44
SC
1842 break;
1843 case MSR_TSC_AUX:
1844 if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
1845 return 1;
1846
1847 if (!host_initiated &&
1848 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
1849 !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
1850 return 1;
1851
1852 /*
1853 * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
1854 * incomplete and conflicting architectural behavior. Current
1855 * AMD CPUs completely ignore bits 63:32, i.e. they aren't
1856 * reserved and always read as zeros. Enforce Intel's reserved
1857 * bits check if and only if the guest CPU is Intel, and clear
1858 * the bits in all other cases. This ensures cross-vendor
1859 * migration will provide consistent behavior for the guest.
1860 */
1861 if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
1862 return 1;
1863
1864 data = (u32)data;
1865 break;
854e8bb1 1866 }
f20935d8
SC
1867
1868 msr.data = data;
1869 msr.index = index;
1870 msr.host_initiated = host_initiated;
1871
b3646477 1872 return static_call(kvm_x86_set_msr)(vcpu, &msr);
15c4a640
CO
1873}
1874
6abe9c13
PX
1875static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
1876 u32 index, u64 data, bool host_initiated)
1877{
1878 int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
1879
1880 if (ret == KVM_MSR_RET_INVALID)
d632826f 1881 if (kvm_msr_ignored_check(index, data, true))
cc4cb017 1882 ret = 0;
6abe9c13
PX
1883
1884 return ret;
1885}
1886
313a3dc7 1887/*
f20935d8
SC
1888 * Read the MSR specified by @index into @data. Select MSR specific fault
1889 * checks are bypassed if @host_initiated is %true.
1890 * Returns 0 on success, non-0 otherwise.
1891 * Assumes vcpu_load() was already called.
313a3dc7 1892 */
edef5c36
PB
1893int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1894 bool host_initiated)
609e36d3
PB
1895{
1896 struct msr_data msr;
f20935d8 1897 int ret;
609e36d3 1898
61a05d44
SC
1899 switch (index) {
1900 case MSR_TSC_AUX:
1901 if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
1902 return 1;
1903
1904 if (!host_initiated &&
1905 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
1906 !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
1907 return 1;
1908 break;
1909 }
1910
609e36d3 1911 msr.index = index;
f20935d8 1912 msr.host_initiated = host_initiated;
609e36d3 1913
b3646477 1914 ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
f20935d8
SC
1915 if (!ret)
1916 *data = msr.data;
1917 return ret;
609e36d3
PB
1918}
1919
6abe9c13
PX
1920static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
1921 u32 index, u64 *data, bool host_initiated)
1922{
1923 int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
1924
1925 if (ret == KVM_MSR_RET_INVALID) {
1926 /* Unconditionally clear *data for simplicity */
1927 *data = 0;
d632826f 1928 if (kvm_msr_ignored_check(index, 0, false))
cc4cb017 1929 ret = 0;
6abe9c13
PX
1930 }
1931
1932 return ret;
1933}
1934
ac8d6cad
HW
1935static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1936{
1937 if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
1938 return KVM_MSR_RET_FILTERED;
1939 return kvm_get_msr_ignored_check(vcpu, index, data, false);
1940}
1941
1942static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
1943{
1944 if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
1945 return KVM_MSR_RET_FILTERED;
1946 return kvm_set_msr_ignored_check(vcpu, index, data, false);
1947}
1948
f20935d8 1949int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
313a3dc7 1950{
6abe9c13 1951 return kvm_get_msr_ignored_check(vcpu, index, data, false);
f20935d8
SC
1952}
1953EXPORT_SYMBOL_GPL(kvm_get_msr);
8fe8ab46 1954
f20935d8
SC
1955int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1956{
6abe9c13 1957 return kvm_set_msr_ignored_check(vcpu, index, data, false);
f20935d8
SC
1958}
1959EXPORT_SYMBOL_GPL(kvm_set_msr);
1960
d2f7d498 1961static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
1ae09954 1962{
d2f7d498 1963 if (!vcpu->run->msr.error) {
1ae09954
AG
1964 kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
1965 kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
1966 }
d2f7d498 1967}
1ae09954 1968
d2f7d498
HW
1969static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
1970{
1971 return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
1ae09954
AG
1972}
1973
d2f7d498
HW
1974static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
1975{
1976 complete_userspace_rdmsr(vcpu);
1977 return complete_emulated_msr_access(vcpu);
1978}
1979
1980static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
1ae09954 1981{
b3646477 1982 return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
1ae09954
AG
1983}
1984
d2f7d498
HW
1985static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
1986{
1987 complete_userspace_rdmsr(vcpu);
1988 return complete_fast_msr_access(vcpu);
1989}
1990
1ae09954
AG
1991static u64 kvm_msr_reason(int r)
1992{
1993 switch (r) {
cc4cb017 1994 case KVM_MSR_RET_INVALID:
1ae09954 1995 return KVM_MSR_EXIT_REASON_UNKNOWN;
cc4cb017 1996 case KVM_MSR_RET_FILTERED:
1a155254 1997 return KVM_MSR_EXIT_REASON_FILTER;
1ae09954
AG
1998 default:
1999 return KVM_MSR_EXIT_REASON_INVAL;
2000 }
2001}
2002
2003static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
2004 u32 exit_reason, u64 data,
2005 int (*completion)(struct kvm_vcpu *vcpu),
2006 int r)
2007{
2008 u64 msr_reason = kvm_msr_reason(r);
2009
2010 /* Check if the user wanted to know about this MSR fault */
2011 if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
2012 return 0;
2013
2014 vcpu->run->exit_reason = exit_reason;
2015 vcpu->run->msr.error = 0;
2016 memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
2017 vcpu->run->msr.reason = msr_reason;
2018 vcpu->run->msr.index = index;
2019 vcpu->run->msr.data = data;
2020 vcpu->arch.complete_userspace_io = completion;
2021
2022 return 1;
2023}
2024
1edce0a9
SC
2025int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
2026{
2027 u32 ecx = kvm_rcx_read(vcpu);
2028 u64 data;
1ae09954
AG
2029 int r;
2030
ac8d6cad 2031 r = kvm_get_msr_with_filter(vcpu, ecx, &data);
1edce0a9 2032
8b474427
PB
2033 if (!r) {
2034 trace_kvm_msr_read(ecx, data);
2035
2036 kvm_rax_write(vcpu, data & -1u);
2037 kvm_rdx_write(vcpu, (data >> 32) & -1u);
2038 } else {
d2f7d498
HW
2039 /* MSR read failed? See if we should ask user space */
2040 if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
2041 complete_fast_rdmsr, r))
2042 return 0;
1edce0a9 2043 trace_kvm_msr_read_ex(ecx);
1edce0a9
SC
2044 }
2045
b3646477 2046 return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
1edce0a9
SC
2047}
2048EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
2049
2050int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
2051{
2052 u32 ecx = kvm_rcx_read(vcpu);
2053 u64 data = kvm_read_edx_eax(vcpu);
1ae09954 2054 int r;
1edce0a9 2055
ac8d6cad 2056 r = kvm_set_msr_with_filter(vcpu, ecx, data);
1ae09954 2057
d2f7d498 2058 if (!r) {
8b474427 2059 trace_kvm_msr_write(ecx, data);
d2f7d498
HW
2060 } else {
2061 /* MSR write failed? See if we should ask user space */
2062 if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
2063 complete_fast_msr_access, r))
2064 return 0;
2065 /* Signal all other negative errors to userspace */
2066 if (r < 0)
2067 return r;
1edce0a9 2068 trace_kvm_msr_write_ex(ecx, data);
d2f7d498 2069 }
1edce0a9 2070
b3646477 2071 return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
1edce0a9
SC
2072}
2073EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
2074
5ff3a351
SC
2075int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
2076{
2077 return kvm_skip_emulated_instruction(vcpu);
2078}
5ff3a351
SC
2079
2080int kvm_emulate_invd(struct kvm_vcpu *vcpu)
2081{
2082 /* Treat an INVD instruction as a NOP and just skip it. */
2083 return kvm_emulate_as_nop(vcpu);
2084}
2085EXPORT_SYMBOL_GPL(kvm_emulate_invd);
2086
5ff3a351
SC
2087int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
2088{
2089 kvm_queue_exception(vcpu, UD_VECTOR);
2090 return 1;
2091}
2092EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
2093
bfbcc81b
SC
2094
2095static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
5ff3a351 2096{
43bb9e00 2097 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) &&
bfbcc81b
SC
2098 !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
2099 return kvm_handle_invalid_op(vcpu);
2100
8d20bd63 2101 pr_warn_once("%s instruction emulated as NOP!\n", insn);
5ff3a351
SC
2102 return kvm_emulate_as_nop(vcpu);
2103}
bfbcc81b
SC
2104int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
2105{
2106 return kvm_emulate_monitor_mwait(vcpu, "MWAIT");
2107}
2108EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
2109
2110int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
2111{
2112 return kvm_emulate_monitor_mwait(vcpu, "MONITOR");
2113}
5ff3a351
SC
2114EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
2115
d89d04ab 2116static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
5a9f5443 2117{
4ae7dc97 2118 xfer_to_guest_mode_prepare();
5a9f5443 2119 return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
72c3c0fe 2120 xfer_to_guest_mode_work_pending();
5a9f5443 2121}
5a9f5443 2122
1e9e2622
WL
2123/*
2124 * The fast path for frequent and performance sensitive wrmsr emulation,
2125 * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
2126 * the latency of virtual IPI by avoiding the expensive bits of transitioning
2127 * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
2128 * other cases which must be called after interrupts are enabled on the host.
2129 */
2130static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
2131{
e1be9ac8
WL
2132 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
2133 return 1;
2134
2135 if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
bd17f417
SC
2136 ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
2137 ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
b9964ee3
SC
2138 ((u32)(data >> 32) != X2APIC_BROADCAST))
2139 return kvm_x2apic_icr_write(vcpu->arch.apic, data);
1e9e2622
WL
2140
2141 return 1;
2142}
2143
ae95f566
WL
2144static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
2145{
2146 if (!kvm_can_use_hv_timer(vcpu))
2147 return 1;
2148
2149 kvm_set_lapic_tscdeadline_msr(vcpu, data);
2150 return 0;
2151}
2152
404d5d7b 2153fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
1e9e2622
WL
2154{
2155 u32 msr = kvm_rcx_read(vcpu);
8a1038de 2156 u64 data;
404d5d7b 2157 fastpath_t ret = EXIT_FASTPATH_NONE;
1e9e2622
WL
2158
2159 switch (msr) {
2160 case APIC_BASE_MSR + (APIC_ICR >> 4):
8a1038de 2161 data = kvm_read_edx_eax(vcpu);
404d5d7b
WL
2162 if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
2163 kvm_skip_emulated_instruction(vcpu);
2164 ret = EXIT_FASTPATH_EXIT_HANDLED;
80bc97f2 2165 }
1e9e2622 2166 break;
09141ec0 2167 case MSR_IA32_TSC_DEADLINE:
ae95f566
WL
2168 data = kvm_read_edx_eax(vcpu);
2169 if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
2170 kvm_skip_emulated_instruction(vcpu);
2171 ret = EXIT_FASTPATH_REENTER_GUEST;
2172 }
2173 break;
1e9e2622 2174 default:
404d5d7b 2175 break;
1e9e2622
WL
2176 }
2177
404d5d7b 2178 if (ret != EXIT_FASTPATH_NONE)
1e9e2622 2179 trace_kvm_msr_write(msr, data);
1e9e2622 2180
404d5d7b 2181 return ret;
1e9e2622
WL
2182}
2183EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
2184
f20935d8
SC
2185/*
2186 * Adapt set_msr() to msr_io()'s calling convention
2187 */
2188static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2189{
6abe9c13 2190 return kvm_get_msr_ignored_check(vcpu, index, data, true);
f20935d8
SC
2191}
2192
2193static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2194{
6abe9c13 2195 return kvm_set_msr_ignored_check(vcpu, index, *data, true);
313a3dc7
CO
2196}
2197
16e8d74d 2198#ifdef CONFIG_X86_64
53fafdbb
MT
2199struct pvclock_clock {
2200 int vclock_mode;
2201 u64 cycle_last;
2202 u64 mask;
2203 u32 mult;
2204 u32 shift;
917f9475
PB
2205 u64 base_cycles;
2206 u64 offset;
53fafdbb
MT
2207};
2208
16e8d74d
MT
2209struct pvclock_gtod_data {
2210 seqcount_t seq;
2211
53fafdbb
MT
2212 struct pvclock_clock clock; /* extract of a clocksource struct */
2213 struct pvclock_clock raw_clock; /* extract of a clocksource struct */
16e8d74d 2214
917f9475 2215 ktime_t offs_boot;
55dd00a7 2216 u64 wall_time_sec;
16e8d74d
MT
2217};
2218
2219static struct pvclock_gtod_data pvclock_gtod_data;
2220
2221static void update_pvclock_gtod(struct timekeeper *tk)
2222{
2223 struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
2224
2225 write_seqcount_begin(&vdata->seq);
2226
2227 /* copy pvclock gtod data */
b95a8a27 2228 vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
876e7881
PZ
2229 vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
2230 vdata->clock.mask = tk->tkr_mono.mask;
2231 vdata->clock.mult = tk->tkr_mono.mult;
2232 vdata->clock.shift = tk->tkr_mono.shift;
917f9475
PB
2233 vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
2234 vdata->clock.offset = tk->tkr_mono.base;
16e8d74d 2235
b95a8a27 2236 vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
53fafdbb
MT
2237 vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
2238 vdata->raw_clock.mask = tk->tkr_raw.mask;
2239 vdata->raw_clock.mult = tk->tkr_raw.mult;
2240 vdata->raw_clock.shift = tk->tkr_raw.shift;
917f9475
PB
2241 vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
2242 vdata->raw_clock.offset = tk->tkr_raw.base;
16e8d74d 2243
55dd00a7
MT
2244 vdata->wall_time_sec = tk->xtime_sec;
2245
917f9475 2246 vdata->offs_boot = tk->offs_boot;
53fafdbb 2247
16e8d74d
MT
2248 write_seqcount_end(&vdata->seq);
2249}
8171cd68
PB
2250
2251static s64 get_kvmclock_base_ns(void)
2252{
2253 /* Count up from boot time, but with the frequency of the raw clock. */
2254 return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
2255}
2256#else
2257static s64 get_kvmclock_base_ns(void)
2258{
2259 /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
2260 return ktime_get_boottime_ns();
2261}
16e8d74d
MT
2262#endif
2263
55749769 2264static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
18068523 2265{
9ed3c444
AK
2266 int version;
2267 int r;
50d0a0f9 2268 struct pvclock_wall_clock wc;
629b5348 2269 u32 wc_sec_hi;
8171cd68 2270 u64 wall_nsec;
18068523
GOC
2271
2272 if (!wall_clock)
2273 return;
2274
9ed3c444
AK
2275 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
2276 if (r)
2277 return;
2278
2279 if (version & 1)
2280 ++version; /* first time write, random junk */
2281
2282 ++version;
18068523 2283
1dab1345
NK
2284 if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
2285 return;
18068523 2286
50d0a0f9
GH
2287 /*
2288 * The guest calculates current wall clock time by adding
34c238a1 2289 * system time (updated by kvm_guest_time_update below) to the
8171cd68 2290 * wall clock specified here. We do the reverse here.
50d0a0f9 2291 */
8171cd68 2292 wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
50d0a0f9 2293
8171cd68
PB
2294 wc.nsec = do_div(wall_nsec, 1000000000);
2295 wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
50d0a0f9 2296 wc.version = version;
18068523
GOC
2297
2298 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
2299
629b5348
JM
2300 if (sec_hi_ofs) {
2301 wc_sec_hi = wall_nsec >> 32;
2302 kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
2303 &wc_sec_hi, sizeof(wc_sec_hi));
2304 }
2305
18068523
GOC
2306 version++;
2307 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
18068523
GOC
2308}
2309
5b9bb0eb
OU
2310static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
2311 bool old_msr, bool host_initiated)
2312{
2313 struct kvm_arch *ka = &vcpu->kvm->arch;
2314
2315 if (vcpu->vcpu_id == 0 && !host_initiated) {
1e293d1a 2316 if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
5b9bb0eb
OU
2317 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2318
2319 ka->boot_vcpu_runs_old_kvmclock = old_msr;
2320 }
2321
2322 vcpu->arch.time = system_time;
2323 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2324
2325 /* we verify if the enable bit is set... */
8c82a0b3
ML
2326 if (system_time & 1)
2327 kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
52491a38 2328 sizeof(struct pvclock_vcpu_time_info));
8c82a0b3
ML
2329 else
2330 kvm_gpc_deactivate(&vcpu->arch.pv_time);
5b9bb0eb
OU
2331
2332 return;
2333}
2334
50d0a0f9
GH
2335static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
2336{
b51012de
PB
2337 do_shl32_div32(dividend, divisor);
2338 return dividend;
50d0a0f9
GH
2339}
2340
3ae13faa 2341static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
5f4e3f88 2342 s8 *pshift, u32 *pmultiplier)
50d0a0f9 2343{
5f4e3f88 2344 uint64_t scaled64;
50d0a0f9
GH
2345 int32_t shift = 0;
2346 uint64_t tps64;
2347 uint32_t tps32;
2348
3ae13faa
PB
2349 tps64 = base_hz;
2350 scaled64 = scaled_hz;
50933623 2351 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
50d0a0f9
GH
2352 tps64 >>= 1;
2353 shift--;
2354 }
2355
2356 tps32 = (uint32_t)tps64;
50933623
JK
2357 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
2358 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
5f4e3f88
ZA
2359 scaled64 >>= 1;
2360 else
2361 tps32 <<= 1;
50d0a0f9
GH
2362 shift++;
2363 }
2364
5f4e3f88
ZA
2365 *pshift = shift;
2366 *pmultiplier = div_frac(scaled64, tps32);
50d0a0f9
GH
2367}
2368
d828199e 2369#ifdef CONFIG_X86_64
16e8d74d 2370static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
d828199e 2371#endif
16e8d74d 2372
c8076604 2373static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
69b0049a 2374static unsigned long max_tsc_khz;
c8076604 2375
cc578287 2376static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1e993611 2377{
cc578287
ZA
2378 u64 v = (u64)khz * (1000000 + ppm);
2379 do_div(v, 1000000);
2380 return v;
1e993611
JR
2381}
2382
1ab9287a
IS
2383static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
2384
381d585c
HZ
2385static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2386{
2387 u64 ratio;
2388
2389 /* Guest TSC same frequency as host TSC? */
2390 if (!scale) {
938c8745 2391 kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
381d585c
HZ
2392 return 0;
2393 }
2394
2395 /* TSC scaling supported? */
938c8745 2396 if (!kvm_caps.has_tsc_control) {
381d585c
HZ
2397 if (user_tsc_khz > tsc_khz) {
2398 vcpu->arch.tsc_catchup = 1;
2399 vcpu->arch.tsc_always_catchup = 1;
2400 return 0;
2401 } else {
3f16a5c3 2402 pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
381d585c
HZ
2403 return -1;
2404 }
2405 }
2406
2407 /* TSC scaling required - calculate ratio */
938c8745 2408 ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits,
381d585c
HZ
2409 user_tsc_khz, tsc_khz);
2410
938c8745 2411 if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) {
3f16a5c3
PB
2412 pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2413 user_tsc_khz);
381d585c
HZ
2414 return -1;
2415 }
2416
1ab9287a 2417 kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
381d585c
HZ
2418 return 0;
2419}
2420
4941b8cb 2421static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
759379dd 2422{
cc578287
ZA
2423 u32 thresh_lo, thresh_hi;
2424 int use_scaling = 0;
217fc9cf 2425
03ba32ca 2426 /* tsc_khz can be zero if TSC calibration fails */
4941b8cb 2427 if (user_tsc_khz == 0) {
ad721883 2428 /* set tsc_scaling_ratio to a safe value */
938c8745 2429 kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
381d585c 2430 return -1;
ad721883 2431 }
03ba32ca 2432
c285545f 2433 /* Compute a scale to convert nanoseconds in TSC cycles */
3ae13faa 2434 kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
cc578287
ZA
2435 &vcpu->arch.virtual_tsc_shift,
2436 &vcpu->arch.virtual_tsc_mult);
4941b8cb 2437 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
cc578287
ZA
2438
2439 /*
2440 * Compute the variation in TSC rate which is acceptable
2441 * within the range of tolerance and decide if the
2442 * rate being applied is within that bounds of the hardware
2443 * rate. If so, no scaling or compensation need be done.
2444 */
2445 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2446 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
4941b8cb 2447 if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
8d20bd63
SC
2448 pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n",
2449 user_tsc_khz, thresh_lo, thresh_hi);
cc578287
ZA
2450 use_scaling = 1;
2451 }
4941b8cb 2452 return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
c285545f
ZA
2453}
2454
2455static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
2456{
e26101b1 2457 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
cc578287
ZA
2458 vcpu->arch.virtual_tsc_mult,
2459 vcpu->arch.virtual_tsc_shift);
e26101b1 2460 tsc += vcpu->arch.this_tsc_write;
c285545f
ZA
2461 return tsc;
2462}
2463
ba1f77c5 2464#ifdef CONFIG_X86_64
b0c39dc6
VK
2465static inline int gtod_is_based_on_tsc(int mode)
2466{
b95a8a27 2467 return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
b0c39dc6 2468}
ba1f77c5 2469#endif
b0c39dc6 2470
69b0049a 2471static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
b48aa97e
MT
2472{
2473#ifdef CONFIG_X86_64
2474 bool vcpus_matched;
b48aa97e
MT
2475 struct kvm_arch *ka = &vcpu->kvm->arch;
2476 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2477
2478 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2479 atomic_read(&vcpu->kvm->online_vcpus));
2480
7f187922
MT
2481 /*
2482 * Once the masterclock is enabled, always perform request in
2483 * order to update it.
2484 *
2485 * In order to enable masterclock, the host clocksource must be TSC
2486 * and the vcpus need to have matched TSCs. When that happens,
2487 * perform request to enable masterclock.
2488 */
2489 if (ka->use_master_clock ||
b0c39dc6 2490 (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
b48aa97e
MT
2491 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2492
2493 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2494 atomic_read(&vcpu->kvm->online_vcpus),
2495 ka->use_master_clock, gtod->clock.vclock_mode);
2496#endif
2497}
2498
35181e86
HZ
2499/*
2500 * Multiply tsc by a fixed point number represented by ratio.
2501 *
2502 * The most significant 64-N bits (mult) of ratio represent the
2503 * integral part of the fixed point number; the remaining N bits
2504 * (frac) represent the fractional part, ie. ratio represents a fixed
2505 * point number (mult + frac * 2^(-N)).
2506 *
938c8745 2507 * N equals to kvm_caps.tsc_scaling_ratio_frac_bits.
35181e86
HZ
2508 */
2509static inline u64 __scale_tsc(u64 ratio, u64 tsc)
2510{
938c8745 2511 return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits);
35181e86
HZ
2512}
2513
62711e5a 2514u64 kvm_scale_tsc(u64 tsc, u64 ratio)
35181e86
HZ
2515{
2516 u64 _tsc = tsc;
35181e86 2517
938c8745 2518 if (ratio != kvm_caps.default_tsc_scaling_ratio)
35181e86
HZ
2519 _tsc = __scale_tsc(ratio, tsc);
2520
2521 return _tsc;
2522}
35181e86 2523
9b399dfd 2524static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
07c1419a
HZ
2525{
2526 u64 tsc;
2527
62711e5a 2528 tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
07c1419a
HZ
2529
2530 return target_tsc - tsc;
2531}
2532
4ba76538
HZ
2533u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2534{
fe3eb504 2535 return vcpu->arch.l1_tsc_offset +
62711e5a 2536 kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
4ba76538
HZ
2537}
2538EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
2539
83150f29
IS
2540u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
2541{
2542 u64 nested_offset;
2543
938c8745 2544 if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio)
83150f29
IS
2545 nested_offset = l1_offset;
2546 else
2547 nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
938c8745 2548 kvm_caps.tsc_scaling_ratio_frac_bits);
83150f29
IS
2549
2550 nested_offset += l2_offset;
2551 return nested_offset;
2552}
2553EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
2554
2555u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
2556{
938c8745 2557 if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio)
83150f29 2558 return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
938c8745 2559 kvm_caps.tsc_scaling_ratio_frac_bits);
83150f29
IS
2560
2561 return l1_multiplier;
2562}
2563EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
2564
edcfe540 2565static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
a545ab6a 2566{
edcfe540
IS
2567 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2568 vcpu->arch.l1_tsc_offset,
2569 l1_offset);
2570
2571 vcpu->arch.l1_tsc_offset = l1_offset;
2572
2573 /*
2574 * If we are here because L1 chose not to trap WRMSR to TSC then
2575 * according to the spec this should set L1's TSC (as opposed to
2576 * setting L1's offset for L2).
2577 */
2578 if (is_guest_mode(vcpu))
2579 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2580 l1_offset,
2581 static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
2582 static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
2583 else
2584 vcpu->arch.tsc_offset = l1_offset;
2585
2586 static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
a545ab6a
LC
2587}
2588
1ab9287a
IS
2589static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
2590{
2591 vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
2592
2593 /* Userspace is changing the multiplier while L2 is active */
2594 if (is_guest_mode(vcpu))
2595 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2596 l1_multiplier,
2597 static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
2598 else
2599 vcpu->arch.tsc_scaling_ratio = l1_multiplier;
2600
938c8745 2601 if (kvm_caps.has_tsc_control)
1ab9287a
IS
2602 static_call(kvm_x86_write_tsc_multiplier)(
2603 vcpu, vcpu->arch.tsc_scaling_ratio);
2604}
2605
b0c39dc6
VK
2606static inline bool kvm_check_tsc_unstable(void)
2607{
2608#ifdef CONFIG_X86_64
2609 /*
2610 * TSC is marked unstable when we're running on Hyper-V,
2611 * 'TSC page' clocksource is good.
2612 */
b95a8a27 2613 if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
b0c39dc6
VK
2614 return false;
2615#endif
2616 return check_tsc_unstable();
2617}
2618
58d4277b
OU
2619/*
2620 * Infers attempts to synchronize the guest's tsc from host writes. Sets the
2621 * offset for the vcpu and tracks the TSC matching generation that the vcpu
2622 * participates in.
2623 */
2624static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
2625 u64 ns, bool matched)
2626{
2627 struct kvm *kvm = vcpu->kvm;
2628
2629 lockdep_assert_held(&kvm->arch.tsc_write_lock);
2630
2631 /*
2632 * We also track th most recent recorded KHZ, write and time to
2633 * allow the matching interval to be extended at each write.
2634 */
2635 kvm->arch.last_tsc_nsec = ns;
2636 kvm->arch.last_tsc_write = tsc;
2637 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
828ca896 2638 kvm->arch.last_tsc_offset = offset;
58d4277b
OU
2639
2640 vcpu->arch.last_guest_tsc = tsc;
2641
2642 kvm_vcpu_write_tsc_offset(vcpu, offset);
2643
2644 if (!matched) {
2645 /*
2646 * We split periods of matched TSC writes into generations.
2647 * For each generation, we track the original measured
2648 * nanosecond time, offset, and write, so if TSCs are in
2649 * sync, we can match exact offset, and if not, we can match
2650 * exact software computation in compute_guest_tsc()
2651 *
2652 * These values are tracked in kvm->arch.cur_xxx variables.
2653 */
2654 kvm->arch.cur_tsc_generation++;
2655 kvm->arch.cur_tsc_nsec = ns;
2656 kvm->arch.cur_tsc_write = tsc;
2657 kvm->arch.cur_tsc_offset = offset;
2658 kvm->arch.nr_vcpus_matched_tsc = 0;
2659 } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
2660 kvm->arch.nr_vcpus_matched_tsc++;
2661 }
2662
2663 /* Keep track of which generation this VCPU has synchronized to */
2664 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2665 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2666 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2667
2668 kvm_track_tsc_matching(vcpu);
2669}
2670
0c899c25 2671static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
99e3e30a
ZA
2672{
2673 struct kvm *kvm = vcpu->kvm;
f38e098f 2674 u64 offset, ns, elapsed;
99e3e30a 2675 unsigned long flags;
58d4277b 2676 bool matched = false;
c5e8ec8e 2677 bool synchronizing = false;
99e3e30a 2678
038f8c11 2679 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
9b399dfd 2680 offset = kvm_compute_l1_tsc_offset(vcpu, data);
8171cd68 2681 ns = get_kvmclock_base_ns();
f38e098f 2682 elapsed = ns - kvm->arch.last_tsc_nsec;
5d3cb0f6 2683
03ba32ca 2684 if (vcpu->arch.virtual_tsc_khz) {
0c899c25 2685 if (data == 0) {
bd8fab39
DP
2686 /*
2687 * detection of vcpu initialization -- need to sync
2688 * with other vCPUs. This particularly helps to keep
2689 * kvm_clock stable after CPU hotplug
2690 */
2691 synchronizing = true;
2692 } else {
2693 u64 tsc_exp = kvm->arch.last_tsc_write +
2694 nsec_to_cycles(vcpu, elapsed);
2695 u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2696 /*
2697 * Special case: TSC write with a small delta (1 second)
2698 * of virtual cycle time against real time is
2699 * interpreted as an attempt to synchronize the CPU.
2700 */
2701 synchronizing = data < tsc_exp + tsc_hz &&
2702 data + tsc_hz > tsc_exp;
2703 }
c5e8ec8e 2704 }
f38e098f
ZA
2705
2706 /*
5d3cb0f6
ZA
2707 * For a reliable TSC, we can match TSC offsets, and for an unstable
2708 * TSC, we add elapsed time in this computation. We could let the
2709 * compensation code attempt to catch up if we fall behind, but
2710 * it's better to try to match offsets from the beginning.
2711 */
c5e8ec8e 2712 if (synchronizing &&
5d3cb0f6 2713 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
b0c39dc6 2714 if (!kvm_check_tsc_unstable()) {
e26101b1 2715 offset = kvm->arch.cur_tsc_offset;
f38e098f 2716 } else {
857e4099 2717 u64 delta = nsec_to_cycles(vcpu, elapsed);
5d3cb0f6 2718 data += delta;
9b399dfd 2719 offset = kvm_compute_l1_tsc_offset(vcpu, data);
f38e098f 2720 }
b48aa97e 2721 matched = true;
f38e098f 2722 }
e26101b1 2723
58d4277b 2724 __kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
e26101b1 2725 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
99e3e30a 2726}
e26101b1 2727
58ea6767
HZ
2728static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
2729 s64 adjustment)
2730{
56ba77a4 2731 u64 tsc_offset = vcpu->arch.l1_tsc_offset;
326e7425 2732 kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
58ea6767
HZ
2733}
2734
2735static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
2736{
938c8745 2737 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
58ea6767 2738 WARN_ON(adjustment < 0);
62711e5a 2739 adjustment = kvm_scale_tsc((u64) adjustment,
fe3eb504 2740 vcpu->arch.l1_tsc_scaling_ratio);
ea26e4ec 2741 adjust_tsc_offset_guest(vcpu, adjustment);
58ea6767
HZ
2742}
2743
d828199e
MT
2744#ifdef CONFIG_X86_64
2745
a5a1d1c2 2746static u64 read_tsc(void)
d828199e 2747{
a5a1d1c2 2748 u64 ret = (u64)rdtsc_ordered();
03b9730b 2749 u64 last = pvclock_gtod_data.clock.cycle_last;
d828199e
MT
2750
2751 if (likely(ret >= last))
2752 return ret;
2753
2754 /*
2755 * GCC likes to generate cmov here, but this branch is extremely
6a6256f9 2756 * predictable (it's just a function of time and the likely is
d828199e
MT
2757 * very likely) and there's a data dependence, so force GCC
2758 * to generate a branch instead. I don't barrier() because
2759 * we don't actually need a barrier, and if this function
2760 * ever gets inlined it will generate worse code.
2761 */
2762 asm volatile ("");
2763 return last;
2764}
2765
53fafdbb
MT
2766static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2767 int *mode)
d828199e
MT
2768{
2769 long v;
b0c39dc6
VK
2770 u64 tsc_pg_val;
2771
53fafdbb 2772 switch (clock->vclock_mode) {
b95a8a27 2773 case VDSO_CLOCKMODE_HVCLOCK:
b0c39dc6
VK
2774 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
2775 tsc_timestamp);
2776 if (tsc_pg_val != U64_MAX) {
2777 /* TSC page valid */
b95a8a27 2778 *mode = VDSO_CLOCKMODE_HVCLOCK;
53fafdbb
MT
2779 v = (tsc_pg_val - clock->cycle_last) &
2780 clock->mask;
b0c39dc6
VK
2781 } else {
2782 /* TSC page invalid */
b95a8a27 2783 *mode = VDSO_CLOCKMODE_NONE;
b0c39dc6
VK
2784 }
2785 break;
b95a8a27
TG
2786 case VDSO_CLOCKMODE_TSC:
2787 *mode = VDSO_CLOCKMODE_TSC;
b0c39dc6 2788 *tsc_timestamp = read_tsc();
53fafdbb
MT
2789 v = (*tsc_timestamp - clock->cycle_last) &
2790 clock->mask;
b0c39dc6
VK
2791 break;
2792 default:
b95a8a27 2793 *mode = VDSO_CLOCKMODE_NONE;
b0c39dc6 2794 }
d828199e 2795
b95a8a27 2796 if (*mode == VDSO_CLOCKMODE_NONE)
b0c39dc6 2797 *tsc_timestamp = v = 0;
d828199e 2798
53fafdbb 2799 return v * clock->mult;
d828199e
MT
2800}
2801
53fafdbb 2802static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
d828199e 2803{
cbcf2dd3 2804 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
d828199e 2805 unsigned long seq;
d828199e 2806 int mode;
cbcf2dd3 2807 u64 ns;
d828199e 2808
d828199e
MT
2809 do {
2810 seq = read_seqcount_begin(&gtod->seq);
917f9475 2811 ns = gtod->raw_clock.base_cycles;
53fafdbb 2812 ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
917f9475
PB
2813 ns >>= gtod->raw_clock.shift;
2814 ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
d828199e 2815 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
cbcf2dd3 2816 *t = ns;
d828199e
MT
2817
2818 return mode;
2819}
2820
899a31f5 2821static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
55dd00a7
MT
2822{
2823 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2824 unsigned long seq;
2825 int mode;
2826 u64 ns;
2827
2828 do {
2829 seq = read_seqcount_begin(&gtod->seq);
55dd00a7 2830 ts->tv_sec = gtod->wall_time_sec;
917f9475 2831 ns = gtod->clock.base_cycles;
53fafdbb 2832 ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
55dd00a7
MT
2833 ns >>= gtod->clock.shift;
2834 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2835
2836 ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
2837 ts->tv_nsec = ns;
2838
2839 return mode;
2840}
2841
b0c39dc6
VK
2842/* returns true if host is using TSC based clocksource */
2843static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
d828199e 2844{
d828199e 2845 /* checked again under seqlock below */
b0c39dc6 2846 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
d828199e
MT
2847 return false;
2848
53fafdbb 2849 return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
b0c39dc6 2850 tsc_timestamp));
d828199e 2851}
55dd00a7 2852
b0c39dc6 2853/* returns true if host is using TSC based clocksource */
899a31f5 2854static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
b0c39dc6 2855 u64 *tsc_timestamp)
55dd00a7
MT
2856{
2857 /* checked again under seqlock below */
b0c39dc6 2858 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
55dd00a7
MT
2859 return false;
2860
b0c39dc6 2861 return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
55dd00a7 2862}
d828199e
MT
2863#endif
2864
2865/*
2866 *
b48aa97e
MT
2867 * Assuming a stable TSC across physical CPUS, and a stable TSC
2868 * across virtual CPUs, the following condition is possible.
2869 * Each numbered line represents an event visible to both
d828199e
MT
2870 * CPUs at the next numbered event.
2871 *
2872 * "timespecX" represents host monotonic time. "tscX" represents
2873 * RDTSC value.
2874 *
2875 * VCPU0 on CPU0 | VCPU1 on CPU1
2876 *
2877 * 1. read timespec0,tsc0
2878 * 2. | timespec1 = timespec0 + N
2879 * | tsc1 = tsc0 + M
2880 * 3. transition to guest | transition to guest
2881 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
2882 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
2883 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
2884 *
2885 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
2886 *
2887 * - ret0 < ret1
2888 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
2889 * ...
2890 * - 0 < N - M => M < N
2891 *
2892 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
2893 * always the case (the difference between two distinct xtime instances
2894 * might be smaller then the difference between corresponding TSC reads,
2895 * when updating guest vcpus pvclock areas).
2896 *
2897 * To avoid that problem, do not allow visibility of distinct
2898 * system_timestamp/tsc_timestamp values simultaneously: use a master
2899 * copy of host monotonic time values. Update that master copy
2900 * in lockstep.
2901 *
b48aa97e 2902 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
d828199e
MT
2903 *
2904 */
2905
2906static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
2907{
2908#ifdef CONFIG_X86_64
2909 struct kvm_arch *ka = &kvm->arch;
2910 int vclock_mode;
b48aa97e
MT
2911 bool host_tsc_clocksource, vcpus_matched;
2912
869b4421 2913 lockdep_assert_held(&kvm->arch.tsc_write_lock);
b48aa97e
MT
2914 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2915 atomic_read(&kvm->online_vcpus));
d828199e
MT
2916
2917 /*
2918 * If the host uses TSC clock, then passthrough TSC as stable
2919 * to the guest.
2920 */
b48aa97e 2921 host_tsc_clocksource = kvm_get_time_and_clockread(
d828199e
MT
2922 &ka->master_kernel_ns,
2923 &ka->master_cycle_now);
2924
16a96021 2925 ka->use_master_clock = host_tsc_clocksource && vcpus_matched
a826faf1 2926 && !ka->backwards_tsc_observed
54750f2c 2927 && !ka->boot_vcpu_runs_old_kvmclock;
b48aa97e 2928
d828199e
MT
2929 if (ka->use_master_clock)
2930 atomic_set(&kvm_guest_has_master_clock, 1);
2931
2932 vclock_mode = pvclock_gtod_data.clock.vclock_mode;
b48aa97e
MT
2933 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
2934 vcpus_matched);
d828199e
MT
2935#endif
2936}
2937
6b6fcd28 2938static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
2860c4b1
PB
2939{
2940 kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
2941}
2942
869b4421 2943static void __kvm_start_pvclock_update(struct kvm *kvm)
2e762ff7 2944{
869b4421
PB
2945 raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
2946 write_seqcount_begin(&kvm->arch.pvclock_sc);
2947}
e880c6ea 2948
869b4421
PB
2949static void kvm_start_pvclock_update(struct kvm *kvm)
2950{
2e762ff7 2951 kvm_make_mclock_inprogress_request(kvm);
c2c647f9 2952
2e762ff7 2953 /* no guest entries from this point */
869b4421 2954 __kvm_start_pvclock_update(kvm);
6b6fcd28 2955}
2e762ff7 2956
6b6fcd28
PB
2957static void kvm_end_pvclock_update(struct kvm *kvm)
2958{
2959 struct kvm_arch *ka = &kvm->arch;
2960 struct kvm_vcpu *vcpu;
46808a4c 2961 unsigned long i;
2e762ff7 2962
869b4421
PB
2963 write_seqcount_end(&ka->pvclock_sc);
2964 raw_spin_unlock_irq(&ka->tsc_write_lock);
2e762ff7 2965 kvm_for_each_vcpu(i, vcpu, kvm)
105b21bb 2966 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2e762ff7
MT
2967
2968 /* guest entries allowed */
2969 kvm_for_each_vcpu(i, vcpu, kvm)
72875d8a 2970 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
2e762ff7
MT
2971}
2972
6b6fcd28
PB
2973static void kvm_update_masterclock(struct kvm *kvm)
2974{
42dcbe7d 2975 kvm_hv_request_tsc_page_update(kvm);
6b6fcd28
PB
2976 kvm_start_pvclock_update(kvm);
2977 pvclock_update_vm_gtod_copy(kvm);
2978 kvm_end_pvclock_update(kvm);
2e762ff7
MT
2979}
2980
3ebcbd22
AR
2981/*
2982 * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
2983 * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz
2984 * can change during boot even if the TSC is constant, as it's possible for KVM
2985 * to be loaded before TSC calibration completes. Ideally, KVM would get a
2986 * notification when calibration completes, but practically speaking calibration
2987 * will complete before userspace is alive enough to create VMs.
2988 */
2989static unsigned long get_cpu_tsc_khz(void)
2990{
2991 if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
2992 return tsc_khz;
2993 else
2994 return __this_cpu_read(cpu_tsc_khz);
2995}
2996
869b4421
PB
2997/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
2998static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
108b249c 2999{
108b249c 3000 struct kvm_arch *ka = &kvm->arch;
8b953440 3001 struct pvclock_vcpu_time_info hv_clock;
8b953440 3002
e2c2206a
WL
3003 /* both __this_cpu_read() and rdtsc() should be on the same cpu */
3004 get_cpu();
3005
869b4421 3006 data->flags = 0;
3ebcbd22
AR
3007 if (ka->use_master_clock &&
3008 (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
c68dc1b5
OU
3009#ifdef CONFIG_X86_64
3010 struct timespec64 ts;
3011
3012 if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
3013 data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
3014 data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
3015 } else
3016#endif
3017 data->host_tsc = rdtsc();
3018
869b4421
PB
3019 data->flags |= KVM_CLOCK_TSC_STABLE;
3020 hv_clock.tsc_timestamp = ka->master_cycle_now;
3021 hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3ebcbd22 3022 kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
e70b57a6
WL
3023 &hv_clock.tsc_shift,
3024 &hv_clock.tsc_to_system_mul);
c68dc1b5 3025 data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
55c0cefb
OU
3026 } else {
3027 data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
3028 }
e2c2206a
WL
3029
3030 put_cpu();
55c0cefb 3031}
e2c2206a 3032
869b4421
PB
3033static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
3034{
3035 struct kvm_arch *ka = &kvm->arch;
3036 unsigned seq;
3037
3038 do {
3039 seq = read_seqcount_begin(&ka->pvclock_sc);
3040 __get_kvmclock(kvm, data);
3041 } while (read_seqcount_retry(&ka->pvclock_sc, seq));
3042}
3043
55c0cefb
OU
3044u64 get_kvmclock_ns(struct kvm *kvm)
3045{
3046 struct kvm_clock_data data;
3047
55c0cefb
OU
3048 get_kvmclock(kvm, &data);
3049 return data.clock;
108b249c
PB
3050}
3051
916d3608
DW
3052static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
3053 struct gfn_to_pfn_cache *gpc,
3054 unsigned int offset)
0d6dd2ff
PB
3055{
3056 struct kvm_vcpu_arch *vcpu = &v->arch;
916d3608
DW
3057 struct pvclock_vcpu_time_info *guest_hv_clock;
3058 unsigned long flags;
0d6dd2ff 3059
916d3608 3060 read_lock_irqsave(&gpc->lock, flags);
58f5ee5f 3061 while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
916d3608
DW
3062 read_unlock_irqrestore(&gpc->lock, flags);
3063
58f5ee5f 3064 if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
916d3608 3065 return;
0d6dd2ff 3066
916d3608
DW
3067 read_lock_irqsave(&gpc->lock, flags);
3068 }
3069
3070 guest_hv_clock = (void *)(gpc->khva + offset);
3071
3072 /*
3073 * This VCPU is paused, but it's legal for a guest to read another
0d6dd2ff
PB
3074 * VCPU's kvmclock, so we really have to follow the specification where
3075 * it says that version is odd if data is being modified, and even after
3076 * it is consistent.
0d6dd2ff 3077 */
0d6dd2ff 3078
916d3608 3079 guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
0d6dd2ff
PB
3080 smp_wmb();
3081
3082 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
916d3608 3083 vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
0d6dd2ff
PB
3084
3085 if (vcpu->pvclock_set_guest_stopped_request) {
3086 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
3087 vcpu->pvclock_set_guest_stopped_request = false;
3088 }
3089
916d3608
DW
3090 memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
3091 smp_wmb();
0d6dd2ff 3092
916d3608 3093 guest_hv_clock->version = ++vcpu->hv_clock.version;
0d6dd2ff 3094
916d3608
DW
3095 mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
3096 read_unlock_irqrestore(&gpc->lock, flags);
0d6dd2ff 3097
916d3608 3098 trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
0d6dd2ff
PB
3099}
3100
34c238a1 3101static int kvm_guest_time_update(struct kvm_vcpu *v)
18068523 3102{
78db6a50 3103 unsigned long flags, tgt_tsc_khz;
869b4421 3104 unsigned seq;
18068523 3105 struct kvm_vcpu_arch *vcpu = &v->arch;
d828199e 3106 struct kvm_arch *ka = &v->kvm->arch;
f25e656d 3107 s64 kernel_ns;
d828199e 3108 u64 tsc_timestamp, host_tsc;
51d59c6b 3109 u8 pvclock_flags;
d828199e
MT
3110 bool use_master_clock;
3111
3112 kernel_ns = 0;
3113 host_tsc = 0;
18068523 3114
d828199e
MT
3115 /*
3116 * If the host uses TSC clock, then passthrough TSC as stable
3117 * to the guest.
3118 */
869b4421
PB
3119 do {
3120 seq = read_seqcount_begin(&ka->pvclock_sc);
3121 use_master_clock = ka->use_master_clock;
3122 if (use_master_clock) {
3123 host_tsc = ka->master_cycle_now;
3124 kernel_ns = ka->master_kernel_ns;
3125 }
3126 } while (read_seqcount_retry(&ka->pvclock_sc, seq));
c09664bb
MT
3127
3128 /* Keep irq disabled to prevent changes to the clock */
3129 local_irq_save(flags);
3ebcbd22 3130 tgt_tsc_khz = get_cpu_tsc_khz();
78db6a50 3131 if (unlikely(tgt_tsc_khz == 0)) {
c09664bb
MT
3132 local_irq_restore(flags);
3133 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
3134 return 1;
3135 }
d828199e 3136 if (!use_master_clock) {
4ea1636b 3137 host_tsc = rdtsc();
8171cd68 3138 kernel_ns = get_kvmclock_base_ns();
d828199e
MT
3139 }
3140
4ba76538 3141 tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
d828199e 3142
c285545f
ZA
3143 /*
3144 * We may have to catch up the TSC to match elapsed wall clock
3145 * time for two reasons, even if kvmclock is used.
3146 * 1) CPU could have been running below the maximum TSC rate
3147 * 2) Broken TSC compensation resets the base at each VCPU
3148 * entry to avoid unknown leaps of TSC even when running
3149 * again on the same CPU. This may cause apparent elapsed
3150 * time to disappear, and the guest to stand still or run
3151 * very slowly.
3152 */
3153 if (vcpu->tsc_catchup) {
3154 u64 tsc = compute_guest_tsc(v, kernel_ns);
3155 if (tsc > tsc_timestamp) {
f1e2b260 3156 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
c285545f
ZA
3157 tsc_timestamp = tsc;
3158 }
50d0a0f9
GH
3159 }
3160
18068523
GOC
3161 local_irq_restore(flags);
3162
0d6dd2ff 3163 /* With all the info we got, fill in the values */
18068523 3164
938c8745 3165 if (kvm_caps.has_tsc_control)
62711e5a 3166 tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
fe3eb504 3167 v->arch.l1_tsc_scaling_ratio);
78db6a50
PB
3168
3169 if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
3ae13faa 3170 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
5f4e3f88
ZA
3171 &vcpu->hv_clock.tsc_shift,
3172 &vcpu->hv_clock.tsc_to_system_mul);
78db6a50 3173 vcpu->hw_tsc_khz = tgt_tsc_khz;
f422f853 3174 kvm_xen_update_tsc_info(v);
8cfdc000
ZA
3175 }
3176
1d5f066e 3177 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
759379dd 3178 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
28e4639a 3179 vcpu->last_guest_tsc = tsc_timestamp;
51d59c6b 3180
d828199e 3181 /* If the host uses TSC clocksource, then it is stable */
0d6dd2ff 3182 pvclock_flags = 0;
d828199e
MT
3183 if (use_master_clock)
3184 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
3185
78c0337a
MT
3186 vcpu->hv_clock.flags = pvclock_flags;
3187
916d3608
DW
3188 if (vcpu->pv_time.active)
3189 kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
7caf9571
DW
3190 if (vcpu->xen.vcpu_info_cache.active)
3191 kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
3192 offsetof(struct compat_vcpu_info, time));
69d413cf
DW
3193 if (vcpu->xen.vcpu_time_info_cache.active)
3194 kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
42dcbe7d 3195 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
8cfdc000 3196 return 0;
c8076604
GH
3197}
3198
0061d53d
MT
3199/*
3200 * kvmclock updates which are isolated to a given vcpu, such as
3201 * vcpu->cpu migration, should not allow system_timestamp from
3202 * the rest of the vcpus to remain static. Otherwise ntp frequency
3203 * correction applies to one vcpu's system_timestamp but not
3204 * the others.
3205 *
3206 * So in those cases, request a kvmclock update for all vcpus.
7e44e449
AJ
3207 * We need to rate-limit these requests though, as they can
3208 * considerably slow guests that have a large number of vcpus.
3209 * The time for a remote vcpu to update its kvmclock is bound
3210 * by the delay we use to rate-limit the updates.
0061d53d
MT
3211 */
3212
7e44e449
AJ
3213#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
3214
3215static void kvmclock_update_fn(struct work_struct *work)
0061d53d 3216{
46808a4c 3217 unsigned long i;
7e44e449
AJ
3218 struct delayed_work *dwork = to_delayed_work(work);
3219 struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
3220 kvmclock_update_work);
3221 struct kvm *kvm = container_of(ka, struct kvm, arch);
0061d53d
MT
3222 struct kvm_vcpu *vcpu;
3223
3224 kvm_for_each_vcpu(i, vcpu, kvm) {
105b21bb 3225 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
0061d53d
MT
3226 kvm_vcpu_kick(vcpu);
3227 }
3228}
3229
7e44e449
AJ
3230static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
3231{
3232 struct kvm *kvm = v->kvm;
3233
105b21bb 3234 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
7e44e449
AJ
3235 schedule_delayed_work(&kvm->arch.kvmclock_update_work,
3236 KVMCLOCK_UPDATE_DELAY);
3237}
3238
332967a3
AJ
3239#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
3240
3241static void kvmclock_sync_fn(struct work_struct *work)
3242{
3243 struct delayed_work *dwork = to_delayed_work(work);
3244 struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
3245 kvmclock_sync_work);
3246 struct kvm *kvm = container_of(ka, struct kvm, arch);
3247
630994b3
MT
3248 if (!kvmclock_periodic_sync)
3249 return;
3250
332967a3
AJ
3251 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
3252 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
3253 KVMCLOCK_SYNC_PERIOD);
3254}
3255
281b5278
JW
3256/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
3257static bool is_mci_control_msr(u32 msr)
3258{
3259 return (msr & 3) == 0;
3260}
3261static bool is_mci_status_msr(u32 msr)
3262{
3263 return (msr & 3) == 1;
3264}
3265
191c8137
BP
3266/*
3267 * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
3268 */
3269static bool can_set_mci_status(struct kvm_vcpu *vcpu)
3270{
3271 /* McStatusWrEn enabled? */
23493d0a 3272 if (guest_cpuid_is_amd_or_hygon(vcpu))
191c8137
BP
3273 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
3274
3275 return false;
3276}
3277
9ffd986c 3278static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
15c4a640 3279{
890ca9ae
HY
3280 u64 mcg_cap = vcpu->arch.mcg_cap;
3281 unsigned bank_num = mcg_cap & 0xff;
9ffd986c
WL
3282 u32 msr = msr_info->index;
3283 u64 data = msr_info->data;
281b5278 3284 u32 offset, last_msr;
890ca9ae 3285
15c4a640 3286 switch (msr) {
15c4a640 3287 case MSR_IA32_MCG_STATUS:
890ca9ae 3288 vcpu->arch.mcg_status = data;
15c4a640 3289 break;
c7ac679c 3290 case MSR_IA32_MCG_CTL:
44883f01
PB
3291 if (!(mcg_cap & MCG_CTL_P) &&
3292 (data || !msr_info->host_initiated))
890ca9ae
HY
3293 return 1;
3294 if (data != 0 && data != ~(u64)0)
44883f01 3295 return 1;
890ca9ae
HY
3296 vcpu->arch.mcg_ctl = data;
3297 break;
281b5278
JW
3298 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
3299 last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
3300 if (msr > last_msr)
3301 return 1;
191c8137 3302
281b5278
JW
3303 if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
3304 return 1;
3305 /* An attempt to write a 1 to a reserved bit raises #GP */
3306 if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
3307 return 1;
3308 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
3309 last_msr + 1 - MSR_IA32_MC0_CTL2);
3310 vcpu->arch.mci_ctl2_banks[offset] = data;
3311 break;
3312 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3313 last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
3314 if (msr > last_msr)
3315 return 1;
3316
3317 /*
3318 * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
3319 * values are architecturally undefined. But, some Linux
3320 * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
3321 * issue on AMD K8s, allow bit 10 to be clear when setting all
3322 * other bits in order to avoid an uncaught #GP in the guest.
f5223a33
SC
3323 *
3324 * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable,
3325 * single-bit ECC data errors.
281b5278
JW
3326 */
3327 if (is_mci_control_msr(msr) &&
3328 data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
3329 return 1;
191c8137 3330
281b5278
JW
3331 /*
3332 * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
3333 * AMD-based CPUs allow non-zero values, but if and only if
3334 * HWCR[McStatusWrEn] is set.
3335 */
3336 if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
3337 data != 0 && !can_set_mci_status(vcpu))
3338 return 1;
3339
3340 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
3341 last_msr + 1 - MSR_IA32_MC0_CTL);
3342 vcpu->arch.mce_banks[offset] = data;
3343 break;
3344 default:
890ca9ae
HY
3345 return 1;
3346 }
3347 return 0;
3348}
3349
2635b5c4
VK
3350static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
3351{
3352 u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
3353
3354 return (vcpu->arch.apf.msr_en_val & mask) == mask;
3355}
3356
344d9588
GN
3357static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
3358{
3359 gpa_t gpa = data & ~0x3f;
3360
2635b5c4
VK
3361 /* Bits 4:5 are reserved, Should be zero */
3362 if (data & 0x30)
344d9588
GN
3363 return 1;
3364
66570e96
OU
3365 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
3366 (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
3367 return 1;
3368
3369 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
3370 (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
3371 return 1;
3372
9d3c447c 3373 if (!lapic_in_kernel(vcpu))
d831de17 3374 return data ? 1 : 0;
9d3c447c 3375
2635b5c4 3376 vcpu->arch.apf.msr_en_val = data;
344d9588 3377
2635b5c4 3378 if (!kvm_pv_async_pf_enabled(vcpu)) {
344d9588
GN
3379 kvm_clear_async_pf_completion_queue(vcpu);
3380 kvm_async_pf_hash_reset(vcpu);
3381 return 0;
3382 }
3383
4e335d9e 3384 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
68fd66f1 3385 sizeof(u64)))
344d9588
GN
3386 return 1;
3387
6adba527 3388 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
52a5c155 3389 vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2635b5c4 3390
344d9588 3391 kvm_async_pf_wakeup_all(vcpu);
2635b5c4
VK
3392
3393 return 0;
3394}
3395
3396static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
3397{
3398 /* Bits 8-63 are reserved */
3399 if (data >> 8)
3400 return 1;
3401
3402 if (!lapic_in_kernel(vcpu))
3403 return 1;
3404
3405 vcpu->arch.apf.msr_int_val = data;
3406
3407 vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
3408
344d9588
GN
3409 return 0;
3410}
3411
12f9a48f
GC
3412static void kvmclock_reset(struct kvm_vcpu *vcpu)
3413{
8c82a0b3 3414 kvm_gpc_deactivate(&vcpu->arch.pv_time);
49dedf0d 3415 vcpu->arch.time = 0;
12f9a48f
GC
3416}
3417
7780938c 3418static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
f38a7b75
WL
3419{
3420 ++vcpu->stat.tlb_flush;
e27bc044 3421 static_call(kvm_x86_flush_tlb_all)(vcpu);
e94cea09
SC
3422
3423 /* Flushing all ASIDs flushes the current ASID... */
3424 kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
f38a7b75
WL
3425}
3426
0baedd79
VK
3427static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
3428{
3429 ++vcpu->stat.tlb_flush;
b53e84ee
LJ
3430
3431 if (!tdp_enabled) {
61b05a9f 3432 /*
b53e84ee
LJ
3433 * A TLB flush on behalf of the guest is equivalent to
3434 * INVPCID(all), toggling CR4.PGE, etc., which requires
61b05a9f
LJ
3435 * a forced sync of the shadow page tables. Ensure all the
3436 * roots are synced and the guest TLB in hardware is clean.
b53e84ee 3437 */
61b05a9f
LJ
3438 kvm_mmu_sync_roots(vcpu);
3439 kvm_mmu_sync_prev_roots(vcpu);
b53e84ee
LJ
3440 }
3441
e27bc044 3442 static_call(kvm_x86_flush_tlb_guest)(vcpu);
adc43caa
VK
3443
3444 /*
3445 * Flushing all "guest" TLB is always a superset of Hyper-V's fine
3446 * grained flushing.
3447 */
0823570f 3448 kvm_hv_vcpu_purge_flush_tlb(vcpu);
0baedd79
VK
3449}
3450
40e5f908
SC
3451
3452static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
3453{
3454 ++vcpu->stat.tlb_flush;
e27bc044 3455 static_call(kvm_x86_flush_tlb_current)(vcpu);
40e5f908
SC
3456}
3457
3458/*
3459 * Service "local" TLB flush requests, which are specific to the current MMU
3460 * context. In addition to the generic event handling in vcpu_enter_guest(),
3461 * TLB flushes that are targeted at an MMU context also need to be serviced
3462 * prior before nested VM-Enter/VM-Exit.
3463 */
3464void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
3465{
3466 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
3467 kvm_vcpu_flush_tlb_current(vcpu);
3468
3469 if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
3470 kvm_vcpu_flush_tlb_guest(vcpu);
3471}
3472EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
3473
c9aaa895
GC
3474static void record_steal_time(struct kvm_vcpu *vcpu)
3475{
7e2175eb
DW
3476 struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3477 struct kvm_steal_time __user *st;
3478 struct kvm_memslots *slots;
901d3765 3479 gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
7e2175eb
DW
3480 u64 steal;
3481 u32 version;
b0431382 3482
30b5c851
DW
3483 if (kvm_xen_msr_enabled(vcpu->kvm)) {
3484 kvm_xen_runstate_set_running(vcpu);
3485 return;
3486 }
3487
c9aaa895
GC
3488 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3489 return;
3490
7e2175eb 3491 if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
c9aaa895
GC
3492 return;
3493
7e2175eb
DW
3494 slots = kvm_memslots(vcpu->kvm);
3495
3496 if (unlikely(slots->generation != ghc->generation ||
901d3765 3497 gpa != ghc->gpa ||
7e2175eb 3498 kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
7e2175eb
DW
3499 /* We rely on the fact that it fits in a single page. */
3500 BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3501
901d3765 3502 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
7e2175eb
DW
3503 kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3504 return;
3505 }
3506
3507 st = (struct kvm_steal_time __user *)ghc->hva;
f38a7b75
WL
3508 /*
3509 * Doing a TLB flush here, on the guest's behalf, can avoid
3510 * expensive IPIs.
3511 */
66570e96 3512 if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
7e2175eb
DW
3513 u8 st_preempted = 0;
3514 int err = -EFAULT;
3515
3e067fd8
PB
3516 if (!user_access_begin(st, sizeof(*st)))
3517 return;
3518
7e2175eb
DW
3519 asm volatile("1: xchgb %0, %2\n"
3520 "xor %1, %1\n"
3521 "2:\n"
3522 _ASM_EXTABLE_UA(1b, 2b)
964b7aa0
DW
3523 : "+q" (st_preempted),
3524 "+&r" (err),
3525 "+m" (st->preempted));
7e2175eb
DW
3526 if (err)
3527 goto out;
3528
3529 user_access_end();
3530
3531 vcpu->arch.st.preempted = 0;
af3511ff 3532
66570e96 3533 trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
af3511ff
LJ
3534 st_preempted & KVM_VCPU_FLUSH_TLB);
3535 if (st_preempted & KVM_VCPU_FLUSH_TLB)
66570e96 3536 kvm_vcpu_flush_tlb_guest(vcpu);
7e2175eb
DW
3537
3538 if (!user_access_begin(st, sizeof(*st)))
3539 goto dirty;
1eff0ada 3540 } else {
3e067fd8
PB
3541 if (!user_access_begin(st, sizeof(*st)))
3542 return;
3543
7e2175eb
DW
3544 unsafe_put_user(0, &st->preempted, out);
3545 vcpu->arch.st.preempted = 0;
66570e96 3546 }
0b9f6c46 3547
7e2175eb
DW
3548 unsafe_get_user(version, &st->version, out);
3549 if (version & 1)
3550 version += 1; /* first time write, random junk */
35f3fae1 3551
7e2175eb
DW
3552 version += 1;
3553 unsafe_put_user(version, &st->version, out);
35f3fae1
WL
3554
3555 smp_wmb();
3556
7e2175eb
DW
3557 unsafe_get_user(steal, &st->steal, out);
3558 steal += current->sched_info.run_delay -
c54cdf14
LC
3559 vcpu->arch.st.last_steal;
3560 vcpu->arch.st.last_steal = current->sched_info.run_delay;
7e2175eb 3561 unsafe_put_user(steal, &st->steal, out);
35f3fae1 3562
7e2175eb
DW
3563 version += 1;
3564 unsafe_put_user(version, &st->version, out);
35f3fae1 3565
7e2175eb
DW
3566 out:
3567 user_access_end();
3568 dirty:
3569 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
c9aaa895
GC
3570}
3571
2de154f5
SC
3572static bool kvm_is_msr_to_save(u32 msr_index)
3573{
3574 unsigned int i;
3575
3576 for (i = 0; i < num_msrs_to_save; i++) {
3577 if (msrs_to_save[i] == msr_index)
3578 return true;
3579 }
3580
3581 return false;
3582}
3583
8fe8ab46 3584int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
15c4a640 3585{
8fe8ab46
WA
3586 u32 msr = msr_info->index;
3587 u64 data = msr_info->data;
5753785f 3588
1232f8e6 3589 if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
23200b7a 3590 return kvm_xen_write_hypercall_page(vcpu, data);
1232f8e6 3591
15c4a640 3592 switch (msr) {
2e32b719 3593 case MSR_AMD64_NB_CFG:
2e32b719
BP
3594 case MSR_IA32_UCODE_WRITE:
3595 case MSR_VM_HSAVE_PA:
3596 case MSR_AMD64_PATCH_LOADER:
3597 case MSR_AMD64_BU_CFG2:
405a353a 3598 case MSR_AMD64_DC_CFG:
0e1b869f 3599 case MSR_F15H_EX_CFG:
2e32b719
BP
3600 break;
3601
518e7b94
WL
3602 case MSR_IA32_UCODE_REV:
3603 if (msr_info->host_initiated)
3604 vcpu->arch.microcode_version = data;
3605 break;
0cf9135b
SC
3606 case MSR_IA32_ARCH_CAPABILITIES:
3607 if (!msr_info->host_initiated)
3608 return 1;
3609 vcpu->arch.arch_capabilities = data;
3610 break;
686e0f03 3611 case MSR_IA32_PERF_CAPABILITIES:
d574c539
VK
3612 if (!msr_info->host_initiated)
3613 return 1;
686e0f03 3614 if (data & ~kvm_caps.supported_perf_cap)
d574c539
VK
3615 return 1;
3616
3617 vcpu->arch.perf_capabilities = data;
17a024a8 3618 kvm_pmu_refresh(vcpu);
d574c539 3619 return 0;
903358c7
SC
3620 case MSR_IA32_PRED_CMD:
3621 if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
3622 return 1;
3623
3624 if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
3625 return 1;
3626 if (!data)
3627 break;
3628
3629 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
3630 break;
da3db168
SC
3631 case MSR_IA32_FLUSH_CMD:
3632 if (!msr_info->host_initiated &&
3633 !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
3634 return 1;
3635
3636 if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
3637 return 1;
3638 if (!data)
3639 break;
3640
3641 wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
3642 break;
15c4a640 3643 case MSR_EFER:
11988499 3644 return set_efer(vcpu, msr_info);
8f1589d9
AP
3645 case MSR_K7_HWCR:
3646 data &= ~(u64)0x40; /* ignore flush filter disable */
82494028 3647 data &= ~(u64)0x100; /* ignore ignne emulation enable */
a223c313 3648 data &= ~(u64)0x8; /* ignore TLB cache disable */
191c8137
BP
3649
3650 /* Handle McStatusWrEn */
3651 if (data == BIT_ULL(18)) {
3652 vcpu->arch.msr_hwcr = data;
3653 } else if (data != 0) {
e76ae527 3654 kvm_pr_unimpl_wrmsr(vcpu, msr, data);
8f1589d9
AP
3655 return 1;
3656 }
15c4a640 3657 break;
f7c6d140
AP
3658 case MSR_FAM10H_MMIO_CONF_BASE:
3659 if (data != 0) {
e76ae527 3660 kvm_pr_unimpl_wrmsr(vcpu, msr, data);
f7c6d140
AP
3661 return 1;
3662 }
15c4a640 3663 break;
281b5278
JW
3664 case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
3665 case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
ff53604b 3666 return kvm_mtrr_set_msr(vcpu, msr, data);
15c4a640 3667 case MSR_IA32_APICBASE:
58cb628d 3668 return kvm_set_apic_base(vcpu, msr_info);
bf10bd0b 3669 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
0105d1a5 3670 return kvm_x2apic_msr_write(vcpu, msr, data);
09141ec0 3671 case MSR_IA32_TSC_DEADLINE:
a3e06bbe
LJ
3672 kvm_set_lapic_tscdeadline_msr(vcpu, data);
3673 break;
ba904635 3674 case MSR_IA32_TSC_ADJUST:
d6321d49 3675 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
ba904635 3676 if (!msr_info->host_initiated) {
d913b904 3677 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
d7add054 3678 adjust_tsc_offset_guest(vcpu, adj);
d9130a2d
ZD
3679 /* Before back to guest, tsc_timestamp must be adjusted
3680 * as well, otherwise guest's percpu pvclock time could jump.
3681 */
3682 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
ba904635
WA
3683 }
3684 vcpu->arch.ia32_tsc_adjust_msr = data;
3685 }
3686 break;
bef6ecca
LX
3687 case MSR_IA32_MISC_ENABLE: {
3688 u64 old_val = vcpu->arch.ia32_misc_enable_msr;
d1055173 3689
9fc22296
SC
3690 if (!msr_info->host_initiated) {
3691 /* RO bits */
3692 if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)
3693 return 1;
3694
3695 /* R bits, i.e. writes are ignored, but don't fault. */
3696 data = data & ~MSR_IA32_MISC_ENABLE_EMON;
3697 data |= old_val & MSR_IA32_MISC_ENABLE_EMON;
3698 }
bef6ecca 3699
511a8556 3700 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
bef6ecca 3701 ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
511a8556
WL
3702 if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
3703 return 1;
3704 vcpu->arch.ia32_misc_enable_msr = data;
aedbaf4f 3705 kvm_update_cpuid_runtime(vcpu);
511a8556
WL
3706 } else {
3707 vcpu->arch.ia32_misc_enable_msr = data;
3708 }
15c4a640 3709 break;
bef6ecca 3710 }
64d60670 3711 case MSR_IA32_SMBASE:
4b8e1b32 3712 if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
64d60670
PB
3713 return 1;
3714 vcpu->arch.smbase = data;
3715 break;
73f624f4
PB
3716 case MSR_IA32_POWER_CTL:
3717 vcpu->arch.msr_ia32_power_ctl = data;
3718 break;
dd259935 3719 case MSR_IA32_TSC:
0c899c25
PB
3720 if (msr_info->host_initiated) {
3721 kvm_synchronize_tsc(vcpu, data);
3722 } else {
9b399dfd 3723 u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
0c899c25
PB
3724 adjust_tsc_offset_guest(vcpu, adj);
3725 vcpu->arch.ia32_tsc_adjust_msr += adj;
3726 }
dd259935 3727 break;
864e2ab2
AL
3728 case MSR_IA32_XSS:
3729 if (!msr_info->host_initiated &&
3730 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3731 return 1;
3732 /*
a1bead2a
SC
3733 * KVM supports exposing PT to the guest, but does not support
3734 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
3735 * XSAVES/XRSTORS to save/restore PT MSRs.
864e2ab2 3736 */
938c8745 3737 if (data & ~kvm_caps.supported_xss)
864e2ab2
AL
3738 return 1;
3739 vcpu->arch.ia32_xss = data;
4c282e51 3740 kvm_update_cpuid_runtime(vcpu);
864e2ab2 3741 break;
52797bf9
LA
3742 case MSR_SMI_COUNT:
3743 if (!msr_info->host_initiated)
3744 return 1;
3745 vcpu->arch.smi_count = data;
3746 break;
11c6bffa 3747 case MSR_KVM_WALL_CLOCK_NEW:
66570e96
OU
3748 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3749 return 1;
3750
629b5348
JM
3751 vcpu->kvm->arch.wall_clock = data;
3752 kvm_write_wall_clock(vcpu->kvm, data, 0);
66570e96 3753 break;
18068523 3754 case MSR_KVM_WALL_CLOCK:
66570e96
OU
3755 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3756 return 1;
3757
629b5348
JM
3758 vcpu->kvm->arch.wall_clock = data;
3759 kvm_write_wall_clock(vcpu->kvm, data, 0);
18068523 3760 break;
11c6bffa 3761 case MSR_KVM_SYSTEM_TIME_NEW:
66570e96
OU
3762 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3763 return 1;
3764
5b9bb0eb
OU
3765 kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
3766 break;
3767 case MSR_KVM_SYSTEM_TIME:
66570e96
OU
3768 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3769 return 1;
3770
3771 kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
18068523 3772 break;
344d9588 3773 case MSR_KVM_ASYNC_PF_EN:
66570e96
OU
3774 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3775 return 1;
3776
344d9588
GN
3777 if (kvm_pv_enable_async_pf(vcpu, data))
3778 return 1;
3779 break;
2635b5c4 3780 case MSR_KVM_ASYNC_PF_INT:
66570e96
OU
3781 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3782 return 1;
3783
2635b5c4
VK
3784 if (kvm_pv_enable_async_pf_int(vcpu, data))
3785 return 1;
3786 break;
557a961a 3787 case MSR_KVM_ASYNC_PF_ACK:
0a31df68 3788 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
66570e96 3789 return 1;
557a961a
VK
3790 if (data & 0x1) {
3791 vcpu->arch.apf.pageready_pending = false;
3792 kvm_check_async_pf_completion(vcpu);
3793 }
3794 break;
c9aaa895 3795 case MSR_KVM_STEAL_TIME:
66570e96
OU
3796 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3797 return 1;
c9aaa895
GC
3798
3799 if (unlikely(!sched_info_on()))
3800 return 1;
3801
3802 if (data & KVM_STEAL_RESERVED_MASK)
3803 return 1;
3804
c9aaa895
GC
3805 vcpu->arch.st.msr_val = data;
3806
3807 if (!(data & KVM_MSR_ENABLED))
3808 break;
3809
c9aaa895
GC
3810 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3811
3812 break;
ae7a2a3f 3813 case MSR_KVM_PV_EOI_EN:
66570e96
OU
3814 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3815 return 1;
3816
77c3323f 3817 if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
ae7a2a3f
MT
3818 return 1;
3819 break;
c9aaa895 3820
2d5ba19b 3821 case MSR_KVM_POLL_CONTROL:
66570e96
OU
3822 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3823 return 1;
3824
2d5ba19b
MT
3825 /* only enable bit supported */
3826 if (data & (-1ULL << 1))
3827 return 1;
3828
3829 vcpu->arch.msr_kvm_poll_control = data;
3830 break;
3831
890ca9ae
HY
3832 case MSR_IA32_MCG_CTL:
3833 case MSR_IA32_MCG_STATUS:
81760dcc 3834 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
281b5278 3835 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
9ffd986c 3836 return set_msr_mce(vcpu, msr_info);
71db6023 3837
6912ac32
WH
3838 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3839 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
6912ac32
WH
3840 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3841 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
c6702c9d 3842 if (kvm_pmu_is_valid_msr(vcpu, msr))
afd80d85 3843 return kvm_pmu_set_msr(vcpu, msr_info);
5753785f 3844
e76ae527
SC
3845 if (data)
3846 kvm_pr_unimpl_wrmsr(vcpu, msr, data);
5753785f 3847 break;
84e0cefa
JS
3848 case MSR_K7_CLK_CTL:
3849 /*
3850 * Ignore all writes to this no longer documented MSR.
3851 * Writes are only relevant for old K7 processors,
3852 * all pre-dating SVM, but a recommended workaround from
4a969980 3853 * AMD for these chips. It is possible to specify the
84e0cefa
JS
3854 * affected processor models on the command line, hence
3855 * the need to ignore the workaround.
3856 */
3857 break;
55cd8e5a 3858 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
f97f5a56
JD
3859 case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3860 case HV_X64_MSR_SYNDBG_OPTIONS:
e7d9513b
AS
3861 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3862 case HV_X64_MSR_CRASH_CTL:
1f4b34f8 3863 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
a2e164e7
VK
3864 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3865 case HV_X64_MSR_TSC_EMULATION_CONTROL:
3866 case HV_X64_MSR_TSC_EMULATION_STATUS:
2be1bd3a 3867 case HV_X64_MSR_TSC_INVARIANT_CONTROL:
e7d9513b
AS
3868 return kvm_hv_set_msr_common(vcpu, msr, data,
3869 msr_info->host_initiated);
91c9c3ed 3870 case MSR_IA32_BBL_CR_CTL3:
3871 /* Drop writes to this legacy MSR -- see rdmsr
3872 * counterpart for further detail.
3873 */
e76ae527 3874 kvm_pr_unimpl_wrmsr(vcpu, msr, data);
91c9c3ed 3875 break;
2b036c6b 3876 case MSR_AMD64_OSVW_ID_LENGTH:
d6321d49 3877 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2b036c6b
BO
3878 return 1;
3879 vcpu->arch.osvw.length = data;
3880 break;
3881 case MSR_AMD64_OSVW_STATUS:
d6321d49 3882 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2b036c6b
BO
3883 return 1;
3884 vcpu->arch.osvw.status = data;
3885 break;
db2336a8
KH
3886 case MSR_PLATFORM_INFO:
3887 if (!msr_info->host_initiated ||
db2336a8
KH
3888 (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
3889 cpuid_fault_enabled(vcpu)))
3890 return 1;
3891 vcpu->arch.msr_platform_info = data;
3892 break;
3893 case MSR_MISC_FEATURES_ENABLES:
3894 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
3895 (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
3896 !supports_cpuid_fault(vcpu)))
3897 return 1;
3898 vcpu->arch.msr_misc_features_enables = data;
3899 break;
820a6ee9
JL
3900#ifdef CONFIG_X86_64
3901 case MSR_IA32_XFD:
3902 if (!msr_info->host_initiated &&
3903 !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
3904 return 1;
3905
988896bb 3906 if (data & ~kvm_guest_supported_xfd(vcpu))
820a6ee9
JL
3907 return 1;
3908
3909 fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
3910 break;
548e8365
JL
3911 case MSR_IA32_XFD_ERR:
3912 if (!msr_info->host_initiated &&
3913 !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
3914 return 1;
3915
988896bb 3916 if (data & ~kvm_guest_supported_xfd(vcpu))
548e8365
JL
3917 return 1;
3918
3919 vcpu->arch.guest_fpu.xfd_err = data;
3920 break;
820a6ee9 3921#endif
2de154f5 3922 default:
157fc497
SC
3923 if (kvm_pmu_is_valid_msr(vcpu, msr))
3924 return kvm_pmu_set_msr(vcpu, msr_info);
2de154f5 3925
157fc497
SC
3926 /*
3927 * Userspace is allowed to write '0' to MSRs that KVM reports
3928 * as to-be-saved, even if an MSRs isn't fully supported.
3929 */
2de154f5
SC
3930 if (msr_info->host_initiated && !data &&
3931 kvm_is_msr_to_save(msr))
3932 break;
3933
6abe9c13 3934 return KVM_MSR_RET_INVALID;
15c4a640
CO
3935 }
3936 return 0;
3937}
3938EXPORT_SYMBOL_GPL(kvm_set_msr_common);
3939
44883f01 3940static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
15c4a640
CO
3941{
3942 u64 data;
890ca9ae
HY
3943 u64 mcg_cap = vcpu->arch.mcg_cap;
3944 unsigned bank_num = mcg_cap & 0xff;
281b5278 3945 u32 offset, last_msr;
15c4a640
CO
3946
3947 switch (msr) {
15c4a640
CO
3948 case MSR_IA32_P5_MC_ADDR:
3949 case MSR_IA32_P5_MC_TYPE:
890ca9ae
HY
3950 data = 0;
3951 break;
15c4a640 3952 case MSR_IA32_MCG_CAP:
890ca9ae
HY
3953 data = vcpu->arch.mcg_cap;
3954 break;
c7ac679c 3955 case MSR_IA32_MCG_CTL:
44883f01 3956 if (!(mcg_cap & MCG_CTL_P) && !host)
890ca9ae
HY
3957 return 1;
3958 data = vcpu->arch.mcg_ctl;
3959 break;
3960 case MSR_IA32_MCG_STATUS:
3961 data = vcpu->arch.mcg_status;
3962 break;
281b5278
JW
3963 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
3964 last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
3965 if (msr > last_msr)
3966 return 1;
6ec4c5ee 3967
281b5278
JW
3968 if (!(mcg_cap & MCG_CMCI_P) && !host)
3969 return 1;
3970 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
3971 last_msr + 1 - MSR_IA32_MC0_CTL2);
3972 data = vcpu->arch.mci_ctl2_banks[offset];
3973 break;
3974 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3975 last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
3976 if (msr > last_msr)
3977 return 1;
3978
3979 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
3980 last_msr + 1 - MSR_IA32_MC0_CTL);
3981 data = vcpu->arch.mce_banks[offset];
3982 break;
3983 default:
890ca9ae
HY
3984 return 1;
3985 }
3986 *pdata = data;
3987 return 0;
3988}
3989
609e36d3 3990int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
890ca9ae 3991{
609e36d3 3992 switch (msr_info->index) {
890ca9ae 3993 case MSR_IA32_PLATFORM_ID:
15c4a640 3994 case MSR_IA32_EBL_CR_POWERON:
b5e2fec0
AG
3995 case MSR_IA32_LASTBRANCHFROMIP:
3996 case MSR_IA32_LASTBRANCHTOIP:
3997 case MSR_IA32_LASTINTFROMIP:
3998 case MSR_IA32_LASTINTTOIP:
059e5c32 3999 case MSR_AMD64_SYSCFG:
3afb1121
PB
4000 case MSR_K8_TSEG_ADDR:
4001 case MSR_K8_TSEG_MASK:
61a6bd67 4002 case MSR_VM_HSAVE_PA:
1fdbd48c 4003 case MSR_K8_INT_PENDING_MSG:
c323c0e5 4004 case MSR_AMD64_NB_CFG:
f7c6d140 4005 case MSR_FAM10H_MMIO_CONF_BASE:
2e32b719 4006 case MSR_AMD64_BU_CFG2:
0c2df2a1 4007 case MSR_IA32_PERF_CTL:
405a353a 4008 case MSR_AMD64_DC_CFG:
0e1b869f 4009 case MSR_F15H_EX_CFG:
2ca1a06a
VS
4010 /*
4011 * Intel Sandy Bridge CPUs must support the RAPL (running average power
4012 * limit) MSRs. Just return 0, as we do not want to expose the host
4013 * data here. Do not conditionalize this on CPUID, as KVM does not do
4014 * so for existing CPU-specific MSRs.
4015 */
4016 case MSR_RAPL_POWER_UNIT:
4017 case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
4018 case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
4019 case MSR_PKG_ENERGY_STATUS: /* Total package */
4020 case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
609e36d3 4021 msr_info->data = 0;
15c4a640 4022 break;
6912ac32
WH
4023 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
4024 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
4025 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
4026 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
c6702c9d 4027 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
cbd71758 4028 return kvm_pmu_get_msr(vcpu, msr_info);
609e36d3 4029 msr_info->data = 0;
5753785f 4030 break;
742bc670 4031 case MSR_IA32_UCODE_REV:
518e7b94 4032 msr_info->data = vcpu->arch.microcode_version;
742bc670 4033 break;
0cf9135b
SC
4034 case MSR_IA32_ARCH_CAPABILITIES:
4035 if (!msr_info->host_initiated &&
4036 !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
4037 return 1;
4038 msr_info->data = vcpu->arch.arch_capabilities;
4039 break;
d574c539
VK
4040 case MSR_IA32_PERF_CAPABILITIES:
4041 if (!msr_info->host_initiated &&
4042 !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
4043 return 1;
4044 msr_info->data = vcpu->arch.perf_capabilities;
4045 break;
73f624f4
PB
4046 case MSR_IA32_POWER_CTL:
4047 msr_info->data = vcpu->arch.msr_ia32_power_ctl;
4048 break;
cc5b54dd
ML
4049 case MSR_IA32_TSC: {
4050 /*
4051 * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
4052 * even when not intercepted. AMD manual doesn't explicitly
4053 * state this but appears to behave the same.
4054 *
ee6fa053 4055 * On userspace reads and writes, however, we unconditionally
c0623f5e 4056 * return L1's TSC value to ensure backwards-compatible
ee6fa053 4057 * behavior for migration.
cc5b54dd 4058 */
fe3eb504 4059 u64 offset, ratio;
cc5b54dd 4060
fe3eb504
IS
4061 if (msr_info->host_initiated) {
4062 offset = vcpu->arch.l1_tsc_offset;
4063 ratio = vcpu->arch.l1_tsc_scaling_ratio;
4064 } else {
4065 offset = vcpu->arch.tsc_offset;
4066 ratio = vcpu->arch.tsc_scaling_ratio;
4067 }
4068
62711e5a 4069 msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
dd259935 4070 break;
cc5b54dd 4071 }
9ba075a6 4072 case MSR_MTRRcap:
281b5278
JW
4073 case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
4074 case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
ff53604b 4075 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
15c4a640 4076 case 0xcd: /* fsb frequency */
609e36d3 4077 msr_info->data = 3;
15c4a640 4078 break;
7b914098
JS
4079 /*
4080 * MSR_EBC_FREQUENCY_ID
4081 * Conservative value valid for even the basic CPU models.
4082 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
4083 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
4084 * and 266MHz for model 3, or 4. Set Core Clock
4085 * Frequency to System Bus Frequency Ratio to 1 (bits
4086 * 31:24) even though these are only valid for CPU
4087 * models > 2, however guests may end up dividing or
4088 * multiplying by zero otherwise.
4089 */
4090 case MSR_EBC_FREQUENCY_ID:
609e36d3 4091 msr_info->data = 1 << 24;
7b914098 4092 break;
15c4a640 4093 case MSR_IA32_APICBASE:
609e36d3 4094 msr_info->data = kvm_get_apic_base(vcpu);
15c4a640 4095 break;
bf10bd0b 4096 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
609e36d3 4097 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
09141ec0 4098 case MSR_IA32_TSC_DEADLINE:
609e36d3 4099 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
a3e06bbe 4100 break;
ba904635 4101 case MSR_IA32_TSC_ADJUST:
609e36d3 4102 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
ba904635 4103 break;
15c4a640 4104 case MSR_IA32_MISC_ENABLE:
609e36d3 4105 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
15c4a640 4106 break;
64d60670 4107 case MSR_IA32_SMBASE:
4b8e1b32 4108 if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
64d60670
PB
4109 return 1;
4110 msr_info->data = vcpu->arch.smbase;
15c4a640 4111 break;
52797bf9
LA
4112 case MSR_SMI_COUNT:
4113 msr_info->data = vcpu->arch.smi_count;
4114 break;
847f0ad8
AG
4115 case MSR_IA32_PERF_STATUS:
4116 /* TSC increment by tick */
609e36d3 4117 msr_info->data = 1000ULL;
847f0ad8 4118 /* CPU multiplier */
b0996ae4 4119 msr_info->data |= (((uint64_t)4ULL) << 40);
847f0ad8 4120 break;
15c4a640 4121 case MSR_EFER:
609e36d3 4122 msr_info->data = vcpu->arch.efer;
15c4a640 4123 break;
18068523 4124 case MSR_KVM_WALL_CLOCK:
1930e5dd
OU
4125 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
4126 return 1;
4127
4128 msr_info->data = vcpu->kvm->arch.wall_clock;
4129 break;
11c6bffa 4130 case MSR_KVM_WALL_CLOCK_NEW:
1930e5dd
OU
4131 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
4132 return 1;
4133
609e36d3 4134 msr_info->data = vcpu->kvm->arch.wall_clock;
18068523
GOC
4135 break;
4136 case MSR_KVM_SYSTEM_TIME:
1930e5dd
OU
4137 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
4138 return 1;
4139
4140 msr_info->data = vcpu->arch.time;
4141 break;
11c6bffa 4142 case MSR_KVM_SYSTEM_TIME_NEW:
1930e5dd
OU
4143 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
4144 return 1;
4145
609e36d3 4146 msr_info->data = vcpu->arch.time;
18068523 4147 break;
344d9588 4148 case MSR_KVM_ASYNC_PF_EN:
1930e5dd
OU
4149 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
4150 return 1;
4151
2635b5c4
VK
4152 msr_info->data = vcpu->arch.apf.msr_en_val;
4153 break;
4154 case MSR_KVM_ASYNC_PF_INT:
1930e5dd
OU
4155 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
4156 return 1;
4157
2635b5c4 4158 msr_info->data = vcpu->arch.apf.msr_int_val;
344d9588 4159 break;
557a961a 4160 case MSR_KVM_ASYNC_PF_ACK:
0a31df68 4161 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
1930e5dd
OU
4162 return 1;
4163
557a961a
VK
4164 msr_info->data = 0;
4165 break;
c9aaa895 4166 case MSR_KVM_STEAL_TIME:
1930e5dd
OU
4167 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
4168 return 1;
4169
609e36d3 4170 msr_info->data = vcpu->arch.st.msr_val;
c9aaa895 4171 break;
1d92128f 4172 case MSR_KVM_PV_EOI_EN:
1930e5dd
OU
4173 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
4174 return 1;
4175
609e36d3 4176 msr_info->data = vcpu->arch.pv_eoi.msr_val;
1d92128f 4177 break;
2d5ba19b 4178 case MSR_KVM_POLL_CONTROL:
1930e5dd
OU
4179 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
4180 return 1;
4181
2d5ba19b
MT
4182 msr_info->data = vcpu->arch.msr_kvm_poll_control;
4183 break;
890ca9ae
HY
4184 case MSR_IA32_P5_MC_ADDR:
4185 case MSR_IA32_P5_MC_TYPE:
4186 case MSR_IA32_MCG_CAP:
4187 case MSR_IA32_MCG_CTL:
4188 case MSR_IA32_MCG_STATUS:
81760dcc 4189 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
281b5278 4190 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
44883f01
PB
4191 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
4192 msr_info->host_initiated);
864e2ab2
AL
4193 case MSR_IA32_XSS:
4194 if (!msr_info->host_initiated &&
4195 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
4196 return 1;
4197 msr_info->data = vcpu->arch.ia32_xss;
4198 break;
84e0cefa
JS
4199 case MSR_K7_CLK_CTL:
4200 /*
4201 * Provide expected ramp-up count for K7. All other
4202 * are set to zero, indicating minimum divisors for
4203 * every field.
4204 *
4205 * This prevents guest kernels on AMD host with CPU
4206 * type 6, model 8 and higher from exploding due to
4207 * the rdmsr failing.
4208 */
609e36d3 4209 msr_info->data = 0x20000000;
84e0cefa 4210 break;
55cd8e5a 4211 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
f97f5a56
JD
4212 case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
4213 case HV_X64_MSR_SYNDBG_OPTIONS:
e7d9513b
AS
4214 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
4215 case HV_X64_MSR_CRASH_CTL:
1f4b34f8 4216 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
a2e164e7
VK
4217 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
4218 case HV_X64_MSR_TSC_EMULATION_CONTROL:
4219 case HV_X64_MSR_TSC_EMULATION_STATUS:
2be1bd3a 4220 case HV_X64_MSR_TSC_INVARIANT_CONTROL:
e83d5887 4221 return kvm_hv_get_msr_common(vcpu,
44883f01
PB
4222 msr_info->index, &msr_info->data,
4223 msr_info->host_initiated);
91c9c3ed 4224 case MSR_IA32_BBL_CR_CTL3:
4225 /* This legacy MSR exists but isn't fully documented in current
4226 * silicon. It is however accessed by winxp in very narrow
4227 * scenarios where it sets bit #19, itself documented as
4228 * a "reserved" bit. Best effort attempt to source coherent
4229 * read data here should the balance of the register be
4230 * interpreted by the guest:
4231 *
4232 * L2 cache control register 3: 64GB range, 256KB size,
4233 * enabled, latency 0x1, configured
4234 */
609e36d3 4235 msr_info->data = 0xbe702111;
91c9c3ed 4236 break;
2b036c6b 4237 case MSR_AMD64_OSVW_ID_LENGTH:
d6321d49 4238 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2b036c6b 4239 return 1;
609e36d3 4240 msr_info->data = vcpu->arch.osvw.length;
2b036c6b
BO
4241 break;
4242 case MSR_AMD64_OSVW_STATUS:
d6321d49 4243 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2b036c6b 4244 return 1;
609e36d3 4245 msr_info->data = vcpu->arch.osvw.status;
2b036c6b 4246 break;
db2336a8 4247 case MSR_PLATFORM_INFO:
6fbbde9a
DS
4248 if (!msr_info->host_initiated &&
4249 !vcpu->kvm->arch.guest_can_read_msr_platform_info)
4250 return 1;
db2336a8
KH
4251 msr_info->data = vcpu->arch.msr_platform_info;
4252 break;
4253 case MSR_MISC_FEATURES_ENABLES:
4254 msr_info->data = vcpu->arch.msr_misc_features_enables;
4255 break;
191c8137
BP
4256 case MSR_K7_HWCR:
4257 msr_info->data = vcpu->arch.msr_hwcr;
4258 break;
820a6ee9
JL
4259#ifdef CONFIG_X86_64
4260 case MSR_IA32_XFD:
4261 if (!msr_info->host_initiated &&
4262 !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
4263 return 1;
4264
4265 msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
4266 break;
548e8365
JL
4267 case MSR_IA32_XFD_ERR:
4268 if (!msr_info->host_initiated &&
4269 !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
4270 return 1;
4271
4272 msr_info->data = vcpu->arch.guest_fpu.xfd_err;
4273 break;
820a6ee9 4274#endif
15c4a640 4275 default:
c6702c9d 4276 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
cbd71758 4277 return kvm_pmu_get_msr(vcpu, msr_info);
2de154f5
SC
4278
4279 /*
4280 * Userspace is allowed to read MSRs that KVM reports as
4281 * to-be-saved, even if an MSR isn't fully supported.
4282 */
4283 if (msr_info->host_initiated &&
4284 kvm_is_msr_to_save(msr_info->index)) {
4285 msr_info->data = 0;
4286 break;
4287 }
4288
6abe9c13 4289 return KVM_MSR_RET_INVALID;
15c4a640 4290 }
15c4a640
CO
4291 return 0;
4292}
4293EXPORT_SYMBOL_GPL(kvm_get_msr_common);
4294
313a3dc7
CO
4295/*
4296 * Read or write a bunch of msrs. All parameters are kernel addresses.
4297 *
4298 * @return number of msrs set successfully.
4299 */
4300static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
4301 struct kvm_msr_entry *entries,
4302 int (*do_msr)(struct kvm_vcpu *vcpu,
4303 unsigned index, u64 *data))
4304{
801e459a 4305 int i;
313a3dc7 4306
313a3dc7
CO
4307 for (i = 0; i < msrs->nmsrs; ++i)
4308 if (do_msr(vcpu, entries[i].index, &entries[i].data))
4309 break;
4310
313a3dc7
CO
4311 return i;
4312}
4313
4314/*
4315 * Read or write a bunch of msrs. Parameters are user addresses.
4316 *
4317 * @return number of msrs set successfully.
4318 */
4319static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
4320 int (*do_msr)(struct kvm_vcpu *vcpu,
4321 unsigned index, u64 *data),
4322 int writeback)
4323{
4324 struct kvm_msrs msrs;
4325 struct kvm_msr_entry *entries;
313a3dc7 4326 unsigned size;
e73ba25f 4327 int r;
313a3dc7
CO
4328
4329 r = -EFAULT;
0e96f31e 4330 if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
313a3dc7
CO
4331 goto out;
4332
4333 r = -E2BIG;
4334 if (msrs.nmsrs >= MAX_IO_MSRS)
4335 goto out;
4336
313a3dc7 4337 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
ff5c2c03
SL
4338 entries = memdup_user(user_msrs->entries, size);
4339 if (IS_ERR(entries)) {
4340 r = PTR_ERR(entries);
313a3dc7 4341 goto out;
ff5c2c03 4342 }
313a3dc7 4343
e73ba25f 4344 r = __msr_io(vcpu, &msrs, entries, do_msr);
313a3dc7 4345
313a3dc7 4346 if (writeback && copy_to_user(user_msrs->entries, entries, size))
e73ba25f 4347 r = -EFAULT;
313a3dc7 4348
7a73c028 4349 kfree(entries);
313a3dc7
CO
4350out:
4351 return r;
4352}
4353
4d5422ce
WL
4354static inline bool kvm_can_mwait_in_guest(void)
4355{
4356 return boot_cpu_has(X86_FEATURE_MWAIT) &&
8e9b29b6
KA
4357 !boot_cpu_has_bug(X86_BUG_MONITOR) &&
4358 boot_cpu_has(X86_FEATURE_ARAT);
4d5422ce
WL
4359}
4360
c21d54f0
VK
4361static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
4362 struct kvm_cpuid2 __user *cpuid_arg)
4363{
4364 struct kvm_cpuid2 cpuid;
4365 int r;
4366
4367 r = -EFAULT;
4368 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4369 return r;
4370
4371 r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4372 if (r)
4373 return r;
4374
4375 r = -EFAULT;
4376 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4377 return r;
4378
4379 return 0;
4380}
4381
784aa3d7 4382int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
018d00d2 4383{
4d5422ce 4384 int r = 0;
018d00d2
ZX
4385
4386 switch (ext) {
4387 case KVM_CAP_IRQCHIP:
4388 case KVM_CAP_HLT:
4389 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
018d00d2 4390 case KVM_CAP_SET_TSS_ADDR:
07716717 4391 case KVM_CAP_EXT_CPUID:
9c15bb1d 4392 case KVM_CAP_EXT_EMUL_CPUID:
c8076604 4393 case KVM_CAP_CLOCKSOURCE:
7837699f 4394 case KVM_CAP_PIT:
a28e4f5a 4395 case KVM_CAP_NOP_IO_DELAY:
62d9f0db 4396 case KVM_CAP_MP_STATE:
ed848624 4397 case KVM_CAP_SYNC_MMU:
a355c85c 4398 case KVM_CAP_USER_NMI:
52d939a0 4399 case KVM_CAP_REINJECT_CONTROL:
4925663a 4400 case KVM_CAP_IRQ_INJECT_STATUS:
d34e6b17 4401 case KVM_CAP_IOEVENTFD:
f848a5a8 4402 case KVM_CAP_IOEVENTFD_NO_LENGTH:
c5ff41ce 4403 case KVM_CAP_PIT2:
e9f42757 4404 case KVM_CAP_PIT_STATE2:
b927a3ce 4405 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
3cfc3092 4406 case KVM_CAP_VCPU_EVENTS:
55cd8e5a 4407 case KVM_CAP_HYPERV:
10388a07 4408 case KVM_CAP_HYPERV_VAPIC:
c25bc163 4409 case KVM_CAP_HYPERV_SPIN:
5c919412 4410 case KVM_CAP_HYPERV_SYNIC:
efc479e6 4411 case KVM_CAP_HYPERV_SYNIC2:
d3457c87 4412 case KVM_CAP_HYPERV_VP_INDEX:
faeb7833 4413 case KVM_CAP_HYPERV_EVENTFD:
c1aea919 4414 case KVM_CAP_HYPERV_TLBFLUSH:
214ff83d 4415 case KVM_CAP_HYPERV_SEND_IPI:
2bc39970 4416 case KVM_CAP_HYPERV_CPUID:
644f7067 4417 case KVM_CAP_HYPERV_ENFORCE_CPUID:
c21d54f0 4418 case KVM_CAP_SYS_HYPERV_CPUID:
ab9f4ecb 4419 case KVM_CAP_PCI_SEGMENT:
a1efbe77 4420 case KVM_CAP_DEBUGREGS:
d2be1651 4421 case KVM_CAP_X86_ROBUST_SINGLESTEP:
2d5b5a66 4422 case KVM_CAP_XSAVE:
344d9588 4423 case KVM_CAP_ASYNC_PF:
72de5fa4 4424 case KVM_CAP_ASYNC_PF_INT:
92a1f12d 4425 case KVM_CAP_GET_TSC_KHZ:
1c0b28c2 4426 case KVM_CAP_KVMCLOCK_CTRL:
4d8b81ab 4427 case KVM_CAP_READONLY_MEM:
5f66b620 4428 case KVM_CAP_HYPERV_TIME:
100943c5 4429 case KVM_CAP_IOAPIC_POLARITY_IGNORED:
defcf51f 4430 case KVM_CAP_TSC_DEADLINE_TIMER:
90de4a18 4431 case KVM_CAP_DISABLE_QUIRKS:
d71ba788 4432 case KVM_CAP_SET_BOOT_CPU_ID:
49df6397 4433 case KVM_CAP_SPLIT_IRQCHIP:
460df4c1 4434 case KVM_CAP_IMMEDIATE_EXIT:
66bb8a06 4435 case KVM_CAP_PMU_EVENT_FILTER:
14329b82 4436 case KVM_CAP_PMU_EVENT_MASKED_EVENTS:
801e459a 4437 case KVM_CAP_GET_MSR_FEATURES:
6fbbde9a 4438 case KVM_CAP_MSR_PLATFORM_INFO:
c4f55198 4439 case KVM_CAP_EXCEPTION_PAYLOAD:
ed235117 4440 case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
b9b2782c 4441 case KVM_CAP_SET_GUEST_DEBUG:
1aa561b1 4442 case KVM_CAP_LAST_CPU:
1ae09954 4443 case KVM_CAP_X86_USER_SPACE_MSR:
1a155254 4444 case KVM_CAP_X86_MSR_FILTER:
66570e96 4445 case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
fe7e9488
SC
4446#ifdef CONFIG_X86_SGX_KVM
4447 case KVM_CAP_SGX_ATTRIBUTE:
4448#endif
54526d1f 4449 case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
30d7c5d6 4450 case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
6dba9403 4451 case KVM_CAP_SREGS2:
19238e75 4452 case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
828ca896 4453 case KVM_CAP_VCPU_ATTRIBUTES:
dd6e6312 4454 case KVM_CAP_SYS_ATTRIBUTES:
8a289785 4455 case KVM_CAP_VAPIC:
127770ac 4456 case KVM_CAP_ENABLE_CAP:
084cc29f 4457 case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
018d00d2
ZX
4458 r = 1;
4459 break;
0dbb1123
AK
4460 case KVM_CAP_EXIT_HYPERCALL:
4461 r = KVM_EXIT_HYPERCALL_VALID_MASK;
4462 break;
7e582ccb
ML
4463 case KVM_CAP_SET_GUEST_DEBUG2:
4464 return KVM_GUESTDBG_VALID_MASK;
b59b153d 4465#ifdef CONFIG_KVM_XEN
23200b7a
JM
4466 case KVM_CAP_XEN_HVM:
4467 r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
8d4e7e80 4468 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
14243b38 4469 KVM_XEN_HVM_CONFIG_SHARED_INFO |
661a20fa
DW
4470 KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
4471 KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
30b5c851 4472 if (sched_info_on())
d8ba8ba4
DW
4473 r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
4474 KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
23200b7a 4475 break;
b59b153d 4476#endif
01643c51
KH
4477 case KVM_CAP_SYNC_REGS:
4478 r = KVM_SYNC_X86_VALID_FIELDS;
4479 break;
e3fd9a93 4480 case KVM_CAP_ADJUST_CLOCK:
c68dc1b5 4481 r = KVM_CLOCK_VALID_FLAGS;
e3fd9a93 4482 break;
4d5422ce 4483 case KVM_CAP_X86_DISABLE_EXITS:
6f0f2d5e
TL
4484 r = KVM_X86_DISABLE_EXITS_PAUSE;
4485
4486 if (!mitigate_smt_rsb) {
4487 r |= KVM_X86_DISABLE_EXITS_HLT |
4488 KVM_X86_DISABLE_EXITS_CSTATE;
4489
4490 if (kvm_can_mwait_in_guest())
4491 r |= KVM_X86_DISABLE_EXITS_MWAIT;
4492 }
668fffa3 4493 break;
6d396b55 4494 case KVM_CAP_X86_SMM:
4b8e1b32
PB
4495 if (!IS_ENABLED(CONFIG_KVM_SMM))
4496 break;
4497
6d396b55
PB
4498 /* SMBASE is usually relocated above 1M on modern chipsets,
4499 * and SMM handlers might indeed rely on 4G segment limits,
4500 * so do not report SMM to be available if real mode is
4501 * emulated via vm86 mode. Still, do not go to great lengths
4502 * to avoid userspace's usage of the feature, because it is a
4503 * fringe case that is not enabled except via specific settings
4504 * of the module parameters.
4505 */
b3646477 4506 r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
6d396b55 4507 break;
f725230a 4508 case KVM_CAP_NR_VCPUS:
2845e735 4509 r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
8c3ba334
SL
4510 break;
4511 case KVM_CAP_MAX_VCPUS:
f725230a
AK
4512 r = KVM_MAX_VCPUS;
4513 break;
a86cb413 4514 case KVM_CAP_MAX_VCPU_ID:
a1c42dde 4515 r = KVM_MAX_VCPU_IDS;
a86cb413 4516 break;
a68a6a72
MT
4517 case KVM_CAP_PV_MMU: /* obsolete */
4518 r = 0;
2f333bcb 4519 break;
890ca9ae
HY
4520 case KVM_CAP_MCE:
4521 r = KVM_MAX_MCE_BANKS;
4522 break;
2d5b5a66 4523 case KVM_CAP_XCRS:
d366bf7e 4524 r = boot_cpu_has(X86_FEATURE_XSAVE);
2d5b5a66 4525 break;
92a1f12d 4526 case KVM_CAP_TSC_CONTROL:
ffbb61d0 4527 case KVM_CAP_VM_TSC_CONTROL:
938c8745 4528 r = kvm_caps.has_tsc_control;
92a1f12d 4529 break;
37131313
RK
4530 case KVM_CAP_X2APIC_API:
4531 r = KVM_X2APIC_API_VALID_FLAGS;
4532 break;
8fcc4b59 4533 case KVM_CAP_NESTED_STATE:
33b22172
PB
4534 r = kvm_x86_ops.nested_ops->get_state ?
4535 kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
8fcc4b59 4536 break;
344c6c80 4537 case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
b83237ad 4538 r = kvm_x86_ops.enable_l2_tlb_flush != NULL;
5a0165f6
VK
4539 break;
4540 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
33b22172 4541 r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
344c6c80 4542 break;
3edd6839
MG
4543 case KVM_CAP_SMALLER_MAXPHYADDR:
4544 r = (int) allow_smaller_maxphyaddr;
4545 break;
004a0124
AJ
4546 case KVM_CAP_STEAL_TIME:
4547 r = sched_info_on();
4548 break;
fe6b6bc8 4549 case KVM_CAP_X86_BUS_LOCK_EXIT:
938c8745 4550 if (kvm_caps.has_bus_lock_exit)
fe6b6bc8
CQ
4551 r = KVM_BUS_LOCK_DETECTION_OFF |
4552 KVM_BUS_LOCK_DETECTION_EXIT;
4553 else
4554 r = 0;
4555 break;
be50b206
GZ
4556 case KVM_CAP_XSAVE2: {
4557 u64 guest_perm = xstate_get_guest_group_perm();
4558
938c8745 4559 r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false);
be50b206
GZ
4560 if (r < sizeof(struct kvm_xsave))
4561 r = sizeof(struct kvm_xsave);
4562 break;
1c4dc573 4563 }
ba7bb663
DD
4564 case KVM_CAP_PMU_CAPABILITY:
4565 r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
4566 break;
6d849191
OU
4567 case KVM_CAP_DISABLE_QUIRKS2:
4568 r = KVM_X86_VALID_QUIRKS;
4569 break;
2f4073e0
TX
4570 case KVM_CAP_X86_NOTIFY_VMEXIT:
4571 r = kvm_caps.has_notify_vmexit;
4572 break;
018d00d2 4573 default:
018d00d2
ZX
4574 break;
4575 }
4576 return r;
56f289a8
SC
4577}
4578
4579static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
4580{
4581 void __user *uaddr = (void __user*)(unsigned long)attr->addr;
018d00d2 4582
56f289a8 4583 if ((u64)(unsigned long)uaddr != attr->addr)
6e37ec88 4584 return ERR_PTR_USR(-EFAULT);
56f289a8 4585 return uaddr;
018d00d2
ZX
4586}
4587
dd6e6312
PB
4588static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
4589{
4590 u64 __user *uaddr = kvm_get_attr_addr(attr);
4591
4592 if (attr->group)
4593 return -ENXIO;
4594
4595 if (IS_ERR(uaddr))
4596 return PTR_ERR(uaddr);
4597
4598 switch (attr->attr) {
4599 case KVM_X86_XCOMP_GUEST_SUPP:
938c8745 4600 if (put_user(kvm_caps.supported_xcr0, uaddr))
dd6e6312
PB
4601 return -EFAULT;
4602 return 0;
4603 default:
4604 return -ENXIO;
4605 break;
4606 }
4607}
4608
4609static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
4610{
4611 if (attr->group)
4612 return -ENXIO;
4613
4614 switch (attr->attr) {
4615 case KVM_X86_XCOMP_GUEST_SUPP:
4616 return 0;
4617 default:
4618 return -ENXIO;
4619 }
4620}
4621
043405e1
CO
4622long kvm_arch_dev_ioctl(struct file *filp,
4623 unsigned int ioctl, unsigned long arg)
4624{
4625 void __user *argp = (void __user *)arg;
4626 long r;
4627
4628 switch (ioctl) {
4629 case KVM_GET_MSR_INDEX_LIST: {
4630 struct kvm_msr_list __user *user_msr_list = argp;
4631 struct kvm_msr_list msr_list;
4632 unsigned n;
4633
4634 r = -EFAULT;
0e96f31e 4635 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
043405e1
CO
4636 goto out;
4637 n = msr_list.nmsrs;
62ef68bb 4638 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
0e96f31e 4639 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
043405e1
CO
4640 goto out;
4641 r = -E2BIG;
e125e7b6 4642 if (n < msr_list.nmsrs)
043405e1
CO
4643 goto out;
4644 r = -EFAULT;
4645 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
4646 num_msrs_to_save * sizeof(u32)))
4647 goto out;
e125e7b6 4648 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
043405e1 4649 &emulated_msrs,
62ef68bb 4650 num_emulated_msrs * sizeof(u32)))
043405e1
CO
4651 goto out;
4652 r = 0;
4653 break;
4654 }
9c15bb1d
BP
4655 case KVM_GET_SUPPORTED_CPUID:
4656 case KVM_GET_EMULATED_CPUID: {
674eea0f
AK
4657 struct kvm_cpuid2 __user *cpuid_arg = argp;
4658 struct kvm_cpuid2 cpuid;
4659
4660 r = -EFAULT;
0e96f31e 4661 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
674eea0f 4662 goto out;
9c15bb1d
BP
4663
4664 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
4665 ioctl);
674eea0f
AK
4666 if (r)
4667 goto out;
4668
4669 r = -EFAULT;
0e96f31e 4670 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
674eea0f
AK
4671 goto out;
4672 r = 0;
4673 break;
4674 }
cf6c26ec 4675 case KVM_X86_GET_MCE_CAP_SUPPORTED:
890ca9ae 4676 r = -EFAULT;
938c8745
SC
4677 if (copy_to_user(argp, &kvm_caps.supported_mce_cap,
4678 sizeof(kvm_caps.supported_mce_cap)))
890ca9ae
HY
4679 goto out;
4680 r = 0;
4681 break;
801e459a
TL
4682 case KVM_GET_MSR_FEATURE_INDEX_LIST: {
4683 struct kvm_msr_list __user *user_msr_list = argp;
4684 struct kvm_msr_list msr_list;
4685 unsigned int n;
4686
4687 r = -EFAULT;
4688 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
4689 goto out;
4690 n = msr_list.nmsrs;
4691 msr_list.nmsrs = num_msr_based_features;
4692 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
4693 goto out;
4694 r = -E2BIG;
4695 if (n < msr_list.nmsrs)
4696 goto out;
4697 r = -EFAULT;
4698 if (copy_to_user(user_msr_list->indices, &msr_based_features,
4699 num_msr_based_features * sizeof(u32)))
4700 goto out;
4701 r = 0;
4702 break;
4703 }
4704 case KVM_GET_MSRS:
4705 r = msr_io(NULL, argp, do_get_msr_feature, 1);
4706 break;
c21d54f0
VK
4707 case KVM_GET_SUPPORTED_HV_CPUID:
4708 r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
4709 break;
dd6e6312
PB
4710 case KVM_GET_DEVICE_ATTR: {
4711 struct kvm_device_attr attr;
4712 r = -EFAULT;
4713 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4714 break;
4715 r = kvm_x86_dev_get_attr(&attr);
4716 break;
4717 }
4718 case KVM_HAS_DEVICE_ATTR: {
4719 struct kvm_device_attr attr;
4720 r = -EFAULT;
4721 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4722 break;
4723 r = kvm_x86_dev_has_attr(&attr);
4724 break;
4725 }
043405e1
CO
4726 default:
4727 r = -EINVAL;
cf6c26ec 4728 break;
043405e1
CO
4729 }
4730out:
4731 return r;
4732}
4733
f5f48ee1
SY
4734static void wbinvd_ipi(void *garbage)
4735{
4736 wbinvd();
4737}
4738
4739static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
4740{
e0f0bbc5 4741 return kvm_arch_has_noncoherent_dma(vcpu->kvm);
f5f48ee1
SY
4742}
4743
313a3dc7
CO
4744void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
4745{
f5f48ee1
SY
4746 /* Address WBINVD may be executed by guest */
4747 if (need_emulate_wbinvd(vcpu)) {
b3646477 4748 if (static_call(kvm_x86_has_wbinvd_exit)())
f5f48ee1
SY
4749 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4750 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
4751 smp_call_function_single(vcpu->cpu,
4752 wbinvd_ipi, NULL, 1);
4753 }
4754
b3646477 4755 static_call(kvm_x86_vcpu_load)(vcpu, cpu);
8f6055cb 4756
37486135
BM
4757 /* Save host pkru register if supported */
4758 vcpu->arch.host_pkru = read_pkru();
4759
0dd6a6ed
ZA
4760 /* Apply any externally detected TSC adjustments (due to suspend) */
4761 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
4762 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
4763 vcpu->arch.tsc_offset_adjustment = 0;
105b21bb 4764 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
0dd6a6ed 4765 }
8f6055cb 4766
b0c39dc6 4767 if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
6f526ec5 4768 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
4ea1636b 4769 rdtsc() - vcpu->arch.last_host_tsc;
e48672fa
ZA
4770 if (tsc_delta < 0)
4771 mark_tsc_unstable("KVM discovered backwards TSC");
ce7a058a 4772
b0c39dc6 4773 if (kvm_check_tsc_unstable()) {
9b399dfd 4774 u64 offset = kvm_compute_l1_tsc_offset(vcpu,
b183aa58 4775 vcpu->arch.last_guest_tsc);
a545ab6a 4776 kvm_vcpu_write_tsc_offset(vcpu, offset);
c285545f 4777 vcpu->arch.tsc_catchup = 1;
c285545f 4778 }
a749e247
PB
4779
4780 if (kvm_lapic_hv_timer_in_use(vcpu))
4781 kvm_lapic_restart_hv_timer(vcpu);
4782
d98d07ca
MT
4783 /*
4784 * On a host with synchronized TSC, there is no need to update
4785 * kvmclock on vcpu->cpu migration
4786 */
4787 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
0061d53d 4788 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
c285545f 4789 if (vcpu->cpu != cpu)
1bd2009e 4790 kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
e48672fa 4791 vcpu->cpu = cpu;
6b7d7e76 4792 }
c9aaa895 4793
c9aaa895 4794 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
313a3dc7
CO
4795}
4796
0b9f6c46
PX
4797static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
4798{
7e2175eb
DW
4799 struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
4800 struct kvm_steal_time __user *st;
4801 struct kvm_memslots *slots;
4802 static const u8 preempted = KVM_VCPU_PREEMPTED;
c3c28d24 4803 gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
b0431382 4804
6cd88243
PB
4805 /*
4806 * The vCPU can be marked preempted if and only if the VM-Exit was on
4807 * an instruction boundary and will not trigger guest emulation of any
4808 * kind (see vcpu_run). Vendor specific code controls (conservatively)
4809 * when this is true, for example allowing the vCPU to be marked
4810 * preempted if and only if the VM-Exit was due to a host interrupt.
4811 */
4812 if (!vcpu->arch.at_instruction_boundary) {
4813 vcpu->stat.preemption_other++;
4814 return;
4815 }
4816
4817 vcpu->stat.preemption_reported++;
0b9f6c46
PX
4818 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
4819 return;
4820
a6bd811f 4821 if (vcpu->arch.st.preempted)
8c6de56a
BO
4822 return;
4823
7e2175eb
DW
4824 /* This happens on process exit */
4825 if (unlikely(current->mm != vcpu->kvm->mm))
9c1a0744 4826 return;
b0431382 4827
7e2175eb
DW
4828 slots = kvm_memslots(vcpu->kvm);
4829
4830 if (unlikely(slots->generation != ghc->generation ||
c3c28d24 4831 gpa != ghc->gpa ||
7e2175eb 4832 kvm_is_error_hva(ghc->hva) || !ghc->memslot))
9c1a0744 4833 return;
b0431382 4834
7e2175eb
DW
4835 st = (struct kvm_steal_time __user *)ghc->hva;
4836 BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
0b9f6c46 4837
7e2175eb
DW
4838 if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
4839 vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
0b9f6c46 4840
7e2175eb 4841 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
0b9f6c46
PX
4842}
4843
313a3dc7
CO
4844void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
4845{
9c1a0744
WL
4846 int idx;
4847
54aa83c9
PB
4848 if (vcpu->preempted) {
4849 if (!vcpu->arch.guest_state_protected)
4850 vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
de63ad4c 4851
54aa83c9
PB
4852 /*
4853 * Take the srcu lock as memslots will be accessed to check the gfn
4854 * cache generation against the memslots generation.
4855 */
4856 idx = srcu_read_lock(&vcpu->kvm->srcu);
4857 if (kvm_xen_msr_enabled(vcpu->kvm))
4858 kvm_xen_runstate_set_preempted(vcpu);
4859 else
4860 kvm_steal_time_set_preempted(vcpu);
4861 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4862 }
30b5c851 4863
b3646477 4864 static_call(kvm_x86_vcpu_put)(vcpu);
4ea1636b 4865 vcpu->arch.last_host_tsc = rdtsc();
313a3dc7
CO
4866}
4867
313a3dc7
CO
4868static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
4869 struct kvm_lapic_state *s)
4870{
37c4dbf3 4871 static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
d62caabb 4872
a92e2543 4873 return kvm_apic_get_state(vcpu, s);
313a3dc7
CO
4874}
4875
4876static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
4877 struct kvm_lapic_state *s)
4878{
a92e2543
RK
4879 int r;
4880
4881 r = kvm_apic_set_state(vcpu, s);
4882 if (r)
4883 return r;
cb142eb7 4884 update_cr8_intercept(vcpu);
313a3dc7
CO
4885
4886 return 0;
4887}
4888
127a457a
MG
4889static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
4890{
71cc849b
PB
4891 /*
4892 * We can accept userspace's request for interrupt injection
4893 * as long as we have a place to store the interrupt number.
4894 * The actual injection will happen when the CPU is able to
4895 * deliver the interrupt.
4896 */
4897 if (kvm_cpu_has_extint(vcpu))
4898 return false;
4899
4900 /* Acknowledging ExtINT does not happen if LINT0 is masked. */
127a457a
MG
4901 return (!lapic_in_kernel(vcpu) ||
4902 kvm_apic_accept_pic_intr(vcpu));
4903}
4904
782d422b
MG
4905static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
4906{
fa7a549d
PB
4907 /*
4908 * Do not cause an interrupt window exit if an exception
4909 * is pending or an event needs reinjection; userspace
4910 * might want to inject the interrupt manually using KVM_SET_REGS
4911 * or KVM_SET_SREGS. For that to work, we must be at an
4912 * instruction boundary and with no events half-injected.
4913 */
4914 return (kvm_arch_interrupt_allowed(vcpu) &&
4915 kvm_cpu_accept_dm_intr(vcpu) &&
4916 !kvm_event_needs_reinjection(vcpu) &&
7709aba8 4917 !kvm_is_exception_pending(vcpu));
782d422b
MG
4918}
4919
f77bc6a4
ZX
4920static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
4921 struct kvm_interrupt *irq)
4922{
02cdb50f 4923 if (irq->irq >= KVM_NR_INTERRUPTS)
f77bc6a4 4924 return -EINVAL;
1c1a9ce9
SR
4925
4926 if (!irqchip_in_kernel(vcpu->kvm)) {
4927 kvm_queue_interrupt(vcpu, irq->irq, false);
4928 kvm_make_request(KVM_REQ_EVENT, vcpu);
4929 return 0;
4930 }
4931
4932 /*
4933 * With in-kernel LAPIC, we only use this to inject EXTINT, so
4934 * fail for in-kernel 8259.
4935 */
4936 if (pic_in_kernel(vcpu->kvm))
f77bc6a4 4937 return -ENXIO;
f77bc6a4 4938
1c1a9ce9
SR
4939 if (vcpu->arch.pending_external_vector != -1)
4940 return -EEXIST;
f77bc6a4 4941
1c1a9ce9 4942 vcpu->arch.pending_external_vector = irq->irq;
934bf653 4943 kvm_make_request(KVM_REQ_EVENT, vcpu);
f77bc6a4
ZX
4944 return 0;
4945}
4946
c4abb7c9
JK
4947static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
4948{
c4abb7c9 4949 kvm_inject_nmi(vcpu);
c4abb7c9
JK
4950
4951 return 0;
4952}
4953
b209749f
AK
4954static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
4955 struct kvm_tpr_access_ctl *tac)
4956{
4957 if (tac->flags)
4958 return -EINVAL;
4959 vcpu->arch.tpr_access_reporting = !!tac->enabled;
4960 return 0;
4961}
4962
890ca9ae
HY
4963static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
4964 u64 mcg_cap)
4965{
4966 int r;
4967 unsigned bank_num = mcg_cap & 0xff, bank;
4968
4969 r = -EINVAL;
c4e0e4ab 4970 if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
890ca9ae 4971 goto out;
938c8745 4972 if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000))
890ca9ae
HY
4973 goto out;
4974 r = 0;
4975 vcpu->arch.mcg_cap = mcg_cap;
4976 /* Init IA32_MCG_CTL to all 1s */
4977 if (mcg_cap & MCG_CTL_P)
4978 vcpu->arch.mcg_ctl = ~(u64)0;
281b5278
JW
4979 /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
4980 for (bank = 0; bank < bank_num; bank++) {
890ca9ae 4981 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
281b5278
JW
4982 if (mcg_cap & MCG_CMCI_P)
4983 vcpu->arch.mci_ctl2_banks[bank] = 0;
4984 }
f83894b2
SC
4985
4986 kvm_apic_after_set_mcg_cap(vcpu);
c45dcc71 4987
b3646477 4988 static_call(kvm_x86_setup_mce)(vcpu);
890ca9ae
HY
4989out:
4990 return r;
4991}
4992
aebc3ca1
JW
4993/*
4994 * Validate this is an UCNA (uncorrectable no action) error by checking the
4995 * MCG_STATUS and MCi_STATUS registers:
4996 * - none of the bits for Machine Check Exceptions are set
4997 * - both the VAL (valid) and UC (uncorrectable) bits are set
4998 * MCI_STATUS_PCC - Processor Context Corrupted
4999 * MCI_STATUS_S - Signaled as a Machine Check Exception
5000 * MCI_STATUS_AR - Software recoverable Action Required
5001 */
5002static bool is_ucna(struct kvm_x86_mce *mce)
5003{
5004 return !mce->mcg_status &&
5005 !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
5006 (mce->status & MCI_STATUS_VAL) &&
5007 (mce->status & MCI_STATUS_UC);
5008}
5009
5010static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks)
5011{
5012 u64 mcg_cap = vcpu->arch.mcg_cap;
5013
5014 banks[1] = mce->status;
5015 banks[2] = mce->addr;
5016 banks[3] = mce->misc;
5017 vcpu->arch.mcg_status = mce->mcg_status;
5018
5019 if (!(mcg_cap & MCG_CMCI_P) ||
5020 !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
5021 return 0;
5022
5023 if (lapic_in_kernel(vcpu))
5024 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
5025
5026 return 0;
5027}
5028
890ca9ae
HY
5029static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
5030 struct kvm_x86_mce *mce)
5031{
5032 u64 mcg_cap = vcpu->arch.mcg_cap;
5033 unsigned bank_num = mcg_cap & 0xff;
5034 u64 *banks = vcpu->arch.mce_banks;
5035
5036 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
5037 return -EINVAL;
aebc3ca1
JW
5038
5039 banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
5040
5041 if (is_ucna(mce))
5042 return kvm_vcpu_x86_set_ucna(vcpu, mce, banks);
5043
890ca9ae
HY
5044 /*
5045 * if IA32_MCG_CTL is not all 1s, the uncorrected error
5046 * reporting is disabled
5047 */
5048 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
5049 vcpu->arch.mcg_ctl != ~(u64)0)
5050 return 0;
890ca9ae
HY
5051 /*
5052 * if IA32_MCi_CTL is not all 1s, the uncorrected error
5053 * reporting is disabled for the bank
5054 */
5055 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
5056 return 0;
5057 if (mce->status & MCI_STATUS_UC) {
5058 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
fc78f519 5059 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
a8eeb04a 5060 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
890ca9ae
HY
5061 return 0;
5062 }
5063 if (banks[1] & MCI_STATUS_VAL)
5064 mce->status |= MCI_STATUS_OVER;
5065 banks[2] = mce->addr;
5066 banks[3] = mce->misc;
5067 vcpu->arch.mcg_status = mce->mcg_status;
5068 banks[1] = mce->status;
5069 kvm_queue_exception(vcpu, MC_VECTOR);
5070 } else if (!(banks[1] & MCI_STATUS_VAL)
5071 || !(banks[1] & MCI_STATUS_UC)) {
5072 if (banks[1] & MCI_STATUS_VAL)
5073 mce->status |= MCI_STATUS_OVER;
5074 banks[2] = mce->addr;
5075 banks[3] = mce->misc;
5076 banks[1] = mce->status;
5077 } else
5078 banks[1] |= MCI_STATUS_OVER;
5079 return 0;
5080}
5081
3cfc3092
JK
5082static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
5083 struct kvm_vcpu_events *events)
5084{
7709aba8 5085 struct kvm_queued_exception *ex;
d4963e31 5086
7460fb4a 5087 process_nmi(vcpu);
59073aaf 5088
cf7316d0 5089#ifdef CONFIG_KVM_SMM
1f7becf1
JZ
5090 if (kvm_check_request(KVM_REQ_SMI, vcpu))
5091 process_smi(vcpu);
cf7316d0 5092#endif
1f7becf1 5093
a06230b6 5094 /*
7709aba8
SC
5095 * KVM's ABI only allows for one exception to be migrated. Luckily,
5096 * the only time there can be two queued exceptions is if there's a
5097 * non-exiting _injected_ exception, and a pending exiting exception.
5098 * In that case, ignore the VM-Exiting exception as it's an extension
5099 * of the injected exception.
5100 */
5101 if (vcpu->arch.exception_vmexit.pending &&
5102 !vcpu->arch.exception.pending &&
5103 !vcpu->arch.exception.injected)
5104 ex = &vcpu->arch.exception_vmexit;
5105 else
5106 ex = &vcpu->arch.exception;
5107
a06230b6 5108 /*
d4963e31
SC
5109 * In guest mode, payload delivery should be deferred if the exception
5110 * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
5111 * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability,
5112 * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
5113 * propagate the payload and so it cannot be safely deferred. Deliver
5114 * the payload if the capability hasn't been requested.
a06230b6
OU
5115 */
5116 if (!vcpu->kvm->arch.exception_payload_enabled &&
d4963e31
SC
5117 ex->pending && ex->has_payload)
5118 kvm_deliver_exception_payload(vcpu, ex);
a06230b6 5119
85672346
PB
5120 memset(events, 0, sizeof(*events));
5121
664f8e26 5122 /*
59073aaf
JM
5123 * The API doesn't provide the instruction length for software
5124 * exceptions, so don't report them. As long as the guest RIP
5125 * isn't advanced, we should expect to encounter the exception
5126 * again.
664f8e26 5127 */
85672346 5128 if (!kvm_exception_is_soft(ex->vector)) {
d4963e31
SC
5129 events->exception.injected = ex->injected;
5130 events->exception.pending = ex->pending;
59073aaf
JM
5131 /*
5132 * For ABI compatibility, deliberately conflate
5133 * pending and injected exceptions when
5134 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
5135 */
5136 if (!vcpu->kvm->arch.exception_payload_enabled)
d4963e31 5137 events->exception.injected |= ex->pending;
59073aaf 5138 }
d4963e31
SC
5139 events->exception.nr = ex->vector;
5140 events->exception.has_error_code = ex->has_error_code;
5141 events->exception.error_code = ex->error_code;
5142 events->exception_has_payload = ex->has_payload;
5143 events->exception_payload = ex->payload;
3cfc3092 5144
03b82a30 5145 events->interrupt.injected =
04140b41 5146 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3cfc3092 5147 events->interrupt.nr = vcpu->arch.interrupt.nr;
b3646477 5148 events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
3cfc3092
JK
5149
5150 events->nmi.injected = vcpu->arch.nmi_injected;
7460fb4a 5151 events->nmi.pending = vcpu->arch.nmi_pending != 0;
b3646477 5152 events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
3cfc3092 5153
85672346 5154 /* events->sipi_vector is never valid when reporting to user space */
3cfc3092 5155
a7662aa5 5156#ifdef CONFIG_KVM_SMM
f077825a
PB
5157 events->smi.smm = is_smm(vcpu);
5158 events->smi.pending = vcpu->arch.smi_pending;
5159 events->smi.smm_inside_nmi =
5160 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
a7662aa5 5161#endif
f077825a
PB
5162 events->smi.latched_init = kvm_lapic_latched_init(vcpu);
5163
dab4b911 5164 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
f077825a
PB
5165 | KVM_VCPUEVENT_VALID_SHADOW
5166 | KVM_VCPUEVENT_VALID_SMM);
59073aaf
JM
5167 if (vcpu->kvm->arch.exception_payload_enabled)
5168 events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
ed235117
CQ
5169 if (vcpu->kvm->arch.triple_fault_event) {
5170 events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5171 events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
5172 }
3cfc3092
JK
5173}
5174
5175static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
5176 struct kvm_vcpu_events *events)
5177{
dab4b911 5178 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
48005f64 5179 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
f077825a 5180 | KVM_VCPUEVENT_VALID_SHADOW
59073aaf 5181 | KVM_VCPUEVENT_VALID_SMM
ed235117
CQ
5182 | KVM_VCPUEVENT_VALID_PAYLOAD
5183 | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
3cfc3092
JK
5184 return -EINVAL;
5185
59073aaf
JM
5186 if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
5187 if (!vcpu->kvm->arch.exception_payload_enabled)
5188 return -EINVAL;
5189 if (events->exception.pending)
5190 events->exception.injected = 0;
5191 else
5192 events->exception_has_payload = 0;
5193 } else {
5194 events->exception.pending = 0;
5195 events->exception_has_payload = 0;
5196 }
5197
5198 if ((events->exception.injected || events->exception.pending) &&
5199 (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
78e546c8
PB
5200 return -EINVAL;
5201
28bf2888
DH
5202 /* INITs are latched while in SMM */
5203 if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
5204 (events->smi.smm || events->smi.pending) &&
5205 vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
5206 return -EINVAL;
5207
7460fb4a 5208 process_nmi(vcpu);
7709aba8
SC
5209
5210 /*
5211 * Flag that userspace is stuffing an exception, the next KVM_RUN will
5212 * morph the exception to a VM-Exit if appropriate. Do this only for
5213 * pending exceptions, already-injected exceptions are not subject to
5214 * intercpetion. Note, userspace that conflates pending and injected
5215 * is hosed, and will incorrectly convert an injected exception into a
5216 * pending exception, which in turn may cause a spurious VM-Exit.
5217 */
5218 vcpu->arch.exception_from_userspace = events->exception.pending;
5219
5220 vcpu->arch.exception_vmexit.pending = false;
5221
59073aaf
JM
5222 vcpu->arch.exception.injected = events->exception.injected;
5223 vcpu->arch.exception.pending = events->exception.pending;
d4963e31 5224 vcpu->arch.exception.vector = events->exception.nr;
3cfc3092
JK
5225 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
5226 vcpu->arch.exception.error_code = events->exception.error_code;
59073aaf
JM
5227 vcpu->arch.exception.has_payload = events->exception_has_payload;
5228 vcpu->arch.exception.payload = events->exception_payload;
3cfc3092 5229
04140b41 5230 vcpu->arch.interrupt.injected = events->interrupt.injected;
3cfc3092
JK
5231 vcpu->arch.interrupt.nr = events->interrupt.nr;
5232 vcpu->arch.interrupt.soft = events->interrupt.soft;
48005f64 5233 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
b3646477
JB
5234 static_call(kvm_x86_set_interrupt_shadow)(vcpu,
5235 events->interrupt.shadow);
3cfc3092
JK
5236
5237 vcpu->arch.nmi_injected = events->nmi.injected;
dab4b911
JK
5238 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
5239 vcpu->arch.nmi_pending = events->nmi.pending;
b3646477 5240 static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
3cfc3092 5241
66450a21 5242 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
bce87cce 5243 lapic_in_kernel(vcpu))
66450a21 5244 vcpu->arch.apic->sipi_vector = events->sipi_vector;
3cfc3092 5245
f077825a 5246 if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
4b8e1b32 5247#ifdef CONFIG_KVM_SMM
f7e57078 5248 if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
f9697df2 5249 kvm_leave_nested(vcpu);
dc87275f 5250 kvm_smm_changed(vcpu, events->smi.smm);
f7e57078 5251 }
6ef4e07e 5252
f077825a 5253 vcpu->arch.smi_pending = events->smi.pending;
f4ef1910
WL
5254
5255 if (events->smi.smm) {
5256 if (events->smi.smm_inside_nmi)
5257 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
f077825a 5258 else
f4ef1910 5259 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
ff90afa7
LA
5260 }
5261
4b8e1b32
PB
5262#else
5263 if (events->smi.smm || events->smi.pending ||
5264 events->smi.smm_inside_nmi)
5265 return -EINVAL;
5266#endif
5267
ff90afa7
LA
5268 if (lapic_in_kernel(vcpu)) {
5269 if (events->smi.latched_init)
5270 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5271 else
5272 clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
f077825a
PB
5273 }
5274 }
5275
ed235117
CQ
5276 if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
5277 if (!vcpu->kvm->arch.triple_fault_event)
5278 return -EINVAL;
5279 if (events->triple_fault.pending)
5280 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5281 else
5282 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5283 }
5284
3842d135
AK
5285 kvm_make_request(KVM_REQ_EVENT, vcpu);
5286
3cfc3092
JK
5287 return 0;
5288}
5289
a1efbe77
JK
5290static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
5291 struct kvm_debugregs *dbgregs)
5292{
73aaf249
JK
5293 unsigned long val;
5294
2c10b614 5295 memset(dbgregs, 0, sizeof(*dbgregs));
a1efbe77 5296 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
16f8a6f9 5297 kvm_get_dr(vcpu, 6, &val);
73aaf249 5298 dbgregs->dr6 = val;
a1efbe77 5299 dbgregs->dr7 = vcpu->arch.dr7;
a1efbe77
JK
5300}
5301
5302static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
5303 struct kvm_debugregs *dbgregs)
5304{
5305 if (dbgregs->flags)
5306 return -EINVAL;
5307
fd238002 5308 if (!kvm_dr6_valid(dbgregs->dr6))
d14bdb55 5309 return -EINVAL;
fd238002 5310 if (!kvm_dr7_valid(dbgregs->dr7))
d14bdb55
PB
5311 return -EINVAL;
5312
a1efbe77 5313 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
ae561ede 5314 kvm_update_dr0123(vcpu);
a1efbe77
JK
5315 vcpu->arch.dr6 = dbgregs->dr6;
5316 vcpu->arch.dr7 = dbgregs->dr7;
9926c9fd 5317 kvm_update_dr7(vcpu);
a1efbe77 5318
a1efbe77
JK
5319 return 0;
5320}
5321
2d5b5a66
SY
5322static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
5323 struct kvm_xsave *guest_xsave)
5324{
d69c1382 5325 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
ed02b213
TL
5326 return;
5327
d69c1382
TG
5328 fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
5329 guest_xsave->region,
5330 sizeof(guest_xsave->region),
5331 vcpu->arch.pkru);
2d5b5a66
SY
5332}
5333
be50b206
GZ
5334static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
5335 u8 *state, unsigned int size)
5336{
5337 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5338 return;
5339
5340 fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
5341 state, size, vcpu->arch.pkru);
5342}
5343
2d5b5a66
SY
5344static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
5345 struct kvm_xsave *guest_xsave)
5346{
d69c1382 5347 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
ed02b213
TL
5348 return 0;
5349
d69c1382
TG
5350 return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
5351 guest_xsave->region,
938c8745
SC
5352 kvm_caps.supported_xcr0,
5353 &vcpu->arch.pkru);
2d5b5a66
SY
5354}
5355
5356static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
5357 struct kvm_xcrs *guest_xcrs)
5358{
d366bf7e 5359 if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
2d5b5a66
SY
5360 guest_xcrs->nr_xcrs = 0;
5361 return;
5362 }
5363
5364 guest_xcrs->nr_xcrs = 1;
5365 guest_xcrs->flags = 0;
5366 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
5367 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
5368}
5369
5370static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
5371 struct kvm_xcrs *guest_xcrs)
5372{
5373 int i, r = 0;
5374
d366bf7e 5375 if (!boot_cpu_has(X86_FEATURE_XSAVE))
2d5b5a66
SY
5376 return -EINVAL;
5377
5378 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
5379 return -EINVAL;
5380
5381 for (i = 0; i < guest_xcrs->nr_xcrs; i++)
5382 /* Only support XCR0 currently */
c67a04cb 5383 if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
2d5b5a66 5384 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
c67a04cb 5385 guest_xcrs->xcrs[i].value);
2d5b5a66
SY
5386 break;
5387 }
5388 if (r)
5389 r = -EINVAL;
5390 return r;
5391}
5392
1c0b28c2
EM
5393/*
5394 * kvm_set_guest_paused() indicates to the guest kernel that it has been
5395 * stopped by the hypervisor. This function will be called from the host only.
5396 * EINVAL is returned when the host attempts to set the flag for a guest that
5397 * does not support pv clocks.
5398 */
5399static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
5400{
916d3608 5401 if (!vcpu->arch.pv_time.active)
1c0b28c2 5402 return -EINVAL;
51d59c6b 5403 vcpu->arch.pvclock_set_guest_stopped_request = true;
1c0b28c2
EM
5404 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5405 return 0;
5406}
5407
828ca896
OU
5408static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
5409 struct kvm_device_attr *attr)
5410{
5411 int r;
5412
5413 switch (attr->attr) {
5414 case KVM_VCPU_TSC_OFFSET:
5415 r = 0;
5416 break;
5417 default:
5418 r = -ENXIO;
5419 }
5420
5421 return r;
5422}
5423
5424static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
5425 struct kvm_device_attr *attr)
5426{
56f289a8 5427 u64 __user *uaddr = kvm_get_attr_addr(attr);
828ca896
OU
5428 int r;
5429
56f289a8
SC
5430 if (IS_ERR(uaddr))
5431 return PTR_ERR(uaddr);
828ca896
OU
5432
5433 switch (attr->attr) {
5434 case KVM_VCPU_TSC_OFFSET:
5435 r = -EFAULT;
5436 if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
5437 break;
5438 r = 0;
5439 break;
5440 default:
5441 r = -ENXIO;
5442 }
5443
5444 return r;
5445}
5446
5447static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
5448 struct kvm_device_attr *attr)
5449{
56f289a8 5450 u64 __user *uaddr = kvm_get_attr_addr(attr);
828ca896
OU
5451 struct kvm *kvm = vcpu->kvm;
5452 int r;
5453
56f289a8
SC
5454 if (IS_ERR(uaddr))
5455 return PTR_ERR(uaddr);
828ca896
OU
5456
5457 switch (attr->attr) {
5458 case KVM_VCPU_TSC_OFFSET: {
5459 u64 offset, tsc, ns;
5460 unsigned long flags;
5461 bool matched;
5462
5463 r = -EFAULT;
5464 if (get_user(offset, uaddr))
5465 break;
5466
5467 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
5468
5469 matched = (vcpu->arch.virtual_tsc_khz &&
5470 kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
5471 kvm->arch.last_tsc_offset == offset);
5472
62711e5a 5473 tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
828ca896
OU
5474 ns = get_kvmclock_base_ns();
5475
5476 __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
5477 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
5478
5479 r = 0;
5480 break;
5481 }
5482 default:
5483 r = -ENXIO;
5484 }
5485
5486 return r;
5487}
5488
5489static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
5490 unsigned int ioctl,
5491 void __user *argp)
5492{
5493 struct kvm_device_attr attr;
5494 int r;
5495
5496 if (copy_from_user(&attr, argp, sizeof(attr)))
5497 return -EFAULT;
5498
5499 if (attr.group != KVM_VCPU_TSC_CTRL)
5500 return -ENXIO;
5501
5502 switch (ioctl) {
5503 case KVM_HAS_DEVICE_ATTR:
5504 r = kvm_arch_tsc_has_attr(vcpu, &attr);
5505 break;
5506 case KVM_GET_DEVICE_ATTR:
5507 r = kvm_arch_tsc_get_attr(vcpu, &attr);
5508 break;
5509 case KVM_SET_DEVICE_ATTR:
5510 r = kvm_arch_tsc_set_attr(vcpu, &attr);
5511 break;
5512 }
5513
5514 return r;
5515}
5516
5c919412
AS
5517static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
5518 struct kvm_enable_cap *cap)
5519{
57b119da
VK
5520 int r;
5521 uint16_t vmcs_version;
5522 void __user *user_ptr;
5523
5c919412
AS
5524 if (cap->flags)
5525 return -EINVAL;
5526
5527 switch (cap->cap) {
efc479e6
RK
5528 case KVM_CAP_HYPERV_SYNIC2:
5529 if (cap->args[0])
5530 return -EINVAL;
df561f66 5531 fallthrough;
b2869f28 5532
5c919412 5533 case KVM_CAP_HYPERV_SYNIC:
546d87e5
WL
5534 if (!irqchip_in_kernel(vcpu->kvm))
5535 return -EINVAL;
efc479e6
RK
5536 return kvm_hv_activate_synic(vcpu, cap->cap ==
5537 KVM_CAP_HYPERV_SYNIC2);
57b119da 5538 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
33b22172 5539 if (!kvm_x86_ops.nested_ops->enable_evmcs)
5158917c 5540 return -ENOTTY;
33b22172 5541 r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
57b119da
VK
5542 if (!r) {
5543 user_ptr = (void __user *)(uintptr_t)cap->args[0];
5544 if (copy_to_user(user_ptr, &vmcs_version,
5545 sizeof(vmcs_version)))
5546 r = -EFAULT;
5547 }
5548 return r;
344c6c80 5549 case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
b83237ad 5550 if (!kvm_x86_ops.enable_l2_tlb_flush)
344c6c80
TL
5551 return -ENOTTY;
5552
b83237ad 5553 return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
57b119da 5554
644f7067
VK
5555 case KVM_CAP_HYPERV_ENFORCE_CPUID:
5556 return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
5557
66570e96
OU
5558 case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
5559 vcpu->arch.pv_cpuid.enforce = cap->args[0];
01b4f510
OU
5560 if (vcpu->arch.pv_cpuid.enforce)
5561 kvm_update_pv_runtime(vcpu);
66570e96
OU
5562
5563 return 0;
5c919412
AS
5564 default:
5565 return -EINVAL;
5566 }
5567}
5568
313a3dc7
CO
5569long kvm_arch_vcpu_ioctl(struct file *filp,
5570 unsigned int ioctl, unsigned long arg)
5571{
5572 struct kvm_vcpu *vcpu = filp->private_data;
5573 void __user *argp = (void __user *)arg;
5574 int r;
d1ac91d8 5575 union {
6dba9403 5576 struct kvm_sregs2 *sregs2;
d1ac91d8
AK
5577 struct kvm_lapic_state *lapic;
5578 struct kvm_xsave *xsave;
5579 struct kvm_xcrs *xcrs;
5580 void *buffer;
5581 } u;
5582
9b062471
CD
5583 vcpu_load(vcpu);
5584
d1ac91d8 5585 u.buffer = NULL;
313a3dc7
CO
5586 switch (ioctl) {
5587 case KVM_GET_LAPIC: {
2204ae3c 5588 r = -EINVAL;
bce87cce 5589 if (!lapic_in_kernel(vcpu))
2204ae3c 5590 goto out;
254272ce
BG
5591 u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
5592 GFP_KERNEL_ACCOUNT);
313a3dc7 5593
b772ff36 5594 r = -ENOMEM;
d1ac91d8 5595 if (!u.lapic)
b772ff36 5596 goto out;
d1ac91d8 5597 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
313a3dc7
CO
5598 if (r)
5599 goto out;
5600 r = -EFAULT;
d1ac91d8 5601 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
313a3dc7
CO
5602 goto out;
5603 r = 0;
5604 break;
5605 }
5606 case KVM_SET_LAPIC: {
2204ae3c 5607 r = -EINVAL;
bce87cce 5608 if (!lapic_in_kernel(vcpu))
2204ae3c 5609 goto out;
ff5c2c03 5610 u.lapic = memdup_user(argp, sizeof(*u.lapic));
9b062471
CD
5611 if (IS_ERR(u.lapic)) {
5612 r = PTR_ERR(u.lapic);
5613 goto out_nofree;
5614 }
ff5c2c03 5615
d1ac91d8 5616 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
313a3dc7
CO
5617 break;
5618 }
f77bc6a4
ZX
5619 case KVM_INTERRUPT: {
5620 struct kvm_interrupt irq;
5621
5622 r = -EFAULT;
0e96f31e 5623 if (copy_from_user(&irq, argp, sizeof(irq)))
f77bc6a4
ZX
5624 goto out;
5625 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
f77bc6a4
ZX
5626 break;
5627 }
c4abb7c9
JK
5628 case KVM_NMI: {
5629 r = kvm_vcpu_ioctl_nmi(vcpu);
c4abb7c9
JK
5630 break;
5631 }
f077825a 5632 case KVM_SMI: {
b0b42197 5633 r = kvm_inject_smi(vcpu);
f077825a
PB
5634 break;
5635 }
313a3dc7
CO
5636 case KVM_SET_CPUID: {
5637 struct kvm_cpuid __user *cpuid_arg = argp;
5638 struct kvm_cpuid cpuid;
5639
5640 r = -EFAULT;
0e96f31e 5641 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
313a3dc7
CO
5642 goto out;
5643 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
313a3dc7
CO
5644 break;
5645 }
07716717
DK
5646 case KVM_SET_CPUID2: {
5647 struct kvm_cpuid2 __user *cpuid_arg = argp;
5648 struct kvm_cpuid2 cpuid;
5649
5650 r = -EFAULT;
0e96f31e 5651 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
07716717
DK
5652 goto out;
5653 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
19355475 5654 cpuid_arg->entries);
07716717
DK
5655 break;
5656 }
5657 case KVM_GET_CPUID2: {
5658 struct kvm_cpuid2 __user *cpuid_arg = argp;
5659 struct kvm_cpuid2 cpuid;
5660
5661 r = -EFAULT;
0e96f31e 5662 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
07716717
DK
5663 goto out;
5664 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
19355475 5665 cpuid_arg->entries);
07716717
DK
5666 if (r)
5667 goto out;
5668 r = -EFAULT;
0e96f31e 5669 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
07716717
DK
5670 goto out;
5671 r = 0;
5672 break;
5673 }
801e459a
TL
5674 case KVM_GET_MSRS: {
5675 int idx = srcu_read_lock(&vcpu->kvm->srcu);
609e36d3 5676 r = msr_io(vcpu, argp, do_get_msr, 1);
801e459a 5677 srcu_read_unlock(&vcpu->kvm->srcu, idx);
313a3dc7 5678 break;
801e459a
TL
5679 }
5680 case KVM_SET_MSRS: {
5681 int idx = srcu_read_lock(&vcpu->kvm->srcu);
313a3dc7 5682 r = msr_io(vcpu, argp, do_set_msr, 0);
801e459a 5683 srcu_read_unlock(&vcpu->kvm->srcu, idx);
313a3dc7 5684 break;
801e459a 5685 }
b209749f
AK
5686 case KVM_TPR_ACCESS_REPORTING: {
5687 struct kvm_tpr_access_ctl tac;
5688
5689 r = -EFAULT;
0e96f31e 5690 if (copy_from_user(&tac, argp, sizeof(tac)))
b209749f
AK
5691 goto out;
5692 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
5693 if (r)
5694 goto out;
5695 r = -EFAULT;
0e96f31e 5696 if (copy_to_user(argp, &tac, sizeof(tac)))
b209749f
AK
5697 goto out;
5698 r = 0;
5699 break;
5700 };
b93463aa
AK
5701 case KVM_SET_VAPIC_ADDR: {
5702 struct kvm_vapic_addr va;
7301d6ab 5703 int idx;
b93463aa
AK
5704
5705 r = -EINVAL;
35754c98 5706 if (!lapic_in_kernel(vcpu))
b93463aa
AK
5707 goto out;
5708 r = -EFAULT;
0e96f31e 5709 if (copy_from_user(&va, argp, sizeof(va)))
b93463aa 5710 goto out;
7301d6ab 5711 idx = srcu_read_lock(&vcpu->kvm->srcu);
fda4e2e8 5712 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
7301d6ab 5713 srcu_read_unlock(&vcpu->kvm->srcu, idx);
b93463aa
AK
5714 break;
5715 }
890ca9ae
HY
5716 case KVM_X86_SETUP_MCE: {
5717 u64 mcg_cap;
5718
5719 r = -EFAULT;
0e96f31e 5720 if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
890ca9ae
HY
5721 goto out;
5722 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
5723 break;
5724 }
5725 case KVM_X86_SET_MCE: {
5726 struct kvm_x86_mce mce;
5727
5728 r = -EFAULT;
0e96f31e 5729 if (copy_from_user(&mce, argp, sizeof(mce)))
890ca9ae
HY
5730 goto out;
5731 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
5732 break;
5733 }
3cfc3092
JK
5734 case KVM_GET_VCPU_EVENTS: {
5735 struct kvm_vcpu_events events;
5736
5737 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
5738
5739 r = -EFAULT;
5740 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
5741 break;
5742 r = 0;
5743 break;
5744 }
5745 case KVM_SET_VCPU_EVENTS: {
5746 struct kvm_vcpu_events events;
5747
5748 r = -EFAULT;
5749 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
5750 break;
5751
5752 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
5753 break;
5754 }
a1efbe77
JK
5755 case KVM_GET_DEBUGREGS: {
5756 struct kvm_debugregs dbgregs;
5757
5758 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
5759
5760 r = -EFAULT;
5761 if (copy_to_user(argp, &dbgregs,
5762 sizeof(struct kvm_debugregs)))
5763 break;
5764 r = 0;
5765 break;
5766 }
5767 case KVM_SET_DEBUGREGS: {
5768 struct kvm_debugregs dbgregs;
5769
5770 r = -EFAULT;
5771 if (copy_from_user(&dbgregs, argp,
5772 sizeof(struct kvm_debugregs)))
5773 break;
5774
5775 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
5776 break;
5777 }
2d5b5a66 5778 case KVM_GET_XSAVE: {
be50b206
GZ
5779 r = -EINVAL;
5780 if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
5781 break;
5782
254272ce 5783 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
2d5b5a66 5784 r = -ENOMEM;
d1ac91d8 5785 if (!u.xsave)
2d5b5a66
SY
5786 break;
5787
d1ac91d8 5788 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
2d5b5a66
SY
5789
5790 r = -EFAULT;
d1ac91d8 5791 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
2d5b5a66
SY
5792 break;
5793 r = 0;
5794 break;
5795 }
5796 case KVM_SET_XSAVE: {
be50b206
GZ
5797 int size = vcpu->arch.guest_fpu.uabi_size;
5798
5799 u.xsave = memdup_user(argp, size);
9b062471
CD
5800 if (IS_ERR(u.xsave)) {
5801 r = PTR_ERR(u.xsave);
5802 goto out_nofree;
5803 }
2d5b5a66 5804
d1ac91d8 5805 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
2d5b5a66
SY
5806 break;
5807 }
be50b206
GZ
5808
5809 case KVM_GET_XSAVE2: {
5810 int size = vcpu->arch.guest_fpu.uabi_size;
5811
5812 u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
5813 r = -ENOMEM;
5814 if (!u.xsave)
5815 break;
5816
5817 kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
5818
5819 r = -EFAULT;
5820 if (copy_to_user(argp, u.xsave, size))
5821 break;
5822
5823 r = 0;
5824 break;
5825 }
5826
2d5b5a66 5827 case KVM_GET_XCRS: {
254272ce 5828 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
2d5b5a66 5829 r = -ENOMEM;
d1ac91d8 5830 if (!u.xcrs)
2d5b5a66
SY
5831 break;
5832
d1ac91d8 5833 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
2d5b5a66
SY
5834
5835 r = -EFAULT;
d1ac91d8 5836 if (copy_to_user(argp, u.xcrs,
2d5b5a66
SY
5837 sizeof(struct kvm_xcrs)))
5838 break;
5839 r = 0;
5840 break;
5841 }
5842 case KVM_SET_XCRS: {
ff5c2c03 5843 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
9b062471
CD
5844 if (IS_ERR(u.xcrs)) {
5845 r = PTR_ERR(u.xcrs);
5846 goto out_nofree;
5847 }
2d5b5a66 5848
d1ac91d8 5849 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2d5b5a66
SY
5850 break;
5851 }
92a1f12d
JR
5852 case KVM_SET_TSC_KHZ: {
5853 u32 user_tsc_khz;
5854
5855 r = -EINVAL;
92a1f12d
JR
5856 user_tsc_khz = (u32)arg;
5857
938c8745
SC
5858 if (kvm_caps.has_tsc_control &&
5859 user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
92a1f12d
JR
5860 goto out;
5861
cc578287
ZA
5862 if (user_tsc_khz == 0)
5863 user_tsc_khz = tsc_khz;
5864
381d585c
HZ
5865 if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
5866 r = 0;
92a1f12d 5867
92a1f12d
JR
5868 goto out;
5869 }
5870 case KVM_GET_TSC_KHZ: {
cc578287 5871 r = vcpu->arch.virtual_tsc_khz;
92a1f12d
JR
5872 goto out;
5873 }
1c0b28c2
EM
5874 case KVM_KVMCLOCK_CTRL: {
5875 r = kvm_set_guest_paused(vcpu);
5876 goto out;
5877 }
5c919412
AS
5878 case KVM_ENABLE_CAP: {
5879 struct kvm_enable_cap cap;
5880
5881 r = -EFAULT;
5882 if (copy_from_user(&cap, argp, sizeof(cap)))
5883 goto out;
5884 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
5885 break;
5886 }
8fcc4b59
JM
5887 case KVM_GET_NESTED_STATE: {
5888 struct kvm_nested_state __user *user_kvm_nested_state = argp;
5889 u32 user_data_size;
5890
5891 r = -EINVAL;
33b22172 5892 if (!kvm_x86_ops.nested_ops->get_state)
8fcc4b59
JM
5893 break;
5894
5895 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
26b471c7 5896 r = -EFAULT;
8fcc4b59 5897 if (get_user(user_data_size, &user_kvm_nested_state->size))
26b471c7 5898 break;
8fcc4b59 5899
33b22172
PB
5900 r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
5901 user_data_size);
8fcc4b59 5902 if (r < 0)
26b471c7 5903 break;
8fcc4b59
JM
5904
5905 if (r > user_data_size) {
5906 if (put_user(r, &user_kvm_nested_state->size))
26b471c7
LA
5907 r = -EFAULT;
5908 else
5909 r = -E2BIG;
5910 break;
8fcc4b59 5911 }
26b471c7 5912
8fcc4b59
JM
5913 r = 0;
5914 break;
5915 }
5916 case KVM_SET_NESTED_STATE: {
5917 struct kvm_nested_state __user *user_kvm_nested_state = argp;
5918 struct kvm_nested_state kvm_state;
ad5996d9 5919 int idx;
8fcc4b59
JM
5920
5921 r = -EINVAL;
33b22172 5922 if (!kvm_x86_ops.nested_ops->set_state)
8fcc4b59
JM
5923 break;
5924
26b471c7 5925 r = -EFAULT;
8fcc4b59 5926 if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
26b471c7 5927 break;
8fcc4b59 5928
26b471c7 5929 r = -EINVAL;
8fcc4b59 5930 if (kvm_state.size < sizeof(kvm_state))
26b471c7 5931 break;
8fcc4b59
JM
5932
5933 if (kvm_state.flags &
8cab6507 5934 ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
cc440cda
PB
5935 | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
5936 | KVM_STATE_NESTED_GIF_SET))
26b471c7 5937 break;
8fcc4b59
JM
5938
5939 /* nested_run_pending implies guest_mode. */
8cab6507
VK
5940 if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
5941 && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
26b471c7 5942 break;
8fcc4b59 5943
ad5996d9 5944 idx = srcu_read_lock(&vcpu->kvm->srcu);
33b22172 5945 r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
ad5996d9 5946 srcu_read_unlock(&vcpu->kvm->srcu, idx);
8fcc4b59
JM
5947 break;
5948 }
c21d54f0
VK
5949 case KVM_GET_SUPPORTED_HV_CPUID:
5950 r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
2bc39970 5951 break;
b59b153d 5952#ifdef CONFIG_KVM_XEN
3e324615
DW
5953 case KVM_XEN_VCPU_GET_ATTR: {
5954 struct kvm_xen_vcpu_attr xva;
5955
5956 r = -EFAULT;
5957 if (copy_from_user(&xva, argp, sizeof(xva)))
5958 goto out;
5959 r = kvm_xen_vcpu_get_attr(vcpu, &xva);
5960 if (!r && copy_to_user(argp, &xva, sizeof(xva)))
5961 r = -EFAULT;
5962 break;
5963 }
5964 case KVM_XEN_VCPU_SET_ATTR: {
5965 struct kvm_xen_vcpu_attr xva;
5966
5967 r = -EFAULT;
5968 if (copy_from_user(&xva, argp, sizeof(xva)))
5969 goto out;
5970 r = kvm_xen_vcpu_set_attr(vcpu, &xva);
5971 break;
5972 }
b59b153d 5973#endif
6dba9403
ML
5974 case KVM_GET_SREGS2: {
5975 u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
5976 r = -ENOMEM;
5977 if (!u.sregs2)
5978 goto out;
5979 __get_sregs2(vcpu, u.sregs2);
5980 r = -EFAULT;
5981 if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
5982 goto out;
5983 r = 0;
5984 break;
5985 }
5986 case KVM_SET_SREGS2: {
5987 u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
5988 if (IS_ERR(u.sregs2)) {
5989 r = PTR_ERR(u.sregs2);
5990 u.sregs2 = NULL;
5991 goto out;
5992 }
5993 r = __set_sregs2(vcpu, u.sregs2);
5994 break;
5995 }
828ca896
OU
5996 case KVM_HAS_DEVICE_ATTR:
5997 case KVM_GET_DEVICE_ATTR:
5998 case KVM_SET_DEVICE_ATTR:
5999 r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
6000 break;
313a3dc7
CO
6001 default:
6002 r = -EINVAL;
6003 }
6004out:
d1ac91d8 6005 kfree(u.buffer);
9b062471
CD
6006out_nofree:
6007 vcpu_put(vcpu);
313a3dc7
CO
6008 return r;
6009}
6010
1499fa80 6011vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
5b1c1493
CO
6012{
6013 return VM_FAULT_SIGBUS;
6014}
6015
1fe779f8
CO
6016static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
6017{
6018 int ret;
6019
6020 if (addr > (unsigned int)(-3 * PAGE_SIZE))
951179ce 6021 return -EINVAL;
b3646477 6022 ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
1fe779f8
CO
6023 return ret;
6024}
6025
b927a3ce
SY
6026static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
6027 u64 ident_addr)
6028{
b3646477 6029 return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
b927a3ce
SY
6030}
6031
1fe779f8 6032static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
bc8a3d89 6033 unsigned long kvm_nr_mmu_pages)
1fe779f8
CO
6034{
6035 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
6036 return -EINVAL;
6037
79fac95e 6038 mutex_lock(&kvm->slots_lock);
1fe779f8
CO
6039
6040 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
f05e70ac 6041 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1fe779f8 6042
79fac95e 6043 mutex_unlock(&kvm->slots_lock);
1fe779f8
CO
6044 return 0;
6045}
6046
1fe779f8
CO
6047static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
6048{
90bca052 6049 struct kvm_pic *pic = kvm->arch.vpic;
1fe779f8
CO
6050 int r;
6051
6052 r = 0;
6053 switch (chip->chip_id) {
6054 case KVM_IRQCHIP_PIC_MASTER:
90bca052 6055 memcpy(&chip->chip.pic, &pic->pics[0],
1fe779f8
CO
6056 sizeof(struct kvm_pic_state));
6057 break;
6058 case KVM_IRQCHIP_PIC_SLAVE:
90bca052 6059 memcpy(&chip->chip.pic, &pic->pics[1],
1fe779f8
CO
6060 sizeof(struct kvm_pic_state));
6061 break;
6062 case KVM_IRQCHIP_IOAPIC:
33392b49 6063 kvm_get_ioapic(kvm, &chip->chip.ioapic);
1fe779f8
CO
6064 break;
6065 default:
6066 r = -EINVAL;
6067 break;
6068 }
6069 return r;
6070}
6071
6072static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
6073{
90bca052 6074 struct kvm_pic *pic = kvm->arch.vpic;
1fe779f8
CO
6075 int r;
6076
6077 r = 0;
6078 switch (chip->chip_id) {
6079 case KVM_IRQCHIP_PIC_MASTER:
90bca052
DH
6080 spin_lock(&pic->lock);
6081 memcpy(&pic->pics[0], &chip->chip.pic,
1fe779f8 6082 sizeof(struct kvm_pic_state));
90bca052 6083 spin_unlock(&pic->lock);
1fe779f8
CO
6084 break;
6085 case KVM_IRQCHIP_PIC_SLAVE:
90bca052
DH
6086 spin_lock(&pic->lock);
6087 memcpy(&pic->pics[1], &chip->chip.pic,
1fe779f8 6088 sizeof(struct kvm_pic_state));
90bca052 6089 spin_unlock(&pic->lock);
1fe779f8
CO
6090 break;
6091 case KVM_IRQCHIP_IOAPIC:
33392b49 6092 kvm_set_ioapic(kvm, &chip->chip.ioapic);
1fe779f8
CO
6093 break;
6094 default:
6095 r = -EINVAL;
6096 break;
6097 }
90bca052 6098 kvm_pic_update_irq(pic);
1fe779f8
CO
6099 return r;
6100}
6101
e0f63cb9
SY
6102static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
6103{
34f3941c
RK
6104 struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
6105
6106 BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
6107
6108 mutex_lock(&kps->lock);
6109 memcpy(ps, &kps->channels, sizeof(*ps));
6110 mutex_unlock(&kps->lock);
2da29bcc 6111 return 0;
e0f63cb9
SY
6112}
6113
6114static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
6115{
0185604c 6116 int i;
09edea72
RK
6117 struct kvm_pit *pit = kvm->arch.vpit;
6118
6119 mutex_lock(&pit->pit_state.lock);
34f3941c 6120 memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
0185604c 6121 for (i = 0; i < 3; i++)
09edea72
RK
6122 kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
6123 mutex_unlock(&pit->pit_state.lock);
2da29bcc 6124 return 0;
e9f42757
BK
6125}
6126
6127static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
6128{
e9f42757
BK
6129 mutex_lock(&kvm->arch.vpit->pit_state.lock);
6130 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
6131 sizeof(ps->channels));
6132 ps->flags = kvm->arch.vpit->pit_state.flags;
6133 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
97e69aa6 6134 memset(&ps->reserved, 0, sizeof(ps->reserved));
2da29bcc 6135 return 0;
e9f42757
BK
6136}
6137
6138static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
6139{
2da29bcc 6140 int start = 0;
0185604c 6141 int i;
e9f42757 6142 u32 prev_legacy, cur_legacy;
09edea72
RK
6143 struct kvm_pit *pit = kvm->arch.vpit;
6144
6145 mutex_lock(&pit->pit_state.lock);
6146 prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
e9f42757
BK
6147 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
6148 if (!prev_legacy && cur_legacy)
6149 start = 1;
09edea72
RK
6150 memcpy(&pit->pit_state.channels, &ps->channels,
6151 sizeof(pit->pit_state.channels));
6152 pit->pit_state.flags = ps->flags;
0185604c 6153 for (i = 0; i < 3; i++)
09edea72 6154 kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
e5e57e7a 6155 start && i == 0);
09edea72 6156 mutex_unlock(&pit->pit_state.lock);
2da29bcc 6157 return 0;
e0f63cb9
SY
6158}
6159
52d939a0
MT
6160static int kvm_vm_ioctl_reinject(struct kvm *kvm,
6161 struct kvm_reinject_control *control)
6162{
71474e2f
RK
6163 struct kvm_pit *pit = kvm->arch.vpit;
6164
71474e2f
RK
6165 /* pit->pit_state.lock was overloaded to prevent userspace from getting
6166 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
6167 * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
6168 */
6169 mutex_lock(&pit->pit_state.lock);
6170 kvm_pit_set_reinject(pit, control->pit_reinject);
6171 mutex_unlock(&pit->pit_state.lock);
b39c90b6 6172
52d939a0
MT
6173 return 0;
6174}
6175
0dff0846 6176void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
5bb064dc 6177{
a018eba5 6178
88178fd4 6179 /*
a018eba5
SC
6180 * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called
6181 * before reporting dirty_bitmap to userspace. KVM flushes the buffers
6182 * on all VM-Exits, thus we only need to kick running vCPUs to force a
6183 * VM-Exit.
88178fd4 6184 */
a018eba5 6185 struct kvm_vcpu *vcpu;
46808a4c 6186 unsigned long i;
a018eba5
SC
6187
6188 kvm_for_each_vcpu(i, vcpu, kvm)
6189 kvm_vcpu_kick(vcpu);
5bb064dc
ZX
6190}
6191
aa2fbe6d
YZ
6192int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
6193 bool line_status)
23d43cf9
CD
6194{
6195 if (!irqchip_in_kernel(kvm))
6196 return -ENXIO;
6197
6198 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
aa2fbe6d
YZ
6199 irq_event->irq, irq_event->level,
6200 line_status);
23d43cf9
CD
6201 return 0;
6202}
6203
e5d83c74
PB
6204int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
6205 struct kvm_enable_cap *cap)
90de4a18
NA
6206{
6207 int r;
6208
6209 if (cap->flags)
6210 return -EINVAL;
6211
6212 switch (cap->cap) {
6d849191
OU
6213 case KVM_CAP_DISABLE_QUIRKS2:
6214 r = -EINVAL;
6215 if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
6216 break;
6217 fallthrough;
90de4a18
NA
6218 case KVM_CAP_DISABLE_QUIRKS:
6219 kvm->arch.disabled_quirks = cap->args[0];
6220 r = 0;
6221 break;
49df6397
SR
6222 case KVM_CAP_SPLIT_IRQCHIP: {
6223 mutex_lock(&kvm->lock);
b053b2ae
SR
6224 r = -EINVAL;
6225 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
6226 goto split_irqchip_unlock;
49df6397
SR
6227 r = -EEXIST;
6228 if (irqchip_in_kernel(kvm))
6229 goto split_irqchip_unlock;
557abc40 6230 if (kvm->created_vcpus)
49df6397
SR
6231 goto split_irqchip_unlock;
6232 r = kvm_setup_empty_irq_routing(kvm);
5c0aea0e 6233 if (r)
49df6397
SR
6234 goto split_irqchip_unlock;
6235 /* Pairs with irqchip_in_kernel. */
6236 smp_wmb();
49776faf 6237 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
b053b2ae 6238 kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
320af55a 6239 kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
49df6397
SR
6240 r = 0;
6241split_irqchip_unlock:
6242 mutex_unlock(&kvm->lock);
6243 break;
6244 }
37131313
RK
6245 case KVM_CAP_X2APIC_API:
6246 r = -EINVAL;
6247 if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
6248 break;
6249
6250 if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
6251 kvm->arch.x2apic_format = true;
c519265f
RK
6252 if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
6253 kvm->arch.x2apic_broadcast_quirk_disabled = true;
37131313
RK
6254
6255 r = 0;
6256 break;
4d5422ce
WL
6257 case KVM_CAP_X86_DISABLE_EXITS:
6258 r = -EINVAL;
6259 if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
6260 break;
6261
b31c114b
WL
6262 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
6263 kvm->arch.pause_in_guest = true;
6f0f2d5e
TL
6264
6265#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
6266 "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests."
6267
6268 if (!mitigate_smt_rsb) {
6269 if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() &&
6270 (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
6271 pr_warn_once(SMT_RSB_MSG);
6272
6273 if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
6274 kvm_can_mwait_in_guest())
6275 kvm->arch.mwait_in_guest = true;
6276 if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
6277 kvm->arch.hlt_in_guest = true;
6278 if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
6279 kvm->arch.cstate_in_guest = true;
6280 }
6281
4d5422ce
WL
6282 r = 0;
6283 break;
6fbbde9a
DS
6284 case KVM_CAP_MSR_PLATFORM_INFO:
6285 kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
6286 r = 0;
c4f55198
JM
6287 break;
6288 case KVM_CAP_EXCEPTION_PAYLOAD:
6289 kvm->arch.exception_payload_enabled = cap->args[0];
6290 r = 0;
6fbbde9a 6291 break;
ed235117
CQ
6292 case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
6293 kvm->arch.triple_fault_event = cap->args[0];
6294 r = 0;
6295 break;
1ae09954 6296 case KVM_CAP_X86_USER_SPACE_MSR:
cf5029d5 6297 r = -EINVAL;
db205f7e 6298 if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
cf5029d5 6299 break;
1ae09954
AG
6300 kvm->arch.user_space_msr_mask = cap->args[0];
6301 r = 0;
6302 break;
fe6b6bc8
CQ
6303 case KVM_CAP_X86_BUS_LOCK_EXIT:
6304 r = -EINVAL;
6305 if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
6306 break;
6307
6308 if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
6309 (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
6310 break;
6311
938c8745 6312 if (kvm_caps.has_bus_lock_exit &&
fe6b6bc8
CQ
6313 cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
6314 kvm->arch.bus_lock_detection_enabled = true;
6315 r = 0;
6316 break;
fe7e9488
SC
6317#ifdef CONFIG_X86_SGX_KVM
6318 case KVM_CAP_SGX_ATTRIBUTE: {
6319 unsigned long allowed_attributes = 0;
6320
6321 r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
6322 if (r)
6323 break;
6324
6325 /* KVM only supports the PROVISIONKEY privileged attribute. */
6326 if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
6327 !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
6328 kvm->arch.sgx_provisioning_allowed = true;
6329 else
6330 r = -EINVAL;
6331 break;
6332 }
6333#endif
54526d1f
NT
6334 case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
6335 r = -EINVAL;
7ad02ef0
SC
6336 if (!kvm_x86_ops.vm_copy_enc_context_from)
6337 break;
6338
6339 r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
6340 break;
b5663931
PG
6341 case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
6342 r = -EINVAL;
7ad02ef0
SC
6343 if (!kvm_x86_ops.vm_move_enc_context_from)
6344 break;
6345
6346 r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
6347 break;
0dbb1123
AK
6348 case KVM_CAP_EXIT_HYPERCALL:
6349 if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
6350 r = -EINVAL;
6351 break;
6352 }
6353 kvm->arch.hypercall_exit_enabled = cap->args[0];
6354 r = 0;
6355 break;
19238e75
AL
6356 case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
6357 r = -EINVAL;
6358 if (cap->args[0] & ~1)
6359 break;
6360 kvm->arch.exit_on_emulation_error = cap->args[0];
6361 r = 0;
6362 break;
ba7bb663
DD
6363 case KVM_CAP_PMU_CAPABILITY:
6364 r = -EINVAL;
6365 if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
6366 break;
6367
6368 mutex_lock(&kvm->lock);
6369 if (!kvm->created_vcpus) {
6370 kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
6371 r = 0;
6372 }
6373 mutex_unlock(&kvm->lock);
6374 break;
35875316
ZG
6375 case KVM_CAP_MAX_VCPU_ID:
6376 r = -EINVAL;
6377 if (cap->args[0] > KVM_MAX_VCPU_IDS)
6378 break;
6379
6380 mutex_lock(&kvm->lock);
6381 if (kvm->arch.max_vcpu_ids == cap->args[0]) {
6382 r = 0;
6383 } else if (!kvm->arch.max_vcpu_ids) {
6384 kvm->arch.max_vcpu_ids = cap->args[0];
6385 r = 0;
6386 }
6387 mutex_unlock(&kvm->lock);
6388 break;
2f4073e0
TX
6389 case KVM_CAP_X86_NOTIFY_VMEXIT:
6390 r = -EINVAL;
6391 if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
6392 break;
6393 if (!kvm_caps.has_notify_vmexit)
6394 break;
6395 if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
6396 break;
6397 mutex_lock(&kvm->lock);
6398 if (!kvm->created_vcpus) {
6399 kvm->arch.notify_window = cap->args[0] >> 32;
6400 kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
6401 r = 0;
6402 }
6403 mutex_unlock(&kvm->lock);
6404 break;
084cc29f
BG
6405 case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
6406 r = -EINVAL;
6407
6408 /*
6409 * Since the risk of disabling NX hugepages is a guest crashing
6410 * the system, ensure the userspace process has permission to
6411 * reboot the system.
6412 *
6413 * Note that unlike the reboot() syscall, the process must have
6414 * this capability in the root namespace because exposing
6415 * /dev/kvm into a container does not limit the scope of the
6416 * iTLB multihit bug to that container. In other words,
6417 * this must use capable(), not ns_capable().
6418 */
6419 if (!capable(CAP_SYS_BOOT)) {
6420 r = -EPERM;
6421 break;
6422 }
6423
6424 if (cap->args[0])
6425 break;
6426
6427 mutex_lock(&kvm->lock);
6428 if (!kvm->created_vcpus) {
6429 kvm->arch.disable_nx_huge_pages = true;
6430 r = 0;
6431 }
6432 mutex_unlock(&kvm->lock);
6433 break;
90de4a18
NA
6434 default:
6435 r = -EINVAL;
6436 break;
6437 }
6438 return r;
6439}
6440
b318e8de
SC
6441static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
6442{
6443 struct kvm_x86_msr_filter *msr_filter;
6444
6445 msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
6446 if (!msr_filter)
6447 return NULL;
6448
6449 msr_filter->default_allow = default_allow;
6450 return msr_filter;
6451}
6452
6453static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
1a155254
AG
6454{
6455 u32 i;
1a155254 6456
b318e8de
SC
6457 if (!msr_filter)
6458 return;
6459
6460 for (i = 0; i < msr_filter->count; i++)
6461 kfree(msr_filter->ranges[i].bitmap);
1a155254 6462
b318e8de 6463 kfree(msr_filter);
1a155254
AG
6464}
6465
b318e8de
SC
6466static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
6467 struct kvm_msr_filter_range *user_range)
1a155254 6468{
1a155254
AG
6469 unsigned long *bitmap = NULL;
6470 size_t bitmap_size;
1a155254
AG
6471
6472 if (!user_range->nmsrs)
6473 return 0;
6474
8aff460f 6475 if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
aca35288
SC
6476 return -EINVAL;
6477
6478 if (!user_range->flags)
6479 return -EINVAL;
6480
1a155254
AG
6481 bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
6482 if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
6483 return -EINVAL;
6484
6485 bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
6486 if (IS_ERR(bitmap))
6487 return PTR_ERR(bitmap);
6488
aca35288 6489 msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
1a155254
AG
6490 .flags = user_range->flags,
6491 .base = user_range->base,
6492 .nmsrs = user_range->nmsrs,
6493 .bitmap = bitmap,
6494 };
6495
b318e8de 6496 msr_filter->count++;
1a155254 6497 return 0;
1a155254
AG
6498}
6499
2e3272bc
AG
6500static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
6501 struct kvm_msr_filter *filter)
1a155254 6502{
b318e8de 6503 struct kvm_x86_msr_filter *new_filter, *old_filter;
1a155254 6504 bool default_allow;
043248b3 6505 bool empty = true;
4559e6cf 6506 int r;
1a155254
AG
6507 u32 i;
6508
c1340fe3 6509 if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
cf5029d5
AL
6510 return -EINVAL;
6511
2e3272bc
AG
6512 for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
6513 empty &= !filter->ranges[i].nmsrs;
1a155254 6514
2e3272bc 6515 default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
043248b3
PB
6516 if (empty && !default_allow)
6517 return -EINVAL;
6518
b318e8de
SC
6519 new_filter = kvm_alloc_msr_filter(default_allow);
6520 if (!new_filter)
6521 return -ENOMEM;
1a155254 6522
2e3272bc
AG
6523 for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
6524 r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
b318e8de
SC
6525 if (r) {
6526 kvm_free_msr_filter(new_filter);
6527 return r;
6528 }
1a155254
AG
6529 }
6530
b318e8de 6531 mutex_lock(&kvm->lock);
1fdefb8b
ML
6532 old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
6533 mutex_is_locked(&kvm->lock));
708f799d 6534 mutex_unlock(&kvm->lock);
b318e8de
SC
6535 synchronize_srcu(&kvm->srcu);
6536
6537 kvm_free_msr_filter(old_filter);
6538
1a155254 6539 kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
1a155254 6540
b318e8de 6541 return 0;
1a155254
AG
6542}
6543
1739c701
AG
6544#ifdef CONFIG_KVM_COMPAT
6545/* for KVM_X86_SET_MSR_FILTER */
6546struct kvm_msr_filter_range_compat {
6547 __u32 flags;
6548 __u32 nmsrs;
6549 __u32 base;
6550 __u32 bitmap;
6551};
6552
6553struct kvm_msr_filter_compat {
6554 __u32 flags;
6555 struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
6556};
6557
6558#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
6559
6560long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
6561 unsigned long arg)
6562{
6563 void __user *argp = (void __user *)arg;
6564 struct kvm *kvm = filp->private_data;
6565 long r = -ENOTTY;
6566
6567 switch (ioctl) {
6568 case KVM_X86_SET_MSR_FILTER_COMPAT: {
6569 struct kvm_msr_filter __user *user_msr_filter = argp;
6570 struct kvm_msr_filter_compat filter_compat;
6571 struct kvm_msr_filter filter;
6572 int i;
6573
6574 if (copy_from_user(&filter_compat, user_msr_filter,
6575 sizeof(filter_compat)))
6576 return -EFAULT;
6577
6578 filter.flags = filter_compat.flags;
6579 for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
6580 struct kvm_msr_filter_range_compat *cr;
6581
6582 cr = &filter_compat.ranges[i];
6583 filter.ranges[i] = (struct kvm_msr_filter_range) {
6584 .flags = cr->flags,
6585 .nmsrs = cr->nmsrs,
6586 .base = cr->base,
6587 .bitmap = (__u8 *)(ulong)cr->bitmap,
6588 };
6589 }
6590
6591 r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
6592 break;
6593 }
6594 }
6595
6596 return r;
6597}
6598#endif
6599
7d62874f
SS
6600#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
6601static int kvm_arch_suspend_notifier(struct kvm *kvm)
6602{
6603 struct kvm_vcpu *vcpu;
46808a4c
MZ
6604 unsigned long i;
6605 int ret = 0;
7d62874f
SS
6606
6607 mutex_lock(&kvm->lock);
6608 kvm_for_each_vcpu(i, vcpu, kvm) {
916d3608 6609 if (!vcpu->arch.pv_time.active)
7d62874f
SS
6610 continue;
6611
6612 ret = kvm_set_guest_paused(vcpu);
6613 if (ret) {
6614 kvm_err("Failed to pause guest VCPU%d: %d\n",
6615 vcpu->vcpu_id, ret);
6616 break;
6617 }
6618 }
6619 mutex_unlock(&kvm->lock);
6620
6621 return ret ? NOTIFY_BAD : NOTIFY_DONE;
6622}
6623
6624int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
6625{
6626 switch (state) {
6627 case PM_HIBERNATION_PREPARE:
6628 case PM_SUSPEND_PREPARE:
6629 return kvm_arch_suspend_notifier(kvm);
6630 }
6631
6632 return NOTIFY_DONE;
6633}
6634#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
6635
45e6c2fa
PB
6636static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
6637{
869b4421 6638 struct kvm_clock_data data = { 0 };
45e6c2fa 6639
55c0cefb 6640 get_kvmclock(kvm, &data);
45e6c2fa
PB
6641 if (copy_to_user(argp, &data, sizeof(data)))
6642 return -EFAULT;
6643
6644 return 0;
6645}
6646
6647static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
6648{
6649 struct kvm_arch *ka = &kvm->arch;
6650 struct kvm_clock_data data;
c68dc1b5 6651 u64 now_raw_ns;
45e6c2fa
PB
6652
6653 if (copy_from_user(&data, argp, sizeof(data)))
6654 return -EFAULT;
6655
c68dc1b5
OU
6656 /*
6657 * Only KVM_CLOCK_REALTIME is used, but allow passing the
6658 * result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
6659 */
6660 if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
45e6c2fa
PB
6661 return -EINVAL;
6662
42dcbe7d 6663 kvm_hv_request_tsc_page_update(kvm);
45e6c2fa
PB
6664 kvm_start_pvclock_update(kvm);
6665 pvclock_update_vm_gtod_copy(kvm);
6666
6667 /*
6668 * This pairs with kvm_guest_time_update(): when masterclock is
6669 * in use, we use master_kernel_ns + kvmclock_offset to set
6670 * unsigned 'system_time' so if we use get_kvmclock_ns() (which
6671 * is slightly ahead) here we risk going negative on unsigned
6672 * 'system_time' when 'data.clock' is very small.
6673 */
c68dc1b5
OU
6674 if (data.flags & KVM_CLOCK_REALTIME) {
6675 u64 now_real_ns = ktime_get_real_ns();
6676
6677 /*
6678 * Avoid stepping the kvmclock backwards.
6679 */
6680 if (now_real_ns > data.realtime)
6681 data.clock += now_real_ns - data.realtime;
6682 }
6683
6684 if (ka->use_master_clock)
6685 now_raw_ns = ka->master_kernel_ns;
45e6c2fa 6686 else
c68dc1b5
OU
6687 now_raw_ns = get_kvmclock_base_ns();
6688 ka->kvmclock_offset = data.clock - now_raw_ns;
45e6c2fa
PB
6689 kvm_end_pvclock_update(kvm);
6690 return 0;
6691}
6692
d8708b80 6693int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1fe779f8
CO
6694{
6695 struct kvm *kvm = filp->private_data;
6696 void __user *argp = (void __user *)arg;
367e1319 6697 int r = -ENOTTY;
f0d66275
DH
6698 /*
6699 * This union makes it completely explicit to gcc-3.x
6700 * that these two variables' stack usage should be
6701 * combined, not added together.
6702 */
6703 union {
6704 struct kvm_pit_state ps;
e9f42757 6705 struct kvm_pit_state2 ps2;
c5ff41ce 6706 struct kvm_pit_config pit_config;
f0d66275 6707 } u;
1fe779f8
CO
6708
6709 switch (ioctl) {
6710 case KVM_SET_TSS_ADDR:
6711 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1fe779f8 6712 break;
b927a3ce
SY
6713 case KVM_SET_IDENTITY_MAP_ADDR: {
6714 u64 ident_addr;
6715
1af1ac91
DH
6716 mutex_lock(&kvm->lock);
6717 r = -EINVAL;
6718 if (kvm->created_vcpus)
6719 goto set_identity_unlock;
b927a3ce 6720 r = -EFAULT;
0e96f31e 6721 if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
1af1ac91 6722 goto set_identity_unlock;
b927a3ce 6723 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
1af1ac91
DH
6724set_identity_unlock:
6725 mutex_unlock(&kvm->lock);
b927a3ce
SY
6726 break;
6727 }
1fe779f8
CO
6728 case KVM_SET_NR_MMU_PAGES:
6729 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1fe779f8 6730 break;
3ddea128 6731 case KVM_CREATE_IRQCHIP: {
3ddea128 6732 mutex_lock(&kvm->lock);
09941366 6733
3ddea128 6734 r = -EEXIST;
35e6eaa3 6735 if (irqchip_in_kernel(kvm))
3ddea128 6736 goto create_irqchip_unlock;
09941366 6737
3e515705 6738 r = -EINVAL;
557abc40 6739 if (kvm->created_vcpus)
3e515705 6740 goto create_irqchip_unlock;
09941366
RK
6741
6742 r = kvm_pic_init(kvm);
6743 if (r)
3ddea128 6744 goto create_irqchip_unlock;
09941366
RK
6745
6746 r = kvm_ioapic_init(kvm);
6747 if (r) {
09941366 6748 kvm_pic_destroy(kvm);
3ddea128 6749 goto create_irqchip_unlock;
09941366
RK
6750 }
6751
399ec807
AK
6752 r = kvm_setup_default_irq_routing(kvm);
6753 if (r) {
72bb2fcd 6754 kvm_ioapic_destroy(kvm);
09941366 6755 kvm_pic_destroy(kvm);
71ba994c 6756 goto create_irqchip_unlock;
399ec807 6757 }
49776faf 6758 /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
71ba994c 6759 smp_wmb();
49776faf 6760 kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
320af55a 6761 kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
3ddea128
MT
6762 create_irqchip_unlock:
6763 mutex_unlock(&kvm->lock);
1fe779f8 6764 break;
3ddea128 6765 }
7837699f 6766 case KVM_CREATE_PIT:
c5ff41ce
JK
6767 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
6768 goto create_pit;
6769 case KVM_CREATE_PIT2:
6770 r = -EFAULT;
6771 if (copy_from_user(&u.pit_config, argp,
6772 sizeof(struct kvm_pit_config)))
6773 goto out;
6774 create_pit:
250715a6 6775 mutex_lock(&kvm->lock);
269e05e4
AK
6776 r = -EEXIST;
6777 if (kvm->arch.vpit)
6778 goto create_pit_unlock;
7837699f 6779 r = -ENOMEM;
c5ff41ce 6780 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
7837699f
SY
6781 if (kvm->arch.vpit)
6782 r = 0;
269e05e4 6783 create_pit_unlock:
250715a6 6784 mutex_unlock(&kvm->lock);
7837699f 6785 break;
1fe779f8
CO
6786 case KVM_GET_IRQCHIP: {
6787 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
ff5c2c03 6788 struct kvm_irqchip *chip;
1fe779f8 6789
ff5c2c03
SL
6790 chip = memdup_user(argp, sizeof(*chip));
6791 if (IS_ERR(chip)) {
6792 r = PTR_ERR(chip);
1fe779f8 6793 goto out;
ff5c2c03
SL
6794 }
6795
1fe779f8 6796 r = -ENXIO;
826da321 6797 if (!irqchip_kernel(kvm))
f0d66275
DH
6798 goto get_irqchip_out;
6799 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1fe779f8 6800 if (r)
f0d66275 6801 goto get_irqchip_out;
1fe779f8 6802 r = -EFAULT;
0e96f31e 6803 if (copy_to_user(argp, chip, sizeof(*chip)))
f0d66275 6804 goto get_irqchip_out;
1fe779f8 6805 r = 0;
f0d66275
DH
6806 get_irqchip_out:
6807 kfree(chip);
1fe779f8
CO
6808 break;
6809 }
6810 case KVM_SET_IRQCHIP: {
6811 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
ff5c2c03 6812 struct kvm_irqchip *chip;
1fe779f8 6813
ff5c2c03
SL
6814 chip = memdup_user(argp, sizeof(*chip));
6815 if (IS_ERR(chip)) {
6816 r = PTR_ERR(chip);
1fe779f8 6817 goto out;
ff5c2c03
SL
6818 }
6819
1fe779f8 6820 r = -ENXIO;
826da321 6821 if (!irqchip_kernel(kvm))
f0d66275
DH
6822 goto set_irqchip_out;
6823 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
f0d66275
DH
6824 set_irqchip_out:
6825 kfree(chip);
1fe779f8
CO
6826 break;
6827 }
e0f63cb9 6828 case KVM_GET_PIT: {
e0f63cb9 6829 r = -EFAULT;
f0d66275 6830 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
e0f63cb9
SY
6831 goto out;
6832 r = -ENXIO;
6833 if (!kvm->arch.vpit)
6834 goto out;
f0d66275 6835 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
e0f63cb9
SY
6836 if (r)
6837 goto out;
6838 r = -EFAULT;
f0d66275 6839 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
e0f63cb9
SY
6840 goto out;
6841 r = 0;
6842 break;
6843 }
6844 case KVM_SET_PIT: {
e0f63cb9 6845 r = -EFAULT;
0e96f31e 6846 if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
e0f63cb9 6847 goto out;
7289fdb5 6848 mutex_lock(&kvm->lock);
e0f63cb9
SY
6849 r = -ENXIO;
6850 if (!kvm->arch.vpit)
7289fdb5 6851 goto set_pit_out;
f0d66275 6852 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
7289fdb5
SR
6853set_pit_out:
6854 mutex_unlock(&kvm->lock);
e0f63cb9
SY
6855 break;
6856 }
e9f42757
BK
6857 case KVM_GET_PIT2: {
6858 r = -ENXIO;
6859 if (!kvm->arch.vpit)
6860 goto out;
6861 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
6862 if (r)
6863 goto out;
6864 r = -EFAULT;
6865 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
6866 goto out;
6867 r = 0;
6868 break;
6869 }
6870 case KVM_SET_PIT2: {
6871 r = -EFAULT;
6872 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
6873 goto out;
7289fdb5 6874 mutex_lock(&kvm->lock);
e9f42757
BK
6875 r = -ENXIO;
6876 if (!kvm->arch.vpit)
7289fdb5 6877 goto set_pit2_out;
e9f42757 6878 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
7289fdb5
SR
6879set_pit2_out:
6880 mutex_unlock(&kvm->lock);
e9f42757
BK
6881 break;
6882 }
52d939a0
MT
6883 case KVM_REINJECT_CONTROL: {
6884 struct kvm_reinject_control control;
6885 r = -EFAULT;
6886 if (copy_from_user(&control, argp, sizeof(control)))
6887 goto out;
cad23e72
ML
6888 r = -ENXIO;
6889 if (!kvm->arch.vpit)
6890 goto out;
52d939a0 6891 r = kvm_vm_ioctl_reinject(kvm, &control);
52d939a0
MT
6892 break;
6893 }
d71ba788
PB
6894 case KVM_SET_BOOT_CPU_ID:
6895 r = 0;
6896 mutex_lock(&kvm->lock);
557abc40 6897 if (kvm->created_vcpus)
d71ba788
PB
6898 r = -EBUSY;
6899 else
6900 kvm->arch.bsp_vcpu_id = arg;
6901 mutex_unlock(&kvm->lock);
6902 break;
b59b153d 6903#ifdef CONFIG_KVM_XEN
ffde22ac 6904 case KVM_XEN_HVM_CONFIG: {
51776043 6905 struct kvm_xen_hvm_config xhc;
ffde22ac 6906 r = -EFAULT;
51776043 6907 if (copy_from_user(&xhc, argp, sizeof(xhc)))
ffde22ac 6908 goto out;
78e9878c 6909 r = kvm_xen_hvm_config(kvm, &xhc);
ffde22ac
ES
6910 break;
6911 }
a76b9641
JM
6912 case KVM_XEN_HVM_GET_ATTR: {
6913 struct kvm_xen_hvm_attr xha;
6914
6915 r = -EFAULT;
6916 if (copy_from_user(&xha, argp, sizeof(xha)))
ffde22ac 6917 goto out;
a76b9641
JM
6918 r = kvm_xen_hvm_get_attr(kvm, &xha);
6919 if (!r && copy_to_user(argp, &xha, sizeof(xha)))
6920 r = -EFAULT;
6921 break;
6922 }
6923 case KVM_XEN_HVM_SET_ATTR: {
6924 struct kvm_xen_hvm_attr xha;
6925
6926 r = -EFAULT;
6927 if (copy_from_user(&xha, argp, sizeof(xha)))
6928 goto out;
6929 r = kvm_xen_hvm_set_attr(kvm, &xha);
ffde22ac
ES
6930 break;
6931 }
35025735
DW
6932 case KVM_XEN_HVM_EVTCHN_SEND: {
6933 struct kvm_irq_routing_xen_evtchn uxe;
6934
6935 r = -EFAULT;
6936 if (copy_from_user(&uxe, argp, sizeof(uxe)))
6937 goto out;
6938 r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
6939 break;
6940 }
b59b153d 6941#endif
45e6c2fa
PB
6942 case KVM_SET_CLOCK:
6943 r = kvm_vm_ioctl_set_clock(kvm, argp);
afbcf7ab 6944 break;
45e6c2fa
PB
6945 case KVM_GET_CLOCK:
6946 r = kvm_vm_ioctl_get_clock(kvm, argp);
afbcf7ab 6947 break;
ffbb61d0
DW
6948 case KVM_SET_TSC_KHZ: {
6949 u32 user_tsc_khz;
6950
6951 r = -EINVAL;
6952 user_tsc_khz = (u32)arg;
6953
938c8745
SC
6954 if (kvm_caps.has_tsc_control &&
6955 user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
ffbb61d0
DW
6956 goto out;
6957
6958 if (user_tsc_khz == 0)
6959 user_tsc_khz = tsc_khz;
6960
6961 WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
6962 r = 0;
6963
6964 goto out;
6965 }
6966 case KVM_GET_TSC_KHZ: {
6967 r = READ_ONCE(kvm->arch.default_tsc_khz);
6968 goto out;
6969 }
5acc5c06
BS
6970 case KVM_MEMORY_ENCRYPT_OP: {
6971 r = -ENOTTY;
03d004cd
SC
6972 if (!kvm_x86_ops.mem_enc_ioctl)
6973 goto out;
6974
6975 r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
5acc5c06
BS
6976 break;
6977 }
69eaedee
BS
6978 case KVM_MEMORY_ENCRYPT_REG_REGION: {
6979 struct kvm_enc_region region;
6980
6981 r = -EFAULT;
6982 if (copy_from_user(&region, argp, sizeof(region)))
6983 goto out;
6984
6985 r = -ENOTTY;
03d004cd
SC
6986 if (!kvm_x86_ops.mem_enc_register_region)
6987 goto out;
6988
6989 r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
69eaedee
BS
6990 break;
6991 }
6992 case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
6993 struct kvm_enc_region region;
6994
6995 r = -EFAULT;
6996 if (copy_from_user(&region, argp, sizeof(region)))
6997 goto out;
6998
6999 r = -ENOTTY;
03d004cd
SC
7000 if (!kvm_x86_ops.mem_enc_unregister_region)
7001 goto out;
7002
7003 r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
69eaedee
BS
7004 break;
7005 }
faeb7833
RK
7006 case KVM_HYPERV_EVENTFD: {
7007 struct kvm_hyperv_eventfd hvevfd;
7008
7009 r = -EFAULT;
7010 if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
7011 goto out;
7012 r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
7013 break;
7014 }
66bb8a06
EH
7015 case KVM_SET_PMU_EVENT_FILTER:
7016 r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
7017 break;
2e3272bc
AG
7018 case KVM_X86_SET_MSR_FILTER: {
7019 struct kvm_msr_filter __user *user_msr_filter = argp;
7020 struct kvm_msr_filter filter;
7021
7022 if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
7023 return -EFAULT;
7024
7025 r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
1a155254 7026 break;
2e3272bc 7027 }
1fe779f8 7028 default:
ad6260da 7029 r = -ENOTTY;
1fe779f8
CO
7030 }
7031out:
7032 return r;
7033}
7034
2374b731 7035static void kvm_probe_msr_to_save(u32 msr_index)
043405e1
CO
7036{
7037 u32 dummy[2];
2374b731
SC
7038
7039 if (rdmsr_safe(msr_index, &dummy[0], &dummy[1]))
7040 return;
7041
7042 /*
7043 * Even MSRs that are valid in the host may not be exposed to guests in
7044 * some cases.
7045 */
7046 switch (msr_index) {
7047 case MSR_IA32_BNDCFGS:
7048 if (!kvm_mpx_supported())
7049 return;
7050 break;
7051 case MSR_TSC_AUX:
7052 if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
7053 !kvm_cpu_cap_has(X86_FEATURE_RDPID))
7054 return;
7055 break;
7056 case MSR_IA32_UMWAIT_CONTROL:
7057 if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
7058 return;
7059 break;
7060 case MSR_IA32_RTIT_CTL:
7061 case MSR_IA32_RTIT_STATUS:
7062 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
7063 return;
7064 break;
7065 case MSR_IA32_RTIT_CR3_MATCH:
7066 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
7067 !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
7068 return;
7069 break;
7070 case MSR_IA32_RTIT_OUTPUT_BASE:
7071 case MSR_IA32_RTIT_OUTPUT_MASK:
7072 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
7073 (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
7074 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
7075 return;
7076 break;
7077 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
7078 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
7079 (msr_index - MSR_IA32_RTIT_ADDR0_A >=
7080 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
7081 return;
7082 break;
7083 case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
7084 if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
7085 kvm_pmu_cap.num_counters_gp)
7086 return;
7087 break;
7088 case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
7089 if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
7090 kvm_pmu_cap.num_counters_gp)
7091 return;
7092 break;
e33b6d79
LX
7093 case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX:
7094 if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
7095 kvm_pmu_cap.num_counters_fixed)
7096 return;
7097 break;
2374b731
SC
7098 case MSR_IA32_XFD:
7099 case MSR_IA32_XFD_ERR:
7100 if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
7101 return;
7102 break;
7103 default:
7104 break;
7105 }
7106
7107 msrs_to_save[num_msrs_to_save++] = msr_index;
7108}
7109
7110static void kvm_init_msr_list(void)
7111{
7a5ee6ed 7112 unsigned i;
043405e1 7113
0144ba0c 7114 BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
2374b731 7115 "Please update the fixed PMCs in msrs_to_save_pmu[]");
24c29b7a 7116
6cbee2b9
XL
7117 num_msrs_to_save = 0;
7118 num_emulated_msrs = 0;
7119 num_msr_based_features = 0;
7120
2374b731
SC
7121 for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++)
7122 kvm_probe_msr_to_save(msrs_to_save_base[i]);
93c4adc7 7123
c3531edc
SC
7124 if (enable_pmu) {
7125 for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++)
7126 kvm_probe_msr_to_save(msrs_to_save_pmu[i]);
043405e1 7127 }
62ef68bb 7128
7a5ee6ed 7129 for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
b3646477 7130 if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
bc226f07 7131 continue;
62ef68bb 7132
7a5ee6ed 7133 emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
62ef68bb 7134 }
801e459a 7135
7a5ee6ed 7136 for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
801e459a
TL
7137 struct kvm_msr_entry msr;
7138
7a5ee6ed 7139 msr.index = msr_based_features_all[i];
66421c1e 7140 if (kvm_get_msr_feature(&msr))
801e459a
TL
7141 continue;
7142
7a5ee6ed 7143 msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
801e459a 7144 }
043405e1
CO
7145}
7146
bda9020e
MT
7147static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
7148 const void *v)
bbd9b64e 7149{
70252a10
AK
7150 int handled = 0;
7151 int n;
7152
7153 do {
7154 n = min(len, 8);
bce87cce 7155 if (!(lapic_in_kernel(vcpu) &&
e32edf4f
NN
7156 !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
7157 && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
70252a10
AK
7158 break;
7159 handled += n;
7160 addr += n;
7161 len -= n;
7162 v += n;
7163 } while (len);
bbd9b64e 7164
70252a10 7165 return handled;
bbd9b64e
CO
7166}
7167
bda9020e 7168static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
bbd9b64e 7169{
70252a10
AK
7170 int handled = 0;
7171 int n;
7172
7173 do {
7174 n = min(len, 8);
bce87cce 7175 if (!(lapic_in_kernel(vcpu) &&
e32edf4f
NN
7176 !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
7177 addr, n, v))
7178 && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
70252a10 7179 break;
e39d200f 7180 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
70252a10
AK
7181 handled += n;
7182 addr += n;
7183 len -= n;
7184 v += n;
7185 } while (len);
bbd9b64e 7186
70252a10 7187 return handled;
bbd9b64e
CO
7188}
7189
c53da4f3
PB
7190void kvm_set_segment(struct kvm_vcpu *vcpu,
7191 struct kvm_segment *var, int seg)
2dafc6c2 7192{
b3646477 7193 static_call(kvm_x86_set_segment)(vcpu, var, seg);
2dafc6c2
GN
7194}
7195
7196void kvm_get_segment(struct kvm_vcpu *vcpu,
7197 struct kvm_segment *var, int seg)
7198{
b3646477 7199 static_call(kvm_x86_get_segment)(vcpu, var, seg);
2dafc6c2
GN
7200}
7201
5b22bbe7 7202gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
54987b7a 7203 struct x86_exception *exception)
02f59dc9 7204{
1f5a21ee 7205 struct kvm_mmu *mmu = vcpu->arch.mmu;
02f59dc9 7206 gpa_t t_gpa;
02f59dc9
JR
7207
7208 BUG_ON(!mmu_is_nested(vcpu));
7209
7210 /* NPT walks are always user-walks */
7211 access |= PFERR_USER_MASK;
1f5a21ee 7212 t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
02f59dc9
JR
7213
7214 return t_gpa;
7215}
7216
ab9ae313
AK
7217gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
7218 struct x86_exception *exception)
1871c602 7219{
1f5a21ee
LJ
7220 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7221
5b22bbe7 7222 u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
1f5a21ee 7223 return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
1871c602 7224}
54f958cd 7225EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
1871c602 7226
ab9ae313
AK
7227gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
7228 struct x86_exception *exception)
1871c602 7229{
1f5a21ee
LJ
7230 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7231
5b22bbe7 7232 u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
1871c602 7233 access |= PFERR_WRITE_MASK;
1f5a21ee 7234 return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
1871c602 7235}
54f958cd 7236EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
1871c602
GN
7237
7238/* uses this to access any guest's mapped memory without checking CPL */
ab9ae313
AK
7239gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
7240 struct x86_exception *exception)
1871c602 7241{
1f5a21ee
LJ
7242 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7243
7244 return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
1871c602
GN
7245}
7246
7247static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
5b22bbe7 7248 struct kvm_vcpu *vcpu, u64 access,
bcc55cba 7249 struct x86_exception *exception)
bbd9b64e 7250{
1f5a21ee 7251 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
bbd9b64e 7252 void *data = val;
10589a46 7253 int r = X86EMUL_CONTINUE;
bbd9b64e
CO
7254
7255 while (bytes) {
1f5a21ee 7256 gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
bbd9b64e 7257 unsigned offset = addr & (PAGE_SIZE-1);
77c2002e 7258 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
bbd9b64e
CO
7259 int ret;
7260
6e1d2a3f 7261 if (gpa == INVALID_GPA)
ab9ae313 7262 return X86EMUL_PROPAGATE_FAULT;
54bf36aa
PB
7263 ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
7264 offset, toread);
10589a46 7265 if (ret < 0) {
c3cd7ffa 7266 r = X86EMUL_IO_NEEDED;
10589a46
MT
7267 goto out;
7268 }
bbd9b64e 7269
77c2002e
IE
7270 bytes -= toread;
7271 data += toread;
7272 addr += toread;
bbd9b64e 7273 }
10589a46 7274out:
10589a46 7275 return r;
bbd9b64e 7276}
77c2002e 7277
1871c602 7278/* used for instruction fetching */
0f65dd70
AK
7279static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
7280 gva_t addr, void *val, unsigned int bytes,
bcc55cba 7281 struct x86_exception *exception)
1871c602 7282{
0f65dd70 7283 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
1f5a21ee 7284 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5b22bbe7 7285 u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
44583cba
PB
7286 unsigned offset;
7287 int ret;
0f65dd70 7288
44583cba 7289 /* Inline kvm_read_guest_virt_helper for speed. */
1f5a21ee
LJ
7290 gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
7291 exception);
6e1d2a3f 7292 if (unlikely(gpa == INVALID_GPA))
44583cba
PB
7293 return X86EMUL_PROPAGATE_FAULT;
7294
7295 offset = addr & (PAGE_SIZE-1);
7296 if (WARN_ON(offset + bytes > PAGE_SIZE))
7297 bytes = (unsigned)PAGE_SIZE - offset;
54bf36aa
PB
7298 ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
7299 offset, bytes);
44583cba
PB
7300 if (unlikely(ret < 0))
7301 return X86EMUL_IO_NEEDED;
7302
7303 return X86EMUL_CONTINUE;
1871c602
GN
7304}
7305
ce14e868 7306int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
0f65dd70 7307 gva_t addr, void *val, unsigned int bytes,
bcc55cba 7308 struct x86_exception *exception)
1871c602 7309{
5b22bbe7 7310 u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
0f65dd70 7311
353c0956
PB
7312 /*
7313 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
7314 * is returned, but our callers are not ready for that and they blindly
7315 * call kvm_inject_page_fault. Ensure that they at least do not leak
7316 * uninitialized kernel stack memory into cr2 and error code.
7317 */
7318 memset(exception, 0, sizeof(*exception));
1871c602 7319 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
bcc55cba 7320 exception);
1871c602 7321}
064aea77 7322EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
1871c602 7323
ce14e868
PB
7324static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
7325 gva_t addr, void *val, unsigned int bytes,
3c9fa24c 7326 struct x86_exception *exception, bool system)
1871c602 7327{
0f65dd70 7328 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5b22bbe7 7329 u64 access = 0;
3c9fa24c 7330
4f4aa80e
LJ
7331 if (system)
7332 access |= PFERR_IMPLICIT_ACCESS;
7333 else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
3c9fa24c
PB
7334 access |= PFERR_USER_MASK;
7335
7336 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
1871c602
GN
7337}
7338
ce14e868 7339static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
5b22bbe7 7340 struct kvm_vcpu *vcpu, u64 access,
ce14e868 7341 struct x86_exception *exception)
77c2002e 7342{
1f5a21ee 7343 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
77c2002e
IE
7344 void *data = val;
7345 int r = X86EMUL_CONTINUE;
7346
7347 while (bytes) {
1f5a21ee 7348 gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
77c2002e
IE
7349 unsigned offset = addr & (PAGE_SIZE-1);
7350 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
7351 int ret;
7352
6e1d2a3f 7353 if (gpa == INVALID_GPA)
ab9ae313 7354 return X86EMUL_PROPAGATE_FAULT;
54bf36aa 7355 ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
77c2002e 7356 if (ret < 0) {
c3cd7ffa 7357 r = X86EMUL_IO_NEEDED;
77c2002e
IE
7358 goto out;
7359 }
7360
7361 bytes -= towrite;
7362 data += towrite;
7363 addr += towrite;
7364 }
7365out:
7366 return r;
7367}
ce14e868
PB
7368
7369static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
3c9fa24c
PB
7370 unsigned int bytes, struct x86_exception *exception,
7371 bool system)
ce14e868
PB
7372{
7373 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5b22bbe7 7374 u64 access = PFERR_WRITE_MASK;
3c9fa24c 7375
4f4aa80e
LJ
7376 if (system)
7377 access |= PFERR_IMPLICIT_ACCESS;
7378 else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
3c9fa24c 7379 access |= PFERR_USER_MASK;
ce14e868
PB
7380
7381 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
3c9fa24c 7382 access, exception);
ce14e868
PB
7383}
7384
7385int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
7386 unsigned int bytes, struct x86_exception *exception)
7387{
c595ceee
PB
7388 /* kvm_write_guest_virt_system can pull in tons of pages. */
7389 vcpu->arch.l1tf_flush_l1d = true;
7390
ce14e868
PB
7391 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
7392 PFERR_WRITE_MASK, exception);
7393}
6a4d7550 7394EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
77c2002e 7395
4d31d9ef
SC
7396static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
7397 void *insn, int insn_len)
7398{
7399 return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
7400 insn, insn_len);
7401}
7402
082d06ed
WL
7403int handle_ud(struct kvm_vcpu *vcpu)
7404{
b3dc0695 7405 static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
40aaa5b6 7406 int fep_flags = READ_ONCE(force_emulation_prefix);
6c86eedc 7407 int emul_type = EMULTYPE_TRAP_UD;
6c86eedc
WL
7408 char sig[5]; /* ud2; .ascii "kvm" */
7409 struct x86_exception e;
7410
4d31d9ef 7411 if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
09e3e2a1
SC
7412 return 1;
7413
40aaa5b6 7414 if (fep_flags &&
3c9fa24c
PB
7415 kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
7416 sig, sizeof(sig), &e) == 0 &&
b3dc0695 7417 memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
40aaa5b6 7418 if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF)
d500e1ed 7419 kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);
6c86eedc 7420 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
b4000606 7421 emul_type = EMULTYPE_TRAP_UD_FORCED;
6c86eedc 7422 }
082d06ed 7423
60fc3d02 7424 return kvm_emulate_instruction(vcpu, emul_type);
082d06ed
WL
7425}
7426EXPORT_SYMBOL_GPL(handle_ud);
7427
0f89b207
TL
7428static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
7429 gpa_t gpa, bool write)
7430{
7431 /* For APIC access vmexit */
7432 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
7433 return 1;
7434
7435 if (vcpu_match_mmio_gpa(vcpu, gpa)) {
7436 trace_vcpu_match_mmio(gva, gpa, write, true);
7437 return 1;
7438 }
7439
7440 return 0;
7441}
7442
af7cc7d1
XG
7443static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
7444 gpa_t *gpa, struct x86_exception *exception,
7445 bool write)
7446{
1f5a21ee 7447 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5b22bbe7 7448 u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
97d64b78 7449 | (write ? PFERR_WRITE_MASK : 0);
af7cc7d1 7450
be94f6b7
HH
7451 /*
7452 * currently PKRU is only applied to ept enabled guest so
7453 * there is no pkey in EPT page table for L1 guest or EPT
7454 * shadow page table for L2 guest.
7455 */
908b7d43
SC
7456 if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
7457 !permission_fault(vcpu, vcpu->arch.walk_mmu,
7458 vcpu->arch.mmio_access, 0, access))) {
bebb106a
XG
7459 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
7460 (gva & (PAGE_SIZE - 1));
4f022648 7461 trace_vcpu_match_mmio(gva, *gpa, write, false);
bebb106a
XG
7462 return 1;
7463 }
7464
1f5a21ee 7465 *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
af7cc7d1 7466
6e1d2a3f 7467 if (*gpa == INVALID_GPA)
af7cc7d1
XG
7468 return -1;
7469
0f89b207 7470 return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
af7cc7d1
XG
7471}
7472
3200f405 7473int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
bcc55cba 7474 const void *val, int bytes)
bbd9b64e
CO
7475{
7476 int ret;
7477
54bf36aa 7478 ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
9f811285 7479 if (ret < 0)
bbd9b64e 7480 return 0;
0eb05bf2 7481 kvm_page_track_write(vcpu, gpa, val, bytes);
bbd9b64e
CO
7482 return 1;
7483}
7484
77d197b2
XG
7485struct read_write_emulator_ops {
7486 int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
7487 int bytes);
7488 int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
7489 void *val, int bytes);
7490 int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
7491 int bytes, void *val);
7492 int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
7493 void *val, int bytes);
7494 bool write;
7495};
7496
7497static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
7498{
7499 if (vcpu->mmio_read_completed) {
77d197b2 7500 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
e39d200f 7501 vcpu->mmio_fragments[0].gpa, val);
77d197b2
XG
7502 vcpu->mmio_read_completed = 0;
7503 return 1;
7504 }
7505
7506 return 0;
7507}
7508
7509static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
7510 void *val, int bytes)
7511{
54bf36aa 7512 return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
77d197b2
XG
7513}
7514
7515static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
7516 void *val, int bytes)
7517{
7518 return emulator_write_phys(vcpu, gpa, val, bytes);
7519}
7520
7521static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
7522{
e39d200f 7523 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
77d197b2
XG
7524 return vcpu_mmio_write(vcpu, gpa, bytes, val);
7525}
7526
7527static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
7528 void *val, int bytes)
7529{
e39d200f 7530 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
77d197b2
XG
7531 return X86EMUL_IO_NEEDED;
7532}
7533
7534static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
7535 void *val, int bytes)
7536{
f78146b0
AK
7537 struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
7538
87da7e66 7539 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
77d197b2
XG
7540 return X86EMUL_CONTINUE;
7541}
7542
0fbe9b0b 7543static const struct read_write_emulator_ops read_emultor = {
77d197b2
XG
7544 .read_write_prepare = read_prepare,
7545 .read_write_emulate = read_emulate,
7546 .read_write_mmio = vcpu_mmio_read,
7547 .read_write_exit_mmio = read_exit_mmio,
7548};
7549
0fbe9b0b 7550static const struct read_write_emulator_ops write_emultor = {
77d197b2
XG
7551 .read_write_emulate = write_emulate,
7552 .read_write_mmio = write_mmio,
7553 .read_write_exit_mmio = write_exit_mmio,
7554 .write = true,
7555};
7556
22388a3c
XG
7557static int emulator_read_write_onepage(unsigned long addr, void *val,
7558 unsigned int bytes,
7559 struct x86_exception *exception,
7560 struct kvm_vcpu *vcpu,
0fbe9b0b 7561 const struct read_write_emulator_ops *ops)
bbd9b64e 7562{
af7cc7d1
XG
7563 gpa_t gpa;
7564 int handled, ret;
22388a3c 7565 bool write = ops->write;
f78146b0 7566 struct kvm_mmio_fragment *frag;
c9b8b07c 7567 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
0f89b207
TL
7568
7569 /*
7570 * If the exit was due to a NPF we may already have a GPA.
7571 * If the GPA is present, use it to avoid the GVA to GPA table walk.
7572 * Note, this cannot be used on string operations since string
7573 * operation using rep will only have the initial GPA from the NPF
7574 * occurred.
7575 */
744e699c
SC
7576 if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
7577 (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
7578 gpa = ctxt->gpa_val;
618232e2
BS
7579 ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
7580 } else {
7581 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
7582 if (ret < 0)
7583 return X86EMUL_PROPAGATE_FAULT;
0f89b207 7584 }
10589a46 7585
618232e2 7586 if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
bbd9b64e
CO
7587 return X86EMUL_CONTINUE;
7588
bbd9b64e
CO
7589 /*
7590 * Is this MMIO handled locally?
7591 */
22388a3c 7592 handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
70252a10 7593 if (handled == bytes)
bbd9b64e 7594 return X86EMUL_CONTINUE;
bbd9b64e 7595
70252a10
AK
7596 gpa += handled;
7597 bytes -= handled;
7598 val += handled;
7599
87da7e66
XG
7600 WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
7601 frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
7602 frag->gpa = gpa;
7603 frag->data = val;
7604 frag->len = bytes;
f78146b0 7605 return X86EMUL_CONTINUE;
bbd9b64e
CO
7606}
7607
52eb5a6d
XL
7608static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
7609 unsigned long addr,
22388a3c
XG
7610 void *val, unsigned int bytes,
7611 struct x86_exception *exception,
0fbe9b0b 7612 const struct read_write_emulator_ops *ops)
bbd9b64e 7613{
0f65dd70 7614 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
f78146b0
AK
7615 gpa_t gpa;
7616 int rc;
7617
7618 if (ops->read_write_prepare &&
7619 ops->read_write_prepare(vcpu, val, bytes))
7620 return X86EMUL_CONTINUE;
7621
7622 vcpu->mmio_nr_fragments = 0;
0f65dd70 7623
bbd9b64e
CO
7624 /* Crossing a page boundary? */
7625 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
f78146b0 7626 int now;
bbd9b64e
CO
7627
7628 now = -addr & ~PAGE_MASK;
22388a3c
XG
7629 rc = emulator_read_write_onepage(addr, val, now, exception,
7630 vcpu, ops);
7631
bbd9b64e
CO
7632 if (rc != X86EMUL_CONTINUE)
7633 return rc;
7634 addr += now;
bac15531
NA
7635 if (ctxt->mode != X86EMUL_MODE_PROT64)
7636 addr = (u32)addr;
bbd9b64e
CO
7637 val += now;
7638 bytes -= now;
7639 }
22388a3c 7640
f78146b0
AK
7641 rc = emulator_read_write_onepage(addr, val, bytes, exception,
7642 vcpu, ops);
7643 if (rc != X86EMUL_CONTINUE)
7644 return rc;
7645
7646 if (!vcpu->mmio_nr_fragments)
7647 return rc;
7648
7649 gpa = vcpu->mmio_fragments[0].gpa;
7650
7651 vcpu->mmio_needed = 1;
7652 vcpu->mmio_cur_fragment = 0;
7653
87da7e66 7654 vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
f78146b0
AK
7655 vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
7656 vcpu->run->exit_reason = KVM_EXIT_MMIO;
7657 vcpu->run->mmio.phys_addr = gpa;
7658
7659 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
22388a3c
XG
7660}
7661
7662static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
7663 unsigned long addr,
7664 void *val,
7665 unsigned int bytes,
7666 struct x86_exception *exception)
7667{
7668 return emulator_read_write(ctxt, addr, val, bytes,
7669 exception, &read_emultor);
7670}
7671
52eb5a6d 7672static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
22388a3c
XG
7673 unsigned long addr,
7674 const void *val,
7675 unsigned int bytes,
7676 struct x86_exception *exception)
7677{
7678 return emulator_read_write(ctxt, addr, (void *)val, bytes,
7679 exception, &write_emultor);
bbd9b64e 7680}
bbd9b64e 7681
1c2361f6
SC
7682#define emulator_try_cmpxchg_user(t, ptr, old, new) \
7683 (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
daea3e73 7684
0f65dd70
AK
7685static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
7686 unsigned long addr,
bbd9b64e
CO
7687 const void *old,
7688 const void *new,
7689 unsigned int bytes,
0f65dd70 7690 struct x86_exception *exception)
bbd9b64e 7691{
0f65dd70 7692 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
9de6fe3c 7693 u64 page_line_mask;
1c2361f6 7694 unsigned long hva;
daea3e73 7695 gpa_t gpa;
1c2361f6 7696 int r;
2bacc55c 7697
daea3e73
AK
7698 /* guests cmpxchg8b have to be emulated atomically */
7699 if (bytes > 8 || (bytes & (bytes - 1)))
7700 goto emul_write;
10589a46 7701
daea3e73 7702 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
2bacc55c 7703
6e1d2a3f 7704 if (gpa == INVALID_GPA ||
daea3e73
AK
7705 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
7706 goto emul_write;
2bacc55c 7707
9de6fe3c
XL
7708 /*
7709 * Emulate the atomic as a straight write to avoid #AC if SLD is
7710 * enabled in the host and the access splits a cache line.
7711 */
7712 if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
7713 page_line_mask = ~(cache_line_size() - 1);
7714 else
7715 page_line_mask = PAGE_MASK;
7716
7717 if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
daea3e73 7718 goto emul_write;
72dc67a6 7719
1c2361f6 7720 hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
33fbe6be 7721 if (kvm_is_error_hva(hva))
c19b8bd6 7722 goto emul_write;
72dc67a6 7723
1c2361f6 7724 hva += offset_in_page(gpa);
42e35f80 7725
daea3e73
AK
7726 switch (bytes) {
7727 case 1:
1c2361f6 7728 r = emulator_try_cmpxchg_user(u8, hva, old, new);
daea3e73
AK
7729 break;
7730 case 2:
1c2361f6 7731 r = emulator_try_cmpxchg_user(u16, hva, old, new);
daea3e73
AK
7732 break;
7733 case 4:
1c2361f6 7734 r = emulator_try_cmpxchg_user(u32, hva, old, new);
daea3e73
AK
7735 break;
7736 case 8:
1c2361f6 7737 r = emulator_try_cmpxchg_user(u64, hva, old, new);
daea3e73
AK
7738 break;
7739 default:
7740 BUG();
2bacc55c 7741 }
42e35f80 7742
1c2361f6 7743 if (r < 0)
5d6c7de6 7744 return X86EMUL_UNHANDLEABLE;
1c2361f6 7745 if (r)
daea3e73
AK
7746 return X86EMUL_CMPXCHG_FAILED;
7747
0eb05bf2 7748 kvm_page_track_write(vcpu, gpa, new, bytes);
8f6abd06
GN
7749
7750 return X86EMUL_CONTINUE;
4a5f48f6 7751
3200f405 7752emul_write:
8d20bd63 7753 pr_warn_once("emulating exchange as write\n");
2bacc55c 7754
0f65dd70 7755 return emulator_write_emulated(ctxt, addr, new, bytes, exception);
bbd9b64e
CO
7756}
7757
6f6fbe98 7758static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
30d583fd 7759 unsigned short port, void *data,
6f6fbe98 7760 unsigned int count, bool in)
cf8f70bf 7761{
0f87ac23
PB
7762 unsigned i;
7763 int r;
cf8f70bf 7764
30d583fd 7765 WARN_ON_ONCE(vcpu->arch.pio.count);
0f87ac23
PB
7766 for (i = 0; i < count; i++) {
7767 if (in)
7768 r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data);
cbfc6c91 7769 else
0f87ac23 7770 r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data);
35ab3b77
PB
7771
7772 if (r) {
7773 if (i == 0)
7774 goto userspace_io;
7775
7776 /*
7777 * Userspace must have unregistered the device while PIO
0c05e10b 7778 * was running. Drop writes / read as 0.
35ab3b77 7779 */
0c05e10b
PB
7780 if (in)
7781 memset(data, 0, size * (count - i));
cbfc6c91 7782 break;
35ab3b77
PB
7783 }
7784
0f87ac23 7785 data += size;
cbfc6c91 7786 }
0f87ac23 7787 return 1;
cf8f70bf 7788
0f87ac23 7789userspace_io:
cf8f70bf 7790 vcpu->arch.pio.port = port;
6f6fbe98 7791 vcpu->arch.pio.in = in;
0c05e10b 7792 vcpu->arch.pio.count = count;
cf8f70bf
GN
7793 vcpu->arch.pio.size = size;
7794
0c05e10b
PB
7795 if (in)
7796 memset(vcpu->arch.pio_data, 0, size * count);
7797 else
7798 memcpy(vcpu->arch.pio_data, data, size * count);
cf8f70bf
GN
7799
7800 vcpu->run->exit_reason = KVM_EXIT_IO;
6f6fbe98 7801 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
cf8f70bf
GN
7802 vcpu->run->io.size = size;
7803 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
7804 vcpu->run->io.count = count;
7805 vcpu->run->io.port = port;
cf8f70bf
GN
7806 return 0;
7807}
7808
f35cee4a
PB
7809static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
7810 unsigned short port, void *val, unsigned int count)
cf8f70bf 7811{
0c05e10b
PB
7812 int r = emulator_pio_in_out(vcpu, size, port, val, count, true);
7813 if (r)
7814 trace_kvm_pio(KVM_PIO_IN, port, size, count, val);
7815
7816 return r;
3b27de27 7817}
ca1d4a9e 7818
6b5efc93 7819static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
3b27de27 7820{
6b5efc93 7821 int size = vcpu->arch.pio.size;
0c05e10b 7822 unsigned int count = vcpu->arch.pio.count;
6b5efc93
PB
7823 memcpy(val, vcpu->arch.pio_data, size * count);
7824 trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
3b27de27
PB
7825 vcpu->arch.pio.count = 0;
7826}
cf8f70bf 7827
f35cee4a
PB
7828static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
7829 int size, unsigned short port, void *val,
7830 unsigned int count)
3b27de27 7831{
f35cee4a 7832 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3b27de27 7833 if (vcpu->arch.pio.count) {
d07898ea
SC
7834 /*
7835 * Complete a previous iteration that required userspace I/O.
7836 * Note, @count isn't guaranteed to match pio.count as userspace
7837 * can modify ECX before rerunning the vCPU. Ignore any such
7838 * shenanigans as KVM doesn't support modifying the rep count,
7839 * and the emulator ensures @count doesn't overflow the buffer.
7840 */
0c05e10b
PB
7841 complete_emulator_pio_in(vcpu, val);
7842 return 1;
cf8f70bf
GN
7843 }
7844
f35cee4a 7845 return emulator_pio_in(vcpu, size, port, val, count);
2e3bb4d8 7846}
6f6fbe98 7847
2e3bb4d8
SC
7848static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
7849 unsigned short port, const void *val,
7850 unsigned int count)
7851{
30d583fd 7852 trace_kvm_pio(KVM_PIO_OUT, port, size, count, val);
0c05e10b 7853 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
6f6fbe98
XG
7854}
7855
2e3bb4d8
SC
7856static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
7857 int size, unsigned short port,
7858 const void *val, unsigned int count)
7859{
7860 return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
7861}
7862
bbd9b64e
CO
7863static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
7864{
b3646477 7865 return static_call(kvm_x86_get_segment_base)(vcpu, seg);
bbd9b64e
CO
7866}
7867
3cb16fe7 7868static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
bbd9b64e 7869{
3cb16fe7 7870 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
bbd9b64e
CO
7871}
7872
ae6a2375 7873static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
f5f48ee1
SY
7874{
7875 if (!need_emulate_wbinvd(vcpu))
7876 return X86EMUL_CONTINUE;
7877
b3646477 7878 if (static_call(kvm_x86_has_wbinvd_exit)()) {
2eec7343
JK
7879 int cpu = get_cpu();
7880
7881 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
c2162e13 7882 on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
f5f48ee1 7883 wbinvd_ipi, NULL, 1);
2eec7343 7884 put_cpu();
f5f48ee1 7885 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
2eec7343
JK
7886 } else
7887 wbinvd();
f5f48ee1
SY
7888 return X86EMUL_CONTINUE;
7889}
5cb56059
JS
7890
7891int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
7892{
6affcbed
KH
7893 kvm_emulate_wbinvd_noskip(vcpu);
7894 return kvm_skip_emulated_instruction(vcpu);
5cb56059 7895}
f5f48ee1
SY
7896EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
7897
5cb56059
JS
7898
7899
bcaf5cc5
AK
7900static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
7901{
5cb56059 7902 kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
bcaf5cc5
AK
7903}
7904
29d6ca41
PB
7905static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
7906 unsigned long *dest)
bbd9b64e 7907{
29d6ca41 7908 kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
bbd9b64e
CO
7909}
7910
52eb5a6d
XL
7911static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
7912 unsigned long value)
bbd9b64e 7913{
338dbc97 7914
996ff542 7915 return kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
bbd9b64e
CO
7916}
7917
52a46617 7918static u64 mk_cr_64(u64 curr_cr, u32 new_val)
5fdbf976 7919{
52a46617 7920 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
5fdbf976
MT
7921}
7922
717746e3 7923static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
bbd9b64e 7924{
717746e3 7925 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
52a46617
GN
7926 unsigned long value;
7927
7928 switch (cr) {
7929 case 0:
7930 value = kvm_read_cr0(vcpu);
7931 break;
7932 case 2:
7933 value = vcpu->arch.cr2;
7934 break;
7935 case 3:
9f8fe504 7936 value = kvm_read_cr3(vcpu);
52a46617
GN
7937 break;
7938 case 4:
7939 value = kvm_read_cr4(vcpu);
7940 break;
7941 case 8:
7942 value = kvm_get_cr8(vcpu);
7943 break;
7944 default:
a737f256 7945 kvm_err("%s: unexpected cr %u\n", __func__, cr);
52a46617
GN
7946 return 0;
7947 }
7948
7949 return value;
7950}
7951
717746e3 7952static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
52a46617 7953{
717746e3 7954 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
0f12244f
GN
7955 int res = 0;
7956
52a46617
GN
7957 switch (cr) {
7958 case 0:
49a9b07e 7959 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
52a46617
GN
7960 break;
7961 case 2:
7962 vcpu->arch.cr2 = val;
7963 break;
7964 case 3:
2390218b 7965 res = kvm_set_cr3(vcpu, val);
52a46617
GN
7966 break;
7967 case 4:
a83b29c6 7968 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
52a46617
GN
7969 break;
7970 case 8:
eea1cff9 7971 res = kvm_set_cr8(vcpu, val);
52a46617
GN
7972 break;
7973 default:
a737f256 7974 kvm_err("%s: unexpected cr %u\n", __func__, cr);
0f12244f 7975 res = -1;
52a46617 7976 }
0f12244f
GN
7977
7978 return res;
52a46617
GN
7979}
7980
717746e3 7981static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
9c537244 7982{
b3646477 7983 return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
9c537244
GN
7984}
7985
4bff1e86 7986static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
2dafc6c2 7987{
b3646477 7988 static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
2dafc6c2
GN
7989}
7990
4bff1e86 7991static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
160ce1f1 7992{
b3646477 7993 static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
160ce1f1
MG
7994}
7995
1ac9d0cf
AK
7996static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
7997{
b3646477 7998 static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
1ac9d0cf
AK
7999}
8000
8001static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
8002{
b3646477 8003 static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
1ac9d0cf
AK
8004}
8005
4bff1e86
AK
8006static unsigned long emulator_get_cached_segment_base(
8007 struct x86_emulate_ctxt *ctxt, int seg)
5951c442 8008{
4bff1e86 8009 return get_segment_base(emul_to_vcpu(ctxt), seg);
5951c442
GN
8010}
8011
1aa36616
AK
8012static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
8013 struct desc_struct *desc, u32 *base3,
8014 int seg)
2dafc6c2
GN
8015{
8016 struct kvm_segment var;
8017
4bff1e86 8018 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
1aa36616 8019 *selector = var.selector;
2dafc6c2 8020
378a8b09
GN
8021 if (var.unusable) {
8022 memset(desc, 0, sizeof(*desc));
f0367ee1
RK
8023 if (base3)
8024 *base3 = 0;
2dafc6c2 8025 return false;
378a8b09 8026 }
2dafc6c2
GN
8027
8028 if (var.g)
8029 var.limit >>= 12;
8030 set_desc_limit(desc, var.limit);
8031 set_desc_base(desc, (unsigned long)var.base);
5601d05b
GN
8032#ifdef CONFIG_X86_64
8033 if (base3)
8034 *base3 = var.base >> 32;
8035#endif
2dafc6c2
GN
8036 desc->type = var.type;
8037 desc->s = var.s;
8038 desc->dpl = var.dpl;
8039 desc->p = var.present;
8040 desc->avl = var.avl;
8041 desc->l = var.l;
8042 desc->d = var.db;
8043 desc->g = var.g;
8044
8045 return true;
8046}
8047
1aa36616
AK
8048static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
8049 struct desc_struct *desc, u32 base3,
8050 int seg)
2dafc6c2 8051{
4bff1e86 8052 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
2dafc6c2
GN
8053 struct kvm_segment var;
8054
1aa36616 8055 var.selector = selector;
2dafc6c2 8056 var.base = get_desc_base(desc);
5601d05b
GN
8057#ifdef CONFIG_X86_64
8058 var.base |= ((u64)base3) << 32;
8059#endif
2dafc6c2
GN
8060 var.limit = get_desc_limit(desc);
8061 if (desc->g)
8062 var.limit = (var.limit << 12) | 0xfff;
8063 var.type = desc->type;
2dafc6c2
GN
8064 var.dpl = desc->dpl;
8065 var.db = desc->d;
8066 var.s = desc->s;
8067 var.l = desc->l;
8068 var.g = desc->g;
8069 var.avl = desc->avl;
8070 var.present = desc->p;
8071 var.unusable = !var.present;
8072 var.padding = 0;
8073
8074 kvm_set_segment(vcpu, &var, seg);
8075 return;
8076}
8077
ac8d6cad
HW
8078static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
8079 u32 msr_index, u64 *pdata)
717746e3 8080{
1ae09954
AG
8081 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8082 int r;
8083
ac8d6cad 8084 r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
36d546d5
HW
8085 if (r < 0)
8086 return X86EMUL_UNHANDLEABLE;
1ae09954 8087
36d546d5
HW
8088 if (r) {
8089 if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
8090 complete_emulated_rdmsr, r))
8091 return X86EMUL_IO_NEEDED;
794663e1
HW
8092
8093 trace_kvm_msr_read_ex(msr_index);
36d546d5 8094 return X86EMUL_PROPAGATE_FAULT;
1ae09954
AG
8095 }
8096
794663e1 8097 trace_kvm_msr_read(msr_index, *pdata);
36d546d5 8098 return X86EMUL_CONTINUE;
717746e3
AK
8099}
8100
ac8d6cad
HW
8101static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
8102 u32 msr_index, u64 data)
717746e3 8103{
1ae09954
AG
8104 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8105 int r;
8106
ac8d6cad 8107 r = kvm_set_msr_with_filter(vcpu, msr_index, data);
36d546d5
HW
8108 if (r < 0)
8109 return X86EMUL_UNHANDLEABLE;
1ae09954 8110
36d546d5
HW
8111 if (r) {
8112 if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
8113 complete_emulated_msr_access, r))
8114 return X86EMUL_IO_NEEDED;
794663e1
HW
8115
8116 trace_kvm_msr_write_ex(msr_index, data);
36d546d5 8117 return X86EMUL_PROPAGATE_FAULT;
1ae09954
AG
8118 }
8119
794663e1 8120 trace_kvm_msr_write(msr_index, data);
36d546d5 8121 return X86EMUL_CONTINUE;
717746e3
AK
8122}
8123
ac8d6cad
HW
8124static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
8125 u32 msr_index, u64 *pdata)
8126{
8127 return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
8128}
8129
67f4d428
NA
8130static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
8131 u32 pmc)
8132{
e6cd31f1
JM
8133 if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
8134 return 0;
8135 return -EINVAL;
67f4d428
NA
8136}
8137
222d21aa
AK
8138static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
8139 u32 pmc, u64 *pdata)
8140{
c6702c9d 8141 return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
222d21aa
AK
8142}
8143
6c3287f7
AK
8144static void emulator_halt(struct x86_emulate_ctxt *ctxt)
8145{
8146 emul_to_vcpu(ctxt)->arch.halt_request = 1;
8147}
8148
2953538e 8149static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
8a76d7f2 8150 struct x86_instruction_info *info,
c4f035c6
AK
8151 enum x86_intercept_stage stage)
8152{
b3646477 8153 return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
21f1b8f2 8154 &ctxt->exception);
c4f035c6
AK
8155}
8156
e911eb3b 8157static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
f91af517
SC
8158 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
8159 bool exact_only)
bdb42f5a 8160{
f91af517 8161 return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
bdb42f5a
SB
8162}
8163
5ae78e95
SC
8164static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
8165{
8166 return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
8167}
8168
8169static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
8170{
8171 return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
8172}
8173
8174static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
8175{
8176 return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
8177}
8178
a836839c
HW
8179static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
8180{
8181 return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
8182}
8183
dd856efa
AK
8184static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
8185{
27b4a9c4 8186 return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
dd856efa
AK
8187}
8188
8189static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
8190{
27b4a9c4 8191 kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val);
dd856efa
AK
8192}
8193
801806d9
NA
8194static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
8195{
b3646477 8196 static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
801806d9
NA
8197}
8198
32e69f23 8199static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
6ed071f0 8200{
32e69f23
ML
8201 return is_smm(emul_to_vcpu(ctxt));
8202}
8203
8204static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt)
6ed071f0 8205{
32e69f23 8206 return is_guest_mode(emul_to_vcpu(ctxt));
6ed071f0
LP
8207}
8208
4b8e1b32
PB
8209#ifndef CONFIG_KVM_SMM
8210static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
6ed071f0 8211{
4b8e1b32
PB
8212 WARN_ON_ONCE(1);
8213 return X86EMUL_UNHANDLEABLE;
0234bf88 8214}
4b8e1b32 8215#endif
0234bf88 8216
25b17226
SC
8217static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
8218{
8219 kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
8220}
8221
02d4160f
VK
8222static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
8223{
8224 return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
8225}
8226
1cca2f8c
SC
8227static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt)
8228{
8229 struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
8230
8231 if (!kvm->vm_bugged)
8232 kvm_vm_bugged(kvm);
8233}
8234
0225fb50 8235static const struct x86_emulate_ops emulate_ops = {
1cca2f8c 8236 .vm_bugged = emulator_vm_bugged,
dd856efa
AK
8237 .read_gpr = emulator_read_gpr,
8238 .write_gpr = emulator_write_gpr,
ce14e868
PB
8239 .read_std = emulator_read_std,
8240 .write_std = emulator_write_std,
1871c602 8241 .fetch = kvm_fetch_guest_virt,
bbd9b64e
CO
8242 .read_emulated = emulator_read_emulated,
8243 .write_emulated = emulator_write_emulated,
8244 .cmpxchg_emulated = emulator_cmpxchg_emulated,
3cb16fe7 8245 .invlpg = emulator_invlpg,
cf8f70bf
GN
8246 .pio_in_emulated = emulator_pio_in_emulated,
8247 .pio_out_emulated = emulator_pio_out_emulated,
1aa36616
AK
8248 .get_segment = emulator_get_segment,
8249 .set_segment = emulator_set_segment,
5951c442 8250 .get_cached_segment_base = emulator_get_cached_segment_base,
2dafc6c2 8251 .get_gdt = emulator_get_gdt,
160ce1f1 8252 .get_idt = emulator_get_idt,
1ac9d0cf
AK
8253 .set_gdt = emulator_set_gdt,
8254 .set_idt = emulator_set_idt,
52a46617
GN
8255 .get_cr = emulator_get_cr,
8256 .set_cr = emulator_set_cr,
9c537244 8257 .cpl = emulator_get_cpl,
35aa5375
GN
8258 .get_dr = emulator_get_dr,
8259 .set_dr = emulator_set_dr,
ac8d6cad
HW
8260 .set_msr_with_filter = emulator_set_msr_with_filter,
8261 .get_msr_with_filter = emulator_get_msr_with_filter,
717746e3 8262 .get_msr = emulator_get_msr,
67f4d428 8263 .check_pmc = emulator_check_pmc,
222d21aa 8264 .read_pmc = emulator_read_pmc,
6c3287f7 8265 .halt = emulator_halt,
bcaf5cc5 8266 .wbinvd = emulator_wbinvd,
d6aa1000 8267 .fix_hypercall = emulator_fix_hypercall,
c4f035c6 8268 .intercept = emulator_intercept,
bdb42f5a 8269 .get_cpuid = emulator_get_cpuid,
5ae78e95
SC
8270 .guest_has_long_mode = emulator_guest_has_long_mode,
8271 .guest_has_movbe = emulator_guest_has_movbe,
8272 .guest_has_fxsr = emulator_guest_has_fxsr,
a836839c 8273 .guest_has_rdpid = emulator_guest_has_rdpid,
801806d9 8274 .set_nmi_mask = emulator_set_nmi_mask,
32e69f23
ML
8275 .is_smm = emulator_is_smm,
8276 .is_guest_mode = emulator_is_guest_mode,
ecc513e5 8277 .leave_smm = emulator_leave_smm,
25b17226 8278 .triple_fault = emulator_triple_fault,
02d4160f 8279 .set_xcr = emulator_set_xcr,
bbd9b64e
CO
8280};
8281
95cb2295
GN
8282static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
8283{
b3646477 8284 u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
95cb2295
GN
8285 /*
8286 * an sti; sti; sequence only disable interrupts for the first
8287 * instruction. So, if the last instruction, be it emulated or
8288 * not, left the system with the INT_STI flag enabled, it
8289 * means that the last instruction is an sti. We should not
8290 * leave the flag on in this case. The same goes for mov ss
8291 */
37ccdcbe
PB
8292 if (int_shadow & mask)
8293 mask = 0;
6addfc42 8294 if (unlikely(int_shadow || mask)) {
b3646477 8295 static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
6addfc42
PB
8296 if (!mask)
8297 kvm_make_request(KVM_REQ_EVENT, vcpu);
8298 }
95cb2295
GN
8299}
8300
7709aba8 8301static void inject_emulated_exception(struct kvm_vcpu *vcpu)
54b8486f 8302{
c9b8b07c 8303 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
ef54bcfe 8304
7709aba8
SC
8305 if (ctxt->exception.vector == PF_VECTOR)
8306 kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
8307 else if (ctxt->exception.error_code_valid)
da9cb575
AK
8308 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
8309 ctxt->exception.error_code);
54b8486f 8310 else
da9cb575 8311 kvm_queue_exception(vcpu, ctxt->exception.vector);
54b8486f
GN
8312}
8313
c9b8b07c
SC
8314static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
8315{
8316 struct x86_emulate_ctxt *ctxt;
8317
8318 ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
8319 if (!ctxt) {
8d20bd63 8320 pr_err("failed to allocate vcpu's emulator\n");
c9b8b07c
SC
8321 return NULL;
8322 }
8323
8324 ctxt->vcpu = vcpu;
8325 ctxt->ops = &emulate_ops;
8326 vcpu->arch.emulate_ctxt = ctxt;
8327
8328 return ctxt;
8329}
8330
8ec4722d
MG
8331static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
8332{
c9b8b07c 8333 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8ec4722d
MG
8334 int cs_db, cs_l;
8335
b3646477 8336 static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
8ec4722d 8337
744e699c 8338 ctxt->gpa_available = false;
adf52235 8339 ctxt->eflags = kvm_get_rflags(vcpu);
c8401dda
PB
8340 ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
8341
adf52235
TY
8342 ctxt->eip = kvm_rip_read(vcpu);
8343 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
8344 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
42bf549f 8345 (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
adf52235
TY
8346 cs_db ? X86EMUL_MODE_PROT32 :
8347 X86EMUL_MODE_PROT16;
da6393cd
WL
8348 ctxt->interruptibility = 0;
8349 ctxt->have_exception = false;
8350 ctxt->exception.vector = -1;
8351 ctxt->perm_ok = false;
8352
dd856efa 8353 init_decode_cache(ctxt);
7ae441ea 8354 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
8ec4722d
MG
8355}
8356
9497e1f2 8357void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
63995653 8358{
c9b8b07c 8359 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
63995653
MG
8360 int ret;
8361
8362 init_emulate_ctxt(vcpu);
8363
9dac77fa
AK
8364 ctxt->op_bytes = 2;
8365 ctxt->ad_bytes = 2;
8366 ctxt->_eip = ctxt->eip + inc_eip;
9d74191a 8367 ret = emulate_int_real(ctxt, irq);
63995653 8368
9497e1f2
SC
8369 if (ret != X86EMUL_CONTINUE) {
8370 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
8371 } else {
8372 ctxt->eip = ctxt->_eip;
8373 kvm_rip_write(vcpu, ctxt->eip);
8374 kvm_set_rflags(vcpu, ctxt->eflags);
8375 }
63995653
MG
8376}
8377EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
8378
e615e355
DE
8379static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
8380 u8 ndata, u8 *insn_bytes, u8 insn_size)
19238e75 8381{
19238e75 8382 struct kvm_run *run = vcpu->run;
e615e355
DE
8383 u64 info[5];
8384 u8 info_start;
8385
8386 /*
8387 * Zero the whole array used to retrieve the exit info, as casting to
8388 * u32 for select entries will leave some chunks uninitialized.
8389 */
8390 memset(&info, 0, sizeof(info));
8391
8392 static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
8393 &info[2], (u32 *)&info[3],
8394 (u32 *)&info[4]);
19238e75
AL
8395
8396 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
8397 run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
e615e355
DE
8398
8399 /*
8400 * There's currently space for 13 entries, but 5 are used for the exit
8401 * reason and info. Restrict to 4 to reduce the maintenance burden
8402 * when expanding kvm_run.emulation_failure in the future.
8403 */
8404 if (WARN_ON_ONCE(ndata > 4))
8405 ndata = 4;
8406
8407 /* Always include the flags as a 'data' entry. */
8408 info_start = 1;
19238e75
AL
8409 run->emulation_failure.flags = 0;
8410
8411 if (insn_size) {
e615e355
DE
8412 BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
8413 sizeof(run->emulation_failure.insn_bytes) != 16));
8414 info_start += 2;
19238e75
AL
8415 run->emulation_failure.flags |=
8416 KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
8417 run->emulation_failure.insn_size = insn_size;
8418 memset(run->emulation_failure.insn_bytes, 0x90,
8419 sizeof(run->emulation_failure.insn_bytes));
e615e355 8420 memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
19238e75 8421 }
e615e355
DE
8422
8423 memcpy(&run->internal.data[info_start], info, sizeof(info));
8424 memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
8425 ndata * sizeof(data[0]));
8426
8427 run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
19238e75
AL
8428}
8429
e615e355
DE
8430static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
8431{
8432 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8433
8434 prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
8435 ctxt->fetch.end - ctxt->fetch.data);
8436}
8437
8438void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
8439 u8 ndata)
8440{
8441 prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
19238e75 8442}
e615e355
DE
8443EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
8444
8445void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
8446{
8447 __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
8448}
8449EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
19238e75 8450
e2366171 8451static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
6d77dbfc 8452{
19238e75
AL
8453 struct kvm *kvm = vcpu->kvm;
8454
6d77dbfc
GN
8455 ++vcpu->stat.insn_emulation_fail;
8456 trace_kvm_emulate_insn_failed(vcpu);
e2366171 8457
42cbf068
SC
8458 if (emulation_type & EMULTYPE_VMWARE_GP) {
8459 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
60fc3d02 8460 return 1;
42cbf068 8461 }
e2366171 8462
19238e75
AL
8463 if (kvm->arch.exit_on_emulation_error ||
8464 (emulation_type & EMULTYPE_SKIP)) {
e615e355 8465 prepare_emulation_ctxt_failure_exit(vcpu);
60fc3d02 8466 return 0;
738fece4
SC
8467 }
8468
22da61c9
SC
8469 kvm_queue_exception(vcpu, UD_VECTOR);
8470
b3646477 8471 if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
e615e355 8472 prepare_emulation_ctxt_failure_exit(vcpu);
60fc3d02 8473 return 0;
fc3a9157 8474 }
e2366171 8475
60fc3d02 8476 return 1;
6d77dbfc
GN
8477}
8478
736c291c 8479static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
991eebf9 8480 int emulation_type)
a6f177ef 8481{
736c291c 8482 gpa_t gpa = cr2_or_gpa;
ba049e93 8483 kvm_pfn_t pfn;
a6f177ef 8484
92daa48b 8485 if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
991eebf9
GN
8486 return false;
8487
92daa48b
SC
8488 if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
8489 WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
6c3dfeb6
SC
8490 return false;
8491
347a0d0d 8492 if (!vcpu->arch.mmu->root_role.direct) {
95b3cf69
XG
8493 /*
8494 * Write permission should be allowed since only
8495 * write access need to be emulated.
8496 */
736c291c 8497 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
a6f177ef 8498
95b3cf69
XG
8499 /*
8500 * If the mapping is invalid in guest, let cpu retry
8501 * it to generate fault.
8502 */
6e1d2a3f 8503 if (gpa == INVALID_GPA)
95b3cf69
XG
8504 return true;
8505 }
a6f177ef 8506
8e3d9d06
XG
8507 /*
8508 * Do not retry the unhandleable instruction if it faults on the
8509 * readonly host memory, otherwise it will goto a infinite loop:
8510 * retry instruction -> write #PF -> emulation fail -> retry
8511 * instruction -> ...
8512 */
8513 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
95b3cf69
XG
8514
8515 /*
8516 * If the instruction failed on the error pfn, it can not be fixed,
8517 * report the error to userspace.
8518 */
8519 if (is_error_noslot_pfn(pfn))
8520 return false;
8521
8522 kvm_release_pfn_clean(pfn);
8523
8524 /* The instructions are well-emulated on direct mmu. */
347a0d0d 8525 if (vcpu->arch.mmu->root_role.direct) {
95b3cf69
XG
8526 unsigned int indirect_shadow_pages;
8527
531810ca 8528 write_lock(&vcpu->kvm->mmu_lock);
95b3cf69 8529 indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
531810ca 8530 write_unlock(&vcpu->kvm->mmu_lock);
95b3cf69
XG
8531
8532 if (indirect_shadow_pages)
8533 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
8534
a6f177ef 8535 return true;
8e3d9d06 8536 }
a6f177ef 8537
95b3cf69
XG
8538 /*
8539 * if emulation was due to access to shadowed page table
8540 * and it failed try to unshadow page and re-enter the
8541 * guest to let CPU execute the instruction.
8542 */
8543 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
93c05d3e
XG
8544
8545 /*
8546 * If the access faults on its page table, it can not
8547 * be fixed by unprotecting shadow page and it should
8548 * be reported to userspace.
8549 */
258d985f 8550 return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
a6f177ef
GN
8551}
8552
1cb3f3ae 8553static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
736c291c 8554 gpa_t cr2_or_gpa, int emulation_type)
1cb3f3ae
XG
8555{
8556 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
736c291c 8557 unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
1cb3f3ae
XG
8558
8559 last_retry_eip = vcpu->arch.last_retry_eip;
8560 last_retry_addr = vcpu->arch.last_retry_addr;
8561
8562 /*
8563 * If the emulation is caused by #PF and it is non-page_table
8564 * writing instruction, it means the VM-EXIT is caused by shadow
8565 * page protected, we can zap the shadow page and retry this
8566 * instruction directly.
8567 *
8568 * Note: if the guest uses a non-page-table modifying instruction
8569 * on the PDE that points to the instruction, then we will unmap
8570 * the instruction and go to an infinite loop. So, we cache the
8571 * last retried eip and the last fault address, if we meet the eip
8572 * and the address again, we can break out of the potential infinite
8573 * loop.
8574 */
8575 vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
8576
92daa48b 8577 if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
1cb3f3ae
XG
8578 return false;
8579
92daa48b
SC
8580 if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
8581 WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
6c3dfeb6
SC
8582 return false;
8583
1cb3f3ae
XG
8584 if (x86_page_table_writing_insn(ctxt))
8585 return false;
8586
736c291c 8587 if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
1cb3f3ae
XG
8588 return false;
8589
8590 vcpu->arch.last_retry_eip = ctxt->eip;
736c291c 8591 vcpu->arch.last_retry_addr = cr2_or_gpa;
1cb3f3ae 8592
347a0d0d 8593 if (!vcpu->arch.mmu->root_role.direct)
736c291c 8594 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
1cb3f3ae 8595
22368028 8596 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
1cb3f3ae
XG
8597
8598 return true;
8599}
8600
716d51ab
GN
8601static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
8602static int complete_emulated_pio(struct kvm_vcpu *vcpu);
8603
4a1e10d5
PB
8604static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
8605 unsigned long *db)
8606{
8607 u32 dr6 = 0;
8608 int i;
8609 u32 enable, rwlen;
8610
8611 enable = dr7;
8612 rwlen = dr7 >> 16;
8613 for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
8614 if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
8615 dr6 |= (1 << i);
8616 return dr6;
8617}
8618
120c2c4f 8619static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
663f4c61
PB
8620{
8621 struct kvm_run *kvm_run = vcpu->run;
8622
c8401dda 8623 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
9a3ecd5e 8624 kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
d5d260c5 8625 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
c8401dda
PB
8626 kvm_run->debug.arch.exception = DB_VECTOR;
8627 kvm_run->exit_reason = KVM_EXIT_DEBUG;
60fc3d02 8628 return 0;
663f4c61 8629 }
120c2c4f 8630 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
60fc3d02 8631 return 1;
663f4c61
PB
8632}
8633
6affcbed
KH
8634int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
8635{
b3646477 8636 unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
f8ea7c60 8637 int r;
6affcbed 8638
b3646477 8639 r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
60fc3d02 8640 if (unlikely(!r))
f8ea7c60 8641 return 0;
c8401dda 8642
9cd803d4
EH
8643 kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
8644
c8401dda
PB
8645 /*
8646 * rflags is the old, "raw" value of the flags. The new value has
8647 * not been saved yet.
8648 *
8649 * This is correct even for TF set by the guest, because "the
8650 * processor will not generate this exception after the instruction
8651 * that sets the TF flag".
8652 */
8653 if (unlikely(rflags & X86_EFLAGS_TF))
120c2c4f 8654 r = kvm_vcpu_do_singlestep(vcpu);
60fc3d02 8655 return r;
6affcbed
KH
8656}
8657EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
8658
baf67ca8 8659static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
4a1e10d5 8660{
baf67ca8
SC
8661 u32 shadow;
8662
8663 if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
8664 return true;
8665
8666 /*
8667 * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
8668 * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first
8669 * to avoid the relatively expensive CPUID lookup.
8670 */
8671 shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
8672 return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
8673 guest_cpuid_is_intel(vcpu);
8674}
8675
750f8fcb
SC
8676static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
8677 int emulation_type, int *r)
4a1e10d5 8678{
750f8fcb
SC
8679 WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
8680
8681 /*
8682 * Do not check for code breakpoints if hardware has already done the
8683 * checks, as inferred from the emulation type. On NO_DECODE and SKIP,
8684 * the instruction has passed all exception checks, and all intercepted
8685 * exceptions that trigger emulation have lower priority than code
8686 * breakpoints, i.e. the fact that the intercepted exception occurred
8687 * means any code breakpoints have already been serviced.
8688 *
8689 * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as
8690 * hardware has checked the RIP of the magic prefix, but not the RIP of
8691 * the instruction being emulated. The intent of forced emulation is
8692 * to behave as if KVM intercepted the instruction without an exception
8693 * and without a prefix.
8694 */
8695 if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
8696 EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
8697 return false;
8698
4a1e10d5
PB
8699 if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
8700 (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
82b32774
NA
8701 struct kvm_run *kvm_run = vcpu->run;
8702 unsigned long eip = kvm_get_linear_rip(vcpu);
8703 u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
4a1e10d5
PB
8704 vcpu->arch.guest_debug_dr7,
8705 vcpu->arch.eff_db);
8706
8707 if (dr6 != 0) {
9a3ecd5e 8708 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
82b32774 8709 kvm_run->debug.arch.pc = eip;
4a1e10d5
PB
8710 kvm_run->debug.arch.exception = DB_VECTOR;
8711 kvm_run->exit_reason = KVM_EXIT_DEBUG;
60fc3d02 8712 *r = 0;
4a1e10d5
PB
8713 return true;
8714 }
8715 }
8716
4161a569 8717 if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
baf67ca8 8718 !kvm_is_code_breakpoint_inhibited(vcpu)) {
82b32774
NA
8719 unsigned long eip = kvm_get_linear_rip(vcpu);
8720 u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
4a1e10d5
PB
8721 vcpu->arch.dr7,
8722 vcpu->arch.db);
8723
8724 if (dr6 != 0) {
4d5523cf 8725 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
60fc3d02 8726 *r = 1;
4a1e10d5
PB
8727 return true;
8728 }
8729 }
8730
8731 return false;
8732}
8733
04789b66
LA
8734static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
8735{
2d7921c4
AM
8736 switch (ctxt->opcode_len) {
8737 case 1:
8738 switch (ctxt->b) {
8739 case 0xe4: /* IN */
8740 case 0xe5:
8741 case 0xec:
8742 case 0xed:
8743 case 0xe6: /* OUT */
8744 case 0xe7:
8745 case 0xee:
8746 case 0xef:
8747 case 0x6c: /* INS */
8748 case 0x6d:
8749 case 0x6e: /* OUTS */
8750 case 0x6f:
8751 return true;
8752 }
8753 break;
8754 case 2:
8755 switch (ctxt->b) {
8756 case 0x33: /* RDPMC */
8757 return true;
8758 }
8759 break;
04789b66
LA
8760 }
8761
8762 return false;
8763}
8764
4aa2691d 8765/*
fee060cd
SC
8766 * Decode an instruction for emulation. The caller is responsible for handling
8767 * code breakpoints. Note, manually detecting code breakpoints is unnecessary
8768 * (and wrong) when emulating on an intercepted fault-like exception[*], as
8769 * code breakpoints have higher priority and thus have already been done by
8770 * hardware.
8771 *
8772 * [*] Except #MC, which is higher priority, but KVM should never emulate in
8773 * response to a machine check.
4aa2691d
WH
8774 */
8775int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
8776 void *insn, int insn_len)
8777{
4aa2691d 8778 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
fee060cd 8779 int r;
4aa2691d
WH
8780
8781 init_emulate_ctxt(vcpu);
8782
b35491e6 8783 r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
4aa2691d
WH
8784
8785 trace_kvm_emulate_insn_start(vcpu);
8786 ++vcpu->stat.insn_emulation;
8787
8788 return r;
8789}
8790EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
8791
736c291c
SC
8792int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
8793 int emulation_type, void *insn, int insn_len)
bbd9b64e 8794{
95cb2295 8795 int r;
c9b8b07c 8796 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7ae441ea 8797 bool writeback = true;
09e3e2a1 8798
4d31d9ef 8799 if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
09e3e2a1 8800 return 1;
bbd9b64e 8801
c595ceee
PB
8802 vcpu->arch.l1tf_flush_l1d = true;
8803
571008da 8804 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4aa2691d 8805 kvm_clear_exception_queue(vcpu);
4a1e10d5 8806
fee060cd
SC
8807 /*
8808 * Return immediately if RIP hits a code breakpoint, such #DBs
8809 * are fault-like and are higher priority than any faults on
8810 * the code fetch itself.
8811 */
750f8fcb 8812 if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))
fee060cd
SC
8813 return r;
8814
4aa2691d
WH
8815 r = x86_decode_emulated_instruction(vcpu, emulation_type,
8816 insn, insn_len);
1d2887e2 8817 if (r != EMULATION_OK) {
b4000606 8818 if ((emulation_type & EMULTYPE_TRAP_UD) ||
c83fad65
SC
8819 (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
8820 kvm_queue_exception(vcpu, UD_VECTOR);
60fc3d02 8821 return 1;
c83fad65 8822 }
736c291c 8823 if (reexecute_instruction(vcpu, cr2_or_gpa,
736c291c 8824 emulation_type))
60fc3d02 8825 return 1;
17122c06
SC
8826
8827 if (ctxt->have_exception &&
8828 !(emulation_type & EMULTYPE_SKIP)) {
c8848cee
JD
8829 /*
8830 * #UD should result in just EMULATION_FAILED, and trap-like
8831 * exception should not be encountered during decode.
8832 */
8833 WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
8834 exception_type(ctxt->exception.vector) == EXCPT_TRAP);
8530a79c 8835 inject_emulated_exception(vcpu);
60fc3d02 8836 return 1;
8530a79c 8837 }
e2366171 8838 return handle_emulation_failure(vcpu, emulation_type);
bbd9b64e
CO
8839 }
8840 }
8841
42cbf068
SC
8842 if ((emulation_type & EMULTYPE_VMWARE_GP) &&
8843 !is_vmware_backdoor_opcode(ctxt)) {
8844 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
60fc3d02 8845 return 1;
42cbf068 8846 }
04789b66 8847
1957aa63 8848 /*
906fa904
HW
8849 * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for
8850 * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
8851 * The caller is responsible for updating interruptibility state and
8852 * injecting single-step #DBs.
1957aa63 8853 */
ba8afb6b 8854 if (emulation_type & EMULTYPE_SKIP) {
5e854864
SC
8855 if (ctxt->mode != X86EMUL_MODE_PROT64)
8856 ctxt->eip = (u32)ctxt->_eip;
8857 else
8858 ctxt->eip = ctxt->_eip;
8859
906fa904
HW
8860 if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) {
8861 r = 1;
8862 goto writeback;
8863 }
8864
5e854864 8865 kvm_rip_write(vcpu, ctxt->eip);
bb663c7a
NA
8866 if (ctxt->eflags & X86_EFLAGS_RF)
8867 kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
60fc3d02 8868 return 1;
ba8afb6b
GN
8869 }
8870
736c291c 8871 if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
60fc3d02 8872 return 1;
1cb3f3ae 8873
7ae441ea 8874 /* this is needed for vmware backdoor interface to work since it
4d2179e1 8875 changes registers values during IO operation */
7ae441ea
GN
8876 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
8877 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
dd856efa 8878 emulator_invalidate_register_cache(ctxt);
7ae441ea 8879 }
4d2179e1 8880
5cd21917 8881restart:
92daa48b
SC
8882 if (emulation_type & EMULTYPE_PF) {
8883 /* Save the faulting GPA (cr2) in the address field */
8884 ctxt->exception.address = cr2_or_gpa;
8885
8886 /* With shadow page tables, cr2 contains a GVA or nGPA. */
347a0d0d 8887 if (vcpu->arch.mmu->root_role.direct) {
744e699c
SC
8888 ctxt->gpa_available = true;
8889 ctxt->gpa_val = cr2_or_gpa;
92daa48b
SC
8890 }
8891 } else {
8892 /* Sanitize the address out of an abundance of paranoia. */
8893 ctxt->exception.address = 0;
8894 }
0f89b207 8895
9d74191a 8896 r = x86_emulate_insn(ctxt);
bbd9b64e 8897
775fde86 8898 if (r == EMULATION_INTERCEPTED)
60fc3d02 8899 return 1;
775fde86 8900
d2ddd1c4 8901 if (r == EMULATION_FAILED) {
258d985f 8902 if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
60fc3d02 8903 return 1;
c3cd7ffa 8904
e2366171 8905 return handle_emulation_failure(vcpu, emulation_type);
bbd9b64e
CO
8906 }
8907
9d74191a 8908 if (ctxt->have_exception) {
60fc3d02 8909 r = 1;
7709aba8 8910 inject_emulated_exception(vcpu);
d2ddd1c4 8911 } else if (vcpu->arch.pio.count) {
0912c977
PB
8912 if (!vcpu->arch.pio.in) {
8913 /* FIXME: return into emulator if single-stepping. */
3457e419 8914 vcpu->arch.pio.count = 0;
0912c977 8915 } else {
7ae441ea 8916 writeback = false;
716d51ab
GN
8917 vcpu->arch.complete_userspace_io = complete_emulated_pio;
8918 }
60fc3d02 8919 r = 0;
7ae441ea 8920 } else if (vcpu->mmio_needed) {
bc8a0aaf
SC
8921 ++vcpu->stat.mmio_exits;
8922
7ae441ea
GN
8923 if (!vcpu->mmio_is_write)
8924 writeback = false;
60fc3d02 8925 r = 0;
716d51ab 8926 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
adbfb12d
HW
8927 } else if (vcpu->arch.complete_userspace_io) {
8928 writeback = false;
8929 r = 0;
7ae441ea 8930 } else if (r == EMULATION_RESTART)
5cd21917 8931 goto restart;
d2ddd1c4 8932 else
60fc3d02 8933 r = 1;
f850e2e6 8934
906fa904 8935writeback:
7ae441ea 8936 if (writeback) {
b3646477 8937 unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
9d74191a 8938 toggle_interruptibility(vcpu, ctxt->interruptibility);
7ae441ea 8939 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5623f751
SC
8940
8941 /*
8942 * Note, EXCPT_DB is assumed to be fault-like as the emulator
8943 * only supports code breakpoints and general detect #DB, both
8944 * of which are fault-like.
8945 */
38827dbd 8946 if (!ctxt->have_exception ||
75ee23b3 8947 exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
9cd803d4 8948 kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
018d70ff
EH
8949 if (ctxt->is_branch)
8950 kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
75ee23b3 8951 kvm_rip_write(vcpu, ctxt->eip);
384dea1c 8952 if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
120c2c4f 8953 r = kvm_vcpu_do_singlestep(vcpu);
2a890614 8954 static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
38827dbd 8955 __kvm_set_rflags(vcpu, ctxt->eflags);
75ee23b3 8956 }
6addfc42
PB
8957
8958 /*
8959 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
8960 * do nothing, and it will be requested again as soon as
8961 * the shadow expires. But we still need to check here,
8962 * because POPF has no interrupt shadow.
8963 */
8964 if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
8965 kvm_make_request(KVM_REQ_EVENT, vcpu);
7ae441ea
GN
8966 } else
8967 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
e85d28f8
GN
8968
8969 return r;
de7d789a 8970}
c60658d1
SC
8971
8972int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
8973{
8974 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
8975}
8976EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
8977
8978int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
8979 void *insn, int insn_len)
8980{
8981 return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
8982}
8983EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
de7d789a 8984
8764ed55
SC
8985static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
8986{
8987 vcpu->arch.pio.count = 0;
8988 return 1;
8989}
8990
45def77e
SC
8991static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
8992{
8993 vcpu->arch.pio.count = 0;
8994
8995 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
8996 return 1;
8997
8998 return kvm_skip_emulated_instruction(vcpu);
8999}
9000
dca7f128
SC
9001static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
9002 unsigned short port)
de7d789a 9003{
de3cd117 9004 unsigned long val = kvm_rax_read(vcpu);
2e3bb4d8
SC
9005 int ret = emulator_pio_out(vcpu, size, port, &val, 1);
9006
8764ed55
SC
9007 if (ret)
9008 return ret;
45def77e 9009
8764ed55
SC
9010 /*
9011 * Workaround userspace that relies on old KVM behavior of %rip being
9012 * incremented prior to exiting to userspace to handle "OUT 0x7e".
9013 */
9014 if (port == 0x7e &&
9015 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
9016 vcpu->arch.complete_userspace_io =
9017 complete_fast_pio_out_port_0x7e;
9018 kvm_skip_emulated_instruction(vcpu);
9019 } else {
45def77e
SC
9020 vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
9021 vcpu->arch.complete_userspace_io = complete_fast_pio_out;
9022 }
8764ed55 9023 return 0;
de7d789a 9024}
de7d789a 9025
8370c3d0
TL
9026static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
9027{
9028 unsigned long val;
9029
9030 /* We should only ever be called with arch.pio.count equal to 1 */
9031 BUG_ON(vcpu->arch.pio.count != 1);
9032
45def77e
SC
9033 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
9034 vcpu->arch.pio.count = 0;
9035 return 1;
9036 }
9037
8370c3d0 9038 /* For size less than 4 we merge, else we zero extend */
de3cd117 9039 val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
8370c3d0 9040
7a6177d6 9041 complete_emulator_pio_in(vcpu, &val);
de3cd117 9042 kvm_rax_write(vcpu, val);
8370c3d0 9043
45def77e 9044 return kvm_skip_emulated_instruction(vcpu);
8370c3d0
TL
9045}
9046
dca7f128
SC
9047static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
9048 unsigned short port)
8370c3d0
TL
9049{
9050 unsigned long val;
9051 int ret;
9052
9053 /* For size less than 4 we merge, else we zero extend */
de3cd117 9054 val = (size < 4) ? kvm_rax_read(vcpu) : 0;
8370c3d0 9055
2e3bb4d8 9056 ret = emulator_pio_in(vcpu, size, port, &val, 1);
8370c3d0 9057 if (ret) {
de3cd117 9058 kvm_rax_write(vcpu, val);
8370c3d0
TL
9059 return ret;
9060 }
9061
45def77e 9062 vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
8370c3d0
TL
9063 vcpu->arch.complete_userspace_io = complete_fast_pio_in;
9064
9065 return 0;
9066}
dca7f128
SC
9067
9068int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
9069{
45def77e 9070 int ret;
dca7f128 9071
dca7f128 9072 if (in)
45def77e 9073 ret = kvm_fast_pio_in(vcpu, size, port);
dca7f128 9074 else
45def77e
SC
9075 ret = kvm_fast_pio_out(vcpu, size, port);
9076 return ret && kvm_skip_emulated_instruction(vcpu);
dca7f128
SC
9077}
9078EXPORT_SYMBOL_GPL(kvm_fast_pio);
8370c3d0 9079
251a5fd6 9080static int kvmclock_cpu_down_prep(unsigned int cpu)
8cfdc000 9081{
0a3aee0d 9082 __this_cpu_write(cpu_tsc_khz, 0);
251a5fd6 9083 return 0;
8cfdc000
ZA
9084}
9085
9086static void tsc_khz_changed(void *data)
c8076604 9087{
8cfdc000
ZA
9088 struct cpufreq_freqs *freq = data;
9089 unsigned long khz = 0;
9090
3ebcbd22
AR
9091 WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
9092
8cfdc000
ZA
9093 if (data)
9094 khz = freq->new;
3ebcbd22 9095 else
8cfdc000
ZA
9096 khz = cpufreq_quick_get(raw_smp_processor_id());
9097 if (!khz)
9098 khz = tsc_khz;
0a3aee0d 9099 __this_cpu_write(cpu_tsc_khz, khz);
c8076604
GH
9100}
9101
5fa4ec9c 9102#ifdef CONFIG_X86_64
0092e434
VK
9103static void kvm_hyperv_tsc_notifier(void)
9104{
0092e434 9105 struct kvm *kvm;
0092e434
VK
9106 int cpu;
9107
0d9ce162 9108 mutex_lock(&kvm_lock);
0092e434
VK
9109 list_for_each_entry(kvm, &vm_list, vm_list)
9110 kvm_make_mclock_inprogress_request(kvm);
9111
6b6fcd28 9112 /* no guest entries from this point */
0092e434
VK
9113 hyperv_stop_tsc_emulation();
9114
9115 /* TSC frequency always matches when on Hyper-V */
3ebcbd22
AR
9116 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
9117 for_each_present_cpu(cpu)
9118 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
9119 }
938c8745 9120 kvm_caps.max_guest_tsc_khz = tsc_khz;
0092e434
VK
9121
9122 list_for_each_entry(kvm, &vm_list, vm_list) {
869b4421 9123 __kvm_start_pvclock_update(kvm);
0092e434 9124 pvclock_update_vm_gtod_copy(kvm);
6b6fcd28 9125 kvm_end_pvclock_update(kvm);
0092e434 9126 }
6b6fcd28 9127
0d9ce162 9128 mutex_unlock(&kvm_lock);
0092e434 9129}
5fa4ec9c 9130#endif
0092e434 9131
df24014a 9132static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
c8076604 9133{
c8076604
GH
9134 struct kvm *kvm;
9135 struct kvm_vcpu *vcpu;
46808a4c
MZ
9136 int send_ipi = 0;
9137 unsigned long i;
c8076604 9138
8cfdc000
ZA
9139 /*
9140 * We allow guests to temporarily run on slowing clocks,
9141 * provided we notify them after, or to run on accelerating
9142 * clocks, provided we notify them before. Thus time never
9143 * goes backwards.
9144 *
9145 * However, we have a problem. We can't atomically update
9146 * the frequency of a given CPU from this function; it is
9147 * merely a notifier, which can be called from any CPU.
9148 * Changing the TSC frequency at arbitrary points in time
9149 * requires a recomputation of local variables related to
9150 * the TSC for each VCPU. We must flag these local variables
9151 * to be updated and be sure the update takes place with the
9152 * new frequency before any guests proceed.
9153 *
9154 * Unfortunately, the combination of hotplug CPU and frequency
9155 * change creates an intractable locking scenario; the order
9156 * of when these callouts happen is undefined with respect to
9157 * CPU hotplug, and they can race with each other. As such,
9158 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
9159 * undefined; you can actually have a CPU frequency change take
9160 * place in between the computation of X and the setting of the
9161 * variable. To protect against this problem, all updates of
9162 * the per_cpu tsc_khz variable are done in an interrupt
9163 * protected IPI, and all callers wishing to update the value
9164 * must wait for a synchronous IPI to complete (which is trivial
9165 * if the caller is on the CPU already). This establishes the
9166 * necessary total order on variable updates.
9167 *
9168 * Note that because a guest time update may take place
9169 * anytime after the setting of the VCPU's request bit, the
9170 * correct TSC value must be set before the request. However,
9171 * to ensure the update actually makes it to any guest which
9172 * starts running in hardware virtualization between the set
9173 * and the acquisition of the spinlock, we must also ping the
9174 * CPU after setting the request bit.
9175 *
9176 */
9177
df24014a 9178 smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
c8076604 9179
0d9ce162 9180 mutex_lock(&kvm_lock);
c8076604 9181 list_for_each_entry(kvm, &vm_list, vm_list) {
988a2cae 9182 kvm_for_each_vcpu(i, vcpu, kvm) {
df24014a 9183 if (vcpu->cpu != cpu)
c8076604 9184 continue;
c285545f 9185 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
0d9ce162 9186 if (vcpu->cpu != raw_smp_processor_id())
8cfdc000 9187 send_ipi = 1;
c8076604
GH
9188 }
9189 }
0d9ce162 9190 mutex_unlock(&kvm_lock);
c8076604
GH
9191
9192 if (freq->old < freq->new && send_ipi) {
9193 /*
9194 * We upscale the frequency. Must make the guest
9195 * doesn't see old kvmclock values while running with
9196 * the new frequency, otherwise we risk the guest sees
9197 * time go backwards.
9198 *
9199 * In case we update the frequency for another cpu
9200 * (which might be in guest context) send an interrupt
9201 * to kick the cpu out of guest context. Next time
9202 * guest context is entered kvmclock will be updated,
9203 * so the guest will not see stale values.
9204 */
df24014a 9205 smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
c8076604 9206 }
df24014a
VK
9207}
9208
9209static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
9210 void *data)
9211{
9212 struct cpufreq_freqs *freq = data;
9213 int cpu;
9214
9215 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
9216 return 0;
9217 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
9218 return 0;
9219
9220 for_each_cpu(cpu, freq->policy->cpus)
9221 __kvmclock_cpufreq_notifier(freq, cpu);
9222
c8076604
GH
9223 return 0;
9224}
9225
9226static struct notifier_block kvmclock_cpufreq_notifier_block = {
8cfdc000
ZA
9227 .notifier_call = kvmclock_cpufreq_notifier
9228};
9229
251a5fd6 9230static int kvmclock_cpu_online(unsigned int cpu)
8cfdc000 9231{
251a5fd6
SAS
9232 tsc_khz_changed(NULL);
9233 return 0;
8cfdc000
ZA
9234}
9235
b820cc0c
ZA
9236static void kvm_timer_init(void)
9237{
b820cc0c 9238 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
741e511b
SC
9239 max_tsc_khz = tsc_khz;
9240
9241 if (IS_ENABLED(CONFIG_CPU_FREQ)) {
9242 struct cpufreq_policy *policy;
9243 int cpu;
9244
9245 cpu = get_cpu();
9246 policy = cpufreq_cpu_get(cpu);
9247 if (policy) {
9248 if (policy->cpuinfo.max_freq)
9249 max_tsc_khz = policy->cpuinfo.max_freq;
9250 cpufreq_cpu_put(policy);
9251 }
9252 put_cpu();
9a11997e 9253 }
b820cc0c
ZA
9254 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
9255 CPUFREQ_TRANSITION_NOTIFIER);
460dd42e 9256
3ebcbd22
AR
9257 cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
9258 kvmclock_cpu_online, kvmclock_cpu_down_prep);
9259 }
b820cc0c
ZA
9260}
9261
16e8d74d
MT
9262#ifdef CONFIG_X86_64
9263static void pvclock_gtod_update_fn(struct work_struct *work)
9264{
d828199e 9265 struct kvm *kvm;
d828199e 9266 struct kvm_vcpu *vcpu;
46808a4c 9267 unsigned long i;
d828199e 9268
0d9ce162 9269 mutex_lock(&kvm_lock);
d828199e
MT
9270 list_for_each_entry(kvm, &vm_list, vm_list)
9271 kvm_for_each_vcpu(i, vcpu, kvm)
105b21bb 9272 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
d828199e 9273 atomic_set(&kvm_guest_has_master_clock, 0);
0d9ce162 9274 mutex_unlock(&kvm_lock);
16e8d74d
MT
9275}
9276
9277static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
9278
3f804f6d
TG
9279/*
9280 * Indirection to move queue_work() out of the tk_core.seq write held
9281 * region to prevent possible deadlocks against time accessors which
9282 * are invoked with work related locks held.
9283 */
9284static void pvclock_irq_work_fn(struct irq_work *w)
9285{
9286 queue_work(system_long_wq, &pvclock_gtod_work);
9287}
9288
9289static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
9290
16e8d74d
MT
9291/*
9292 * Notification about pvclock gtod data update.
9293 */
9294static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
9295 void *priv)
9296{
9297 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
9298 struct timekeeper *tk = priv;
9299
9300 update_pvclock_gtod(tk);
9301
3f804f6d
TG
9302 /*
9303 * Disable master clock if host does not trust, or does not use,
9304 * TSC based clocksource. Delegate queue_work() to irq_work as
9305 * this is invoked with tk_core.seq write held.
16e8d74d 9306 */
b0c39dc6 9307 if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
16e8d74d 9308 atomic_read(&kvm_guest_has_master_clock) != 0)
3f804f6d 9309 irq_work_queue(&pvclock_irq_work);
16e8d74d
MT
9310 return 0;
9311}
9312
9313static struct notifier_block pvclock_gtod_notifier = {
9314 .notifier_call = pvclock_gtod_notify,
9315};
9316#endif
9317
b7483387
SC
9318static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
9319{
9320 memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
9321
9322#define __KVM_X86_OP(func) \
9323 static_call_update(kvm_x86_##func, kvm_x86_ops.func);
9324#define KVM_X86_OP(func) \
9325 WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
9326#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
9327#define KVM_X86_OP_OPTIONAL_RET0(func) \
9328 static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
9329 (void *)__static_call_return0);
9330#include <asm/kvm-x86-ops.h>
9331#undef __KVM_X86_OP
9332
9333 kvm_pmu_ops_update(ops->pmu_ops);
9334}
9335
d83420c2 9336static int kvm_x86_check_processor_compatibility(void)
3045c483 9337{
e4aa7f88
CG
9338 int cpu = smp_processor_id();
9339 struct cpuinfo_x86 *c = &cpu_data(cpu);
9340
9341 /*
9342 * Compatibility checks are done when loading KVM and when enabling
9343 * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
9344 * compatible, i.e. KVM should never perform a compatibility check on
9345 * an offline CPU.
9346 */
9347 WARN_ON(!cpu_online(cpu));
3045c483 9348
3045c483
SC
9349 if (__cr4_reserved_bits(cpu_has, c) !=
9350 __cr4_reserved_bits(cpu_has, &boot_cpu_data))
9351 return -EIO;
9352
d83420c2 9353 return static_call(kvm_x86_check_processor_compatibility)();
3045c483
SC
9354}
9355
d83420c2 9356static void kvm_x86_check_cpu_compat(void *ret)
3045c483 9357{
d83420c2 9358 *(int *)ret = kvm_x86_check_processor_compatibility();
3045c483
SC
9359}
9360
3af4a9e6 9361static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
043405e1 9362{
94bda2f4 9363 u64 host_pat;
3045c483 9364 int r, cpu;
f8c16bba 9365
afaf0b2f 9366 if (kvm_x86_ops.hardware_enable) {
8d20bd63 9367 pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
82ffad2d 9368 return -EEXIST;
f8c16bba
ZX
9369 }
9370
b666a4b6
MO
9371 /*
9372 * KVM explicitly assumes that the guest has an FPU and
9373 * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
9374 * vCPU's FPU state as a fxregs_state struct.
9375 */
9376 if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
8d20bd63 9377 pr_err("inadequate fpu\n");
82ffad2d 9378 return -EOPNOTSUPP;
b666a4b6
MO
9379 }
9380
5e17b2ee
TG
9381 if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
9382 pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
82ffad2d 9383 return -EOPNOTSUPP;
b666a4b6
MO
9384 }
9385
94bda2f4
SC
9386 /*
9387 * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes
9388 * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something
9389 * other than WB. Note, EPT doesn't utilize the PAT, but don't bother
9390 * with an exception. PAT[0] is set to WB on RESET and also by the
9391 * kernel, i.e. failure indicates a kernel bug or broken firmware.
9392 */
9393 if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
9394 (host_pat & GENMASK(2, 0)) != 6) {
8d20bd63 9395 pr_err("host PAT[0] is not WB\n");
82ffad2d 9396 return -EIO;
94bda2f4 9397 }
b666a4b6 9398
c9b8b07c
SC
9399 x86_emulator_cache = kvm_alloc_emulator_cache();
9400 if (!x86_emulator_cache) {
8d20bd63 9401 pr_err("failed to allocate cache for x86 emulator\n");
82ffad2d 9402 return -ENOMEM;
c9b8b07c
SC
9403 }
9404
7e34fbd0
SC
9405 user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
9406 if (!user_return_msrs) {
8d20bd63 9407 pr_err("failed to allocate percpu kvm_user_return_msrs\n");
82ffad2d 9408 r = -ENOMEM;
c9b8b07c 9409 goto out_free_x86_emulator_cache;
013f6a5d 9410 }
e5fda4bb 9411 kvm_nr_uret_msrs = 0;
013f6a5d 9412
1d0e8480 9413 r = kvm_mmu_vendor_module_init();
97db56ce 9414 if (r)
013f6a5d 9415 goto out_free_percpu;
97db56ce 9416
cfc48181 9417 if (boot_cpu_has(X86_FEATURE_XSAVE)) {
2acf923e 9418 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
938c8745 9419 kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
cfc48181 9420 }
2acf923e 9421
b7483387
SC
9422 rdmsrl_safe(MSR_EFER, &host_efer);
9423
9424 if (boot_cpu_has(X86_FEATURE_XSAVES))
9425 rdmsrl(MSR_IA32_XSS, host_xss);
9426
8911ce66 9427 kvm_init_pmu_capability(ops->pmu_ops);
b7483387
SC
9428
9429 r = ops->hardware_setup();
9430 if (r != 0)
9431 goto out_mmu_exit;
9432
d83420c2
SC
9433 kvm_ops_update(ops);
9434
3045c483 9435 for_each_online_cpu(cpu) {
d83420c2 9436 smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
3045c483 9437 if (r < 0)
d83420c2 9438 goto out_unwind_ops;
3045c483
SC
9439 }
9440
b7483387
SC
9441 /*
9442 * Point of no return! DO NOT add error paths below this point unless
9443 * absolutely necessary, as most operations from this point forward
9444 * require unwinding.
9445 */
1935542a
SC
9446 kvm_timer_init();
9447
0c5f81da 9448 if (pi_inject_timer == -1)
04d4e665 9449 pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
16e8d74d
MT
9450#ifdef CONFIG_X86_64
9451 pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
0092e434 9452
5fa4ec9c 9453 if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
0092e434 9454 set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
16e8d74d
MT
9455#endif
9456
b7483387
SC
9457 kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
9458
9459 if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
9460 kvm_caps.supported_xss = 0;
9461
9462#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
9463 cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
9464#undef __kvm_cpu_cap_has
9465
9466 if (kvm_caps.has_tsc_control) {
9467 /*
9468 * Make sure the user can only configure tsc_khz values that
9469 * fit into a signed integer.
9470 * A min value is not calculated because it will always
9471 * be 1 on all machines.
9472 */
9473 u64 max = min(0x7fffffffULL,
9474 __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
9475 kvm_caps.max_guest_tsc_khz = max;
9476 }
9477 kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
9478 kvm_init_msr_list();
f8c16bba 9479 return 0;
56c6d28a 9480
d83420c2
SC
9481out_unwind_ops:
9482 kvm_x86_ops.hardware_enable = NULL;
9483 static_call(kvm_x86_hardware_unsetup)();
b7483387
SC
9484out_mmu_exit:
9485 kvm_mmu_vendor_module_exit();
013f6a5d 9486out_free_percpu:
7e34fbd0 9487 free_percpu(user_return_msrs);
c9b8b07c
SC
9488out_free_x86_emulator_cache:
9489 kmem_cache_destroy(x86_emulator_cache);
56c6d28a 9490 return r;
043405e1 9491}
8776e519 9492
3af4a9e6
SC
9493int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
9494{
9495 int r;
9496
9497 mutex_lock(&vendor_module_lock);
9498 r = __kvm_x86_vendor_init(ops);
9499 mutex_unlock(&vendor_module_lock);
9500
9501 return r;
9502}
4f8396b9 9503EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
8776e519 9504
4f8396b9 9505void kvm_x86_vendor_exit(void)
f8c16bba 9506{
b7483387
SC
9507 kvm_unregister_perf_callbacks();
9508
0092e434 9509#ifdef CONFIG_X86_64
5fa4ec9c 9510 if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
0092e434
VK
9511 clear_hv_tscchange_cb();
9512#endif
cef84c30 9513 kvm_lapic_exit();
ff9d07a0 9514
3ebcbd22 9515 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
888d256e
JK
9516 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
9517 CPUFREQ_TRANSITION_NOTIFIER);
3ebcbd22
AR
9518 cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
9519 }
16e8d74d
MT
9520#ifdef CONFIG_X86_64
9521 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
3f804f6d 9522 irq_work_sync(&pvclock_irq_work);
594b27e6 9523 cancel_work_sync(&pvclock_gtod_work);
16e8d74d 9524#endif
b7483387 9525 static_call(kvm_x86_hardware_unsetup)();
1d0e8480 9526 kvm_mmu_vendor_module_exit();
7e34fbd0 9527 free_percpu(user_return_msrs);
dfdc0a71 9528 kmem_cache_destroy(x86_emulator_cache);
b59b153d 9529#ifdef CONFIG_KVM_XEN
c462f859 9530 static_key_deferred_flush(&kvm_xen_enabled);
7d6bbebb 9531 WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
b59b153d 9532#endif
3af4a9e6
SC
9533 mutex_lock(&vendor_module_lock);
9534 kvm_x86_ops.hardware_enable = NULL;
9535 mutex_unlock(&vendor_module_lock);
56c6d28a 9536}
4f8396b9 9537EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
f8c16bba 9538
1460179d 9539static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
8776e519 9540{
91b99ea7
SC
9541 /*
9542 * The vCPU has halted, e.g. executed HLT. Update the run state if the
9543 * local APIC is in-kernel, the run loop will detect the non-runnable
9544 * state and halt the vCPU. Exit to userspace if the local APIC is
9545 * managed by userspace, in which case userspace is responsible for
9546 * handling wake events.
9547 */
8776e519 9548 ++vcpu->stat.halt_exits;
35754c98 9549 if (lapic_in_kernel(vcpu)) {
647daca2 9550 vcpu->arch.mp_state = state;
8776e519
HB
9551 return 1;
9552 } else {
647daca2 9553 vcpu->run->exit_reason = reason;
8776e519
HB
9554 return 0;
9555 }
9556}
647daca2 9557
1460179d 9558int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
647daca2 9559{
1460179d 9560 return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
647daca2 9561}
1460179d 9562EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
5cb56059
JS
9563
9564int kvm_emulate_halt(struct kvm_vcpu *vcpu)
9565{
6affcbed
KH
9566 int ret = kvm_skip_emulated_instruction(vcpu);
9567 /*
9568 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
9569 * KVM_EXIT_DEBUG here.
9570 */
1460179d 9571 return kvm_emulate_halt_noskip(vcpu) && ret;
5cb56059 9572}
8776e519
HB
9573EXPORT_SYMBOL_GPL(kvm_emulate_halt);
9574
647daca2
TL
9575int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
9576{
9577 int ret = kvm_skip_emulated_instruction(vcpu);
9578
1460179d
SC
9579 return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
9580 KVM_EXIT_AP_RESET_HOLD) && ret;
647daca2
TL
9581}
9582EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
9583
8ef81a9a 9584#ifdef CONFIG_X86_64
55dd00a7
MT
9585static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
9586 unsigned long clock_type)
9587{
9588 struct kvm_clock_pairing clock_pairing;
899a31f5 9589 struct timespec64 ts;
80fbd89c 9590 u64 cycle;
55dd00a7
MT
9591 int ret;
9592
9593 if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
9594 return -KVM_EOPNOTSUPP;
9595
3a55f729
AR
9596 /*
9597 * When tsc is in permanent catchup mode guests won't be able to use
9598 * pvclock_read_retry loop to get consistent view of pvclock
9599 */
9600 if (vcpu->arch.tsc_always_catchup)
9601 return -KVM_EOPNOTSUPP;
9602
7ca7f3b9 9603 if (!kvm_get_walltime_and_clockread(&ts, &cycle))
55dd00a7
MT
9604 return -KVM_EOPNOTSUPP;
9605
9606 clock_pairing.sec = ts.tv_sec;
9607 clock_pairing.nsec = ts.tv_nsec;
9608 clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
9609 clock_pairing.flags = 0;
bcbfbd8e 9610 memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
55dd00a7
MT
9611
9612 ret = 0;
9613 if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
9614 sizeof(struct kvm_clock_pairing)))
9615 ret = -KVM_EFAULT;
9616
9617 return ret;
9618}
8ef81a9a 9619#endif
55dd00a7 9620
6aef266c
SV
9621/*
9622 * kvm_pv_kick_cpu_op: Kick a vcpu.
9623 *
9624 * @apicid - apicid of vcpu to be kicked.
9625 */
9d68c6f6 9626static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
6aef266c 9627{
8a414f94
VK
9628 /*
9629 * All other fields are unused for APIC_DM_REMRD, but may be consumed by
9630 * common code, e.g. for tracing. Defer initialization to the compiler.
9631 */
9632 struct kvm_lapic_irq lapic_irq = {
9633 .delivery_mode = APIC_DM_REMRD,
9634 .dest_mode = APIC_DEST_PHYSICAL,
9635 .shorthand = APIC_DEST_NOSHORT,
9636 .dest_id = apicid,
9637 };
6aef266c 9638
795a149e 9639 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
6aef266c
SV
9640}
9641
4e19c36f
SS
9642bool kvm_apicv_activated(struct kvm *kvm)
9643{
9644 return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
9645}
9646EXPORT_SYMBOL_GPL(kvm_apicv_activated);
9647
d5fa597e
ML
9648bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
9649{
9650 ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
9651 ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
9652
9653 return (vm_reasons | vcpu_reasons) == 0;
9654}
9655EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
4f4c4a3e
SC
9656
9657static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
9658 enum kvm_apicv_inhibit reason, bool set)
9659{
9660 if (set)
9661 __set_bit(reason, inhibits);
9662 else
9663 __clear_bit(reason, inhibits);
9664
9665 trace_kvm_apicv_inhibit_changed(reason, set, *inhibits);
9666}
9667
4651fc56 9668static void kvm_apicv_init(struct kvm *kvm)
4e19c36f 9669{
4f4c4a3e
SC
9670 unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons;
9671
187c8833 9672 init_rwsem(&kvm->arch.apicv_update_lock);
b0a1637f 9673
4f4c4a3e
SC
9674 set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true);
9675
ef8b4b72 9676 if (!enable_apicv)
4f4c4a3e 9677 set_or_clear_apicv_inhibit(inhibits,
80f0497c 9678 APICV_INHIBIT_REASON_DISABLE, true);
4e19c36f 9679}
4e19c36f 9680
4a7132ef 9681static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
71506297
WL
9682{
9683 struct kvm_vcpu *target = NULL;
9684 struct kvm_apic_map *map;
9685
4a7132ef
WL
9686 vcpu->stat.directed_yield_attempted++;
9687
72b268a8
WL
9688 if (single_task_running())
9689 goto no_yield;
9690
71506297 9691 rcu_read_lock();
4a7132ef 9692 map = rcu_dereference(vcpu->kvm->arch.apic_map);
71506297
WL
9693
9694 if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
9695 target = map->phys_map[dest_id]->vcpu;
9696
9697 rcu_read_unlock();
9698
4a7132ef
WL
9699 if (!target || !READ_ONCE(target->ready))
9700 goto no_yield;
9701
a1fa4cbd
WL
9702 /* Ignore requests to yield to self */
9703 if (vcpu == target)
9704 goto no_yield;
9705
4a7132ef
WL
9706 if (kvm_vcpu_yield_to(target) <= 0)
9707 goto no_yield;
9708
9709 vcpu->stat.directed_yield_successful++;
9710
9711no_yield:
9712 return;
71506297
WL
9713}
9714
0dbb1123
AK
9715static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
9716{
9717 u64 ret = vcpu->run->hypercall.ret;
9718
9719 if (!is_64_bit_mode(vcpu))
9720 ret = (u32)ret;
9721 kvm_rax_write(vcpu, ret);
9722 ++vcpu->stat.hypercalls;
9723 return kvm_skip_emulated_instruction(vcpu);
9724}
9725
8776e519
HB
9726int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
9727{
9728 unsigned long nr, a0, a1, a2, a3, ret;
6356ee0c 9729 int op_64_bit;
8776e519 9730
23200b7a
JM
9731 if (kvm_xen_hypercall_enabled(vcpu->kvm))
9732 return kvm_xen_hypercall(vcpu);
9733
8f014550 9734 if (kvm_hv_hypercall_enabled(vcpu))
696ca779 9735 return kvm_hv_hypercall(vcpu);
55cd8e5a 9736
de3cd117
SC
9737 nr = kvm_rax_read(vcpu);
9738 a0 = kvm_rbx_read(vcpu);
9739 a1 = kvm_rcx_read(vcpu);
9740 a2 = kvm_rdx_read(vcpu);
9741 a3 = kvm_rsi_read(vcpu);
8776e519 9742
229456fc 9743 trace_kvm_hypercall(nr, a0, a1, a2, a3);
2714d1d3 9744
b5aead00 9745 op_64_bit = is_64_bit_hypercall(vcpu);
a449c7aa 9746 if (!op_64_bit) {
8776e519
HB
9747 nr &= 0xFFFFFFFF;
9748 a0 &= 0xFFFFFFFF;
9749 a1 &= 0xFFFFFFFF;
9750 a2 &= 0xFFFFFFFF;
9751 a3 &= 0xFFFFFFFF;
9752 }
9753
b3646477 9754 if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
07708c4a 9755 ret = -KVM_EPERM;
696ca779 9756 goto out;
07708c4a
JK
9757 }
9758
66570e96
OU
9759 ret = -KVM_ENOSYS;
9760
8776e519 9761 switch (nr) {
b93463aa
AK
9762 case KVM_HC_VAPIC_POLL_IRQ:
9763 ret = 0;
9764 break;
6aef266c 9765 case KVM_HC_KICK_CPU:
66570e96
OU
9766 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
9767 break;
9768
9d68c6f6 9769 kvm_pv_kick_cpu_op(vcpu->kvm, a1);
4a7132ef 9770 kvm_sched_yield(vcpu, a1);
6aef266c
SV
9771 ret = 0;
9772 break;
8ef81a9a 9773#ifdef CONFIG_X86_64
55dd00a7
MT
9774 case KVM_HC_CLOCK_PAIRING:
9775 ret = kvm_pv_clock_pairing(vcpu, a0, a1);
9776 break;
1ed199a4 9777#endif
4180bf1b 9778 case KVM_HC_SEND_IPI:
66570e96
OU
9779 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
9780 break;
9781
4180bf1b
WL
9782 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
9783 break;
71506297 9784 case KVM_HC_SCHED_YIELD:
66570e96
OU
9785 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
9786 break;
9787
4a7132ef 9788 kvm_sched_yield(vcpu, a0);
71506297
WL
9789 ret = 0;
9790 break;
0dbb1123
AK
9791 case KVM_HC_MAP_GPA_RANGE: {
9792 u64 gpa = a0, npages = a1, attrs = a2;
9793
9794 ret = -KVM_ENOSYS;
9795 if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
9796 break;
9797
9798 if (!PAGE_ALIGNED(gpa) || !npages ||
9799 gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
9800 ret = -KVM_EINVAL;
9801 break;
9802 }
9803
9804 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
9805 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
9806 vcpu->run->hypercall.args[0] = gpa;
9807 vcpu->run->hypercall.args[1] = npages;
9808 vcpu->run->hypercall.args[2] = attrs;
9809 vcpu->run->hypercall.longmode = op_64_bit;
9810 vcpu->arch.complete_userspace_io = complete_hypercall_exit;
9811 return 0;
9812 }
8776e519
HB
9813 default:
9814 ret = -KVM_ENOSYS;
9815 break;
9816 }
696ca779 9817out:
a449c7aa
NA
9818 if (!op_64_bit)
9819 ret = (u32)ret;
de3cd117 9820 kvm_rax_write(vcpu, ret);
6356ee0c 9821
f11c3a8d 9822 ++vcpu->stat.hypercalls;
6356ee0c 9823 return kvm_skip_emulated_instruction(vcpu);
8776e519
HB
9824}
9825EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
9826
b6785def 9827static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
8776e519 9828{
d6aa1000 9829 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8776e519 9830 char instruction[3];
5fdbf976 9831 unsigned long rip = kvm_rip_read(vcpu);
8776e519 9832
f1a9761f
OU
9833 /*
9834 * If the quirk is disabled, synthesize a #UD and let the guest pick up
9835 * the pieces.
9836 */
9837 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
9838 ctxt->exception.error_code_valid = false;
9839 ctxt->exception.vector = UD_VECTOR;
9840 ctxt->have_exception = true;
9841 return X86EMUL_PROPAGATE_FAULT;
9842 }
9843
b3646477 9844 static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
8776e519 9845
ce2e852e
DV
9846 return emulator_write_emulated(ctxt, rip, instruction, 3,
9847 &ctxt->exception);
8776e519
HB
9848}
9849
851ba692 9850static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
b6c7a5dc 9851{
782d422b
MG
9852 return vcpu->run->request_interrupt_window &&
9853 likely(!pic_in_kernel(vcpu->kvm));
b6c7a5dc
HB
9854}
9855
8d25b7be 9856/* Called within kvm->srcu read side. */
851ba692 9857static void post_kvm_run_save(struct kvm_vcpu *vcpu)
b6c7a5dc 9858{
851ba692
AK
9859 struct kvm_run *kvm_run = vcpu->run;
9860
c5063551 9861 kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
2d3ad1f4 9862 kvm_run->cr8 = kvm_get_cr8(vcpu);
b6c7a5dc 9863 kvm_run->apic_base = kvm_get_apic_base(vcpu);
f3d1436d 9864
127a457a
MG
9865 kvm_run->ready_for_interrupt_injection =
9866 pic_in_kernel(vcpu->kvm) ||
782d422b 9867 kvm_vcpu_ready_for_interrupt_injection(vcpu);
15aad3be
CQ
9868
9869 if (is_smm(vcpu))
9870 kvm_run->flags |= KVM_RUN_X86_SMM;
b6c7a5dc
HB
9871}
9872
95ba8273
GN
9873static void update_cr8_intercept(struct kvm_vcpu *vcpu)
9874{
9875 int max_irr, tpr;
9876
afaf0b2f 9877 if (!kvm_x86_ops.update_cr8_intercept)
95ba8273
GN
9878 return;
9879
bce87cce 9880 if (!lapic_in_kernel(vcpu))
88c808fd
AK
9881 return;
9882
ce0a58f4 9883 if (vcpu->arch.apic->apicv_active)
d62caabb
AS
9884 return;
9885
8db3baa2
GN
9886 if (!vcpu->arch.apic->vapic_addr)
9887 max_irr = kvm_lapic_find_highest_irr(vcpu);
9888 else
9889 max_irr = -1;
95ba8273
GN
9890
9891 if (max_irr != -1)
9892 max_irr >>= 4;
9893
9894 tpr = kvm_lapic_get_cr8(vcpu);
9895
b3646477 9896 static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
95ba8273
GN
9897}
9898
b97f0745 9899
cb6a32c2
SC
9900int kvm_check_nested_events(struct kvm_vcpu *vcpu)
9901{
92e7d5c8 9902 if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
cb6a32c2
SC
9903 kvm_x86_ops.nested_ops->triple_fault(vcpu);
9904 return 1;
9905 }
9906
9907 return kvm_x86_ops.nested_ops->check_events(vcpu);
9908}
9909
b97f0745
ML
9910static void kvm_inject_exception(struct kvm_vcpu *vcpu)
9911{
d4963e31 9912 trace_kvm_inj_exception(vcpu->arch.exception.vector,
a61d7c54
SC
9913 vcpu->arch.exception.has_error_code,
9914 vcpu->arch.exception.error_code,
9915 vcpu->arch.exception.injected);
9916
b97f0745
ML
9917 if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
9918 vcpu->arch.exception.error_code = false;
6ad75c5c 9919 static_call(kvm_x86_inject_exception)(vcpu);
b97f0745
ML
9920}
9921
e746c1f1
SC
9922/*
9923 * Check for any event (interrupt or exception) that is ready to be injected,
9924 * and if there is at least one event, inject the event with the highest
9925 * priority. This handles both "pending" events, i.e. events that have never
9926 * been injected into the guest, and "injected" events, i.e. events that were
9927 * injected as part of a previous VM-Enter, but weren't successfully delivered
9928 * and need to be re-injected.
9929 *
9930 * Note, this is not guaranteed to be invoked on a guest instruction boundary,
9931 * i.e. doesn't guarantee that there's an event window in the guest. KVM must
9932 * be able to inject exceptions in the "middle" of an instruction, and so must
9933 * also be able to re-inject NMIs and IRQs in the middle of an instruction.
9934 * I.e. for exceptions and re-injected events, NOT invoking this on instruction
9935 * boundaries is necessary and correct.
9936 *
9937 * For simplicity, KVM uses a single path to inject all events (except events
9938 * that are injected directly from L1 to L2) and doesn't explicitly track
9939 * instruction boundaries for asynchronous events. However, because VM-Exits
9940 * that can occur during instruction execution typically result in KVM skipping
9941 * the instruction or injecting an exception, e.g. instruction and exception
9942 * intercepts, and because pending exceptions have higher priority than pending
9943 * interrupts, KVM still honors instruction boundaries in most scenarios.
9944 *
9945 * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
9946 * the instruction or inject an exception, then KVM can incorrecty inject a new
9947 * asynchrounous event if the event became pending after the CPU fetched the
9948 * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation)
9949 * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
9950 * injected on the restarted instruction instead of being deferred until the
9951 * instruction completes.
9952 *
9953 * In practice, this virtualization hole is unlikely to be observed by the
9954 * guest, and even less likely to cause functional problems. To detect the
9955 * hole, the guest would have to trigger an event on a side effect of an early
9956 * phase of instruction execution, e.g. on the instruction fetch from memory.
9957 * And for it to be a functional problem, the guest would need to depend on the
9958 * ordering between that side effect, the instruction completing, _and_ the
9959 * delivery of the asynchronous event.
9960 */
9961static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
9962 bool *req_immediate_exit)
95ba8273 9963{
28360f88 9964 bool can_inject;
b6b8a145
JK
9965 int r;
9966
6c593b52
SC
9967 /*
9968 * Process nested events first, as nested VM-Exit supercedes event
9969 * re-injection. If there's an event queued for re-injection, it will
9970 * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
9971 */
9972 if (is_guest_mode(vcpu))
9973 r = kvm_check_nested_events(vcpu);
9974 else
9975 r = 0;
664f8e26
WL
9976
9977 /*
6c593b52
SC
9978 * Re-inject exceptions and events *especially* if immediate entry+exit
9979 * to/from L2 is needed, as any event that has already been injected
9980 * into L2 needs to complete its lifecycle before injecting a new event.
9981 *
9982 * Don't re-inject an NMI or interrupt if there is a pending exception.
9983 * This collision arises if an exception occurred while vectoring the
9984 * injected event, KVM intercepted said exception, and KVM ultimately
9985 * determined the fault belongs to the guest and queues the exception
9986 * for injection back into the guest.
9987 *
9988 * "Injected" interrupts can also collide with pending exceptions if
9989 * userspace ignores the "ready for injection" flag and blindly queues
9990 * an interrupt. In that case, prioritizing the exception is correct,
9991 * as the exception "occurred" before the exit to userspace. Trap-like
9992 * exceptions, e.g. most #DBs, have higher priority than interrupts.
9993 * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
9994 * priority, they're only generated (pended) during instruction
9995 * execution, and interrupts are recognized at instruction boundaries.
9996 * Thus a pending fault-like exception means the fault occurred on the
9997 * *previous* instruction and must be serviced prior to recognizing any
9998 * new events in order to fully complete the previous instruction.
664f8e26 9999 */
6c593b52
SC
10000 if (vcpu->arch.exception.injected)
10001 kvm_inject_exception(vcpu);
7709aba8 10002 else if (kvm_is_exception_pending(vcpu))
6c593b52
SC
10003 ; /* see above */
10004 else if (vcpu->arch.nmi_injected)
10005 static_call(kvm_x86_inject_nmi)(vcpu);
10006 else if (vcpu->arch.interrupt.injected)
10007 static_call(kvm_x86_inject_irq)(vcpu, true);
664f8e26 10008
6c593b52
SC
10009 /*
10010 * Exceptions that morph to VM-Exits are handled above, and pending
10011 * exceptions on top of injected exceptions that do not VM-Exit should
10012 * either morph to #DF or, sadly, override the injected exception.
10013 */
3b82b8d7
SC
10014 WARN_ON_ONCE(vcpu->arch.exception.injected &&
10015 vcpu->arch.exception.pending);
10016
1a680e35 10017 /*
6c593b52
SC
10018 * Bail if immediate entry+exit to/from the guest is needed to complete
10019 * nested VM-Enter or event re-injection so that a different pending
10020 * event can be serviced (or if KVM needs to exit to userspace).
10021 *
10022 * Otherwise, continue processing events even if VM-Exit occurred. The
10023 * VM-Exit will have cleared exceptions that were meant for L2, but
10024 * there may now be events that can be injected into L1.
1a680e35 10025 */
6c593b52
SC
10026 if (r < 0)
10027 goto out;
664f8e26 10028
7709aba8
SC
10029 /*
10030 * A pending exception VM-Exit should either result in nested VM-Exit
10031 * or force an immediate re-entry and exit to/from L2, and exception
10032 * VM-Exits cannot be injected (flag should _never_ be set).
10033 */
10034 WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
10035 vcpu->arch.exception_vmexit.pending);
10036
28360f88
SC
10037 /*
10038 * New events, other than exceptions, cannot be injected if KVM needs
10039 * to re-inject a previous event. See above comments on re-injecting
10040 * for why pending exceptions get priority.
10041 */
10042 can_inject = !kvm_event_needs_reinjection(vcpu);
664f8e26 10043
b59bb7bd 10044 if (vcpu->arch.exception.pending) {
5623f751
SC
10045 /*
10046 * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
10047 * value pushed on the stack. Trap-like exception and all #DBs
10048 * leave RF as-is (KVM follows Intel's behavior in this regard;
10049 * AMD states that code breakpoint #DBs excplitly clear RF=0).
10050 *
10051 * Note, most versions of Intel's SDM and AMD's APM incorrectly
10052 * describe the behavior of General Detect #DBs, which are
10053 * fault-like. They do _not_ set RF, a la code breakpoints.
10054 */
d4963e31 10055 if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
d6e8c854
NA
10056 __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
10057 X86_EFLAGS_RF);
10058
d4963e31
SC
10059 if (vcpu->arch.exception.vector == DB_VECTOR) {
10060 kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
f10c729f
JM
10061 if (vcpu->arch.dr7 & DR7_GD) {
10062 vcpu->arch.dr7 &= ~DR7_GD;
10063 kvm_update_dr7(vcpu);
10064 }
6bdf0662
NA
10065 }
10066
b97f0745 10067 kvm_inject_exception(vcpu);
a61d7c54
SC
10068
10069 vcpu->arch.exception.pending = false;
10070 vcpu->arch.exception.injected = true;
10071
c6b22f59 10072 can_inject = false;
1a680e35
LA
10073 }
10074
61e5f69e
ML
10075 /* Don't inject interrupts if the user asked to avoid doing so */
10076 if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
10077 return 0;
10078
c9d40913
PB
10079 /*
10080 * Finally, inject interrupt events. If an event cannot be injected
10081 * due to architectural conditions (e.g. IF=0) a window-open exit
10082 * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
10083 * and can architecturally be injected, but we cannot do it right now:
10084 * an interrupt could have arrived just now and we have to inject it
10085 * as a vmexit, or there could already an event in the queue, which is
10086 * indicated by can_inject. In that case we request an immediate exit
10087 * in order to make progress and get back here for another iteration.
10088 * The kvm_x86_ops hooks communicate this by returning -EBUSY.
10089 */
31e83e21 10090#ifdef CONFIG_KVM_SMM
c9d40913 10091 if (vcpu->arch.smi_pending) {
b3646477 10092 r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
c9d40913 10093 if (r < 0)
a5f6909a 10094 goto out;
c9d40913
PB
10095 if (r) {
10096 vcpu->arch.smi_pending = false;
10097 ++vcpu->arch.smi_count;
10098 enter_smm(vcpu);
10099 can_inject = false;
10100 } else
b3646477 10101 static_call(kvm_x86_enable_smi_window)(vcpu);
c9d40913 10102 }
31e83e21 10103#endif
c9d40913
PB
10104
10105 if (vcpu->arch.nmi_pending) {
b3646477 10106 r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
c9d40913 10107 if (r < 0)
a5f6909a 10108 goto out;
c9d40913
PB
10109 if (r) {
10110 --vcpu->arch.nmi_pending;
10111 vcpu->arch.nmi_injected = true;
e27bc044 10112 static_call(kvm_x86_inject_nmi)(vcpu);
c9d40913 10113 can_inject = false;
b3646477 10114 WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
c9d40913
PB
10115 }
10116 if (vcpu->arch.nmi_pending)
b3646477 10117 static_call(kvm_x86_enable_nmi_window)(vcpu);
c9d40913 10118 }
1a680e35 10119
c9d40913 10120 if (kvm_cpu_has_injectable_intr(vcpu)) {
b3646477 10121 r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
c9d40913 10122 if (r < 0)
a5f6909a 10123 goto out;
c9d40913
PB
10124 if (r) {
10125 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
2d613912 10126 static_call(kvm_x86_inject_irq)(vcpu, false);
b3646477 10127 WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
c9d40913
PB
10128 }
10129 if (kvm_cpu_has_injectable_intr(vcpu))
b3646477 10130 static_call(kvm_x86_enable_irq_window)(vcpu);
95ba8273 10131 }
ee2cd4b7 10132
c9d40913 10133 if (is_guest_mode(vcpu) &&
5b4ac1a1
PB
10134 kvm_x86_ops.nested_ops->has_events &&
10135 kvm_x86_ops.nested_ops->has_events(vcpu))
c9d40913
PB
10136 *req_immediate_exit = true;
10137
dea0d5a2
SC
10138 /*
10139 * KVM must never queue a new exception while injecting an event; KVM
10140 * is done emulating and should only propagate the to-be-injected event
10141 * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an
10142 * infinite loop as KVM will bail from VM-Enter to inject the pending
10143 * exception and start the cycle all over.
10144 *
10145 * Exempt triple faults as they have special handling and won't put the
10146 * vCPU into an infinite loop. Triple fault can be queued when running
10147 * VMX without unrestricted guest, as that requires KVM to emulate Real
10148 * Mode events (see kvm_inject_realmode_interrupt()).
10149 */
10150 WARN_ON_ONCE(vcpu->arch.exception.pending ||
10151 vcpu->arch.exception_vmexit.pending);
a5f6909a 10152 return 0;
c9d40913 10153
a5f6909a
JM
10154out:
10155 if (r == -EBUSY) {
10156 *req_immediate_exit = true;
10157 r = 0;
10158 }
10159 return r;
95ba8273
GN
10160}
10161
7460fb4a
AK
10162static void process_nmi(struct kvm_vcpu *vcpu)
10163{
10164 unsigned limit = 2;
10165
10166 /*
10167 * x86 is limited to one NMI running, and one NMI pending after it.
10168 * If an NMI is already in progress, limit further NMIs to just one.
10169 * Otherwise, allow two (and we'll inject the first one immediately).
10170 */
b3646477 10171 if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
7460fb4a
AK
10172 limit = 1;
10173
10174 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
10175 vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
10176 kvm_make_request(KVM_REQ_EVENT, vcpu);
10177}
10178
7ee30bc1
NNL
10179void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
10180 unsigned long *vcpu_bitmap)
10181{
620b2438 10182 kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap);
7ee30bc1
NNL
10183}
10184
2860c4b1
PB
10185void kvm_make_scan_ioapic_request(struct kvm *kvm)
10186{
10187 kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
10188}
10189
2008fab3 10190void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
8df14af4 10191{
ce0a58f4 10192 struct kvm_lapic *apic = vcpu->arch.apic;
06ef8134
ML
10193 bool activate;
10194
8df14af4
SS
10195 if (!lapic_in_kernel(vcpu))
10196 return;
10197
187c8833 10198 down_read(&vcpu->kvm->arch.apicv_update_lock);
66c768d3 10199 preempt_disable();
b0a1637f 10200
8fc9c7a3
SS
10201 /* Do not activate APICV when APIC is disabled */
10202 activate = kvm_vcpu_apicv_activated(vcpu) &&
10203 (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED);
d5fa597e 10204
ce0a58f4 10205 if (apic->apicv_active == activate)
06ef8134
ML
10206 goto out;
10207
ce0a58f4 10208 apic->apicv_active = activate;
8df14af4 10209 kvm_apic_update_apicv(vcpu);
b3646477 10210 static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
bca66dbc
VK
10211
10212 /*
10213 * When APICv gets disabled, we may still have injected interrupts
10214 * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
10215 * still active when the interrupt got accepted. Make sure
e746c1f1 10216 * kvm_check_and_inject_events() is called to check for that.
bca66dbc 10217 */
ce0a58f4 10218 if (!apic->apicv_active)
bca66dbc 10219 kvm_make_request(KVM_REQ_EVENT, vcpu);
b0a1637f 10220
06ef8134 10221out:
66c768d3 10222 preempt_enable();
187c8833 10223 up_read(&vcpu->kvm->arch.apicv_update_lock);
8df14af4 10224}
2008fab3
SC
10225EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv);
10226
10227static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
10228{
10229 if (!lapic_in_kernel(vcpu))
10230 return;
10231
10232 /*
10233 * Due to sharing page tables across vCPUs, the xAPIC memslot must be
10234 * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but
10235 * and hardware doesn't support x2APIC virtualization. E.g. some AMD
10236 * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in
10237 * this case so that KVM can the AVIC doorbell to inject interrupts to
10238 * running vCPUs, but KVM must not create SPTEs for the APIC base as
10239 * the vCPU would incorrectly be able to access the vAPIC page via MMIO
10240 * despite being in x2APIC mode. For simplicity, inhibiting the APIC
10241 * access page is sticky.
10242 */
10243 if (apic_x2apic_mode(vcpu->arch.apic) &&
10244 kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization)
10245 kvm_inhibit_apic_access_page(vcpu);
10246
10247 __kvm_vcpu_update_apicv(vcpu);
10248}
8df14af4 10249
320af55a
SC
10250void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
10251 enum kvm_apicv_inhibit reason, bool set)
8df14af4 10252{
b0a1637f 10253 unsigned long old, new;
8e205a6b 10254
187c8833
SC
10255 lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
10256
b3f257a8 10257 if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason)))
ef8efd7a
SS
10258 return;
10259
b0a1637f
ML
10260 old = new = kvm->arch.apicv_inhibit_reasons;
10261
4f4c4a3e 10262 set_or_clear_apicv_inhibit(&new, reason, set);
8e205a6b 10263
36222b11 10264 if (!!old != !!new) {
ee49a893
SC
10265 /*
10266 * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
10267 * false positives in the sanity check WARN in svm_vcpu_run().
10268 * This task will wait for all vCPUs to ack the kick IRQ before
10269 * updating apicv_inhibit_reasons, and all other vCPUs will
10270 * block on acquiring apicv_update_lock so that vCPUs can't
10271 * redo svm_vcpu_run() without seeing the new inhibit state.
10272 *
10273 * Note, holding apicv_update_lock and taking it in the read
10274 * side (handling the request) also prevents other vCPUs from
10275 * servicing the request with a stale apicv_inhibit_reasons.
10276 */
36222b11 10277 kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
b0a1637f 10278 kvm->arch.apicv_inhibit_reasons = new;
36222b11
ML
10279 if (new) {
10280 unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
074c0080
BG
10281 int idx = srcu_read_lock(&kvm->srcu);
10282
36222b11 10283 kvm_zap_gfn_range(kvm, gfn, gfn+1);
074c0080 10284 srcu_read_unlock(&kvm->srcu, idx);
36222b11 10285 }
7491b7b2 10286 } else {
b0a1637f 10287 kvm->arch.apicv_inhibit_reasons = new;
7491b7b2 10288 }
b0a1637f 10289}
7d611233 10290
320af55a
SC
10291void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
10292 enum kvm_apicv_inhibit reason, bool set)
b0a1637f 10293{
f1575642
SC
10294 if (!enable_apicv)
10295 return;
10296
187c8833 10297 down_write(&kvm->arch.apicv_update_lock);
320af55a 10298 __kvm_set_or_clear_apicv_inhibit(kvm, reason, set);
187c8833 10299 up_write(&kvm->arch.apicv_update_lock);
8df14af4 10300}
320af55a 10301EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit);
8df14af4 10302
3d81bc7e 10303static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
c7c9c56c 10304{
dcbd3e49 10305 if (!kvm_apic_present(vcpu))
3d81bc7e 10306 return;
c7c9c56c 10307
6308630b 10308 bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
c7c9c56c 10309
b053b2ae 10310 if (irqchip_split(vcpu->kvm))
6308630b 10311 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
db2bdcbb 10312 else {
37c4dbf3 10313 static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
e97f852f
WL
10314 if (ioapic_in_kernel(vcpu->kvm))
10315 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
db2bdcbb 10316 }
e40ff1d6
LA
10317
10318 if (is_guest_mode(vcpu))
10319 vcpu->arch.load_eoi_exitmap_pending = true;
10320 else
10321 kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
10322}
10323
10324static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
10325{
10326 u64 eoi_exit_bitmap[4];
10327
10328 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
10329 return;
10330
c5adbb3a 10331 if (to_hv_vcpu(vcpu)) {
f2bc14b6
VK
10332 bitmap_or((ulong *)eoi_exit_bitmap,
10333 vcpu->arch.ioapic_handled_vectors,
10334 to_hv_synic(vcpu)->vec_bitmap, 256);
abb6d479 10335 static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
c5adbb3a 10336 return;
10337 }
f2bc14b6 10338
abb6d479 10339 static_call_cond(kvm_x86_load_eoi_exitmap)(
c5adbb3a 10340 vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
c7c9c56c
YZ
10341}
10342
e649b3f0
ET
10343void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
10344 unsigned long start, unsigned long end)
b1394e74
RK
10345{
10346 unsigned long apic_address;
10347
10348 /*
10349 * The physical address of apic access page is stored in the VMCS.
10350 * Update it when it becomes invalid.
10351 */
10352 apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
10353 if (start <= apic_address && apic_address < end)
10354 kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
10355}
10356
683412cc
MZ
10357void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
10358{
10359 static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
10360}
10361
d081a343 10362static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
4256f43f 10363{
35754c98 10364 if (!lapic_in_kernel(vcpu))
f439ed27
PB
10365 return;
10366
2a890614 10367 static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
4256f43f 10368}
4256f43f 10369
d264ee0c
SC
10370void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
10371{
10372 smp_send_reschedule(vcpu->cpu);
10373}
10374EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
10375
9357d939 10376/*
8d25b7be 10377 * Called within kvm->srcu read side.
362c698f 10378 * Returns 1 to let vcpu_run() continue the guest execution loop without
9357d939
TY
10379 * exiting to the userspace. Otherwise, the value will be returned to the
10380 * userspace.
10381 */
851ba692 10382static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
b6c7a5dc
HB
10383{
10384 int r;
62a193ed
MG
10385 bool req_int_win =
10386 dm_request_for_irq_injection(vcpu) &&
10387 kvm_cpu_accept_dm_intr(vcpu);
404d5d7b 10388 fastpath_t exit_fastpath;
62a193ed 10389
730dca42 10390 bool req_immediate_exit = false;
b6c7a5dc 10391
2fa6e1e1 10392 if (kvm_request_pending(vcpu)) {
f4d31653 10393 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
67369273
SC
10394 r = -EIO;
10395 goto out;
10396 }
cf87ac73
GS
10397
10398 if (kvm_dirty_ring_check_request(vcpu)) {
10399 r = 0;
10400 goto out;
10401 }
10402
729c15c2 10403 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
9a78e158 10404 if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
671ddc70
JM
10405 r = 0;
10406 goto out;
10407 }
10408 }
527d5cd7
SC
10409 if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
10410 kvm_mmu_free_obsolete_roots(vcpu);
a8eeb04a 10411 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
2f599714 10412 __kvm_migrate_timers(vcpu);
d828199e 10413 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
6b6fcd28 10414 kvm_update_masterclock(vcpu->kvm);
0061d53d
MT
10415 if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
10416 kvm_gen_kvmclock_update(vcpu);
34c238a1
ZA
10417 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
10418 r = kvm_guest_time_update(vcpu);
8cfdc000
ZA
10419 if (unlikely(r))
10420 goto out;
10421 }
a8eeb04a 10422 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4731d4c7 10423 kvm_mmu_sync_roots(vcpu);
727a7e27
PB
10424 if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
10425 kvm_mmu_load_pgd(vcpu);
e94cea09
SC
10426
10427 /*
10428 * Note, the order matters here, as flushing "all" TLB entries
10429 * also flushes the "current" TLB entries, i.e. servicing the
10430 * flush "all" will clear any request to flush "current".
10431 */
10432 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
7780938c 10433 kvm_vcpu_flush_tlb_all(vcpu);
eeeb4f67 10434
40e5f908 10435 kvm_service_local_tlb_flush_requests(vcpu);
eeeb4f67 10436
0823570f
VK
10437 /*
10438 * Fall back to a "full" guest flush if Hyper-V's precise
10439 * flushing fails. Note, Hyper-V's flushing is per-vCPU, but
10440 * the flushes are considered "remote" and not "local" because
10441 * the requests can be initiated from other vCPUs.
10442 */
10443 if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) &&
10444 kvm_hv_vcpu_flush_tlb(vcpu))
adc43caa
VK
10445 kvm_vcpu_flush_tlb_guest(vcpu);
10446
a8eeb04a 10447 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
851ba692 10448 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
b93463aa
AK
10449 r = 0;
10450 goto out;
10451 }
92e7d5c8
ML
10452 if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
10453 if (is_guest_mode(vcpu))
cb6a32c2 10454 kvm_x86_ops.nested_ops->triple_fault(vcpu);
92e7d5c8
ML
10455
10456 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
cb6a32c2
SC
10457 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
10458 vcpu->mmio_needed = 0;
10459 r = 0;
e542baf3 10460 goto out;
cb6a32c2 10461 }
71c4dfaf 10462 }
af585b92
GN
10463 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
10464 /* Page is swapped out. Do synthetic halt */
10465 vcpu->arch.apf.halted = true;
10466 r = 1;
10467 goto out;
10468 }
c9aaa895
GC
10469 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
10470 record_steal_time(vcpu);
cf7316d0 10471#ifdef CONFIG_KVM_SMM
64d60670
PB
10472 if (kvm_check_request(KVM_REQ_SMI, vcpu))
10473 process_smi(vcpu);
cf7316d0 10474#endif
7460fb4a
AK
10475 if (kvm_check_request(KVM_REQ_NMI, vcpu))
10476 process_nmi(vcpu);
f5132b01 10477 if (kvm_check_request(KVM_REQ_PMU, vcpu))
c6702c9d 10478 kvm_pmu_handle_event(vcpu);
f5132b01 10479 if (kvm_check_request(KVM_REQ_PMI, vcpu))
c6702c9d 10480 kvm_pmu_deliver_pmi(vcpu);
7543a635
SR
10481 if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
10482 BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
10483 if (test_bit(vcpu->arch.pending_ioapic_eoi,
6308630b 10484 vcpu->arch.ioapic_handled_vectors)) {
7543a635
SR
10485 vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
10486 vcpu->run->eoi.vector =
10487 vcpu->arch.pending_ioapic_eoi;
10488 r = 0;
10489 goto out;
10490 }
10491 }
3d81bc7e
YZ
10492 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
10493 vcpu_scan_ioapic(vcpu);
e40ff1d6
LA
10494 if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
10495 vcpu_load_eoi_exitmap(vcpu);
4256f43f
TC
10496 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
10497 kvm_vcpu_reload_apic_access_page(vcpu);
2ce79189
AS
10498 if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
10499 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
10500 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
d495f942 10501 vcpu->run->system_event.ndata = 0;
2ce79189
AS
10502 r = 0;
10503 goto out;
10504 }
e516cebb
AS
10505 if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
10506 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
10507 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
d495f942 10508 vcpu->run->system_event.ndata = 0;
e516cebb
AS
10509 r = 0;
10510 goto out;
10511 }
db397571 10512 if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
9ff5e030
VK
10513 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
10514
db397571 10515 vcpu->run->exit_reason = KVM_EXIT_HYPERV;
9ff5e030 10516 vcpu->run->hyperv = hv_vcpu->exit;
db397571
AS
10517 r = 0;
10518 goto out;
10519 }
f3b138c5
AS
10520
10521 /*
10522 * KVM_REQ_HV_STIMER has to be processed after
10523 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
10524 * depend on the guest clock being up-to-date
10525 */
1f4b34f8
AS
10526 if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
10527 kvm_hv_process_stimers(vcpu);
8df14af4
SS
10528 if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
10529 kvm_vcpu_update_apicv(vcpu);
557a961a
VK
10530 if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
10531 kvm_check_async_pf_completion(vcpu);
1a155254 10532 if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
b3646477 10533 static_call(kvm_x86_msr_filter_changed)(vcpu);
a85863c2
MS
10534
10535 if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
10536 static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
2f52d58c 10537 }
b93463aa 10538
40da8ccd
DW
10539 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
10540 kvm_xen_has_interrupt(vcpu)) {
0f1e261e 10541 ++vcpu->stat.req_event;
4fe09bcf
JM
10542 r = kvm_apic_accept_events(vcpu);
10543 if (r < 0) {
10544 r = 0;
10545 goto out;
10546 }
66450a21
JK
10547 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
10548 r = 1;
10549 goto out;
10550 }
10551
e746c1f1 10552 r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
a5f6909a
JM
10553 if (r < 0) {
10554 r = 0;
10555 goto out;
10556 }
c9d40913 10557 if (req_int_win)
b3646477 10558 static_call(kvm_x86_enable_irq_window)(vcpu);
b463a6f7
AK
10559
10560 if (kvm_lapic_enabled(vcpu)) {
10561 update_cr8_intercept(vcpu);
10562 kvm_lapic_sync_to_vapic(vcpu);
10563 }
10564 }
10565
d8368af8
AK
10566 r = kvm_mmu_reload(vcpu);
10567 if (unlikely(r)) {
d905c069 10568 goto cancel_injection;
d8368af8
AK
10569 }
10570
b6c7a5dc
HB
10571 preempt_disable();
10572
e27bc044 10573 static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
b95234c8
PB
10574
10575 /*
10576 * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
10577 * IPI are then delayed after guest entry, which ensures that they
10578 * result in virtual interrupt delivery.
10579 */
10580 local_irq_disable();
66fa226c
ML
10581
10582 /* Store vcpu->apicv_active before vcpu->mode. */
10583 smp_store_release(&vcpu->mode, IN_GUEST_MODE);
6b7e2d09 10584
2031f287 10585 kvm_vcpu_srcu_read_unlock(vcpu);
01b71917 10586
0f127d12 10587 /*
b95234c8 10588 * 1) We should set ->mode before checking ->requests. Please see
cde9af6e 10589 * the comment in kvm_vcpu_exiting_guest_mode().
b95234c8 10590 *
81b01667 10591 * 2) For APICv, we should set ->mode before checking PID.ON. This
b95234c8
PB
10592 * pairs with the memory barrier implicit in pi_test_and_set_on
10593 * (see vmx_deliver_posted_interrupt).
10594 *
10595 * 3) This also orders the write to mode from any reads to the page
10596 * tables done while the VCPU is running. Please see the comment
10597 * in kvm_flush_remote_tlbs.
6b7e2d09 10598 */
01b71917 10599 smp_mb__after_srcu_read_unlock();
b6c7a5dc 10600
b95234c8 10601 /*
0f65a9d3
SC
10602 * Process pending posted interrupts to handle the case where the
10603 * notification IRQ arrived in the host, or was never sent (because the
10604 * target vCPU wasn't running). Do this regardless of the vCPU's APICv
10605 * status, KVM doesn't update assigned devices when APICv is inhibited,
10606 * i.e. they can post interrupts even if APICv is temporarily disabled.
b95234c8 10607 */
37c4dbf3
PB
10608 if (kvm_lapic_enabled(vcpu))
10609 static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
32f88400 10610
5a9f5443 10611 if (kvm_vcpu_exit_request(vcpu)) {
6b7e2d09 10612 vcpu->mode = OUTSIDE_GUEST_MODE;
d94e1dc9 10613 smp_wmb();
6c142801
AK
10614 local_irq_enable();
10615 preempt_enable();
2031f287 10616 kvm_vcpu_srcu_read_lock(vcpu);
6c142801 10617 r = 1;
d905c069 10618 goto cancel_injection;
6c142801
AK
10619 }
10620
c43203ca
PB
10621 if (req_immediate_exit) {
10622 kvm_make_request(KVM_REQ_EVENT, vcpu);
b3646477 10623 static_call(kvm_x86_request_immediate_exit)(vcpu);
c43203ca 10624 }
d6185f20 10625
2620fe26
SC
10626 fpregs_assert_state_consistent();
10627 if (test_thread_flag(TIF_NEED_FPU_LOAD))
10628 switch_fpu_return();
5f409e20 10629
ec5be88a
JL
10630 if (vcpu->arch.guest_fpu.xfd_err)
10631 wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
10632
42dbaa5a 10633 if (unlikely(vcpu->arch.switch_db_regs)) {
42dbaa5a
JK
10634 set_debugreg(0, 7);
10635 set_debugreg(vcpu->arch.eff_db[0], 0);
10636 set_debugreg(vcpu->arch.eff_db[1], 1);
10637 set_debugreg(vcpu->arch.eff_db[2], 2);
10638 set_debugreg(vcpu->arch.eff_db[3], 3);
f85d4016
LJ
10639 } else if (unlikely(hw_breakpoint_active())) {
10640 set_debugreg(0, 7);
42dbaa5a 10641 }
b6c7a5dc 10642
b2d2af7e
MR
10643 guest_timing_enter_irqoff();
10644
d89d04ab 10645 for (;;) {
ee49a893
SC
10646 /*
10647 * Assert that vCPU vs. VM APICv state is consistent. An APICv
10648 * update must kick and wait for all vCPUs before toggling the
10649 * per-VM state, and responsing vCPUs must wait for the update
10650 * to complete before servicing KVM_REQ_APICV_UPDATE.
10651 */
f8d8ac21
SS
10652 WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
10653 (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
ee49a893 10654
e27bc044 10655 exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
d89d04ab
PB
10656 if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
10657 break;
10658
37c4dbf3
PB
10659 if (kvm_lapic_enabled(vcpu))
10660 static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
de7cd3f6
PB
10661
10662 if (unlikely(kvm_vcpu_exit_request(vcpu))) {
d89d04ab
PB
10663 exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
10664 break;
10665 }
de7cd3f6 10666 }
b6c7a5dc 10667
c77fb5fe
PB
10668 /*
10669 * Do this here before restoring debug registers on the host. And
10670 * since we do this before handling the vmexit, a DR access vmexit
10671 * can (a) read the correct value of the debug registers, (b) set
10672 * KVM_DEBUGREG_WONT_EXIT again.
10673 */
10674 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
c77fb5fe 10675 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
b3646477 10676 static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
70e4da7a 10677 kvm_update_dr0123(vcpu);
70e4da7a 10678 kvm_update_dr7(vcpu);
c77fb5fe
PB
10679 }
10680
24f1e32c
FW
10681 /*
10682 * If the guest has used debug registers, at least dr7
10683 * will be disabled while returning to the host.
10684 * If we don't have active breakpoints in the host, we don't
10685 * care about the messed up debug address registers. But if
10686 * we have some of them active, restore the old state.
10687 */
59d8eb53 10688 if (hw_breakpoint_active())
24f1e32c 10689 hw_breakpoint_restore();
42dbaa5a 10690
c967118d 10691 vcpu->arch.last_vmentry_cpu = vcpu->cpu;
4ba76538 10692 vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1d5f066e 10693
6b7e2d09 10694 vcpu->mode = OUTSIDE_GUEST_MODE;
d94e1dc9 10695 smp_wmb();
a547c6db 10696
b5274b1b
KT
10697 /*
10698 * Sync xfd before calling handle_exit_irqoff() which may
10699 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
10700 * in #NM irqoff handler).
10701 */
10702 if (vcpu->arch.xfd_no_write_intercept)
10703 fpu_sync_guest_vmexit_xfd_state();
10704
b3646477 10705 static_call(kvm_x86_handle_exit_irqoff)(vcpu);
b6c7a5dc 10706
ec5be88a
JL
10707 if (vcpu->arch.guest_fpu.xfd_err)
10708 wrmsrl(MSR_IA32_XFD_ERR, 0);
10709
d7a08882
SC
10710 /*
10711 * Consume any pending interrupts, including the possible source of
10712 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
10713 * An instruction is required after local_irq_enable() to fully unblock
10714 * interrupts on processors that implement an interrupt shadow, the
10715 * stat.exits increment will do nicely.
10716 */
db215756 10717 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
d7a08882 10718 local_irq_enable();
b6c7a5dc 10719 ++vcpu->stat.exits;
d7a08882
SC
10720 local_irq_disable();
10721 kvm_after_interrupt(vcpu);
b6c7a5dc 10722
16045714
WL
10723 /*
10724 * Wait until after servicing IRQs to account guest time so that any
10725 * ticks that occurred while running the guest are properly accounted
10726 * to the guest. Waiting until IRQs are enabled degrades the accuracy
10727 * of accounting via context tracking, but the loss of accuracy is
10728 * acceptable for all known use cases.
10729 */
b2d2af7e 10730 guest_timing_exit_irqoff();
16045714 10731
f2485b3e 10732 local_irq_enable();
b6c7a5dc
HB
10733 preempt_enable();
10734
2031f287 10735 kvm_vcpu_srcu_read_lock(vcpu);
3200f405 10736
b6c7a5dc
HB
10737 /*
10738 * Profile KVM exit RIPs:
10739 */
10740 if (unlikely(prof_on == KVM_PROFILING)) {
5fdbf976
MT
10741 unsigned long rip = kvm_rip_read(vcpu);
10742 profile_hit(KVM_PROFILING, (void *)rip);
b6c7a5dc
HB
10743 }
10744
cc578287
ZA
10745 if (unlikely(vcpu->arch.tsc_always_catchup))
10746 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
298101da 10747
5cfb1d5a
MT
10748 if (vcpu->arch.apic_attention)
10749 kvm_lapic_sync_from_vapic(vcpu);
b93463aa 10750
b3646477 10751 r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
d905c069
MT
10752 return r;
10753
10754cancel_injection:
8081ad06
SC
10755 if (req_immediate_exit)
10756 kvm_make_request(KVM_REQ_EVENT, vcpu);
b3646477 10757 static_call(kvm_x86_cancel_injection)(vcpu);
ae7a2a3f
MT
10758 if (unlikely(vcpu->arch.apic_attention))
10759 kvm_lapic_sync_from_vapic(vcpu);
d7690175
MT
10760out:
10761 return r;
10762}
b6c7a5dc 10763
8d25b7be 10764/* Called within kvm->srcu read side. */
2031f287 10765static inline int vcpu_block(struct kvm_vcpu *vcpu)
362c698f 10766{
98c25ead
SC
10767 bool hv_timer;
10768
c3e8abf0 10769 if (!kvm_arch_vcpu_runnable(vcpu)) {
98c25ead
SC
10770 /*
10771 * Switch to the software timer before halt-polling/blocking as
10772 * the guest's timer may be a break event for the vCPU, and the
10773 * hypervisor timer runs only when the CPU is in guest mode.
10774 * Switch before halt-polling so that KVM recognizes an expired
10775 * timer before blocking.
10776 */
10777 hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
10778 if (hv_timer)
10779 kvm_lapic_switch_to_sw_timer(vcpu);
10780
2031f287 10781 kvm_vcpu_srcu_read_unlock(vcpu);
cdafece4
SC
10782 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
10783 kvm_vcpu_halt(vcpu);
10784 else
10785 kvm_vcpu_block(vcpu);
2031f287 10786 kvm_vcpu_srcu_read_lock(vcpu);
bf9f6ac8 10787
98c25ead
SC
10788 if (hv_timer)
10789 kvm_lapic_switch_to_hv_timer(vcpu);
10790
599275c0
PB
10791 /*
10792 * If the vCPU is not runnable, a signal or another host event
10793 * of some kind is pending; service it without changing the
10794 * vCPU's activity state.
10795 */
10796 if (!kvm_arch_vcpu_runnable(vcpu))
9c8fd1ba
PB
10797 return 1;
10798 }
362c698f 10799
26844fee
PB
10800 /*
10801 * Evaluate nested events before exiting the halted state. This allows
10802 * the halt state to be recorded properly in the VMCS12's activity
10803 * state field (AMD does not have a similar field and a VM-Exit always
10804 * causes a spurious wakeup from HLT).
10805 */
10806 if (is_guest_mode(vcpu)) {
10807 if (kvm_check_nested_events(vcpu) < 0)
10808 return 0;
10809 }
10810
4fe09bcf
JM
10811 if (kvm_apic_accept_events(vcpu) < 0)
10812 return 0;
362c698f
PB
10813 switch(vcpu->arch.mp_state) {
10814 case KVM_MP_STATE_HALTED:
647daca2 10815 case KVM_MP_STATE_AP_RESET_HOLD:
362c698f
PB
10816 vcpu->arch.pv.pv_unhalted = false;
10817 vcpu->arch.mp_state =
10818 KVM_MP_STATE_RUNNABLE;
df561f66 10819 fallthrough;
362c698f
PB
10820 case KVM_MP_STATE_RUNNABLE:
10821 vcpu->arch.apf.halted = false;
10822 break;
10823 case KVM_MP_STATE_INIT_RECEIVED:
10824 break;
10825 default:
22c6a0ef
PB
10826 WARN_ON_ONCE(1);
10827 break;
362c698f
PB
10828 }
10829 return 1;
10830}
09cec754 10831
5d9bc648
PB
10832static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
10833{
10834 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
10835 !vcpu->arch.apf.halted);
10836}
10837
8d25b7be 10838/* Called within kvm->srcu read side. */
362c698f 10839static int vcpu_run(struct kvm_vcpu *vcpu)
d7690175
MT
10840{
10841 int r;
10842
c595ceee 10843 vcpu->arch.l1tf_flush_l1d = true;
d7690175 10844
362c698f 10845 for (;;) {
6cd88243
PB
10846 /*
10847 * If another guest vCPU requests a PV TLB flush in the middle
10848 * of instruction emulation, the rest of the emulation could
10849 * use a stale page translation. Assume that any code after
10850 * this point can start executing an instruction.
10851 */
10852 vcpu->arch.at_instruction_boundary = false;
58f800d5 10853 if (kvm_vcpu_running(vcpu)) {
851ba692 10854 r = vcpu_enter_guest(vcpu);
bf9f6ac8 10855 } else {
2031f287 10856 r = vcpu_block(vcpu);
bf9f6ac8
FW
10857 }
10858
09cec754
GN
10859 if (r <= 0)
10860 break;
10861
084071d5 10862 kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
7caf9571
DW
10863 if (kvm_xen_has_pending_events(vcpu))
10864 kvm_xen_inject_pending_events(vcpu);
10865
09cec754
GN
10866 if (kvm_cpu_has_pending_timer(vcpu))
10867 kvm_inject_pending_timer_irqs(vcpu);
10868
782d422b
MG
10869 if (dm_request_for_irq_injection(vcpu) &&
10870 kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
4ca7dd8c
PB
10871 r = 0;
10872 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
09cec754 10873 ++vcpu->stat.request_irq_exits;
362c698f 10874 break;
09cec754 10875 }
af585b92 10876
f3020b88 10877 if (__xfer_to_guest_mode_work_pending()) {
2031f287 10878 kvm_vcpu_srcu_read_unlock(vcpu);
72c3c0fe 10879 r = xfer_to_guest_mode_handle_work(vcpu);
2031f287 10880 kvm_vcpu_srcu_read_lock(vcpu);
72c3c0fe
TG
10881 if (r)
10882 return r;
d7690175 10883 }
b6c7a5dc
HB
10884 }
10885
b6c7a5dc
HB
10886 return r;
10887}
10888
716d51ab
GN
10889static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
10890{
2d089356 10891 return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
716d51ab
GN
10892}
10893
10894static int complete_emulated_pio(struct kvm_vcpu *vcpu)
10895{
10896 BUG_ON(!vcpu->arch.pio.count);
10897
10898 return complete_emulated_io(vcpu);
10899}
10900
f78146b0
AK
10901/*
10902 * Implements the following, as a state machine:
10903 *
10904 * read:
10905 * for each fragment
87da7e66
XG
10906 * for each mmio piece in the fragment
10907 * write gpa, len
10908 * exit
10909 * copy data
f78146b0
AK
10910 * execute insn
10911 *
10912 * write:
10913 * for each fragment
87da7e66
XG
10914 * for each mmio piece in the fragment
10915 * write gpa, len
10916 * copy data
10917 * exit
f78146b0 10918 */
716d51ab 10919static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
5287f194
AK
10920{
10921 struct kvm_run *run = vcpu->run;
f78146b0 10922 struct kvm_mmio_fragment *frag;
87da7e66 10923 unsigned len;
5287f194 10924
716d51ab 10925 BUG_ON(!vcpu->mmio_needed);
5287f194 10926
716d51ab 10927 /* Complete previous fragment */
87da7e66
XG
10928 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
10929 len = min(8u, frag->len);
716d51ab 10930 if (!vcpu->mmio_is_write)
87da7e66
XG
10931 memcpy(frag->data, run->mmio.data, len);
10932
10933 if (frag->len <= 8) {
10934 /* Switch to the next fragment. */
10935 frag++;
10936 vcpu->mmio_cur_fragment++;
10937 } else {
10938 /* Go forward to the next mmio piece. */
10939 frag->data += len;
10940 frag->gpa += len;
10941 frag->len -= len;
10942 }
10943
a08d3b3b 10944 if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
716d51ab 10945 vcpu->mmio_needed = 0;
0912c977
PB
10946
10947 /* FIXME: return into emulator if single-stepping. */
cef4dea0 10948 if (vcpu->mmio_is_write)
716d51ab
GN
10949 return 1;
10950 vcpu->mmio_read_completed = 1;
10951 return complete_emulated_io(vcpu);
10952 }
87da7e66 10953
716d51ab
GN
10954 run->exit_reason = KVM_EXIT_MMIO;
10955 run->mmio.phys_addr = frag->gpa;
10956 if (vcpu->mmio_is_write)
87da7e66
XG
10957 memcpy(run->mmio.data, frag->data, min(8u, frag->len));
10958 run->mmio.len = min(8u, frag->len);
716d51ab
GN
10959 run->mmio.is_write = vcpu->mmio_is_write;
10960 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
10961 return 0;
5287f194
AK
10962}
10963
822f312d
SAS
10964/* Swap (qemu) user FPU context for the guest FPU context. */
10965static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
10966{
e27bc044 10967 /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
d69c1382 10968 fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
822f312d
SAS
10969 trace_kvm_fpu(1);
10970}
10971
10972/* When vcpu_run ends, restore user space FPU context. */
10973static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
10974{
d69c1382 10975 fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
822f312d
SAS
10976 ++vcpu->stat.fpu_reload;
10977 trace_kvm_fpu(0);
10978}
10979
1b94f6f8 10980int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
b6c7a5dc 10981{
7709aba8 10982 struct kvm_queued_exception *ex = &vcpu->arch.exception;
1b94f6f8 10983 struct kvm_run *kvm_run = vcpu->run;
b6c7a5dc 10984 int r;
b6c7a5dc 10985
accb757d 10986 vcpu_load(vcpu);
20b7035c 10987 kvm_sigset_activate(vcpu);
15aad3be 10988 kvm_run->flags = 0;
5663d8f9
PX
10989 kvm_load_guest_fpu(vcpu);
10990
2031f287 10991 kvm_vcpu_srcu_read_lock(vcpu);
a4535290 10992 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2f173d26
JS
10993 if (kvm_run->immediate_exit) {
10994 r = -EINTR;
10995 goto out;
10996 }
98c25ead
SC
10997 /*
10998 * It should be impossible for the hypervisor timer to be in
10999 * use before KVM has ever run the vCPU.
11000 */
11001 WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
8d25b7be 11002
2031f287 11003 kvm_vcpu_srcu_read_unlock(vcpu);
c91d4497 11004 kvm_vcpu_block(vcpu);
2031f287 11005 kvm_vcpu_srcu_read_lock(vcpu);
8d25b7be 11006
4fe09bcf
JM
11007 if (kvm_apic_accept_events(vcpu) < 0) {
11008 r = 0;
11009 goto out;
11010 }
ac9f6dc0 11011 r = -EAGAIN;
a0595000
JS
11012 if (signal_pending(current)) {
11013 r = -EINTR;
1b94f6f8 11014 kvm_run->exit_reason = KVM_EXIT_INTR;
a0595000
JS
11015 ++vcpu->stat.signal_exits;
11016 }
ac9f6dc0 11017 goto out;
b6c7a5dc
HB
11018 }
11019
e489a4a6
SC
11020 if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
11021 (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
01643c51
KH
11022 r = -EINVAL;
11023 goto out;
11024 }
11025
1b94f6f8 11026 if (kvm_run->kvm_dirty_regs) {
01643c51
KH
11027 r = sync_regs(vcpu);
11028 if (r != 0)
11029 goto out;
11030 }
11031
b6c7a5dc 11032 /* re-sync apic's tpr */
35754c98 11033 if (!lapic_in_kernel(vcpu)) {
eea1cff9
AP
11034 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
11035 r = -EINVAL;
11036 goto out;
11037 }
11038 }
b6c7a5dc 11039
7709aba8
SC
11040 /*
11041 * If userspace set a pending exception and L2 is active, convert it to
11042 * a pending VM-Exit if L1 wants to intercept the exception.
11043 */
11044 if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
11045 kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
11046 ex->error_code)) {
11047 kvm_queue_exception_vmexit(vcpu, ex->vector,
11048 ex->has_error_code, ex->error_code,
11049 ex->has_payload, ex->payload);
11050 ex->injected = false;
11051 ex->pending = false;
11052 }
11053 vcpu->arch.exception_from_userspace = false;
11054
716d51ab
GN
11055 if (unlikely(vcpu->arch.complete_userspace_io)) {
11056 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
11057 vcpu->arch.complete_userspace_io = NULL;
11058 r = cui(vcpu);
11059 if (r <= 0)
5663d8f9 11060 goto out;
0bc27326
SC
11061 } else {
11062 WARN_ON_ONCE(vcpu->arch.pio.count);
11063 WARN_ON_ONCE(vcpu->mmio_needed);
11064 }
5287f194 11065
fc4fad79 11066 if (kvm_run->immediate_exit) {
460df4c1 11067 r = -EINTR;
fc4fad79
SC
11068 goto out;
11069 }
11070
11071 r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
11072 if (r <= 0)
11073 goto out;
11074
11075 r = vcpu_run(vcpu);
b6c7a5dc
HB
11076
11077out:
5663d8f9 11078 kvm_put_guest_fpu(vcpu);
1b94f6f8 11079 if (kvm_run->kvm_valid_regs)
01643c51 11080 store_regs(vcpu);
f1d86e46 11081 post_kvm_run_save(vcpu);
2031f287 11082 kvm_vcpu_srcu_read_unlock(vcpu);
b6c7a5dc 11083
8d25b7be 11084 kvm_sigset_deactivate(vcpu);
accb757d 11085 vcpu_put(vcpu);
b6c7a5dc
HB
11086 return r;
11087}
11088
01643c51 11089static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
b6c7a5dc 11090{
7ae441ea
GN
11091 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
11092 /*
11093 * We are here if userspace calls get_regs() in the middle of
11094 * instruction emulation. Registers state needs to be copied
4a969980 11095 * back from emulation context to vcpu. Userspace shouldn't do
7ae441ea
GN
11096 * that usually, but some bad designed PV devices (vmware
11097 * backdoor interface) need this to work
11098 */
c9b8b07c 11099 emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
7ae441ea
GN
11100 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
11101 }
de3cd117
SC
11102 regs->rax = kvm_rax_read(vcpu);
11103 regs->rbx = kvm_rbx_read(vcpu);
11104 regs->rcx = kvm_rcx_read(vcpu);
11105 regs->rdx = kvm_rdx_read(vcpu);
11106 regs->rsi = kvm_rsi_read(vcpu);
11107 regs->rdi = kvm_rdi_read(vcpu);
e9c16c78 11108 regs->rsp = kvm_rsp_read(vcpu);
de3cd117 11109 regs->rbp = kvm_rbp_read(vcpu);
b6c7a5dc 11110#ifdef CONFIG_X86_64
de3cd117
SC
11111 regs->r8 = kvm_r8_read(vcpu);
11112 regs->r9 = kvm_r9_read(vcpu);
11113 regs->r10 = kvm_r10_read(vcpu);
11114 regs->r11 = kvm_r11_read(vcpu);
11115 regs->r12 = kvm_r12_read(vcpu);
11116 regs->r13 = kvm_r13_read(vcpu);
11117 regs->r14 = kvm_r14_read(vcpu);
11118 regs->r15 = kvm_r15_read(vcpu);
b6c7a5dc
HB
11119#endif
11120
5fdbf976 11121 regs->rip = kvm_rip_read(vcpu);
91586a3b 11122 regs->rflags = kvm_get_rflags(vcpu);
01643c51 11123}
b6c7a5dc 11124
01643c51
KH
11125int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
11126{
11127 vcpu_load(vcpu);
11128 __get_regs(vcpu, regs);
1fc9b76b 11129 vcpu_put(vcpu);
b6c7a5dc
HB
11130 return 0;
11131}
11132
01643c51 11133static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
b6c7a5dc 11134{
7ae441ea
GN
11135 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
11136 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
11137
de3cd117
SC
11138 kvm_rax_write(vcpu, regs->rax);
11139 kvm_rbx_write(vcpu, regs->rbx);
11140 kvm_rcx_write(vcpu, regs->rcx);
11141 kvm_rdx_write(vcpu, regs->rdx);
11142 kvm_rsi_write(vcpu, regs->rsi);
11143 kvm_rdi_write(vcpu, regs->rdi);
e9c16c78 11144 kvm_rsp_write(vcpu, regs->rsp);
de3cd117 11145 kvm_rbp_write(vcpu, regs->rbp);
b6c7a5dc 11146#ifdef CONFIG_X86_64
de3cd117
SC
11147 kvm_r8_write(vcpu, regs->r8);
11148 kvm_r9_write(vcpu, regs->r9);
11149 kvm_r10_write(vcpu, regs->r10);
11150 kvm_r11_write(vcpu, regs->r11);
11151 kvm_r12_write(vcpu, regs->r12);
11152 kvm_r13_write(vcpu, regs->r13);
11153 kvm_r14_write(vcpu, regs->r14);
11154 kvm_r15_write(vcpu, regs->r15);
b6c7a5dc
HB
11155#endif
11156
5fdbf976 11157 kvm_rip_write(vcpu, regs->rip);
d73235d1 11158 kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
b6c7a5dc 11159
b4f14abd 11160 vcpu->arch.exception.pending = false;
7709aba8 11161 vcpu->arch.exception_vmexit.pending = false;
b4f14abd 11162
3842d135 11163 kvm_make_request(KVM_REQ_EVENT, vcpu);
01643c51 11164}
3842d135 11165
01643c51
KH
11166int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
11167{
11168 vcpu_load(vcpu);
11169 __set_regs(vcpu, regs);
875656fe 11170 vcpu_put(vcpu);
b6c7a5dc
HB
11171 return 0;
11172}
11173
6dba9403 11174static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
b6c7a5dc 11175{
89a27f4d 11176 struct desc_ptr dt;
b6c7a5dc 11177
5265713a
TL
11178 if (vcpu->arch.guest_state_protected)
11179 goto skip_protected_regs;
11180
3e6e0aab
GT
11181 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
11182 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
11183 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
11184 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
11185 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
11186 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
b6c7a5dc 11187
3e6e0aab
GT
11188 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
11189 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
b6c7a5dc 11190
b3646477 11191 static_call(kvm_x86_get_idt)(vcpu, &dt);
89a27f4d
GN
11192 sregs->idt.limit = dt.size;
11193 sregs->idt.base = dt.address;
b3646477 11194 static_call(kvm_x86_get_gdt)(vcpu, &dt);
89a27f4d
GN
11195 sregs->gdt.limit = dt.size;
11196 sregs->gdt.base = dt.address;
b6c7a5dc 11197
ad312c7c 11198 sregs->cr2 = vcpu->arch.cr2;
9f8fe504 11199 sregs->cr3 = kvm_read_cr3(vcpu);
5265713a
TL
11200
11201skip_protected_regs:
11202 sregs->cr0 = kvm_read_cr0(vcpu);
fc78f519 11203 sregs->cr4 = kvm_read_cr4(vcpu);
2d3ad1f4 11204 sregs->cr8 = kvm_get_cr8(vcpu);
f6801dff 11205 sregs->efer = vcpu->arch.efer;
b6c7a5dc 11206 sregs->apic_base = kvm_get_apic_base(vcpu);
6dba9403 11207}
b6c7a5dc 11208
6dba9403
ML
11209static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
11210{
11211 __get_sregs_common(vcpu, sregs);
11212
11213 if (vcpu->arch.guest_state_protected)
11214 return;
b6c7a5dc 11215
04140b41 11216 if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
14d0bc1f
GN
11217 set_bit(vcpu->arch.interrupt.nr,
11218 (unsigned long *)sregs->interrupt_bitmap);
01643c51 11219}
16d7a191 11220
6dba9403
ML
11221static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
11222{
11223 int i;
11224
11225 __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
11226
11227 if (vcpu->arch.guest_state_protected)
11228 return;
11229
11230 if (is_pae_paging(vcpu)) {
11231 for (i = 0 ; i < 4 ; i++)
11232 sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
11233 sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
11234 }
11235}
11236
01643c51
KH
11237int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
11238 struct kvm_sregs *sregs)
11239{
11240 vcpu_load(vcpu);
11241 __get_sregs(vcpu, sregs);
bcdec41c 11242 vcpu_put(vcpu);
b6c7a5dc
HB
11243 return 0;
11244}
11245
62d9f0db
MT
11246int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
11247 struct kvm_mp_state *mp_state)
11248{
4fe09bcf
JM
11249 int r;
11250
fd232561 11251 vcpu_load(vcpu);
f958bd23
SC
11252 if (kvm_mpx_supported())
11253 kvm_load_guest_fpu(vcpu);
fd232561 11254
4fe09bcf
JM
11255 r = kvm_apic_accept_events(vcpu);
11256 if (r < 0)
11257 goto out;
11258 r = 0;
11259
647daca2
TL
11260 if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
11261 vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
11262 vcpu->arch.pv.pv_unhalted)
6aef266c
SV
11263 mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
11264 else
11265 mp_state->mp_state = vcpu->arch.mp_state;
11266
4fe09bcf 11267out:
f958bd23
SC
11268 if (kvm_mpx_supported())
11269 kvm_put_guest_fpu(vcpu);
fd232561 11270 vcpu_put(vcpu);
4fe09bcf 11271 return r;
62d9f0db
MT
11272}
11273
11274int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
11275 struct kvm_mp_state *mp_state)
11276{
e83dff5e
CD
11277 int ret = -EINVAL;
11278
11279 vcpu_load(vcpu);
11280
22c6a0ef
PB
11281 switch (mp_state->mp_state) {
11282 case KVM_MP_STATE_UNINITIALIZED:
11283 case KVM_MP_STATE_HALTED:
11284 case KVM_MP_STATE_AP_RESET_HOLD:
11285 case KVM_MP_STATE_INIT_RECEIVED:
11286 case KVM_MP_STATE_SIPI_RECEIVED:
11287 if (!lapic_in_kernel(vcpu))
11288 goto out;
11289 break;
11290
11291 case KVM_MP_STATE_RUNNABLE:
11292 break;
11293
11294 default:
e83dff5e 11295 goto out;
22c6a0ef 11296 }
66450a21 11297
27cbe7d6 11298 /*
1b7a1b78
SC
11299 * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
11300 * forcing the guest into INIT/SIPI if those events are supposed to be
11301 * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state
11302 * if an SMI is pending as well.
27cbe7d6 11303 */
1b7a1b78 11304 if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
28bf2888
DH
11305 (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
11306 mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
e83dff5e 11307 goto out;
28bf2888 11308
66450a21
JK
11309 if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
11310 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
11311 set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
11312 } else
11313 vcpu->arch.mp_state = mp_state->mp_state;
3842d135 11314 kvm_make_request(KVM_REQ_EVENT, vcpu);
e83dff5e
CD
11315
11316 ret = 0;
11317out:
11318 vcpu_put(vcpu);
11319 return ret;
62d9f0db
MT
11320}
11321
7f3d35fd
KW
11322int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
11323 int reason, bool has_error_code, u32 error_code)
b6c7a5dc 11324{
c9b8b07c 11325 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8ec4722d 11326 int ret;
e01c2426 11327
8ec4722d 11328 init_emulate_ctxt(vcpu);
c697518a 11329
7f3d35fd 11330 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
9d74191a 11331 has_error_code, error_code);
1051778f
SC
11332 if (ret) {
11333 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
11334 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
11335 vcpu->run->internal.ndata = 0;
60fc3d02 11336 return 0;
1051778f 11337 }
37817f29 11338
9d74191a
TY
11339 kvm_rip_write(vcpu, ctxt->eip);
11340 kvm_set_rflags(vcpu, ctxt->eflags);
60fc3d02 11341 return 1;
37817f29
IE
11342}
11343EXPORT_SYMBOL_GPL(kvm_task_switch);
11344
ee69c92b 11345static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
f2981033 11346{
37b95951 11347 if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
f2981033
LT
11348 /*
11349 * When EFER.LME and CR0.PG are set, the processor is in
11350 * 64-bit mode (though maybe in a 32-bit code segment).
11351 * CR4.PAE and EFER.LMA must be set.
11352 */
ee69c92b
SC
11353 if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
11354 return false;
ca29e145 11355 if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3))
c1c35cf7 11356 return false;
f2981033
LT
11357 } else {
11358 /*
11359 * Not in 64-bit mode: EFER.LMA is clear and the code
11360 * segment cannot be 64-bit.
11361 */
11362 if (sregs->efer & EFER_LMA || sregs->cs.l)
ee69c92b 11363 return false;
f2981033
LT
11364 }
11365
ee69c92b 11366 return kvm_is_valid_cr4(vcpu, sregs->cr4);
f2981033
LT
11367}
11368
6dba9403
ML
11369static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
11370 int *mmu_reset_needed, bool update_pdptrs)
b6c7a5dc 11371{
58cb628d 11372 struct msr_data apic_base_msr;
6dba9403 11373 int idx;
89a27f4d 11374 struct desc_ptr dt;
b4ef9d4e 11375
ee69c92b 11376 if (!kvm_is_valid_sregs(vcpu, sregs))
6dba9403 11377 return -EINVAL;
f2981033 11378
d3802286
JM
11379 apic_base_msr.data = sregs->apic_base;
11380 apic_base_msr.host_initiated = true;
11381 if (kvm_set_apic_base(vcpu, &apic_base_msr))
6dba9403 11382 return -EINVAL;
6d1068b3 11383
5265713a 11384 if (vcpu->arch.guest_state_protected)
6dba9403 11385 return 0;
5265713a 11386
89a27f4d
GN
11387 dt.size = sregs->idt.limit;
11388 dt.address = sregs->idt.base;
b3646477 11389 static_call(kvm_x86_set_idt)(vcpu, &dt);
89a27f4d
GN
11390 dt.size = sregs->gdt.limit;
11391 dt.address = sregs->gdt.base;
b3646477 11392 static_call(kvm_x86_set_gdt)(vcpu, &dt);
b6c7a5dc 11393
ad312c7c 11394 vcpu->arch.cr2 = sregs->cr2;
6dba9403 11395 *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
dc7e795e 11396 vcpu->arch.cr3 = sregs->cr3;
3883bc9d 11397 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
405329fc 11398 static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
b6c7a5dc 11399
2d3ad1f4 11400 kvm_set_cr8(vcpu, sregs->cr8);
b6c7a5dc 11401
6dba9403 11402 *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
b3646477 11403 static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
b6c7a5dc 11404
6dba9403 11405 *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
b3646477 11406 static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
d7306163 11407 vcpu->arch.cr0 = sregs->cr0;
b6c7a5dc 11408
6dba9403 11409 *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
b3646477 11410 static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
63f42e02 11411
6dba9403
ML
11412 if (update_pdptrs) {
11413 idx = srcu_read_lock(&vcpu->kvm->srcu);
11414 if (is_pae_paging(vcpu)) {
2df4a5eb 11415 load_pdptrs(vcpu, kvm_read_cr3(vcpu));
6dba9403
ML
11416 *mmu_reset_needed = 1;
11417 }
11418 srcu_read_unlock(&vcpu->kvm->srcu, idx);
7c93be44 11419 }
b6c7a5dc 11420
3e6e0aab
GT
11421 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
11422 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
11423 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
11424 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
11425 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
11426 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
b6c7a5dc 11427
3e6e0aab
GT
11428 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
11429 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
b6c7a5dc 11430
5f0269f5
ME
11431 update_cr8_intercept(vcpu);
11432
9c3e4aab 11433 /* Older userspace won't unhalt the vcpu on reset. */
c5af89b6 11434 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
9c3e4aab 11435 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
3eeb3288 11436 !is_protmode(vcpu))
9c3e4aab
MT
11437 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
11438
6dba9403
ML
11439 return 0;
11440}
11441
11442static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
11443{
11444 int pending_vec, max_bits;
11445 int mmu_reset_needed = 0;
11446 int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
11447
11448 if (ret)
11449 return ret;
11450
11451 if (mmu_reset_needed)
11452 kvm_mmu_reset_context(vcpu);
11453
5265713a
TL
11454 max_bits = KVM_NR_INTERRUPTS;
11455 pending_vec = find_first_bit(
11456 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
6dba9403 11457
5265713a
TL
11458 if (pending_vec < max_bits) {
11459 kvm_queue_interrupt(vcpu, pending_vec, false);
11460 pr_debug("Set back pending irq %d\n", pending_vec);
6dba9403 11461 kvm_make_request(KVM_REQ_EVENT, vcpu);
5265713a 11462 }
6dba9403
ML
11463 return 0;
11464}
5265713a 11465
6dba9403
ML
11466static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
11467{
11468 int mmu_reset_needed = 0;
11469 bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
11470 bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
11471 !(sregs2->efer & EFER_LMA);
11472 int i, ret;
3842d135 11473
6dba9403
ML
11474 if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
11475 return -EINVAL;
11476
11477 if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
11478 return -EINVAL;
11479
11480 ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
11481 &mmu_reset_needed, !valid_pdptrs);
11482 if (ret)
11483 return ret;
11484
11485 if (valid_pdptrs) {
11486 for (i = 0; i < 4 ; i++)
11487 kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
11488
11489 kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
11490 mmu_reset_needed = 1;
158a48ec 11491 vcpu->arch.pdptrs_from_userspace = true;
6dba9403
ML
11492 }
11493 if (mmu_reset_needed)
11494 kvm_mmu_reset_context(vcpu);
11495 return 0;
01643c51
KH
11496}
11497
11498int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
11499 struct kvm_sregs *sregs)
11500{
11501 int ret;
11502
11503 vcpu_load(vcpu);
11504 ret = __set_sregs(vcpu, sregs);
b4ef9d4e
CD
11505 vcpu_put(vcpu);
11506 return ret;
b6c7a5dc
HB
11507}
11508
cae72dcc
ML
11509static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
11510{
320af55a 11511 bool set = false;
cae72dcc 11512 struct kvm_vcpu *vcpu;
46808a4c 11513 unsigned long i;
cae72dcc 11514
0047fb33
SC
11515 if (!enable_apicv)
11516 return;
11517
cae72dcc
ML
11518 down_write(&kvm->arch.apicv_update_lock);
11519
11520 kvm_for_each_vcpu(i, vcpu, kvm) {
11521 if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
320af55a 11522 set = true;
cae72dcc
ML
11523 break;
11524 }
11525 }
320af55a 11526 __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set);
cae72dcc
ML
11527 up_write(&kvm->arch.apicv_update_lock);
11528}
11529
d0bfb940
JK
11530int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
11531 struct kvm_guest_debug *dbg)
b6c7a5dc 11532{
355be0b9 11533 unsigned long rflags;
ae675ef0 11534 int i, r;
b6c7a5dc 11535
8d4846b9
TL
11536 if (vcpu->arch.guest_state_protected)
11537 return -EINVAL;
11538
66b56562
CD
11539 vcpu_load(vcpu);
11540
4f926bf2
JK
11541 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
11542 r = -EBUSY;
7709aba8 11543 if (kvm_is_exception_pending(vcpu))
2122ff5e 11544 goto out;
4f926bf2
JK
11545 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
11546 kvm_queue_exception(vcpu, DB_VECTOR);
11547 else
11548 kvm_queue_exception(vcpu, BP_VECTOR);
11549 }
11550
91586a3b
JK
11551 /*
11552 * Read rflags as long as potentially injected trace flags are still
11553 * filtered out.
11554 */
11555 rflags = kvm_get_rflags(vcpu);
355be0b9
JK
11556
11557 vcpu->guest_debug = dbg->control;
11558 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
11559 vcpu->guest_debug = 0;
11560
11561 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
ae675ef0
JK
11562 for (i = 0; i < KVM_NR_DB_REGS; ++i)
11563 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
c8639010 11564 vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
ae675ef0
JK
11565 } else {
11566 for (i = 0; i < KVM_NR_DB_REGS; i++)
11567 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
ae675ef0 11568 }
c8639010 11569 kvm_update_dr7(vcpu);
ae675ef0 11570
f92653ee 11571 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
e87e46d5 11572 vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
94fe45da 11573
91586a3b
JK
11574 /*
11575 * Trigger an rflags update that will inject or remove the trace
11576 * flags.
11577 */
11578 kvm_set_rflags(vcpu, rflags);
b6c7a5dc 11579
b3646477 11580 static_call(kvm_x86_update_exception_bitmap)(vcpu);
b6c7a5dc 11581
cae72dcc
ML
11582 kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
11583
4f926bf2 11584 r = 0;
d0bfb940 11585
2122ff5e 11586out:
66b56562 11587 vcpu_put(vcpu);
b6c7a5dc
HB
11588 return r;
11589}
11590
8b006791
ZX
11591/*
11592 * Translate a guest virtual address to a guest physical address.
11593 */
11594int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
11595 struct kvm_translation *tr)
11596{
11597 unsigned long vaddr = tr->linear_address;
11598 gpa_t gpa;
f656ce01 11599 int idx;
8b006791 11600
1da5b61d
CD
11601 vcpu_load(vcpu);
11602
f656ce01 11603 idx = srcu_read_lock(&vcpu->kvm->srcu);
1871c602 11604 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
f656ce01 11605 srcu_read_unlock(&vcpu->kvm->srcu, idx);
8b006791 11606 tr->physical_address = gpa;
6e1d2a3f 11607 tr->valid = gpa != INVALID_GPA;
8b006791
ZX
11608 tr->writeable = 1;
11609 tr->usermode = 0;
8b006791 11610
1da5b61d 11611 vcpu_put(vcpu);
8b006791
ZX
11612 return 0;
11613}
11614
d0752060
HB
11615int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
11616{
1393123e 11617 struct fxregs_state *fxsave;
d0752060 11618
d69c1382 11619 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
ed02b213
TL
11620 return 0;
11621
1393123e 11622 vcpu_load(vcpu);
d0752060 11623
d69c1382 11624 fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
d0752060
HB
11625 memcpy(fpu->fpr, fxsave->st_space, 128);
11626 fpu->fcw = fxsave->cwd;
11627 fpu->fsw = fxsave->swd;
11628 fpu->ftwx = fxsave->twd;
11629 fpu->last_opcode = fxsave->fop;
11630 fpu->last_ip = fxsave->rip;
11631 fpu->last_dp = fxsave->rdp;
0e96f31e 11632 memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
d0752060 11633
1393123e 11634 vcpu_put(vcpu);
d0752060
HB
11635 return 0;
11636}
11637
11638int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
11639{
6a96bc7f
CD
11640 struct fxregs_state *fxsave;
11641
d69c1382 11642 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
ed02b213
TL
11643 return 0;
11644
6a96bc7f
CD
11645 vcpu_load(vcpu);
11646
d69c1382 11647 fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
d0752060 11648
d0752060
HB
11649 memcpy(fxsave->st_space, fpu->fpr, 128);
11650 fxsave->cwd = fpu->fcw;
11651 fxsave->swd = fpu->fsw;
11652 fxsave->twd = fpu->ftwx;
11653 fxsave->fop = fpu->last_opcode;
11654 fxsave->rip = fpu->last_ip;
11655 fxsave->rdp = fpu->last_dp;
0e96f31e 11656 memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
d0752060 11657
6a96bc7f 11658 vcpu_put(vcpu);
d0752060
HB
11659 return 0;
11660}
11661
01643c51
KH
11662static void store_regs(struct kvm_vcpu *vcpu)
11663{
11664 BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
11665
11666 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
11667 __get_regs(vcpu, &vcpu->run->s.regs.regs);
11668
11669 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
11670 __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
11671
11672 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
11673 kvm_vcpu_ioctl_x86_get_vcpu_events(
11674 vcpu, &vcpu->run->s.regs.events);
11675}
11676
11677static int sync_regs(struct kvm_vcpu *vcpu)
11678{
01643c51
KH
11679 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
11680 __set_regs(vcpu, &vcpu->run->s.regs.regs);
11681 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
11682 }
11683 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
11684 if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
11685 return -EINVAL;
11686 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
11687 }
11688 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
11689 if (kvm_vcpu_ioctl_x86_set_vcpu_events(
11690 vcpu, &vcpu->run->s.regs.events))
11691 return -EINVAL;
11692 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
11693 }
11694
11695 return 0;
11696}
11697
897cc38e 11698int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
e9b11c17 11699{
1d5e740d 11700 if (kvm_check_tsc_unstable() && kvm->created_vcpus)
8d20bd63 11701 pr_warn_once("SMP vm created on host with unstable TSC; "
897cc38e 11702 "guest TSC will not be reliable\n");
7f1ea208 11703
35875316
ZG
11704 if (!kvm->arch.max_vcpu_ids)
11705 kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
11706
11707 if (id >= kvm->arch.max_vcpu_ids)
11708 return -EINVAL;
11709
d588bb9b 11710 return static_call(kvm_x86_vcpu_precreate)(kvm);
e9b11c17
ZX
11711}
11712
e529ef66 11713int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
e9b11c17 11714{
95a0d01e
SC
11715 struct page *page;
11716 int r;
c447e76b 11717
63f5a190 11718 vcpu->arch.last_vmentry_cpu = -1;
7117003f
SC
11719 vcpu->arch.regs_avail = ~0;
11720 vcpu->arch.regs_dirty = ~0;
63f5a190 11721
8c82a0b3 11722 kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
52491a38 11723
95a0d01e
SC
11724 if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
11725 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
11726 else
11727 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
c447e76b 11728
95a0d01e
SC
11729 r = kvm_mmu_create(vcpu);
11730 if (r < 0)
11731 return r;
11732
11733 if (irqchip_in_kernel(vcpu->kvm)) {
95a0d01e
SC
11734 r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
11735 if (r < 0)
11736 goto fail_mmu_destroy;
423ecfea
SC
11737
11738 /*
11739 * Defer evaluating inhibits until the vCPU is first run, as
11740 * this vCPU will not get notified of any changes until this
11741 * vCPU is visible to other vCPUs (marked online and added to
11742 * the set of vCPUs). Opportunistically mark APICv active as
11743 * VMX in particularly is highly unlikely to have inhibits.
11744 * Ignore the current per-VM APICv state so that vCPU creation
11745 * is guaranteed to run with a deterministic value, the request
11746 * will ensure the vCPU gets the correct state before VM-Entry.
11747 */
11748 if (enable_apicv) {
ce0a58f4 11749 vcpu->arch.apic->apicv_active = true;
423ecfea
SC
11750 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
11751 }
95a0d01e 11752 } else
6e4e3b4d 11753 static_branch_inc(&kvm_has_noapic_vcpu);
95a0d01e
SC
11754
11755 r = -ENOMEM;
11756
93bb59ca 11757 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
95a0d01e
SC
11758 if (!page)
11759 goto fail_free_lapic;
11760 vcpu->arch.pio_data = page_address(page);
11761
087acc4e 11762 vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
95a0d01e 11763 GFP_KERNEL_ACCOUNT);
281b5278
JW
11764 vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
11765 GFP_KERNEL_ACCOUNT);
11766 if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
3c0ba05c 11767 goto fail_free_mce_banks;
95a0d01e
SC
11768 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
11769
11770 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
11771 GFP_KERNEL_ACCOUNT))
11772 goto fail_free_mce_banks;
11773
c9b8b07c
SC
11774 if (!alloc_emulate_ctxt(vcpu))
11775 goto free_wbinvd_dirty_mask;
11776
d69c1382 11777 if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
8d20bd63 11778 pr_err("failed to allocate vcpu's fpu\n");
c9b8b07c 11779 goto free_emulate_ctxt;
95a0d01e
SC
11780 }
11781
95a0d01e 11782 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
a8ac864a 11783 vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
95a0d01e
SC
11784
11785 vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
11786
11787 kvm_async_pf_hash_reset(vcpu);
6c6f82be
SC
11788
11789 vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
95a0d01e
SC
11790 kvm_pmu_init(vcpu);
11791
11792 vcpu->arch.pending_external_vector = -1;
11793 vcpu->arch.preempted_in_kernel = false;
11794
3c86c0d3
VP
11795#if IS_ENABLED(CONFIG_HYPERV)
11796 vcpu->arch.hv_root_tdp = INVALID_PAGE;
11797#endif
11798
b3646477 11799 r = static_call(kvm_x86_vcpu_create)(vcpu);
95a0d01e
SC
11800 if (r)
11801 goto free_guest_fpu;
e9b11c17 11802
0cf9135b 11803 vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
e53d88af 11804 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
942c2490 11805 kvm_xen_init_vcpu(vcpu);
19efffa2 11806 kvm_vcpu_mtrr_init(vcpu);
ec7660cc 11807 vcpu_load(vcpu);
ffbb61d0 11808 kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
d28bc9dd 11809 kvm_vcpu_reset(vcpu, false);
c9060662 11810 kvm_init_mmu(vcpu);
e9b11c17 11811 vcpu_put(vcpu);
ec7660cc 11812 return 0;
95a0d01e
SC
11813
11814free_guest_fpu:
d69c1382 11815 fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
c9b8b07c
SC
11816free_emulate_ctxt:
11817 kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
95a0d01e
SC
11818free_wbinvd_dirty_mask:
11819 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
11820fail_free_mce_banks:
11821 kfree(vcpu->arch.mce_banks);
281b5278 11822 kfree(vcpu->arch.mci_ctl2_banks);
95a0d01e
SC
11823 free_page((unsigned long)vcpu->arch.pio_data);
11824fail_free_lapic:
11825 kvm_free_lapic(vcpu);
11826fail_mmu_destroy:
11827 kvm_mmu_destroy(vcpu);
11828 return r;
e9b11c17
ZX
11829}
11830
31928aa5 11831void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
42897d86 11832{
332967a3 11833 struct kvm *kvm = vcpu->kvm;
42897d86 11834
ec7660cc 11835 if (mutex_lock_killable(&vcpu->mutex))
31928aa5 11836 return;
ec7660cc 11837 vcpu_load(vcpu);
0c899c25 11838 kvm_synchronize_tsc(vcpu, 0);
42897d86 11839 vcpu_put(vcpu);
2d5ba19b
MT
11840
11841 /* poll control enabled by default */
11842 vcpu->arch.msr_kvm_poll_control = 1;
11843
ec7660cc 11844 mutex_unlock(&vcpu->mutex);
42897d86 11845
b34de572
WL
11846 if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
11847 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
11848 KVMCLOCK_SYNC_PERIOD);
42897d86
MT
11849}
11850
d40ccc62 11851void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
e9b11c17 11852{
95a0d01e 11853 int idx;
344d9588 11854
50b143e1 11855 kvmclock_reset(vcpu);
e9b11c17 11856
b3646477 11857 static_call(kvm_x86_vcpu_free)(vcpu);
50b143e1 11858
c9b8b07c 11859 kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
50b143e1 11860 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
d69c1382 11861 fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
95a0d01e 11862
a795cd43 11863 kvm_xen_destroy_vcpu(vcpu);
95a0d01e
SC
11864 kvm_hv_vcpu_uninit(vcpu);
11865 kvm_pmu_destroy(vcpu);
11866 kfree(vcpu->arch.mce_banks);
281b5278 11867 kfree(vcpu->arch.mci_ctl2_banks);
95a0d01e
SC
11868 kvm_free_lapic(vcpu);
11869 idx = srcu_read_lock(&vcpu->kvm->srcu);
11870 kvm_mmu_destroy(vcpu);
11871 srcu_read_unlock(&vcpu->kvm->srcu, idx);
11872 free_page((unsigned long)vcpu->arch.pio_data);
255cbecf 11873 kvfree(vcpu->arch.cpuid_entries);
95a0d01e 11874 if (!lapic_in_kernel(vcpu))
6e4e3b4d 11875 static_branch_dec(&kvm_has_noapic_vcpu);
e9b11c17
ZX
11876}
11877
d28bc9dd 11878void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
e9b11c17 11879{
25b97845 11880 struct kvm_cpuid_entry2 *cpuid_0x1;
0aa18375 11881 unsigned long old_cr0 = kvm_read_cr0(vcpu);
4c72ab5a 11882 unsigned long new_cr0;
0aa18375 11883
62dd57dd
SC
11884 /*
11885 * Several of the "set" flows, e.g. ->set_cr0(), read other registers
11886 * to handle side effects. RESET emulation hits those flows and relies
11887 * on emulated/virtualized registers, including those that are loaded
11888 * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel
11889 * to detect improper or missing initialization.
11890 */
11891 WARN_ON_ONCE(!init_event &&
11892 (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
0aa18375 11893
ed129ec9
ML
11894 /*
11895 * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
11896 * possible to INIT the vCPU while L2 is active. Force the vCPU back
11897 * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
11898 * bits), i.e. virtualization is disabled.
11899 */
11900 if (is_guest_mode(vcpu))
11901 kvm_leave_nested(vcpu);
11902
b7e31be3
RK
11903 kvm_lapic_reset(vcpu, init_event);
11904
ed129ec9 11905 WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
e69fab5d
PB
11906 vcpu->arch.hflags = 0;
11907
c43203ca 11908 vcpu->arch.smi_pending = 0;
52797bf9 11909 vcpu->arch.smi_count = 0;
7460fb4a
AK
11910 atomic_set(&vcpu->arch.nmi_queued, 0);
11911 vcpu->arch.nmi_pending = 0;
448fa4a9 11912 vcpu->arch.nmi_injected = false;
5f7552d4
NA
11913 kvm_clear_interrupt_queue(vcpu);
11914 kvm_clear_exception_queue(vcpu);
448fa4a9 11915
42dbaa5a 11916 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
ae561ede 11917 kvm_update_dr0123(vcpu);
9a3ecd5e 11918 vcpu->arch.dr6 = DR6_ACTIVE_LOW;
42dbaa5a 11919 vcpu->arch.dr7 = DR7_FIXED_1;
c8639010 11920 kvm_update_dr7(vcpu);
42dbaa5a 11921
1119022c
NA
11922 vcpu->arch.cr2 = 0;
11923
3842d135 11924 kvm_make_request(KVM_REQ_EVENT, vcpu);
2635b5c4
VK
11925 vcpu->arch.apf.msr_en_val = 0;
11926 vcpu->arch.apf.msr_int_val = 0;
c9aaa895 11927 vcpu->arch.st.msr_val = 0;
3842d135 11928
12f9a48f
GC
11929 kvmclock_reset(vcpu);
11930
af585b92
GN
11931 kvm_clear_async_pf_completion_queue(vcpu);
11932 kvm_async_pf_hash_reset(vcpu);
11933 vcpu->arch.apf.halted = false;
3842d135 11934
d69c1382
TG
11935 if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
11936 struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
a554d207
WL
11937
11938 /*
a61353ac
SC
11939 * All paths that lead to INIT are required to load the guest's
11940 * FPU state (because most paths are buried in KVM_RUN).
a554d207 11941 */
f775b13e
RR
11942 if (init_event)
11943 kvm_put_guest_fpu(vcpu);
087df48c
TG
11944
11945 fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
11946 fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
11947
f775b13e
RR
11948 if (init_event)
11949 kvm_load_guest_fpu(vcpu);
a554d207
WL
11950 }
11951
64d60670 11952 if (!init_event) {
d28bc9dd 11953 kvm_pmu_reset(vcpu);
64d60670 11954 vcpu->arch.smbase = 0x30000;
db2336a8 11955
db2336a8 11956 vcpu->arch.msr_misc_features_enables = 0;
9fc22296
SC
11957 vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
11958 MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
a554d207 11959
05a9e065
LX
11960 __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
11961 __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
64d60670 11962 }
f5132b01 11963
ff8828c8 11964 /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
66f7b72e 11965 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
ff8828c8 11966 kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP);
66f7b72e 11967
49d8665c
SC
11968 /*
11969 * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
11970 * if no CPUID match is found. Note, it's impossible to get a match at
11971 * RESET since KVM emulates RESET before exposing the vCPU to userspace,
25b97845
SC
11972 * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
11973 * on RESET. But, go through the motions in case that's ever remedied.
49d8665c 11974 */
277ad7d5 11975 cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1);
25b97845 11976 kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
49d8665c 11977
b3646477 11978 static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
0aa18375 11979
f39e805e
SC
11980 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
11981 kvm_rip_write(vcpu, 0xfff0);
11982
03a6e840
SC
11983 vcpu->arch.cr3 = 0;
11984 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
11985
4c72ab5a
SC
11986 /*
11987 * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions
11988 * of Intel's SDM list CD/NW as being set on INIT, but they contradict
11989 * (or qualify) that with a footnote stating that CD/NW are preserved.
11990 */
11991 new_cr0 = X86_CR0_ET;
11992 if (init_event)
11993 new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
11994 else
11995 new_cr0 |= X86_CR0_NW | X86_CR0_CD;
11996
11997 static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
f39e805e
SC
11998 static_call(kvm_x86_set_cr4)(vcpu, 0);
11999 static_call(kvm_x86_set_efer)(vcpu, 0);
12000 static_call(kvm_x86_update_exception_bitmap)(vcpu);
12001
0aa18375 12002 /*
b5f61c03
PB
12003 * On the standard CR0/CR4/EFER modification paths, there are several
12004 * complex conditions determining whether the MMU has to be reset and/or
12005 * which PCIDs have to be flushed. However, CR0.WP and the paging-related
12006 * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush
12007 * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as
12008 * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here.
0aa18375 12009 */
b5f61c03
PB
12010 if (old_cr0 & X86_CR0_PG) {
12011 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
0aa18375 12012 kvm_mmu_reset_context(vcpu);
b5f61c03 12013 }
df37ed38
SC
12014
12015 /*
12016 * Intel's SDM states that all TLB entries are flushed on INIT. AMD's
12017 * APM states the TLBs are untouched by INIT, but it also states that
12018 * the TLBs are flushed on "External initialization of the processor."
12019 * Flush the guest TLB regardless of vendor, there is no meaningful
12020 * benefit in relying on the guest to flush the TLB immediately after
12021 * INIT. A spurious TLB flush is benign and likely negligible from a
12022 * performance perspective.
12023 */
12024 if (init_event)
12025 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
e9b11c17 12026}
265e4353 12027EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
e9b11c17 12028
2b4a273b 12029void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
66450a21
JK
12030{
12031 struct kvm_segment cs;
12032
12033 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
12034 cs.selector = vector << 8;
12035 cs.base = vector << 12;
12036 kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
12037 kvm_rip_write(vcpu, 0);
e9b11c17 12038}
647daca2 12039EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
e9b11c17 12040
13a34e06 12041int kvm_arch_hardware_enable(void)
e9b11c17 12042{
ca84d1a2
ZA
12043 struct kvm *kvm;
12044 struct kvm_vcpu *vcpu;
46808a4c 12045 unsigned long i;
0dd6a6ed
ZA
12046 int ret;
12047 u64 local_tsc;
12048 u64 max_tsc = 0;
12049 bool stable, backwards_tsc = false;
18863bdd 12050
7e34fbd0 12051 kvm_user_return_msr_cpu_online();
c82a5c5c
CG
12052
12053 ret = kvm_x86_check_processor_compatibility();
12054 if (ret)
12055 return ret;
12056
b3646477 12057 ret = static_call(kvm_x86_hardware_enable)();
0dd6a6ed
ZA
12058 if (ret != 0)
12059 return ret;
12060
4ea1636b 12061 local_tsc = rdtsc();
b0c39dc6 12062 stable = !kvm_check_tsc_unstable();
0dd6a6ed
ZA
12063 list_for_each_entry(kvm, &vm_list, vm_list) {
12064 kvm_for_each_vcpu(i, vcpu, kvm) {
12065 if (!stable && vcpu->cpu == smp_processor_id())
105b21bb 12066 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
0dd6a6ed
ZA
12067 if (stable && vcpu->arch.last_host_tsc > local_tsc) {
12068 backwards_tsc = true;
12069 if (vcpu->arch.last_host_tsc > max_tsc)
12070 max_tsc = vcpu->arch.last_host_tsc;
12071 }
12072 }
12073 }
12074
12075 /*
12076 * Sometimes, even reliable TSCs go backwards. This happens on
12077 * platforms that reset TSC during suspend or hibernate actions, but
12078 * maintain synchronization. We must compensate. Fortunately, we can
12079 * detect that condition here, which happens early in CPU bringup,
12080 * before any KVM threads can be running. Unfortunately, we can't
12081 * bring the TSCs fully up to date with real time, as we aren't yet far
12082 * enough into CPU bringup that we know how much real time has actually
9285ec4c 12083 * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
0dd6a6ed
ZA
12084 * variables that haven't been updated yet.
12085 *
12086 * So we simply find the maximum observed TSC above, then record the
12087 * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
12088 * the adjustment will be applied. Note that we accumulate
12089 * adjustments, in case multiple suspend cycles happen before some VCPU
12090 * gets a chance to run again. In the event that no KVM threads get a
12091 * chance to run, we will miss the entire elapsed period, as we'll have
12092 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
12093 * loose cycle time. This isn't too big a deal, since the loss will be
12094 * uniform across all VCPUs (not to mention the scenario is extremely
12095 * unlikely). It is possible that a second hibernate recovery happens
12096 * much faster than a first, causing the observed TSC here to be
12097 * smaller; this would require additional padding adjustment, which is
12098 * why we set last_host_tsc to the local tsc observed here.
12099 *
12100 * N.B. - this code below runs only on platforms with reliable TSC,
12101 * as that is the only way backwards_tsc is set above. Also note
12102 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
12103 * have the same delta_cyc adjustment applied if backwards_tsc
12104 * is detected. Note further, this adjustment is only done once,
12105 * as we reset last_host_tsc on all VCPUs to stop this from being
12106 * called multiple times (one for each physical CPU bringup).
12107 *
4a969980 12108 * Platforms with unreliable TSCs don't have to deal with this, they
0dd6a6ed
ZA
12109 * will be compensated by the logic in vcpu_load, which sets the TSC to
12110 * catchup mode. This will catchup all VCPUs to real time, but cannot
12111 * guarantee that they stay in perfect synchronization.
12112 */
12113 if (backwards_tsc) {
12114 u64 delta_cyc = max_tsc - local_tsc;
12115 list_for_each_entry(kvm, &vm_list, vm_list) {
a826faf1 12116 kvm->arch.backwards_tsc_observed = true;
0dd6a6ed
ZA
12117 kvm_for_each_vcpu(i, vcpu, kvm) {
12118 vcpu->arch.tsc_offset_adjustment += delta_cyc;
12119 vcpu->arch.last_host_tsc = local_tsc;
105b21bb 12120 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
0dd6a6ed
ZA
12121 }
12122
12123 /*
12124 * We have to disable TSC offset matching.. if you were
12125 * booting a VM while issuing an S4 host suspend....
12126 * you may have some problem. Solving this issue is
12127 * left as an exercise to the reader.
12128 */
12129 kvm->arch.last_tsc_nsec = 0;
12130 kvm->arch.last_tsc_write = 0;
12131 }
12132
12133 }
12134 return 0;
e9b11c17
ZX
12135}
12136
13a34e06 12137void kvm_arch_hardware_disable(void)
e9b11c17 12138{
b3646477 12139 static_call(kvm_x86_hardware_disable)();
13a34e06 12140 drop_user_return_notifiers();
e9b11c17
ZX
12141}
12142
d71ba788
PB
12143bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
12144{
12145 return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
12146}
d71ba788
PB
12147
12148bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
12149{
12150 return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
e9b11c17
ZX
12151}
12152
6e4e3b4d
CL
12153__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
12154EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
54e9818f 12155
e790d9ef
RK
12156void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
12157{
b35e5548
LX
12158 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
12159
c595ceee 12160 vcpu->arch.l1tf_flush_l1d = true;
b35e5548
LX
12161 if (pmu->version && unlikely(pmu->event_count)) {
12162 pmu->need_cleanup = true;
12163 kvm_make_request(KVM_REQ_PMU, vcpu);
12164 }
b3646477 12165 static_call(kvm_x86_sched_in)(vcpu, cpu);
e790d9ef
RK
12166}
12167
562b6b08
SC
12168void kvm_arch_free_vm(struct kvm *kvm)
12169{
05f04ae4 12170 kfree(to_kvm_hv(kvm)->hv_pa_pg);
78b497f2 12171 __kvm_arch_free_vm(kvm);
e790d9ef
RK
12172}
12173
562b6b08 12174
e08b9637 12175int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
d19a9cd2 12176{
eb7511bf 12177 int ret;
869b4421 12178 unsigned long flags;
eb7511bf 12179
e08b9637
CO
12180 if (type)
12181 return -EINVAL;
12182
eb7511bf
HZ
12183 ret = kvm_page_track_init(kvm);
12184 if (ret)
a1a39128
PB
12185 goto out;
12186
12187 ret = kvm_mmu_init_vm(kvm);
12188 if (ret)
12189 goto out_page_track;
eb7511bf 12190
b24ede22
JS
12191 ret = static_call(kvm_x86_vm_init)(kvm);
12192 if (ret)
12193 goto out_uninit_mmu;
12194
6ef768fa 12195 INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
4d5c5d0f 12196 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
e0f0bbc5 12197 atomic_set(&kvm->arch.noncoherent_dma_count, 0);
d19a9cd2 12198
5550af4d
SY
12199 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
12200 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
7a84428a
AW
12201 /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
12202 set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
12203 &kvm->arch.irq_sources_bitmap);
5550af4d 12204
038f8c11 12205 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
1e08ec4a 12206 mutex_init(&kvm->arch.apic_map_lock);
869b4421 12207 seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
8171cd68 12208 kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
869b4421
PB
12209
12210 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
d828199e 12211 pvclock_update_vm_gtod_copy(kvm);
869b4421 12212 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
53f658b3 12213
741e511b 12214 kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
6fbbde9a 12215 kvm->arch.guest_can_read_msr_platform_info = true;
ba7bb663 12216 kvm->arch.enable_pmu = enable_pmu;
6fbbde9a 12217
3c86c0d3
VP
12218#if IS_ENABLED(CONFIG_HYPERV)
12219 spin_lock_init(&kvm->arch.hv_root_tdp_lock);
12220 kvm->arch.hv_root_tdp = INVALID_PAGE;
12221#endif
12222
7e44e449 12223 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
332967a3 12224 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
7e44e449 12225
4651fc56 12226 kvm_apicv_init(kvm);
cbc0236a 12227 kvm_hv_init_vm(kvm);
319afe68 12228 kvm_xen_init_vm(kvm);
0eb05bf2 12229
b24ede22 12230 return 0;
a1a39128 12231
b24ede22
JS
12232out_uninit_mmu:
12233 kvm_mmu_uninit_vm(kvm);
a1a39128
PB
12234out_page_track:
12235 kvm_page_track_cleanup(kvm);
12236out:
12237 return ret;
d19a9cd2
ZX
12238}
12239
1aa9b957
JS
12240int kvm_arch_post_init_vm(struct kvm *kvm)
12241{
12242 return kvm_mmu_post_init_vm(kvm);
12243}
12244
d19a9cd2
ZX
12245static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
12246{
ec7660cc 12247 vcpu_load(vcpu);
d19a9cd2
ZX
12248 kvm_mmu_unload(vcpu);
12249 vcpu_put(vcpu);
12250}
12251
6fcee03d 12252static void kvm_unload_vcpu_mmus(struct kvm *kvm)
d19a9cd2 12253{
46808a4c 12254 unsigned long i;
988a2cae 12255 struct kvm_vcpu *vcpu;
d19a9cd2 12256
af585b92
GN
12257 kvm_for_each_vcpu(i, vcpu, kvm) {
12258 kvm_clear_async_pf_completion_queue(vcpu);
988a2cae 12259 kvm_unload_vcpu_mmu(vcpu);
af585b92 12260 }
d19a9cd2
ZX
12261}
12262
ad8ba2cd
SY
12263void kvm_arch_sync_events(struct kvm *kvm)
12264{
332967a3 12265 cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
7e44e449 12266 cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
aea924f6 12267 kvm_free_pit(kvm);
ad8ba2cd
SY
12268}
12269
ff5a983c
PX
12270/**
12271 * __x86_set_memory_region: Setup KVM internal memory slot
12272 *
12273 * @kvm: the kvm pointer to the VM.
12274 * @id: the slot ID to setup.
12275 * @gpa: the GPA to install the slot (unused when @size == 0).
12276 * @size: the size of the slot. Set to zero to uninstall a slot.
12277 *
12278 * This function helps to setup a KVM internal memory slot. Specify
12279 * @size > 0 to install a new slot, while @size == 0 to uninstall a
12280 * slot. The return code can be one of the following:
12281 *
12282 * HVA: on success (uninstall will return a bogus HVA)
12283 * -errno: on error
12284 *
12285 * The caller should always use IS_ERR() to check the return value
12286 * before use. Note, the KVM internal memory slots are guaranteed to
12287 * remain valid and unchanged until the VM is destroyed, i.e., the
12288 * GPA->HVA translation will not change. However, the HVA is a user
12289 * address, i.e. its accessibility is not guaranteed, and must be
12290 * accessed via __copy_{to,from}_user().
12291 */
12292void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
12293 u32 size)
9da0e4d5
PB
12294{
12295 int i, r;
3f649ab7 12296 unsigned long hva, old_npages;
f0d648bd 12297 struct kvm_memslots *slots = kvm_memslots(kvm);
0577d1ab 12298 struct kvm_memory_slot *slot;
9da0e4d5
PB
12299
12300 /* Called with kvm->slots_lock held. */
1d8007bd 12301 if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
ff5a983c 12302 return ERR_PTR_USR(-EINVAL);
9da0e4d5 12303
f0d648bd
PB
12304 slot = id_to_memslot(slots, id);
12305 if (size) {
0577d1ab 12306 if (slot && slot->npages)
ff5a983c 12307 return ERR_PTR_USR(-EEXIST);
f0d648bd
PB
12308
12309 /*
12310 * MAP_SHARED to prevent internal slot pages from being moved
12311 * by fork()/COW.
12312 */
12313 hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
12314 MAP_SHARED | MAP_ANONYMOUS, 0);
2eb398df 12315 if (IS_ERR_VALUE(hva))
ff5a983c 12316 return (void __user *)hva;
f0d648bd 12317 } else {
0577d1ab 12318 if (!slot || !slot->npages)
46914534 12319 return NULL;
f0d648bd 12320
0577d1ab 12321 old_npages = slot->npages;
b66f9bab 12322 hva = slot->userspace_addr;
f0d648bd
PB
12323 }
12324
9da0e4d5 12325 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1d8007bd 12326 struct kvm_userspace_memory_region m;
9da0e4d5 12327
1d8007bd
PB
12328 m.slot = id | (i << 16);
12329 m.flags = 0;
12330 m.guest_phys_addr = gpa;
f0d648bd 12331 m.userspace_addr = hva;
1d8007bd 12332 m.memory_size = size;
9da0e4d5
PB
12333 r = __kvm_set_memory_region(kvm, &m);
12334 if (r < 0)
ff5a983c 12335 return ERR_PTR_USR(r);
9da0e4d5
PB
12336 }
12337
103c763c 12338 if (!size)
0577d1ab 12339 vm_munmap(hva, old_npages * PAGE_SIZE);
f0d648bd 12340
ff5a983c 12341 return (void __user *)hva;
9da0e4d5
PB
12342}
12343EXPORT_SYMBOL_GPL(__x86_set_memory_region);
12344
1aa9b957
JS
12345void kvm_arch_pre_destroy_vm(struct kvm *kvm)
12346{
12347 kvm_mmu_pre_destroy_vm(kvm);
12348}
12349
d19a9cd2
ZX
12350void kvm_arch_destroy_vm(struct kvm *kvm)
12351{
27469d29
AH
12352 if (current->mm == kvm->mm) {
12353 /*
12354 * Free memory regions allocated on behalf of userspace,
f7081834 12355 * unless the memory map has changed due to process exit
27469d29
AH
12356 * or fd copying.
12357 */
6a3c623b
PX
12358 mutex_lock(&kvm->slots_lock);
12359 __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
12360 0, 0);
12361 __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
12362 0, 0);
12363 __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
12364 mutex_unlock(&kvm->slots_lock);
27469d29 12365 }
6fcee03d 12366 kvm_unload_vcpu_mmus(kvm);
b3646477 12367 static_call_cond(kvm_x86_vm_destroy)(kvm);
b318e8de 12368 kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
c761159c
PX
12369 kvm_pic_destroy(kvm);
12370 kvm_ioapic_destroy(kvm);
6fcee03d 12371 kvm_destroy_vcpus(kvm);
af1bae54 12372 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
66bb8a06 12373 kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
13d268ca 12374 kvm_mmu_uninit_vm(kvm);
2beb6dad 12375 kvm_page_track_cleanup(kvm);
7d6bbebb 12376 kvm_xen_destroy_vm(kvm);
cbc0236a 12377 kvm_hv_destroy_vm(kvm);
d19a9cd2 12378}
0de10343 12379
c9b929b3 12380static void memslot_rmap_free(struct kvm_memory_slot *slot)
db3fe4eb
TY
12381{
12382 int i;
12383
d89cc617 12384 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
e96c81ee
SC
12385 kvfree(slot->arch.rmap[i]);
12386 slot->arch.rmap[i] = NULL;
c9b929b3
BG
12387 }
12388}
e96c81ee 12389
c9b929b3
BG
12390void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
12391{
12392 int i;
12393
12394 memslot_rmap_free(slot);
d89cc617 12395
c9b929b3 12396 for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
e96c81ee
SC
12397 kvfree(slot->arch.lpage_info[i - 1]);
12398 slot->arch.lpage_info[i - 1] = NULL;
db3fe4eb 12399 }
21ebbeda 12400
e96c81ee 12401 kvm_page_track_free_memslot(slot);
db3fe4eb
TY
12402}
12403
1e76a3ce 12404int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
56dd1019
BG
12405{
12406 const int sz = sizeof(*slot->arch.rmap[0]);
12407 int i;
12408
12409 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
12410 int level = i + 1;
4139b197 12411 int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
56dd1019 12412
fa13843d
PB
12413 if (slot->arch.rmap[i])
12414 continue;
d501f747 12415
37b2a651 12416 slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
56dd1019
BG
12417 if (!slot->arch.rmap[i]) {
12418 memslot_rmap_free(slot);
12419 return -ENOMEM;
12420 }
12421 }
12422
12423 return 0;
12424}
12425
a2557408 12426static int kvm_alloc_memslot_metadata(struct kvm *kvm,
9d7d18ee 12427 struct kvm_memory_slot *slot)
db3fe4eb 12428{
9d7d18ee 12429 unsigned long npages = slot->npages;
56dd1019 12430 int i, r;
db3fe4eb 12431
edd4fa37
SC
12432 /*
12433 * Clear out the previous array pointers for the KVM_MR_MOVE case. The
12434 * old arrays will be freed by __kvm_set_memory_region() if installing
12435 * the new memslot is successful.
12436 */
12437 memset(&slot->arch, 0, sizeof(slot->arch));
12438
e2209710 12439 if (kvm_memslots_have_rmaps(kvm)) {
a2557408
BG
12440 r = memslot_rmap_alloc(slot, npages);
12441 if (r)
12442 return r;
12443 }
56dd1019
BG
12444
12445 for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
92f94f1e 12446 struct kvm_lpage_info *linfo;
db3fe4eb
TY
12447 unsigned long ugfn;
12448 int lpages;
d89cc617 12449 int level = i + 1;
db3fe4eb 12450
4139b197 12451 lpages = __kvm_mmu_slot_lpages(slot, npages, level);
db3fe4eb 12452
37b2a651 12453 linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
92f94f1e 12454 if (!linfo)
db3fe4eb
TY
12455 goto out_free;
12456
92f94f1e
XG
12457 slot->arch.lpage_info[i - 1] = linfo;
12458
db3fe4eb 12459 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
92f94f1e 12460 linfo[0].disallow_lpage = 1;
db3fe4eb 12461 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
92f94f1e 12462 linfo[lpages - 1].disallow_lpage = 1;
db3fe4eb
TY
12463 ugfn = slot->userspace_addr >> PAGE_SHIFT;
12464 /*
12465 * If the gfn and userspace address are not aligned wrt each
600087b6 12466 * other, disable large page support for this slot.
db3fe4eb 12467 */
600087b6 12468 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
db3fe4eb
TY
12469 unsigned long j;
12470
12471 for (j = 0; j < lpages; ++j)
92f94f1e 12472 linfo[j].disallow_lpage = 1;
db3fe4eb
TY
12473 }
12474 }
12475
deae4a10 12476 if (kvm_page_track_create_memslot(kvm, slot, npages))
21ebbeda
XG
12477 goto out_free;
12478
db3fe4eb
TY
12479 return 0;
12480
12481out_free:
c9b929b3 12482 memslot_rmap_free(slot);
d89cc617 12483
c9b929b3 12484 for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
548ef284 12485 kvfree(slot->arch.lpage_info[i - 1]);
d89cc617 12486 slot->arch.lpage_info[i - 1] = NULL;
db3fe4eb
TY
12487 }
12488 return -ENOMEM;
12489}
12490
15248258 12491void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
e59dbe09 12492{
91724814 12493 struct kvm_vcpu *vcpu;
46808a4c 12494 unsigned long i;
91724814 12495
e6dff7d1
TY
12496 /*
12497 * memslots->generation has been incremented.
12498 * mmio generation may have reached its maximum value.
12499 */
15248258 12500 kvm_mmu_invalidate_mmio_sptes(kvm, gen);
91724814
BO
12501
12502 /* Force re-initialization of steal_time cache */
12503 kvm_for_each_vcpu(i, vcpu, kvm)
12504 kvm_vcpu_kick(vcpu);
e59dbe09
TY
12505}
12506
f7784b8e 12507int kvm_arch_prepare_memory_region(struct kvm *kvm,
537a17b3
SC
12508 const struct kvm_memory_slot *old,
12509 struct kvm_memory_slot *new,
12510 enum kvm_mr_change change)
0de10343 12511{
86931ff7
SC
12512 if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
12513 if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
12514 return -EINVAL;
12515
9d7d18ee 12516 return kvm_alloc_memslot_metadata(kvm, new);
86931ff7 12517 }
537a17b3
SC
12518
12519 if (change == KVM_MR_FLAGS_ONLY)
12520 memcpy(&new->arch, &old->arch, sizeof(old->arch));
12521 else if (WARN_ON_ONCE(change != KVM_MR_DELETE))
12522 return -EIO;
12523
f7784b8e
MT
12524 return 0;
12525}
12526
a85863c2
MS
12527
12528static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
12529{
ee661d8e 12530 int nr_slots;
a85863c2
MS
12531
12532 if (!kvm_x86_ops.cpu_dirty_log_size)
12533 return;
12534
ee661d8e
DM
12535 nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
12536 if ((enable && nr_slots == 1) || !nr_slots)
a85863c2 12537 kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
a85863c2
MS
12538}
12539
88178fd4 12540static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
3741679b 12541 struct kvm_memory_slot *old,
269e9552 12542 const struct kvm_memory_slot *new,
3741679b 12543 enum kvm_mr_change change)
88178fd4 12544{
77aedf26
SC
12545 u32 old_flags = old ? old->flags : 0;
12546 u32 new_flags = new ? new->flags : 0;
12547 bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
a85863c2 12548
3741679b 12549 /*
a85863c2
MS
12550 * Update CPU dirty logging if dirty logging is being toggled. This
12551 * applies to all operations.
3741679b 12552 */
77aedf26 12553 if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
a85863c2 12554 kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
88178fd4
KH
12555
12556 /*
a85863c2 12557 * Nothing more to do for RO slots (which can't be dirtied and can't be
b6e16ae5 12558 * made writable) or CREATE/MOVE/DELETE of a slot.
88178fd4 12559 *
b6e16ae5 12560 * For a memslot with dirty logging disabled:
3741679b
AY
12561 * CREATE: No dirty mappings will already exist.
12562 * MOVE/DELETE: The old mappings will already have been cleaned up by
12563 * kvm_arch_flush_shadow_memslot()
b6e16ae5
SC
12564 *
12565 * For a memslot with dirty logging enabled:
12566 * CREATE: No shadow pages exist, thus nothing to write-protect
12567 * and no dirty bits to clear.
12568 * MOVE/DELETE: The old mappings will already have been cleaned up by
12569 * kvm_arch_flush_shadow_memslot().
3741679b 12570 */
77aedf26 12571 if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
88178fd4 12572 return;
3741679b
AY
12573
12574 /*
52f46079
SC
12575 * READONLY and non-flags changes were filtered out above, and the only
12576 * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
12577 * logging isn't being toggled on or off.
88178fd4 12578 */
77aedf26 12579 if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
52f46079
SC
12580 return;
12581
b6e16ae5
SC
12582 if (!log_dirty_pages) {
12583 /*
12584 * Dirty logging tracks sptes in 4k granularity, meaning that
12585 * large sptes have to be split. If live migration succeeds,
12586 * the guest in the source machine will be destroyed and large
12587 * sptes will be created in the destination. However, if the
12588 * guest continues to run in the source machine (for example if
12589 * live migration fails), small sptes will remain around and
12590 * cause bad performance.
12591 *
12592 * Scan sptes if dirty logging has been stopped, dropping those
12593 * which can be collapsed into a single large-page spte. Later
12594 * page faults will create the large-page sptes.
12595 */
3741679b 12596 kvm_mmu_zap_collapsible_sptes(kvm, new);
b6e16ae5 12597 } else {
89212919
KZ
12598 /*
12599 * Initially-all-set does not require write protecting any page,
12600 * because they're all assumed to be dirty.
12601 */
12602 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
12603 return;
a1419f8b 12604
a3fe5dbd
DM
12605 if (READ_ONCE(eager_page_split))
12606 kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
12607
a018eba5 12608 if (kvm_x86_ops.cpu_dirty_log_size) {
89212919
KZ
12609 kvm_mmu_slot_leaf_clear_dirty(kvm, new);
12610 kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
12611 } else {
12612 kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
3c9bd400 12613 }
b64d740e
JS
12614
12615 /*
12616 * Unconditionally flush the TLBs after enabling dirty logging.
12617 * A flush is almost always going to be necessary (see below),
12618 * and unconditionally flushing allows the helpers to omit
12619 * the subtly complex checks when removing write access.
12620 *
12621 * Do the flush outside of mmu_lock to reduce the amount of
12622 * time mmu_lock is held. Flushing after dropping mmu_lock is
12623 * safe as KVM only needs to guarantee the slot is fully
12624 * write-protected before returning to userspace, i.e. before
12625 * userspace can consume the dirty status.
12626 *
12627 * Flushing outside of mmu_lock requires KVM to be careful when
12628 * making decisions based on writable status of an SPTE, e.g. a
12629 * !writable SPTE doesn't guarantee a CPU can't perform writes.
12630 *
12631 * Specifically, KVM also write-protects guest page tables to
12632 * monitor changes when using shadow paging, and must guarantee
12633 * no CPUs can write to those page before mmu_lock is dropped.
12634 * Because CPUs may have stale TLB entries at this point, a
12635 * !writable SPTE doesn't guarantee CPUs can't perform writes.
12636 *
12637 * KVM also allows making SPTES writable outside of mmu_lock,
12638 * e.g. to allow dirty logging without taking mmu_lock.
12639 *
12640 * To handle these scenarios, KVM uses a separate software-only
12641 * bit (MMU-writable) to track if a SPTE is !writable due to
12642 * a guest page table being write-protected (KVM clears the
12643 * MMU-writable flag when write-protecting for shadow paging).
12644 *
12645 * The use of MMU-writable is also the primary motivation for
12646 * the unconditional flush. Because KVM must guarantee that a
12647 * CPU doesn't contain stale, writable TLB entries for a
12648 * !MMU-writable SPTE, KVM must flush if it encounters any
12649 * MMU-writable SPTE regardless of whether the actual hardware
12650 * writable bit was set. I.e. KVM is almost guaranteed to need
12651 * to flush, while unconditionally flushing allows the "remove
12652 * write access" helpers to ignore MMU-writable entirely.
12653 *
12654 * See is_writable_pte() for more details (the case involving
12655 * access-tracked SPTEs is particularly relevant).
12656 */
12657 kvm_arch_flush_remote_tlbs_memslot(kvm, new);
88178fd4
KH
12658 }
12659}
12660
f7784b8e 12661void kvm_arch_commit_memory_region(struct kvm *kvm,
9d4c197c 12662 struct kvm_memory_slot *old,
f36f3f28 12663 const struct kvm_memory_slot *new,
8482644a 12664 enum kvm_mr_change change)
f7784b8e 12665{
e0c2b633 12666 if (!kvm->arch.n_requested_mmu_pages &&
f5756029
MS
12667 (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
12668 unsigned long nr_mmu_pages;
12669
12670 nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
12671 nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
12672 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
12673 }
1c91cad4 12674
269e9552 12675 kvm_mmu_slot_apply_flags(kvm, old, new, change);
21198846
SC
12676
12677 /* Free the arrays associated with the old memslot. */
12678 if (change == KVM_MR_MOVE)
e96c81ee 12679 kvm_arch_free_memslot(kvm, old);
0de10343 12680}
1d737c8a 12681
2df72e9b 12682void kvm_arch_flush_shadow_all(struct kvm *kvm)
34d4cb8f 12683{
7390de1e 12684 kvm_mmu_zap_all(kvm);
34d4cb8f
MT
12685}
12686
2df72e9b
MT
12687void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
12688 struct kvm_memory_slot *slot)
12689{
ae7cd873 12690 kvm_page_track_flush_slot(kvm, slot);
2df72e9b
MT
12691}
12692
e6c67d8c
LA
12693static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
12694{
12695 return (is_guest_mode(vcpu) &&
5be2226f 12696 static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
e6c67d8c
LA
12697}
12698
5d9bc648
PB
12699static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
12700{
12701 if (!list_empty_careful(&vcpu->async_pf.done))
12702 return true;
12703
bf7f9352
PB
12704 if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
12705 kvm_apic_init_sipi_allowed(vcpu))
5d9bc648
PB
12706 return true;
12707
12708 if (vcpu->arch.pv.pv_unhalted)
12709 return true;
12710
7709aba8 12711 if (kvm_is_exception_pending(vcpu))
a5f01f8e
WL
12712 return true;
12713
47a66eed
Z
12714 if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
12715 (vcpu->arch.nmi_pending &&
b3646477 12716 static_call(kvm_x86_nmi_allowed)(vcpu, false)))
5d9bc648
PB
12717 return true;
12718
31e83e21 12719#ifdef CONFIG_KVM_SMM
47a66eed 12720 if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
a9fa7cb6 12721 (vcpu->arch.smi_pending &&
b3646477 12722 static_call(kvm_x86_smi_allowed)(vcpu, false)))
73917739 12723 return true;
31e83e21 12724#endif
73917739 12725
5d9bc648 12726 if (kvm_arch_interrupt_allowed(vcpu) &&
e6c67d8c
LA
12727 (kvm_cpu_has_interrupt(vcpu) ||
12728 kvm_guest_apic_has_interrupt(vcpu)))
5d9bc648
PB
12729 return true;
12730
1f4b34f8
AS
12731 if (kvm_hv_has_stimer_pending(vcpu))
12732 return true;
12733
d2060bd4 12734 if (is_guest_mode(vcpu) &&
5b4ac1a1
PB
12735 kvm_x86_ops.nested_ops->has_events &&
12736 kvm_x86_ops.nested_ops->has_events(vcpu))
d2060bd4
SC
12737 return true;
12738
7caf9571
DW
12739 if (kvm_xen_has_pending_events(vcpu))
12740 return true;
12741
5d9bc648
PB
12742 return false;
12743}
12744
1d737c8a
ZX
12745int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
12746{
5d9bc648 12747 return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
1d737c8a 12748}
5736199a 12749
10dbdf98 12750bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
17e433b5 12751{
ae801e13
SC
12752 if (kvm_vcpu_apicv_active(vcpu) &&
12753 static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
52acd22f
WL
12754 return true;
12755
12756 return false;
12757}
12758
17e433b5
WL
12759bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
12760{
12761 if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
12762 return true;
12763
12764 if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
cf7316d0 12765#ifdef CONFIG_KVM_SMM
17e433b5 12766 kvm_test_request(KVM_REQ_SMI, vcpu) ||
cf7316d0 12767#endif
17e433b5
WL
12768 kvm_test_request(KVM_REQ_EVENT, vcpu))
12769 return true;
12770
10dbdf98 12771 return kvm_arch_dy_has_pending_interrupt(vcpu);
17e433b5
WL
12772}
12773
199b5763
LM
12774bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
12775{
b86bb11e
WL
12776 if (vcpu->arch.guest_state_protected)
12777 return true;
12778
de63ad4c 12779 return vcpu->arch.preempted_in_kernel;
199b5763
LM
12780}
12781
e1bfc245
SC
12782unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
12783{
12784 return kvm_rip_read(vcpu);
12785}
12786
b6d33834 12787int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
5736199a 12788{
b6d33834 12789 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
5736199a 12790}
78646121
GN
12791
12792int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
12793{
b3646477 12794 return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
78646121 12795}
229456fc 12796
82b32774 12797unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
f92653ee 12798{
7ed9abfe
TL
12799 /* Can't read the RIP when guest state is protected, just return 0 */
12800 if (vcpu->arch.guest_state_protected)
12801 return 0;
12802
82b32774
NA
12803 if (is_64_bit_mode(vcpu))
12804 return kvm_rip_read(vcpu);
12805 return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
12806 kvm_rip_read(vcpu));
12807}
12808EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
f92653ee 12809
82b32774
NA
12810bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
12811{
12812 return kvm_get_linear_rip(vcpu) == linear_rip;
f92653ee
JK
12813}
12814EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
12815
94fe45da
JK
12816unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
12817{
12818 unsigned long rflags;
12819
b3646477 12820 rflags = static_call(kvm_x86_get_rflags)(vcpu);
94fe45da 12821 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
c310bac5 12822 rflags &= ~X86_EFLAGS_TF;
94fe45da
JK
12823 return rflags;
12824}
12825EXPORT_SYMBOL_GPL(kvm_get_rflags);
12826
6addfc42 12827static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
94fe45da
JK
12828{
12829 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
f92653ee 12830 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
c310bac5 12831 rflags |= X86_EFLAGS_TF;
b3646477 12832 static_call(kvm_x86_set_rflags)(vcpu, rflags);
6addfc42
PB
12833}
12834
12835void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
12836{
12837 __kvm_set_rflags(vcpu, rflags);
3842d135 12838 kvm_make_request(KVM_REQ_EVENT, vcpu);
94fe45da
JK
12839}
12840EXPORT_SYMBOL_GPL(kvm_set_rflags);
12841
af585b92
GN
12842static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
12843{
dd03bcaa
PX
12844 BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
12845
af585b92
GN
12846 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
12847}
12848
12849static inline u32 kvm_async_pf_next_probe(u32 key)
12850{
dd03bcaa 12851 return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
af585b92
GN
12852}
12853
12854static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
12855{
12856 u32 key = kvm_async_pf_hash_fn(gfn);
12857
12858 while (vcpu->arch.apf.gfns[key] != ~0)
12859 key = kvm_async_pf_next_probe(key);
12860
12861 vcpu->arch.apf.gfns[key] = gfn;
12862}
12863
12864static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
12865{
12866 int i;
12867 u32 key = kvm_async_pf_hash_fn(gfn);
12868
dd03bcaa 12869 for (i = 0; i < ASYNC_PF_PER_VCPU &&
c7d28c24
XG
12870 (vcpu->arch.apf.gfns[key] != gfn &&
12871 vcpu->arch.apf.gfns[key] != ~0); i++)
af585b92
GN
12872 key = kvm_async_pf_next_probe(key);
12873
12874 return key;
12875}
12876
12877bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
12878{
12879 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
12880}
12881
12882static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
12883{
12884 u32 i, j, k;
12885
12886 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
0fd46044
PX
12887
12888 if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
12889 return;
12890
af585b92
GN
12891 while (true) {
12892 vcpu->arch.apf.gfns[i] = ~0;
12893 do {
12894 j = kvm_async_pf_next_probe(j);
12895 if (vcpu->arch.apf.gfns[j] == ~0)
12896 return;
12897 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
12898 /*
12899 * k lies cyclically in ]i,j]
12900 * | i.k.j |
12901 * |....j i.k.| or |.k..j i...|
12902 */
12903 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
12904 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
12905 i = j;
12906 }
12907}
12908
68fd66f1 12909static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
7c90705b 12910{
68fd66f1
VK
12911 u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
12912
12913 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
12914 sizeof(reason));
12915}
12916
12917static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
12918{
2635b5c4 12919 unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
4e335d9e 12920
2635b5c4
VK
12921 return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
12922 &token, offset, sizeof(token));
12923}
12924
12925static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
12926{
12927 unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
12928 u32 val;
12929
12930 if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
12931 &val, offset, sizeof(val)))
12932 return false;
12933
12934 return !val;
7c90705b
GN
12935}
12936
1dfdb45e
PB
12937static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
12938{
57cb3bb0
PB
12939
12940 if (!kvm_pv_async_pf_enabled(vcpu))
1dfdb45e
PB
12941 return false;
12942
57cb3bb0
PB
12943 if (vcpu->arch.apf.send_user_only &&
12944 static_call(kvm_x86_get_cpl)(vcpu) == 0)
1dfdb45e
PB
12945 return false;
12946
57cb3bb0
PB
12947 if (is_guest_mode(vcpu)) {
12948 /*
12949 * L1 needs to opt into the special #PF vmexits that are
12950 * used to deliver async page faults.
12951 */
12952 return vcpu->arch.apf.delivery_as_pf_vmexit;
12953 } else {
12954 /*
12955 * Play it safe in case the guest temporarily disables paging.
12956 * The real mode IDT in particular is unlikely to have a #PF
12957 * exception setup.
12958 */
12959 return is_paging(vcpu);
12960 }
1dfdb45e
PB
12961}
12962
12963bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
12964{
12965 if (unlikely(!lapic_in_kernel(vcpu) ||
12966 kvm_event_needs_reinjection(vcpu) ||
7709aba8 12967 kvm_is_exception_pending(vcpu)))
1dfdb45e
PB
12968 return false;
12969
12970 if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
12971 return false;
12972
12973 /*
12974 * If interrupts are off we cannot even use an artificial
12975 * halt state.
12976 */
c300ab9f 12977 return kvm_arch_interrupt_allowed(vcpu);
1dfdb45e
PB
12978}
12979
2a18b7e7 12980bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
af585b92
GN
12981 struct kvm_async_pf *work)
12982{
6389ee94
AK
12983 struct x86_exception fault;
12984
736c291c 12985 trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
af585b92 12986 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
7c90705b 12987
1dfdb45e 12988 if (kvm_can_deliver_async_pf(vcpu) &&
68fd66f1 12989 !apf_put_user_notpresent(vcpu)) {
6389ee94
AK
12990 fault.vector = PF_VECTOR;
12991 fault.error_code_valid = true;
12992 fault.error_code = 0;
12993 fault.nested_page_fault = false;
12994 fault.address = work->arch.token;
adfe20fb 12995 fault.async_page_fault = true;
6389ee94 12996 kvm_inject_page_fault(vcpu, &fault);
2a18b7e7 12997 return true;
1dfdb45e
PB
12998 } else {
12999 /*
13000 * It is not possible to deliver a paravirtualized asynchronous
13001 * page fault, but putting the guest in an artificial halt state
13002 * can be beneficial nevertheless: if an interrupt arrives, we
13003 * can deliver it timely and perhaps the guest will schedule
13004 * another process. When the instruction that triggered a page
13005 * fault is retried, hopefully the page will be ready in the host.
13006 */
13007 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2a18b7e7 13008 return false;
7c90705b 13009 }
af585b92
GN
13010}
13011
13012void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
13013 struct kvm_async_pf *work)
13014{
2635b5c4
VK
13015 struct kvm_lapic_irq irq = {
13016 .delivery_mode = APIC_DM_FIXED,
13017 .vector = vcpu->arch.apf.vec
13018 };
6389ee94 13019
f2e10669 13020 if (work->wakeup_all)
7c90705b
GN
13021 work->arch.token = ~0; /* broadcast wakeup */
13022 else
13023 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
736c291c 13024 trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
7c90705b 13025
2a18b7e7
VK
13026 if ((work->wakeup_all || work->notpresent_injected) &&
13027 kvm_pv_async_pf_enabled(vcpu) &&
557a961a
VK
13028 !apf_put_user_ready(vcpu, work->arch.token)) {
13029 vcpu->arch.apf.pageready_pending = true;
2635b5c4 13030 kvm_apic_set_irq(vcpu, &irq, NULL);
557a961a 13031 }
2635b5c4 13032
e6d53e3b 13033 vcpu->arch.apf.halted = false;
a4fa1635 13034 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7c90705b
GN
13035}
13036
557a961a
VK
13037void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
13038{
13039 kvm_make_request(KVM_REQ_APF_READY, vcpu);
13040 if (!vcpu->arch.apf.pageready_pending)
13041 kvm_vcpu_kick(vcpu);
13042}
13043
7c0ade6c 13044bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
7c90705b 13045{
2635b5c4 13046 if (!kvm_pv_async_pf_enabled(vcpu))
7c90705b
GN
13047 return true;
13048 else
2f15d027 13049 return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
af585b92
GN
13050}
13051
5544eb9b
PB
13052void kvm_arch_start_assignment(struct kvm *kvm)
13053{
57ab8794 13054 if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
e27bc044 13055 static_call_cond(kvm_x86_pi_start_assignment)(kvm);
5544eb9b
PB
13056}
13057EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
13058
13059void kvm_arch_end_assignment(struct kvm *kvm)
13060{
13061 atomic_dec(&kvm->arch.assigned_device_count);
13062}
13063EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
13064
742ab6df 13065bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
5544eb9b 13066{
742ab6df 13067 return arch_atomic_read(&kvm->arch.assigned_device_count);
5544eb9b
PB
13068}
13069EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
13070
e0f0bbc5
AW
13071void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
13072{
13073 atomic_inc(&kvm->arch.noncoherent_dma_count);
13074}
13075EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
13076
13077void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
13078{
13079 atomic_dec(&kvm->arch.noncoherent_dma_count);
13080}
13081EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
13082
13083bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
13084{
13085 return atomic_read(&kvm->arch.noncoherent_dma_count);
13086}
13087EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
13088
14717e20
AW
13089bool kvm_arch_has_irq_bypass(void)
13090{
92735b1b 13091 return true;
14717e20
AW
13092}
13093
87276880
FW
13094int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
13095 struct irq_bypass_producer *prod)
13096{
13097 struct kvm_kernel_irqfd *irqfd =
13098 container_of(cons, struct kvm_kernel_irqfd, consumer);
2edd9cb7 13099 int ret;
87276880 13100
14717e20 13101 irqfd->producer = prod;
2edd9cb7 13102 kvm_arch_start_assignment(irqfd->kvm);
e27bc044 13103 ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
2edd9cb7
ZL
13104 prod->irq, irqfd->gsi, 1);
13105
13106 if (ret)
13107 kvm_arch_end_assignment(irqfd->kvm);
87276880 13108
2edd9cb7 13109 return ret;
87276880
FW
13110}
13111
13112void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
13113 struct irq_bypass_producer *prod)
13114{
13115 int ret;
13116 struct kvm_kernel_irqfd *irqfd =
13117 container_of(cons, struct kvm_kernel_irqfd, consumer);
13118
87276880
FW
13119 WARN_ON(irqfd->producer != prod);
13120 irqfd->producer = NULL;
13121
13122 /*
13123 * When producer of consumer is unregistered, we change back to
13124 * remapped mode, so we can re-use the current implementation
bb3541f1 13125 * when the irq is masked/disabled or the consumer side (KVM
87276880
FW
13126 * int this case doesn't want to receive the interrupts.
13127 */
e27bc044 13128 ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
87276880
FW
13129 if (ret)
13130 printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
13131 " fails: %d\n", irqfd->consumer.token, ret);
2edd9cb7
ZL
13132
13133 kvm_arch_end_assignment(irqfd->kvm);
87276880
FW
13134}
13135
13136int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
13137 uint32_t guest_irq, bool set)
13138{
e27bc044 13139 return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
87276880
FW
13140}
13141
515a0c79
LM
13142bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
13143 struct kvm_kernel_irq_routing_entry *new)
13144{
13145 if (new->type != KVM_IRQ_ROUTING_MSI)
13146 return true;
13147
13148 return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
13149}
13150
52004014
FW
13151bool kvm_vector_hashing_enabled(void)
13152{
13153 return vector_hashing;
13154}
52004014 13155
2d5ba19b
MT
13156bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
13157{
13158 return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
13159}
13160EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
13161
841c2be0
ML
13162
13163int kvm_spec_ctrl_test_value(u64 value)
6441fa61 13164{
841c2be0
ML
13165 /*
13166 * test that setting IA32_SPEC_CTRL to given value
13167 * is allowed by the host processor
13168 */
6441fa61 13169
841c2be0
ML
13170 u64 saved_value;
13171 unsigned long flags;
13172 int ret = 0;
6441fa61 13173
841c2be0 13174 local_irq_save(flags);
6441fa61 13175
841c2be0
ML
13176 if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
13177 ret = 1;
13178 else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
13179 ret = 1;
13180 else
13181 wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
6441fa61 13182
841c2be0 13183 local_irq_restore(flags);
6441fa61 13184
841c2be0 13185 return ret;
6441fa61 13186}
841c2be0 13187EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
2d5ba19b 13188
89786147
MG
13189void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
13190{
1f5a21ee 13191 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
89786147 13192 struct x86_exception fault;
5b22bbe7 13193 u64 access = error_code &
19cf4b7e 13194 (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
89786147
MG
13195
13196 if (!(error_code & PFERR_PRESENT_MASK) ||
6e1d2a3f 13197 mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
89786147
MG
13198 /*
13199 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
13200 * tables probably do not match the TLB. Just proceed
13201 * with the error code that the processor gave.
13202 */
13203 fault.vector = PF_VECTOR;
13204 fault.error_code_valid = true;
13205 fault.error_code = error_code;
13206 fault.nested_page_fault = false;
13207 fault.address = gva;
2bc685e6 13208 fault.async_page_fault = false;
89786147
MG
13209 }
13210 vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
6441fa61 13211}
89786147 13212EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
2d5ba19b 13213
3f3393b3
BM
13214/*
13215 * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
13216 * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
13217 * indicates whether exit to userspace is needed.
13218 */
13219int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
13220 struct x86_exception *e)
13221{
13222 if (r == X86EMUL_PROPAGATE_FAULT) {
77b1908e
SC
13223 if (KVM_BUG_ON(!e, vcpu->kvm))
13224 return -EIO;
13225
3f3393b3
BM
13226 kvm_inject_emulated_page_fault(vcpu, e);
13227 return 1;
13228 }
13229
13230 /*
13231 * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
13232 * while handling a VMX instruction KVM could've handled the request
13233 * correctly by exiting to userspace and performing I/O but there
13234 * doesn't seem to be a real use-case behind such requests, just return
13235 * KVM_EXIT_INTERNAL_ERROR for now.
13236 */
e615e355 13237 kvm_prepare_emulation_failure_exit(vcpu);
3f3393b3
BM
13238
13239 return 0;
13240}
13241EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
13242
9715092f
BM
13243int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
13244{
13245 bool pcid_enabled;
13246 struct x86_exception e;
9715092f
BM
13247 struct {
13248 u64 pcid;
13249 u64 gla;
13250 } operand;
13251 int r;
13252
13253 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
13254 if (r != X86EMUL_CONTINUE)
13255 return kvm_handle_memory_failure(vcpu, r, &e);
13256
13257 if (operand.pcid >> 12 != 0) {
13258 kvm_inject_gp(vcpu, 0);
13259 return 1;
13260 }
13261
13262 pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
13263
13264 switch (type) {
13265 case INVPCID_TYPE_INDIV_ADDR:
13266 if ((!pcid_enabled && (operand.pcid != 0)) ||
13267 is_noncanonical_address(operand.gla, vcpu)) {
13268 kvm_inject_gp(vcpu, 0);
13269 return 1;
13270 }
13271 kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
13272 return kvm_skip_emulated_instruction(vcpu);
13273
13274 case INVPCID_TYPE_SINGLE_CTXT:
13275 if (!pcid_enabled && (operand.pcid != 0)) {
13276 kvm_inject_gp(vcpu, 0);
13277 return 1;
13278 }
13279
21823fbd 13280 kvm_invalidate_pcid(vcpu, operand.pcid);
9715092f
BM
13281 return kvm_skip_emulated_instruction(vcpu);
13282
13283 case INVPCID_TYPE_ALL_NON_GLOBAL:
13284 /*
13285 * Currently, KVM doesn't mark global entries in the shadow
13286 * page tables, so a non-global flush just degenerates to a
13287 * global flush. If needed, we could optimize this later by
13288 * keeping track of global entries in shadow page tables.
13289 */
13290
13291 fallthrough;
13292 case INVPCID_TYPE_ALL_INCL_GLOBAL:
28f28d45 13293 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
9715092f
BM
13294 return kvm_skip_emulated_instruction(vcpu);
13295
13296 default:
796c83c5
VS
13297 kvm_inject_gp(vcpu, 0);
13298 return 1;
9715092f
BM
13299 }
13300}
13301EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
13302
8f423a80
TL
13303static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
13304{
13305 struct kvm_run *run = vcpu->run;
13306 struct kvm_mmio_fragment *frag;
13307 unsigned int len;
13308
13309 BUG_ON(!vcpu->mmio_needed);
13310
13311 /* Complete previous fragment */
13312 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
13313 len = min(8u, frag->len);
13314 if (!vcpu->mmio_is_write)
13315 memcpy(frag->data, run->mmio.data, len);
13316
13317 if (frag->len <= 8) {
13318 /* Switch to the next fragment. */
13319 frag++;
13320 vcpu->mmio_cur_fragment++;
13321 } else {
13322 /* Go forward to the next mmio piece. */
13323 frag->data += len;
13324 frag->gpa += len;
13325 frag->len -= len;
13326 }
13327
13328 if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
13329 vcpu->mmio_needed = 0;
13330
13331 // VMG change, at this point, we're always done
13332 // RIP has already been advanced
13333 return 1;
13334 }
13335
13336 // More MMIO is needed
13337 run->mmio.phys_addr = frag->gpa;
13338 run->mmio.len = min(8u, frag->len);
13339 run->mmio.is_write = vcpu->mmio_is_write;
13340 if (run->mmio.is_write)
13341 memcpy(run->mmio.data, frag->data, min(8u, frag->len));
13342 run->exit_reason = KVM_EXIT_MMIO;
13343
13344 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
13345
13346 return 0;
13347}
13348
13349int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
13350 void *data)
13351{
13352 int handled;
13353 struct kvm_mmio_fragment *frag;
13354
13355 if (!data)
13356 return -EINVAL;
13357
13358 handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data);
13359 if (handled == bytes)
13360 return 1;
13361
13362 bytes -= handled;
13363 gpa += handled;
13364 data += handled;
13365
13366 /*TODO: Check if need to increment number of frags */
13367 frag = vcpu->mmio_fragments;
13368 vcpu->mmio_nr_fragments = 1;
13369 frag->len = bytes;
13370 frag->gpa = gpa;
13371 frag->data = data;
13372
13373 vcpu->mmio_needed = 1;
13374 vcpu->mmio_cur_fragment = 0;
13375
13376 vcpu->run->mmio.phys_addr = gpa;
13377 vcpu->run->mmio.len = min(8u, frag->len);
13378 vcpu->run->mmio.is_write = 1;
13379 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
13380 vcpu->run->exit_reason = KVM_EXIT_MMIO;
13381
13382 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
13383
13384 return 0;
13385}
13386EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write);
13387
13388int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
13389 void *data)
13390{
13391 int handled;
13392 struct kvm_mmio_fragment *frag;
13393
13394 if (!data)
13395 return -EINVAL;
13396
13397 handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data);
13398 if (handled == bytes)
13399 return 1;
13400
13401 bytes -= handled;
13402 gpa += handled;
13403 data += handled;
13404
13405 /*TODO: Check if need to increment number of frags */
13406 frag = vcpu->mmio_fragments;
13407 vcpu->mmio_nr_fragments = 1;
13408 frag->len = bytes;
13409 frag->gpa = gpa;
13410 frag->data = data;
13411
13412 vcpu->mmio_needed = 1;
13413 vcpu->mmio_cur_fragment = 0;
13414
13415 vcpu->run->mmio.phys_addr = gpa;
13416 vcpu->run->mmio.len = min(8u, frag->len);
13417 vcpu->run->mmio.is_write = 0;
13418 vcpu->run->exit_reason = KVM_EXIT_MMIO;
13419
13420 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
13421
13422 return 0;
13423}
13424EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
13425
db209369
PB
13426static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size)
13427{
13428 vcpu->arch.sev_pio_count -= count;
13429 vcpu->arch.sev_pio_data += count * size;
13430}
13431
7ed9abfe 13432static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
95e16b47
PB
13433 unsigned int port);
13434
13435static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
7ed9abfe 13436{
95e16b47
PB
13437 int size = vcpu->arch.pio.size;
13438 int port = vcpu->arch.pio.port;
13439
13440 vcpu->arch.pio.count = 0;
13441 if (vcpu->arch.sev_pio_count)
13442 return kvm_sev_es_outs(vcpu, size, port);
13443 return 1;
13444}
13445
13446static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
13447 unsigned int port)
13448{
13449 for (;;) {
13450 unsigned int count =
13451 min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
13452 int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
13453
13454 /* memcpy done already by emulator_pio_out. */
db209369 13455 advance_sev_es_emulated_pio(vcpu, count, size);
95e16b47
PB
13456 if (!ret)
13457 break;
7ed9abfe 13458
ea724ea4 13459 /* Emulation done by the kernel. */
95e16b47
PB
13460 if (!vcpu->arch.sev_pio_count)
13461 return 1;
ea724ea4 13462 }
7ed9abfe 13463
95e16b47 13464 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
7ed9abfe
TL
13465 return 0;
13466}
13467
95e16b47
PB
13468static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
13469 unsigned int port);
13470
4fa4b38d
PB
13471static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
13472{
0c05e10b 13473 unsigned count = vcpu->arch.pio.count;
95e16b47
PB
13474 int size = vcpu->arch.pio.size;
13475 int port = vcpu->arch.pio.port;
4fa4b38d 13476
0c05e10b 13477 complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
db209369 13478 advance_sev_es_emulated_pio(vcpu, count, size);
95e16b47
PB
13479 if (vcpu->arch.sev_pio_count)
13480 return kvm_sev_es_ins(vcpu, size, port);
4fa4b38d
PB
13481 return 1;
13482}
13483
7ed9abfe 13484static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
95e16b47 13485 unsigned int port)
7ed9abfe 13486{
95e16b47
PB
13487 for (;;) {
13488 unsigned int count =
13489 min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
f35cee4a 13490 if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
95e16b47 13491 break;
7ed9abfe 13492
ea724ea4 13493 /* Emulation done by the kernel. */
db209369 13494 advance_sev_es_emulated_pio(vcpu, count, size);
95e16b47
PB
13495 if (!vcpu->arch.sev_pio_count)
13496 return 1;
7ed9abfe
TL
13497 }
13498
ea724ea4 13499 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
7ed9abfe
TL
13500 return 0;
13501}
13502
13503int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
13504 unsigned int port, void *data, unsigned int count,
13505 int in)
13506{
ea724ea4 13507 vcpu->arch.sev_pio_data = data;
95e16b47
PB
13508 vcpu->arch.sev_pio_count = count;
13509 return in ? kvm_sev_es_ins(vcpu, size, port)
13510 : kvm_sev_es_outs(vcpu, size, port);
7ed9abfe
TL
13511}
13512EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
13513
d95df951 13514EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
229456fc 13515EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
931c33b1 13516EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
229456fc
MT
13517EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
13518EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
13519EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
13520EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
89e54ec5 13521EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
d8cabddf 13522EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
17897f36 13523EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
236649de 13524EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5497b955 13525EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
ec1ff790 13526EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
532a46b9 13527EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
2e554e8d 13528EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
489223ed 13529EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
4f75bcc3 13530EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
843e4330 13531EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
efc64404 13532EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
18f40c53
SS
13533EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
13534EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
ab56f8e6 13535EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
9f084f7c 13536EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
39b6b8c3 13537EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell);
8e819d75 13538EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
d523ab6b
TL
13539EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
13540EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
59e38b58
TL
13541EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
13542EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
1d0e8480
SC
13543
13544static int __init kvm_x86_init(void)
13545{
13546 kvm_mmu_x86_module_init();
6f0f2d5e 13547 mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
1d0e8480
SC
13548 return 0;
13549}
13550module_init(kvm_x86_init);
13551
13552static void __exit kvm_x86_exit(void)
13553{
13554 /*
13555 * If module_init() is implemented, module_exit() must also be
13556 * implemented to allow module unload.
13557 */
13558}
13559module_exit(kvm_x86_exit);