KVM: VMX: Use vmx_get_rflags() to query RFLAGS in vmx_interrupt_blocked()
[linux-block.git] / arch / x86 / kvm / x86.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
043405e1
CO
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * derived from drivers/kvm/kvm_main.c
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
4d5c5d0f
BAY
8 * Copyright (C) 2008 Qumranet, Inc.
9 * Copyright IBM Corporation, 2008
9611c187 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
043405e1
CO
11 *
12 * Authors:
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
4d5c5d0f
BAY
15 * Amit Shah <amit.shah@qumranet.com>
16 * Ben-Ami Yassour <benami@il.ibm.com>
043405e1
CO
17 */
18
edf88417 19#include <linux/kvm_host.h>
313a3dc7 20#include "irq.h"
1d737c8a 21#include "mmu.h"
7837699f 22#include "i8254.h"
37817f29 23#include "tss.h"
5fdbf976 24#include "kvm_cache_regs.h"
2f728d66 25#include "kvm_emulate.h"
26eef70c 26#include "x86.h"
00b27a3e 27#include "cpuid.h"
474a5bb9 28#include "pmu.h"
e83d5887 29#include "hyperv.h"
8df14af4 30#include "lapic.h"
313a3dc7 31
18068523 32#include <linux/clocksource.h>
4d5c5d0f 33#include <linux/interrupt.h>
313a3dc7
CO
34#include <linux/kvm.h>
35#include <linux/fs.h>
36#include <linux/vmalloc.h>
1767e931
PG
37#include <linux/export.h>
38#include <linux/moduleparam.h>
0de10343 39#include <linux/mman.h>
2bacc55c 40#include <linux/highmem.h>
19de40a8 41#include <linux/iommu.h>
62c476c7 42#include <linux/intel-iommu.h>
c8076604 43#include <linux/cpufreq.h>
18863bdd 44#include <linux/user-return-notifier.h>
a983fb23 45#include <linux/srcu.h>
5a0e3ad6 46#include <linux/slab.h>
ff9d07a0 47#include <linux/perf_event.h>
7bee342a 48#include <linux/uaccess.h>
af585b92 49#include <linux/hash.h>
a1b60c1c 50#include <linux/pci.h>
16e8d74d
MT
51#include <linux/timekeeper_internal.h>
52#include <linux/pvclock_gtod.h>
87276880
FW
53#include <linux/kvm_irqfd.h>
54#include <linux/irqbypass.h>
3905f9ad 55#include <linux/sched/stat.h>
0c5f81da 56#include <linux/sched/isolation.h>
d0ec49d4 57#include <linux/mem_encrypt.h>
3905f9ad 58
aec51dc4 59#include <trace/events/kvm.h>
2ed152af 60
24f1e32c 61#include <asm/debugreg.h>
d825ed0a 62#include <asm/msr.h>
a5f61300 63#include <asm/desc.h>
890ca9ae 64#include <asm/mce.h>
f89e32e0 65#include <linux/kernel_stat.h>
78f7f1e5 66#include <asm/fpu/internal.h> /* Ugh! */
1d5f066e 67#include <asm/pvclock.h>
217fc9cf 68#include <asm/div64.h>
efc64404 69#include <asm/irq_remapping.h>
b0c39dc6 70#include <asm/mshyperv.h>
0092e434 71#include <asm/hypervisor.h>
bf8c55d8 72#include <asm/intel_pt.h>
b3dc0695 73#include <asm/emulate_prefix.h>
dd2cb348 74#include <clocksource/hyperv_timer.h>
043405e1 75
d1898b73
DH
76#define CREATE_TRACE_POINTS
77#include "trace.h"
78
313a3dc7 79#define MAX_IO_MSRS 256
890ca9ae 80#define KVM_MAX_MCE_BANKS 32
c45dcc71
AR
81u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
82EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
890ca9ae 83
0f65dd70 84#define emul_to_vcpu(ctxt) \
c9b8b07c 85 ((struct kvm_vcpu *)(ctxt)->vcpu)
0f65dd70 86
50a37eb4
JR
87/* EFER defaults:
88 * - enable syscall per default because its emulated by KVM
89 * - enable LME and LMA per default on 64 bit KVM
90 */
91#ifdef CONFIG_X86_64
1260edbe
LJ
92static
93u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
50a37eb4 94#else
1260edbe 95static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
50a37eb4 96#endif
313a3dc7 97
b11306b5
SC
98static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
99
c519265f
RK
100#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
101 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
37131313 102
cb142eb7 103static void update_cr8_intercept(struct kvm_vcpu *vcpu);
7460fb4a 104static void process_nmi(struct kvm_vcpu *vcpu);
ee2cd4b7 105static void enter_smm(struct kvm_vcpu *vcpu);
6addfc42 106static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
01643c51
KH
107static void store_regs(struct kvm_vcpu *vcpu);
108static int sync_regs(struct kvm_vcpu *vcpu);
674eea0f 109
afaf0b2f 110struct kvm_x86_ops kvm_x86_ops __read_mostly;
5fdbf976 111EXPORT_SYMBOL_GPL(kvm_x86_ops);
97896d04 112
893590c7 113static bool __read_mostly ignore_msrs = 0;
476bc001 114module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
ed85c068 115
fab0aa3b
EM
116static bool __read_mostly report_ignored_msrs = true;
117module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
118
4c27625b 119unsigned int min_timer_period_us = 200;
9ed96e87
MT
120module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
121
630994b3
MT
122static bool __read_mostly kvmclock_periodic_sync = true;
123module_param(kvmclock_periodic_sync, bool, S_IRUGO);
124
893590c7 125bool __read_mostly kvm_has_tsc_control;
92a1f12d 126EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
893590c7 127u32 __read_mostly kvm_max_guest_tsc_khz;
92a1f12d 128EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
bc9b961b
HZ
129u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
130EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
131u64 __read_mostly kvm_max_tsc_scaling_ratio;
132EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
64672c95
YJ
133u64 __read_mostly kvm_default_tsc_scaling_ratio;
134EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
92a1f12d 135
cc578287 136/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
893590c7 137static u32 __read_mostly tsc_tolerance_ppm = 250;
cc578287
ZA
138module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
139
c3941d9e
SC
140/*
141 * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
142 * adaptive tuning starting from default advancment of 1000ns. '0' disables
143 * advancement entirely. Any other value is used as-is and disables adaptive
144 * tuning, i.e. allows priveleged userspace to set an exact advancement time.
145 */
146static int __read_mostly lapic_timer_advance_ns = -1;
0e6edceb 147module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
d0659d94 148
52004014
FW
149static bool __read_mostly vector_hashing = true;
150module_param(vector_hashing, bool, S_IRUGO);
151
c4ae60e4
LA
152bool __read_mostly enable_vmware_backdoor = false;
153module_param(enable_vmware_backdoor, bool, S_IRUGO);
154EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
155
6c86eedc
WL
156static bool __read_mostly force_emulation_prefix = false;
157module_param(force_emulation_prefix, bool, S_IRUGO);
158
0c5f81da
WL
159int __read_mostly pi_inject_timer = -1;
160module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
161
18863bdd
AK
162#define KVM_NR_SHARED_MSRS 16
163
164struct kvm_shared_msrs_global {
165 int nr;
2bf78fa7 166 u32 msrs[KVM_NR_SHARED_MSRS];
18863bdd
AK
167};
168
169struct kvm_shared_msrs {
170 struct user_return_notifier urn;
171 bool registered;
2bf78fa7
SY
172 struct kvm_shared_msr_values {
173 u64 host;
174 u64 curr;
175 } values[KVM_NR_SHARED_MSRS];
18863bdd
AK
176};
177
178static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
013f6a5d 179static struct kvm_shared_msrs __percpu *shared_msrs;
18863bdd 180
cfc48181
SC
181#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
182 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
183 | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
184 | XFEATURE_MASK_PKRU)
185
91661989
SC
186u64 __read_mostly host_efer;
187EXPORT_SYMBOL_GPL(host_efer);
188
139a12cf 189static u64 __read_mostly host_xss;
408e9a31
PB
190u64 __read_mostly supported_xss;
191EXPORT_SYMBOL_GPL(supported_xss);
139a12cf 192
417bc304 193struct kvm_stats_debugfs_item debugfs_entries[] = {
812756a8
EGE
194 VCPU_STAT("pf_fixed", pf_fixed),
195 VCPU_STAT("pf_guest", pf_guest),
196 VCPU_STAT("tlb_flush", tlb_flush),
197 VCPU_STAT("invlpg", invlpg),
198 VCPU_STAT("exits", exits),
199 VCPU_STAT("io_exits", io_exits),
200 VCPU_STAT("mmio_exits", mmio_exits),
201 VCPU_STAT("signal_exits", signal_exits),
202 VCPU_STAT("irq_window", irq_window_exits),
203 VCPU_STAT("nmi_window", nmi_window_exits),
204 VCPU_STAT("halt_exits", halt_exits),
205 VCPU_STAT("halt_successful_poll", halt_successful_poll),
206 VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
207 VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
208 VCPU_STAT("halt_wakeup", halt_wakeup),
209 VCPU_STAT("hypercalls", hypercalls),
210 VCPU_STAT("request_irq", request_irq_exits),
211 VCPU_STAT("irq_exits", irq_exits),
212 VCPU_STAT("host_state_reload", host_state_reload),
213 VCPU_STAT("fpu_reload", fpu_reload),
214 VCPU_STAT("insn_emulation", insn_emulation),
215 VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
216 VCPU_STAT("irq_injections", irq_injections),
217 VCPU_STAT("nmi_injections", nmi_injections),
218 VCPU_STAT("req_event", req_event),
219 VCPU_STAT("l1d_flush", l1d_flush),
220 VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
221 VM_STAT("mmu_pte_write", mmu_pte_write),
222 VM_STAT("mmu_pte_updated", mmu_pte_updated),
223 VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
224 VM_STAT("mmu_flooded", mmu_flooded),
225 VM_STAT("mmu_recycled", mmu_recycled),
226 VM_STAT("mmu_cache_miss", mmu_cache_miss),
227 VM_STAT("mmu_unsync", mmu_unsync),
228 VM_STAT("remote_tlb_flush", remote_tlb_flush),
229 VM_STAT("largepages", lpages, .mode = 0444),
230 VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
231 VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
417bc304
HB
232 { NULL }
233};
234
2acf923e 235u64 __read_mostly host_xcr0;
cfc48181
SC
236u64 __read_mostly supported_xcr0;
237EXPORT_SYMBOL_GPL(supported_xcr0);
2acf923e 238
b666a4b6
MO
239struct kmem_cache *x86_fpu_cache;
240EXPORT_SYMBOL_GPL(x86_fpu_cache);
241
c9b8b07c
SC
242static struct kmem_cache *x86_emulator_cache;
243
244static struct kmem_cache *kvm_alloc_emulator_cache(void)
245{
06add254
SC
246 unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
247 unsigned int size = sizeof(struct x86_emulate_ctxt);
248
249 return kmem_cache_create_usercopy("x86_emulator", size,
c9b8b07c 250 __alignof__(struct x86_emulate_ctxt),
06add254
SC
251 SLAB_ACCOUNT, useroffset,
252 size - useroffset, NULL);
c9b8b07c
SC
253}
254
b6785def 255static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
d6aa1000 256
af585b92
GN
257static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
258{
259 int i;
260 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
261 vcpu->arch.apf.gfns[i] = ~0;
262}
263
18863bdd
AK
264static void kvm_on_user_return(struct user_return_notifier *urn)
265{
266 unsigned slot;
18863bdd
AK
267 struct kvm_shared_msrs *locals
268 = container_of(urn, struct kvm_shared_msrs, urn);
2bf78fa7 269 struct kvm_shared_msr_values *values;
1650b4eb
IA
270 unsigned long flags;
271
272 /*
273 * Disabling irqs at this point since the following code could be
274 * interrupted and executed through kvm_arch_hardware_disable()
275 */
276 local_irq_save(flags);
277 if (locals->registered) {
278 locals->registered = false;
279 user_return_notifier_unregister(urn);
280 }
281 local_irq_restore(flags);
18863bdd 282 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
2bf78fa7
SY
283 values = &locals->values[slot];
284 if (values->host != values->curr) {
285 wrmsrl(shared_msrs_global.msrs[slot], values->host);
286 values->curr = values->host;
18863bdd
AK
287 }
288 }
18863bdd
AK
289}
290
2bf78fa7
SY
291void kvm_define_shared_msr(unsigned slot, u32 msr)
292{
0123be42 293 BUG_ON(slot >= KVM_NR_SHARED_MSRS);
c847fe88 294 shared_msrs_global.msrs[slot] = msr;
18863bdd
AK
295 if (slot >= shared_msrs_global.nr)
296 shared_msrs_global.nr = slot + 1;
18863bdd
AK
297}
298EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
299
300static void kvm_shared_msr_cpu_online(void)
301{
05c19c2f
SC
302 unsigned int cpu = smp_processor_id();
303 struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
304 u64 value;
305 int i;
18863bdd 306
05c19c2f
SC
307 for (i = 0; i < shared_msrs_global.nr; ++i) {
308 rdmsrl_safe(shared_msrs_global.msrs[i], &value);
309 smsr->values[i].host = value;
310 smsr->values[i].curr = value;
311 }
18863bdd
AK
312}
313
8b3c3104 314int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
18863bdd 315{
013f6a5d
MT
316 unsigned int cpu = smp_processor_id();
317 struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
8b3c3104 318 int err;
18863bdd 319
de1fca5d
PB
320 value = (value & mask) | (smsr->values[slot].host & ~mask);
321 if (value == smsr->values[slot].curr)
8b3c3104 322 return 0;
8b3c3104
AH
323 err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
324 if (err)
325 return 1;
326
de1fca5d 327 smsr->values[slot].curr = value;
18863bdd
AK
328 if (!smsr->registered) {
329 smsr->urn.on_user_return = kvm_on_user_return;
330 user_return_notifier_register(&smsr->urn);
331 smsr->registered = true;
332 }
8b3c3104 333 return 0;
18863bdd
AK
334}
335EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
336
13a34e06 337static void drop_user_return_notifiers(void)
3548bab5 338{
013f6a5d
MT
339 unsigned int cpu = smp_processor_id();
340 struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
3548bab5
AK
341
342 if (smsr->registered)
343 kvm_on_user_return(&smsr->urn);
344}
345
6866b83e
CO
346u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
347{
8a5a87d9 348 return vcpu->arch.apic_base;
6866b83e
CO
349}
350EXPORT_SYMBOL_GPL(kvm_get_apic_base);
351
58871649
JM
352enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
353{
354 return kvm_apic_mode(kvm_get_apic_base(vcpu));
355}
356EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
357
58cb628d
JK
358int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
359{
58871649
JM
360 enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
361 enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
d6321d49
RK
362 u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
363 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
58cb628d 364
58871649 365 if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
58cb628d 366 return 1;
58871649
JM
367 if (!msr_info->host_initiated) {
368 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
369 return 1;
370 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
371 return 1;
372 }
58cb628d
JK
373
374 kvm_lapic_set_base(vcpu, msr_info->data);
4abaffce 375 kvm_recalculate_apic_map(vcpu->kvm);
58cb628d 376 return 0;
6866b83e
CO
377}
378EXPORT_SYMBOL_GPL(kvm_set_apic_base);
379
2605fc21 380asmlinkage __visible void kvm_spurious_fault(void)
e3ba45b8
GL
381{
382 /* Fault while not rebooting. We want the trace. */
b4fdcf60 383 BUG_ON(!kvm_rebooting);
e3ba45b8
GL
384}
385EXPORT_SYMBOL_GPL(kvm_spurious_fault);
386
3fd28fce
ED
387#define EXCPT_BENIGN 0
388#define EXCPT_CONTRIBUTORY 1
389#define EXCPT_PF 2
390
391static int exception_class(int vector)
392{
393 switch (vector) {
394 case PF_VECTOR:
395 return EXCPT_PF;
396 case DE_VECTOR:
397 case TS_VECTOR:
398 case NP_VECTOR:
399 case SS_VECTOR:
400 case GP_VECTOR:
401 return EXCPT_CONTRIBUTORY;
402 default:
403 break;
404 }
405 return EXCPT_BENIGN;
406}
407
d6e8c854
NA
408#define EXCPT_FAULT 0
409#define EXCPT_TRAP 1
410#define EXCPT_ABORT 2
411#define EXCPT_INTERRUPT 3
412
413static int exception_type(int vector)
414{
415 unsigned int mask;
416
417 if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
418 return EXCPT_INTERRUPT;
419
420 mask = 1 << vector;
421
422 /* #DB is trap, as instruction watchpoints are handled elsewhere */
423 if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
424 return EXCPT_TRAP;
425
426 if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
427 return EXCPT_ABORT;
428
429 /* Reserved exceptions will result in fault */
430 return EXCPT_FAULT;
431}
432
da998b46
JM
433void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
434{
435 unsigned nr = vcpu->arch.exception.nr;
436 bool has_payload = vcpu->arch.exception.has_payload;
437 unsigned long payload = vcpu->arch.exception.payload;
438
439 if (!has_payload)
440 return;
441
442 switch (nr) {
f10c729f
JM
443 case DB_VECTOR:
444 /*
445 * "Certain debug exceptions may clear bit 0-3. The
446 * remaining contents of the DR6 register are never
447 * cleared by the processor".
448 */
449 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
450 /*
451 * DR6.RTM is set by all #DB exceptions that don't clear it.
452 */
453 vcpu->arch.dr6 |= DR6_RTM;
454 vcpu->arch.dr6 |= payload;
455 /*
456 * Bit 16 should be set in the payload whenever the #DB
457 * exception should clear DR6.RTM. This makes the payload
458 * compatible with the pending debug exceptions under VMX.
459 * Though not currently documented in the SDM, this also
460 * makes the payload compatible with the exit qualification
461 * for #DB exceptions under VMX.
462 */
463 vcpu->arch.dr6 ^= payload & DR6_RTM;
307f1cfa
OU
464
465 /*
466 * The #DB payload is defined as compatible with the 'pending
467 * debug exceptions' field under VMX, not DR6. While bit 12 is
468 * defined in the 'pending debug exceptions' field (enabled
469 * breakpoint), it is reserved and must be zero in DR6.
470 */
471 vcpu->arch.dr6 &= ~BIT(12);
f10c729f 472 break;
da998b46
JM
473 case PF_VECTOR:
474 vcpu->arch.cr2 = payload;
475 break;
476 }
477
478 vcpu->arch.exception.has_payload = false;
479 vcpu->arch.exception.payload = 0;
480}
481EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
482
3fd28fce 483static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
ce7ddec4 484 unsigned nr, bool has_error, u32 error_code,
91e86d22 485 bool has_payload, unsigned long payload, bool reinject)
3fd28fce
ED
486{
487 u32 prev_nr;
488 int class1, class2;
489
3842d135
AK
490 kvm_make_request(KVM_REQ_EVENT, vcpu);
491
664f8e26 492 if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
3fd28fce 493 queue:
3ffb2468
NA
494 if (has_error && !is_protmode(vcpu))
495 has_error = false;
664f8e26
WL
496 if (reinject) {
497 /*
498 * On vmentry, vcpu->arch.exception.pending is only
499 * true if an event injection was blocked by
500 * nested_run_pending. In that case, however,
501 * vcpu_enter_guest requests an immediate exit,
502 * and the guest shouldn't proceed far enough to
503 * need reinjection.
504 */
505 WARN_ON_ONCE(vcpu->arch.exception.pending);
506 vcpu->arch.exception.injected = true;
91e86d22
JM
507 if (WARN_ON_ONCE(has_payload)) {
508 /*
509 * A reinjected event has already
510 * delivered its payload.
511 */
512 has_payload = false;
513 payload = 0;
514 }
664f8e26
WL
515 } else {
516 vcpu->arch.exception.pending = true;
517 vcpu->arch.exception.injected = false;
518 }
3fd28fce
ED
519 vcpu->arch.exception.has_error_code = has_error;
520 vcpu->arch.exception.nr = nr;
521 vcpu->arch.exception.error_code = error_code;
91e86d22
JM
522 vcpu->arch.exception.has_payload = has_payload;
523 vcpu->arch.exception.payload = payload;
a06230b6 524 if (!is_guest_mode(vcpu))
da998b46 525 kvm_deliver_exception_payload(vcpu);
3fd28fce
ED
526 return;
527 }
528
529 /* to check exception */
530 prev_nr = vcpu->arch.exception.nr;
531 if (prev_nr == DF_VECTOR) {
532 /* triple fault -> shutdown */
a8eeb04a 533 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3fd28fce
ED
534 return;
535 }
536 class1 = exception_class(prev_nr);
537 class2 = exception_class(nr);
538 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
539 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
664f8e26
WL
540 /*
541 * Generate double fault per SDM Table 5-5. Set
542 * exception.pending = true so that the double fault
543 * can trigger a nested vmexit.
544 */
3fd28fce 545 vcpu->arch.exception.pending = true;
664f8e26 546 vcpu->arch.exception.injected = false;
3fd28fce
ED
547 vcpu->arch.exception.has_error_code = true;
548 vcpu->arch.exception.nr = DF_VECTOR;
549 vcpu->arch.exception.error_code = 0;
c851436a
JM
550 vcpu->arch.exception.has_payload = false;
551 vcpu->arch.exception.payload = 0;
3fd28fce
ED
552 } else
553 /* replace previous exception with a new one in a hope
554 that instruction re-execution will regenerate lost
555 exception */
556 goto queue;
557}
558
298101da
AK
559void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
560{
91e86d22 561 kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
298101da
AK
562}
563EXPORT_SYMBOL_GPL(kvm_queue_exception);
564
ce7ddec4
JR
565void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
566{
91e86d22 567 kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
ce7ddec4
JR
568}
569EXPORT_SYMBOL_GPL(kvm_requeue_exception);
570
4d5523cf
PB
571void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
572 unsigned long payload)
f10c729f
JM
573{
574 kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
575}
4d5523cf 576EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
f10c729f 577
da998b46
JM
578static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
579 u32 error_code, unsigned long payload)
580{
581 kvm_multiple_exception(vcpu, nr, true, error_code,
582 true, payload, false);
583}
584
6affcbed 585int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
c3c91fee 586{
db8fcefa
AP
587 if (err)
588 kvm_inject_gp(vcpu, 0);
589 else
6affcbed
KH
590 return kvm_skip_emulated_instruction(vcpu);
591
592 return 1;
db8fcefa
AP
593}
594EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
8df25a32 595
6389ee94 596void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
c3c91fee
AK
597{
598 ++vcpu->stat.pf_guest;
adfe20fb
WL
599 vcpu->arch.exception.nested_apf =
600 is_guest_mode(vcpu) && fault->async_page_fault;
da998b46 601 if (vcpu->arch.exception.nested_apf) {
adfe20fb 602 vcpu->arch.apf.nested_apf_token = fault->address;
da998b46
JM
603 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
604 } else {
605 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
606 fault->address);
607 }
c3c91fee 608}
27d6c865 609EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
c3c91fee 610
53b3d8e9
SC
611bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
612 struct x86_exception *fault)
d4f8cf66 613{
0cd665bd 614 struct kvm_mmu *fault_mmu;
53b3d8e9
SC
615 WARN_ON_ONCE(fault->vector != PF_VECTOR);
616
0cd665bd
PB
617 fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
618 vcpu->arch.walk_mmu;
ef54bcfe 619
ee1fa209
JS
620 /*
621 * Invalidate the TLB entry for the faulting address, if it exists,
622 * else the access will fault indefinitely (and to emulate hardware).
623 */
624 if ((fault->error_code & PFERR_PRESENT_MASK) &&
625 !(fault->error_code & PFERR_RSVD_MASK))
626 kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
627 fault_mmu->root_hpa);
628
629 fault_mmu->inject_page_fault(vcpu, fault);
ef54bcfe 630 return fault->nested_page_fault;
d4f8cf66 631}
53b3d8e9 632EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
d4f8cf66 633
3419ffc8
SY
634void kvm_inject_nmi(struct kvm_vcpu *vcpu)
635{
7460fb4a
AK
636 atomic_inc(&vcpu->arch.nmi_queued);
637 kvm_make_request(KVM_REQ_NMI, vcpu);
3419ffc8
SY
638}
639EXPORT_SYMBOL_GPL(kvm_inject_nmi);
640
298101da
AK
641void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
642{
91e86d22 643 kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
298101da
AK
644}
645EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
646
ce7ddec4
JR
647void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
648{
91e86d22 649 kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
ce7ddec4
JR
650}
651EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
652
0a79b009
AK
653/*
654 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
655 * a #GP and return false.
656 */
657bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
298101da 658{
afaf0b2f 659 if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
0a79b009
AK
660 return true;
661 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
662 return false;
298101da 663}
0a79b009 664EXPORT_SYMBOL_GPL(kvm_require_cpl);
298101da 665
16f8a6f9
NA
666bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
667{
668 if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
669 return true;
670
671 kvm_queue_exception(vcpu, UD_VECTOR);
672 return false;
673}
674EXPORT_SYMBOL_GPL(kvm_require_dr);
675
ec92fe44
JR
676/*
677 * This function will be used to read from the physical memory of the currently
54bf36aa 678 * running guest. The difference to kvm_vcpu_read_guest_page is that this function
ec92fe44
JR
679 * can read from guest physical or from the guest's guest physical memory.
680 */
681int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
682 gfn_t ngfn, void *data, int offset, int len,
683 u32 access)
684{
54987b7a 685 struct x86_exception exception;
ec92fe44
JR
686 gfn_t real_gfn;
687 gpa_t ngpa;
688
689 ngpa = gfn_to_gpa(ngfn);
54987b7a 690 real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
ec92fe44
JR
691 if (real_gfn == UNMAPPED_GVA)
692 return -EFAULT;
693
694 real_gfn = gpa_to_gfn(real_gfn);
695
54bf36aa 696 return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
ec92fe44
JR
697}
698EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
699
69b0049a 700static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3d06b8bf
JR
701 void *data, int offset, int len, u32 access)
702{
703 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
704 data, offset, len, access);
705}
706
16cfacc8
SC
707static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
708{
709 return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
710 rsvd_bits(1, 2);
711}
712
a03490ed 713/*
16cfacc8 714 * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
a03490ed 715 */
ff03a073 716int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
a03490ed
CO
717{
718 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
719 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
720 int i;
721 int ret;
ff03a073 722 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
a03490ed 723
ff03a073
JR
724 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
725 offset * sizeof(u64), sizeof(pdpte),
726 PFERR_USER_MASK|PFERR_WRITE_MASK);
a03490ed
CO
727 if (ret < 0) {
728 ret = 0;
729 goto out;
730 }
731 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
812f30b2 732 if ((pdpte[i] & PT_PRESENT_MASK) &&
16cfacc8 733 (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
a03490ed
CO
734 ret = 0;
735 goto out;
736 }
737 }
738 ret = 1;
739
ff03a073 740 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
cb3c1e2f
SC
741 kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
742
a03490ed 743out:
a03490ed
CO
744
745 return ret;
746}
cc4b6871 747EXPORT_SYMBOL_GPL(load_pdptrs);
a03490ed 748
9ed38ffa 749bool pdptrs_changed(struct kvm_vcpu *vcpu)
d835dfec 750{
ff03a073 751 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
3d06b8bf
JR
752 int offset;
753 gfn_t gfn;
d835dfec
AK
754 int r;
755
bf03d4f9 756 if (!is_pae_paging(vcpu))
d835dfec
AK
757 return false;
758
cb3c1e2f 759 if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
6de4f3ad
AK
760 return true;
761
a512177e
PB
762 gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
763 offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
3d06b8bf
JR
764 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
765 PFERR_USER_MASK | PFERR_WRITE_MASK);
d835dfec 766 if (r < 0)
7f7f0d9c 767 return true;
d835dfec 768
7f7f0d9c 769 return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
d835dfec 770}
9ed38ffa 771EXPORT_SYMBOL_GPL(pdptrs_changed);
d835dfec 772
49a9b07e 773int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
a03490ed 774{
aad82703 775 unsigned long old_cr0 = kvm_read_cr0(vcpu);
d81135a5 776 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
aad82703 777
f9a48e6a
AK
778 cr0 |= X86_CR0_ET;
779
ab344828 780#ifdef CONFIG_X86_64
0f12244f
GN
781 if (cr0 & 0xffffffff00000000UL)
782 return 1;
ab344828
GN
783#endif
784
785 cr0 &= ~CR0_RESERVED_BITS;
a03490ed 786
0f12244f
GN
787 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
788 return 1;
a03490ed 789
0f12244f
GN
790 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
791 return 1;
a03490ed
CO
792
793 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
794#ifdef CONFIG_X86_64
f6801dff 795 if ((vcpu->arch.efer & EFER_LME)) {
a03490ed
CO
796 int cs_db, cs_l;
797
0f12244f
GN
798 if (!is_pae(vcpu))
799 return 1;
afaf0b2f 800 kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
0f12244f
GN
801 if (cs_l)
802 return 1;
a03490ed
CO
803 } else
804#endif
ff03a073 805 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
9f8fe504 806 kvm_read_cr3(vcpu)))
0f12244f 807 return 1;
a03490ed
CO
808 }
809
ad756a16
MJ
810 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
811 return 1;
812
afaf0b2f 813 kvm_x86_ops.set_cr0(vcpu, cr0);
a03490ed 814
d170c419 815 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
e5f3f027 816 kvm_clear_async_pf_completion_queue(vcpu);
d170c419
LJ
817 kvm_async_pf_hash_reset(vcpu);
818 }
e5f3f027 819
aad82703
SY
820 if ((cr0 ^ old_cr0) & update_bits)
821 kvm_mmu_reset_context(vcpu);
b18d5431 822
879ae188
LE
823 if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
824 kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
825 !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
b18d5431
XG
826 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
827
0f12244f
GN
828 return 0;
829}
2d3ad1f4 830EXPORT_SYMBOL_GPL(kvm_set_cr0);
a03490ed 831
2d3ad1f4 832void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
a03490ed 833{
49a9b07e 834 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
a03490ed 835}
2d3ad1f4 836EXPORT_SYMBOL_GPL(kvm_lmsw);
a03490ed 837
139a12cf 838void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
42bdf991 839{
139a12cf
AL
840 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
841
842 if (vcpu->arch.xcr0 != host_xcr0)
843 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
844
845 if (vcpu->arch.xsaves_enabled &&
846 vcpu->arch.ia32_xss != host_xss)
847 wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
848 }
37486135
BM
849
850 if (static_cpu_has(X86_FEATURE_PKU) &&
851 (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
852 (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
853 vcpu->arch.pkru != vcpu->arch.host_pkru)
854 __write_pkru(vcpu->arch.pkru);
42bdf991 855}
139a12cf 856EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
42bdf991 857
139a12cf 858void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
42bdf991 859{
37486135
BM
860 if (static_cpu_has(X86_FEATURE_PKU) &&
861 (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
862 (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
863 vcpu->arch.pkru = rdpkru();
864 if (vcpu->arch.pkru != vcpu->arch.host_pkru)
865 __write_pkru(vcpu->arch.host_pkru);
866 }
867
139a12cf
AL
868 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
869
870 if (vcpu->arch.xcr0 != host_xcr0)
871 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
872
873 if (vcpu->arch.xsaves_enabled &&
874 vcpu->arch.ia32_xss != host_xss)
875 wrmsrl(MSR_IA32_XSS, host_xss);
876 }
877
42bdf991 878}
139a12cf 879EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
42bdf991 880
69b0049a 881static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
2acf923e 882{
56c103ec
LJ
883 u64 xcr0 = xcr;
884 u64 old_xcr0 = vcpu->arch.xcr0;
46c34cb0 885 u64 valid_bits;
2acf923e
DC
886
887 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
888 if (index != XCR_XFEATURE_ENABLED_MASK)
889 return 1;
d91cab78 890 if (!(xcr0 & XFEATURE_MASK_FP))
2acf923e 891 return 1;
d91cab78 892 if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
2acf923e 893 return 1;
46c34cb0
PB
894
895 /*
896 * Do not allow the guest to set bits that we do not support
897 * saving. However, xcr0 bit 0 is always set, even if the
898 * emulated CPU does not support XSAVE (see fx_init).
899 */
d91cab78 900 valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
46c34cb0 901 if (xcr0 & ~valid_bits)
2acf923e 902 return 1;
46c34cb0 903
d91cab78
DH
904 if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
905 (!(xcr0 & XFEATURE_MASK_BNDCSR)))
390bd528
LJ
906 return 1;
907
d91cab78
DH
908 if (xcr0 & XFEATURE_MASK_AVX512) {
909 if (!(xcr0 & XFEATURE_MASK_YMM))
612263b3 910 return 1;
d91cab78 911 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
612263b3
CP
912 return 1;
913 }
2acf923e 914 vcpu->arch.xcr0 = xcr0;
56c103ec 915
d91cab78 916 if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
56c103ec 917 kvm_update_cpuid(vcpu);
2acf923e
DC
918 return 0;
919}
920
921int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
922{
afaf0b2f 923 if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
764bcbc5 924 __kvm_set_xcr(vcpu, index, xcr)) {
2acf923e
DC
925 kvm_inject_gp(vcpu, 0);
926 return 1;
927 }
928 return 0;
929}
930EXPORT_SYMBOL_GPL(kvm_set_xcr);
931
345599f9
SC
932#define __cr4_reserved_bits(__cpu_has, __c) \
933({ \
934 u64 __reserved_bits = CR4_RESERVED_BITS; \
935 \
936 if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \
937 __reserved_bits |= X86_CR4_OSXSAVE; \
938 if (!__cpu_has(__c, X86_FEATURE_SMEP)) \
939 __reserved_bits |= X86_CR4_SMEP; \
940 if (!__cpu_has(__c, X86_FEATURE_SMAP)) \
941 __reserved_bits |= X86_CR4_SMAP; \
942 if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \
943 __reserved_bits |= X86_CR4_FSGSBASE; \
944 if (!__cpu_has(__c, X86_FEATURE_PKU)) \
945 __reserved_bits |= X86_CR4_PKE; \
946 if (!__cpu_has(__c, X86_FEATURE_LA57)) \
947 __reserved_bits |= X86_CR4_LA57; \
d76c7fbc
SC
948 if (!__cpu_has(__c, X86_FEATURE_UMIP)) \
949 __reserved_bits |= X86_CR4_UMIP; \
345599f9
SC
950 __reserved_bits; \
951})
a03490ed 952
3ca94192 953static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
a03490ed 954{
b11306b5 955 if (cr4 & cr4_reserved_bits)
3ca94192 956 return -EINVAL;
b9baba86 957
345599f9 958 if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu))
3ca94192
WL
959 return -EINVAL;
960
961 return 0;
962}
963
964int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
965{
966 unsigned long old_cr4 = kvm_read_cr4(vcpu);
967 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
968 X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
969
970 if (kvm_valid_cr4(vcpu, cr4))
ae3e61e1
PB
971 return 1;
972
a03490ed 973 if (is_long_mode(vcpu)) {
0f12244f
GN
974 if (!(cr4 & X86_CR4_PAE))
975 return 1;
a2edf57f
AK
976 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
977 && ((cr4 ^ old_cr4) & pdptr_bits)
9f8fe504
AK
978 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
979 kvm_read_cr3(vcpu)))
0f12244f
GN
980 return 1;
981
ad756a16 982 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
d6321d49 983 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
ad756a16
MJ
984 return 1;
985
986 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
987 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
988 return 1;
989 }
990
afaf0b2f 991 if (kvm_x86_ops.set_cr4(vcpu, cr4))
0f12244f 992 return 1;
a03490ed 993
ad756a16
MJ
994 if (((cr4 ^ old_cr4) & pdptr_bits) ||
995 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
aad82703 996 kvm_mmu_reset_context(vcpu);
0f12244f 997
b9baba86 998 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
00b27a3e 999 kvm_update_cpuid(vcpu);
2acf923e 1000
0f12244f
GN
1001 return 0;
1002}
2d3ad1f4 1003EXPORT_SYMBOL_GPL(kvm_set_cr4);
a03490ed 1004
2390218b 1005int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
a03490ed 1006{
ade61e28 1007 bool skip_tlb_flush = false;
ac146235 1008#ifdef CONFIG_X86_64
c19986fe
JS
1009 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
1010
ade61e28 1011 if (pcid_enabled) {
208320ba
JS
1012 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
1013 cr3 &= ~X86_CR3_PCID_NOFLUSH;
ade61e28 1014 }
ac146235 1015#endif
9d88fca7 1016
9f8fe504 1017 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
956bf353
JS
1018 if (!skip_tlb_flush) {
1019 kvm_mmu_sync_roots(vcpu);
eeeb4f67 1020 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
956bf353 1021 }
0f12244f 1022 return 0;
d835dfec
AK
1023 }
1024
d1cd3ce9 1025 if (is_long_mode(vcpu) &&
a780a3ea 1026 (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
d1cd3ce9 1027 return 1;
bf03d4f9
PB
1028 else if (is_pae_paging(vcpu) &&
1029 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
346874c9 1030 return 1;
a03490ed 1031
be01e8e2 1032 kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
0f12244f 1033 vcpu->arch.cr3 = cr3;
cb3c1e2f 1034 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
7c390d35 1035
0f12244f
GN
1036 return 0;
1037}
2d3ad1f4 1038EXPORT_SYMBOL_GPL(kvm_set_cr3);
a03490ed 1039
eea1cff9 1040int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
a03490ed 1041{
0f12244f
GN
1042 if (cr8 & CR8_RESERVED_BITS)
1043 return 1;
35754c98 1044 if (lapic_in_kernel(vcpu))
a03490ed
CO
1045 kvm_lapic_set_tpr(vcpu, cr8);
1046 else
ad312c7c 1047 vcpu->arch.cr8 = cr8;
0f12244f
GN
1048 return 0;
1049}
2d3ad1f4 1050EXPORT_SYMBOL_GPL(kvm_set_cr8);
a03490ed 1051
2d3ad1f4 1052unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
a03490ed 1053{
35754c98 1054 if (lapic_in_kernel(vcpu))
a03490ed
CO
1055 return kvm_lapic_get_cr8(vcpu);
1056 else
ad312c7c 1057 return vcpu->arch.cr8;
a03490ed 1058}
2d3ad1f4 1059EXPORT_SYMBOL_GPL(kvm_get_cr8);
a03490ed 1060
ae561ede
NA
1061static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
1062{
1063 int i;
1064
1065 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1066 for (i = 0; i < KVM_NR_DB_REGS; i++)
1067 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1068 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
1069 }
1070}
1071
c8639010
JK
1072static void kvm_update_dr7(struct kvm_vcpu *vcpu)
1073{
1074 unsigned long dr7;
1075
1076 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1077 dr7 = vcpu->arch.guest_debug_dr7;
1078 else
1079 dr7 = vcpu->arch.dr7;
afaf0b2f 1080 kvm_x86_ops.set_dr7(vcpu, dr7);
360b948d
PB
1081 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1082 if (dr7 & DR7_BP_EN_MASK)
1083 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
c8639010
JK
1084}
1085
6f43ed01
NA
1086static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
1087{
1088 u64 fixed = DR6_FIXED_1;
1089
d6321d49 1090 if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
6f43ed01
NA
1091 fixed |= DR6_RTM;
1092 return fixed;
1093}
1094
338dbc97 1095static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
020df079 1096{
ea740059
MP
1097 size_t size = ARRAY_SIZE(vcpu->arch.db);
1098
020df079
GN
1099 switch (dr) {
1100 case 0 ... 3:
ea740059 1101 vcpu->arch.db[array_index_nospec(dr, size)] = val;
020df079
GN
1102 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1103 vcpu->arch.eff_db[dr] = val;
1104 break;
1105 case 4:
020df079
GN
1106 /* fall through */
1107 case 6:
338dbc97
GN
1108 if (val & 0xffffffff00000000ULL)
1109 return -1; /* #GP */
6f43ed01 1110 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
020df079
GN
1111 break;
1112 case 5:
020df079
GN
1113 /* fall through */
1114 default: /* 7 */
b91991bf 1115 if (!kvm_dr7_valid(val))
338dbc97 1116 return -1; /* #GP */
020df079 1117 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
c8639010 1118 kvm_update_dr7(vcpu);
020df079
GN
1119 break;
1120 }
1121
1122 return 0;
1123}
338dbc97
GN
1124
1125int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1126{
16f8a6f9 1127 if (__kvm_set_dr(vcpu, dr, val)) {
338dbc97 1128 kvm_inject_gp(vcpu, 0);
16f8a6f9
NA
1129 return 1;
1130 }
1131 return 0;
338dbc97 1132}
020df079
GN
1133EXPORT_SYMBOL_GPL(kvm_set_dr);
1134
16f8a6f9 1135int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
020df079 1136{
ea740059
MP
1137 size_t size = ARRAY_SIZE(vcpu->arch.db);
1138
020df079
GN
1139 switch (dr) {
1140 case 0 ... 3:
ea740059 1141 *val = vcpu->arch.db[array_index_nospec(dr, size)];
020df079
GN
1142 break;
1143 case 4:
020df079
GN
1144 /* fall through */
1145 case 6:
5679b803 1146 *val = vcpu->arch.dr6;
020df079
GN
1147 break;
1148 case 5:
020df079
GN
1149 /* fall through */
1150 default: /* 7 */
1151 *val = vcpu->arch.dr7;
1152 break;
1153 }
338dbc97
GN
1154 return 0;
1155}
020df079
GN
1156EXPORT_SYMBOL_GPL(kvm_get_dr);
1157
022cd0e8
AK
1158bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1159{
de3cd117 1160 u32 ecx = kvm_rcx_read(vcpu);
022cd0e8
AK
1161 u64 data;
1162 int err;
1163
c6702c9d 1164 err = kvm_pmu_rdpmc(vcpu, ecx, &data);
022cd0e8
AK
1165 if (err)
1166 return err;
de3cd117
SC
1167 kvm_rax_write(vcpu, (u32)data);
1168 kvm_rdx_write(vcpu, data >> 32);
022cd0e8
AK
1169 return err;
1170}
1171EXPORT_SYMBOL_GPL(kvm_rdpmc);
1172
043405e1
CO
1173/*
1174 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1175 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1176 *
7a5ee6ed
CQ
1177 * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1178 * extract the supported MSRs from the related const lists.
1179 * msrs_to_save is selected from the msrs_to_save_all to reflect the
e3267cbb 1180 * capabilities of the host cpu. This capabilities test skips MSRs that are
7a5ee6ed 1181 * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
62ef68bb 1182 * may depend on host virtualization features rather than host cpu features.
043405e1 1183 */
e3267cbb 1184
7a5ee6ed 1185static const u32 msrs_to_save_all[] = {
043405e1 1186 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
8c06585d 1187 MSR_STAR,
043405e1
CO
1188#ifdef CONFIG_X86_64
1189 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1190#endif
b3897a49 1191 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
32ad73db 1192 MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
2bdb76c0 1193 MSR_IA32_SPEC_CTRL,
bf8c55d8
CP
1194 MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1195 MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1196 MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1197 MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1198 MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1199 MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
6e3ba4ab
TX
1200 MSR_IA32_UMWAIT_CONTROL,
1201
e2ada66e
JM
1202 MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
1203 MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
1204 MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1205 MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1206 MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1207 MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1208 MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1209 MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
1210 MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
1211 MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
1212 MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
1213 MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
1214 MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
e2ada66e
JM
1215 MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1216 MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1217 MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1218 MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
1219 MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
1220 MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
1221 MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
1222 MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
1223 MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
043405e1
CO
1224};
1225
7a5ee6ed 1226static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
043405e1
CO
1227static unsigned num_msrs_to_save;
1228
7a5ee6ed 1229static const u32 emulated_msrs_all[] = {
62ef68bb
PB
1230 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1231 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1232 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1233 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
72c139ba 1234 HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
e7d9513b
AS
1235 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1236 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
e516cebb 1237 HV_X64_MSR_RESET,
11c4b1ca 1238 HV_X64_MSR_VP_INDEX,
9eec50b8 1239 HV_X64_MSR_VP_RUNTIME,
5c919412 1240 HV_X64_MSR_SCONTROL,
1f4b34f8 1241 HV_X64_MSR_STIMER0_CONFIG,
d4abc577 1242 HV_X64_MSR_VP_ASSIST_PAGE,
a2e164e7
VK
1243 HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1244 HV_X64_MSR_TSC_EMULATION_STATUS,
1245
1246 MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
62ef68bb
PB
1247 MSR_KVM_PV_EOI_EN,
1248
ba904635 1249 MSR_IA32_TSC_ADJUST,
a3e06bbe 1250 MSR_IA32_TSCDEADLINE,
2bdb76c0 1251 MSR_IA32_ARCH_CAPABILITIES,
043405e1 1252 MSR_IA32_MISC_ENABLE,
908e75f3
AK
1253 MSR_IA32_MCG_STATUS,
1254 MSR_IA32_MCG_CTL,
c45dcc71 1255 MSR_IA32_MCG_EXT_CTL,
64d60670 1256 MSR_IA32_SMBASE,
52797bf9 1257 MSR_SMI_COUNT,
db2336a8
KH
1258 MSR_PLATFORM_INFO,
1259 MSR_MISC_FEATURES_ENABLES,
bc226f07 1260 MSR_AMD64_VIRT_SPEC_CTRL,
6c6a2ab9 1261 MSR_IA32_POWER_CTL,
99634e3e 1262 MSR_IA32_UCODE_REV,
191c8137 1263
95c5c7c7
PB
1264 /*
1265 * The following list leaves out MSRs whose values are determined
1266 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1267 * We always support the "true" VMX control MSRs, even if the host
1268 * processor does not, so I am putting these registers here rather
7a5ee6ed 1269 * than in msrs_to_save_all.
95c5c7c7
PB
1270 */
1271 MSR_IA32_VMX_BASIC,
1272 MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1273 MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1274 MSR_IA32_VMX_TRUE_EXIT_CTLS,
1275 MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1276 MSR_IA32_VMX_MISC,
1277 MSR_IA32_VMX_CR0_FIXED0,
1278 MSR_IA32_VMX_CR4_FIXED0,
1279 MSR_IA32_VMX_VMCS_ENUM,
1280 MSR_IA32_VMX_PROCBASED_CTLS2,
1281 MSR_IA32_VMX_EPT_VPID_CAP,
1282 MSR_IA32_VMX_VMFUNC,
1283
191c8137 1284 MSR_K7_HWCR,
2d5ba19b 1285 MSR_KVM_POLL_CONTROL,
043405e1
CO
1286};
1287
7a5ee6ed 1288static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
62ef68bb
PB
1289static unsigned num_emulated_msrs;
1290
801e459a
TL
1291/*
1292 * List of msr numbers which are used to expose MSR-based features that
1293 * can be used by a hypervisor to validate requested CPU features.
1294 */
7a5ee6ed 1295static const u32 msr_based_features_all[] = {
1389309c
PB
1296 MSR_IA32_VMX_BASIC,
1297 MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1298 MSR_IA32_VMX_PINBASED_CTLS,
1299 MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1300 MSR_IA32_VMX_PROCBASED_CTLS,
1301 MSR_IA32_VMX_TRUE_EXIT_CTLS,
1302 MSR_IA32_VMX_EXIT_CTLS,
1303 MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1304 MSR_IA32_VMX_ENTRY_CTLS,
1305 MSR_IA32_VMX_MISC,
1306 MSR_IA32_VMX_CR0_FIXED0,
1307 MSR_IA32_VMX_CR0_FIXED1,
1308 MSR_IA32_VMX_CR4_FIXED0,
1309 MSR_IA32_VMX_CR4_FIXED1,
1310 MSR_IA32_VMX_VMCS_ENUM,
1311 MSR_IA32_VMX_PROCBASED_CTLS2,
1312 MSR_IA32_VMX_EPT_VPID_CAP,
1313 MSR_IA32_VMX_VMFUNC,
1314
d1d93fa9 1315 MSR_F10H_DECFG,
518e7b94 1316 MSR_IA32_UCODE_REV,
cd283252 1317 MSR_IA32_ARCH_CAPABILITIES,
801e459a
TL
1318};
1319
7a5ee6ed 1320static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
801e459a
TL
1321static unsigned int num_msr_based_features;
1322
4d22c17c 1323static u64 kvm_get_arch_capabilities(void)
5b76a3cf 1324{
4d22c17c 1325 u64 data = 0;
5b76a3cf 1326
4d22c17c
XL
1327 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
1328 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
5b76a3cf 1329
b8e8c830
PB
1330 /*
1331 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1332 * the nested hypervisor runs with NX huge pages. If it is not,
1333 * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
1334 * L1 guests, so it need not worry about its own (L2) guests.
1335 */
1336 data |= ARCH_CAP_PSCHANGE_MC_NO;
1337
5b76a3cf
PB
1338 /*
1339 * If we're doing cache flushes (either "always" or "cond")
1340 * we will do one whenever the guest does a vmlaunch/vmresume.
1341 * If an outer hypervisor is doing the cache flush for us
1342 * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1343 * capability to the guest too, and if EPT is disabled we're not
1344 * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
1345 * require a nested hypervisor to do a flush of its own.
1346 */
1347 if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1348 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1349
0c54914d
PB
1350 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
1351 data |= ARCH_CAP_RDCL_NO;
1352 if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
1353 data |= ARCH_CAP_SSB_NO;
1354 if (!boot_cpu_has_bug(X86_BUG_MDS))
1355 data |= ARCH_CAP_MDS_NO;
1356
e1d38b63 1357 /*
c11f83e0
PB
1358 * On TAA affected systems:
1359 * - nothing to do if TSX is disabled on the host.
1360 * - we emulate TSX_CTRL if present on the host.
1361 * This lets the guest use VERW to clear CPU buffers.
e1d38b63 1362 */
cbbaa272 1363 if (!boot_cpu_has(X86_FEATURE_RTM))
c11f83e0 1364 data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR);
cbbaa272
PB
1365 else if (!boot_cpu_has_bug(X86_BUG_TAA))
1366 data |= ARCH_CAP_TAA_NO;
e1d38b63 1367
5b76a3cf
PB
1368 return data;
1369}
5b76a3cf 1370
66421c1e
WL
1371static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1372{
1373 switch (msr->index) {
cd283252 1374 case MSR_IA32_ARCH_CAPABILITIES:
5b76a3cf
PB
1375 msr->data = kvm_get_arch_capabilities();
1376 break;
1377 case MSR_IA32_UCODE_REV:
cd283252 1378 rdmsrl_safe(msr->index, &msr->data);
518e7b94 1379 break;
66421c1e 1380 default:
afaf0b2f 1381 if (kvm_x86_ops.get_msr_feature(msr))
66421c1e
WL
1382 return 1;
1383 }
1384 return 0;
1385}
1386
801e459a
TL
1387static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1388{
1389 struct kvm_msr_entry msr;
66421c1e 1390 int r;
801e459a
TL
1391
1392 msr.index = index;
66421c1e
WL
1393 r = kvm_get_msr_feature(&msr);
1394 if (r)
1395 return r;
801e459a
TL
1396
1397 *data = msr.data;
1398
1399 return 0;
1400}
1401
11988499 1402static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
15c4a640 1403{
1b4d56b8 1404 if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
11988499 1405 return false;
1b2fd70c 1406
1b4d56b8 1407 if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
11988499 1408 return false;
d8017474 1409
0a629563
SC
1410 if (efer & (EFER_LME | EFER_LMA) &&
1411 !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1412 return false;
1413
1414 if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1415 return false;
d8017474 1416
384bb783 1417 return true;
11988499
SC
1418
1419}
1420bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1421{
1422 if (efer & efer_reserved_bits)
1423 return false;
1424
1425 return __kvm_valid_efer(vcpu, efer);
384bb783
JK
1426}
1427EXPORT_SYMBOL_GPL(kvm_valid_efer);
1428
11988499 1429static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
384bb783
JK
1430{
1431 u64 old_efer = vcpu->arch.efer;
11988499 1432 u64 efer = msr_info->data;
384bb783 1433
11988499 1434 if (efer & efer_reserved_bits)
66f61c92 1435 return 1;
384bb783 1436
11988499
SC
1437 if (!msr_info->host_initiated) {
1438 if (!__kvm_valid_efer(vcpu, efer))
1439 return 1;
1440
1441 if (is_paging(vcpu) &&
1442 (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1443 return 1;
1444 }
384bb783 1445
15c4a640 1446 efer &= ~EFER_LMA;
f6801dff 1447 efer |= vcpu->arch.efer & EFER_LMA;
15c4a640 1448
afaf0b2f 1449 kvm_x86_ops.set_efer(vcpu, efer);
a3d204e2 1450
aad82703
SY
1451 /* Update reserved bits */
1452 if ((efer ^ old_efer) & EFER_NX)
1453 kvm_mmu_reset_context(vcpu);
1454
b69e8cae 1455 return 0;
15c4a640
CO
1456}
1457
f2b4b7dd
JR
1458void kvm_enable_efer_bits(u64 mask)
1459{
1460 efer_reserved_bits &= ~mask;
1461}
1462EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1463
15c4a640 1464/*
f20935d8
SC
1465 * Write @data into the MSR specified by @index. Select MSR specific fault
1466 * checks are bypassed if @host_initiated is %true.
15c4a640
CO
1467 * Returns 0 on success, non-0 otherwise.
1468 * Assumes vcpu_load() was already called.
1469 */
f20935d8
SC
1470static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1471 bool host_initiated)
15c4a640 1472{
f20935d8
SC
1473 struct msr_data msr;
1474
1475 switch (index) {
854e8bb1
NA
1476 case MSR_FS_BASE:
1477 case MSR_GS_BASE:
1478 case MSR_KERNEL_GS_BASE:
1479 case MSR_CSTAR:
1480 case MSR_LSTAR:
f20935d8 1481 if (is_noncanonical_address(data, vcpu))
854e8bb1
NA
1482 return 1;
1483 break;
1484 case MSR_IA32_SYSENTER_EIP:
1485 case MSR_IA32_SYSENTER_ESP:
1486 /*
1487 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1488 * non-canonical address is written on Intel but not on
1489 * AMD (which ignores the top 32-bits, because it does
1490 * not implement 64-bit SYSENTER).
1491 *
1492 * 64-bit code should hence be able to write a non-canonical
1493 * value on AMD. Making the address canonical ensures that
1494 * vmentry does not fail on Intel after writing a non-canonical
1495 * value, and that something deterministic happens if the guest
1496 * invokes 64-bit SYSENTER.
1497 */
f20935d8 1498 data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
854e8bb1 1499 }
f20935d8
SC
1500
1501 msr.data = data;
1502 msr.index = index;
1503 msr.host_initiated = host_initiated;
1504
afaf0b2f 1505 return kvm_x86_ops.set_msr(vcpu, &msr);
15c4a640
CO
1506}
1507
313a3dc7 1508/*
f20935d8
SC
1509 * Read the MSR specified by @index into @data. Select MSR specific fault
1510 * checks are bypassed if @host_initiated is %true.
1511 * Returns 0 on success, non-0 otherwise.
1512 * Assumes vcpu_load() was already called.
313a3dc7 1513 */
edef5c36
PB
1514int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1515 bool host_initiated)
609e36d3
PB
1516{
1517 struct msr_data msr;
f20935d8 1518 int ret;
609e36d3
PB
1519
1520 msr.index = index;
f20935d8 1521 msr.host_initiated = host_initiated;
609e36d3 1522
afaf0b2f 1523 ret = kvm_x86_ops.get_msr(vcpu, &msr);
f20935d8
SC
1524 if (!ret)
1525 *data = msr.data;
1526 return ret;
609e36d3
PB
1527}
1528
f20935d8 1529int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
313a3dc7 1530{
f20935d8
SC
1531 return __kvm_get_msr(vcpu, index, data, false);
1532}
1533EXPORT_SYMBOL_GPL(kvm_get_msr);
8fe8ab46 1534
f20935d8
SC
1535int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1536{
1537 return __kvm_set_msr(vcpu, index, data, false);
1538}
1539EXPORT_SYMBOL_GPL(kvm_set_msr);
1540
1edce0a9
SC
1541int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
1542{
1543 u32 ecx = kvm_rcx_read(vcpu);
1544 u64 data;
1545
1546 if (kvm_get_msr(vcpu, ecx, &data)) {
1547 trace_kvm_msr_read_ex(ecx);
1548 kvm_inject_gp(vcpu, 0);
1549 return 1;
1550 }
1551
1552 trace_kvm_msr_read(ecx, data);
1553
1554 kvm_rax_write(vcpu, data & -1u);
1555 kvm_rdx_write(vcpu, (data >> 32) & -1u);
1556 return kvm_skip_emulated_instruction(vcpu);
1557}
1558EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
1559
1560int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
1561{
1562 u32 ecx = kvm_rcx_read(vcpu);
1563 u64 data = kvm_read_edx_eax(vcpu);
1564
1565 if (kvm_set_msr(vcpu, ecx, data)) {
1566 trace_kvm_msr_write_ex(ecx, data);
1567 kvm_inject_gp(vcpu, 0);
1568 return 1;
1569 }
1570
1571 trace_kvm_msr_write(ecx, data);
1572 return kvm_skip_emulated_instruction(vcpu);
1573}
1574EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
1575
1e9e2622
WL
1576/*
1577 * The fast path for frequent and performance sensitive wrmsr emulation,
1578 * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
1579 * the latency of virtual IPI by avoiding the expensive bits of transitioning
1580 * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
1581 * other cases which must be called after interrupts are enabled on the host.
1582 */
1583static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
1584{
e1be9ac8
WL
1585 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
1586 return 1;
1587
1588 if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
1e9e2622 1589 ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
4064a4c6
WL
1590 ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
1591 ((u32)(data >> 32) != X2APIC_BROADCAST)) {
1e9e2622 1592
d5361678
WL
1593 data &= ~(1 << 12);
1594 kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
1e9e2622 1595 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
d5361678
WL
1596 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
1597 trace_kvm_apic_write(APIC_ICR, (u32)data);
1598 return 0;
1e9e2622
WL
1599 }
1600
1601 return 1;
1602}
1603
1604enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
1605{
1606 u32 msr = kvm_rcx_read(vcpu);
8a1038de 1607 u64 data;
1e9e2622
WL
1608 int ret = 0;
1609
1610 switch (msr) {
1611 case APIC_BASE_MSR + (APIC_ICR >> 4):
8a1038de 1612 data = kvm_read_edx_eax(vcpu);
1e9e2622
WL
1613 ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
1614 break;
1615 default:
1616 return EXIT_FASTPATH_NONE;
1617 }
1618
1619 if (!ret) {
1620 trace_kvm_msr_write(msr, data);
1621 return EXIT_FASTPATH_SKIP_EMUL_INS;
1622 }
1623
1624 return EXIT_FASTPATH_NONE;
1625}
1626EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
1627
f20935d8
SC
1628/*
1629 * Adapt set_msr() to msr_io()'s calling convention
1630 */
1631static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1632{
1633 return __kvm_get_msr(vcpu, index, data, true);
1634}
1635
1636static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1637{
1638 return __kvm_set_msr(vcpu, index, *data, true);
313a3dc7
CO
1639}
1640
16e8d74d 1641#ifdef CONFIG_X86_64
53fafdbb
MT
1642struct pvclock_clock {
1643 int vclock_mode;
1644 u64 cycle_last;
1645 u64 mask;
1646 u32 mult;
1647 u32 shift;
917f9475
PB
1648 u64 base_cycles;
1649 u64 offset;
53fafdbb
MT
1650};
1651
16e8d74d
MT
1652struct pvclock_gtod_data {
1653 seqcount_t seq;
1654
53fafdbb
MT
1655 struct pvclock_clock clock; /* extract of a clocksource struct */
1656 struct pvclock_clock raw_clock; /* extract of a clocksource struct */
16e8d74d 1657
917f9475 1658 ktime_t offs_boot;
55dd00a7 1659 u64 wall_time_sec;
16e8d74d
MT
1660};
1661
1662static struct pvclock_gtod_data pvclock_gtod_data;
1663
1664static void update_pvclock_gtod(struct timekeeper *tk)
1665{
1666 struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1667
1668 write_seqcount_begin(&vdata->seq);
1669
1670 /* copy pvclock gtod data */
b95a8a27 1671 vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
876e7881
PZ
1672 vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
1673 vdata->clock.mask = tk->tkr_mono.mask;
1674 vdata->clock.mult = tk->tkr_mono.mult;
1675 vdata->clock.shift = tk->tkr_mono.shift;
917f9475
PB
1676 vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
1677 vdata->clock.offset = tk->tkr_mono.base;
16e8d74d 1678
b95a8a27 1679 vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
53fafdbb
MT
1680 vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
1681 vdata->raw_clock.mask = tk->tkr_raw.mask;
1682 vdata->raw_clock.mult = tk->tkr_raw.mult;
1683 vdata->raw_clock.shift = tk->tkr_raw.shift;
917f9475
PB
1684 vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
1685 vdata->raw_clock.offset = tk->tkr_raw.base;
16e8d74d 1686
55dd00a7
MT
1687 vdata->wall_time_sec = tk->xtime_sec;
1688
917f9475 1689 vdata->offs_boot = tk->offs_boot;
53fafdbb 1690
16e8d74d
MT
1691 write_seqcount_end(&vdata->seq);
1692}
8171cd68
PB
1693
1694static s64 get_kvmclock_base_ns(void)
1695{
1696 /* Count up from boot time, but with the frequency of the raw clock. */
1697 return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
1698}
1699#else
1700static s64 get_kvmclock_base_ns(void)
1701{
1702 /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
1703 return ktime_get_boottime_ns();
1704}
16e8d74d
MT
1705#endif
1706
bab5bb39
NK
1707void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1708{
bab5bb39 1709 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
4d151bf3 1710 kvm_vcpu_kick(vcpu);
bab5bb39 1711}
16e8d74d 1712
18068523
GOC
1713static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1714{
9ed3c444
AK
1715 int version;
1716 int r;
50d0a0f9 1717 struct pvclock_wall_clock wc;
8171cd68 1718 u64 wall_nsec;
18068523
GOC
1719
1720 if (!wall_clock)
1721 return;
1722
9ed3c444
AK
1723 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1724 if (r)
1725 return;
1726
1727 if (version & 1)
1728 ++version; /* first time write, random junk */
1729
1730 ++version;
18068523 1731
1dab1345
NK
1732 if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
1733 return;
18068523 1734
50d0a0f9
GH
1735 /*
1736 * The guest calculates current wall clock time by adding
34c238a1 1737 * system time (updated by kvm_guest_time_update below) to the
8171cd68 1738 * wall clock specified here. We do the reverse here.
50d0a0f9 1739 */
8171cd68 1740 wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
50d0a0f9 1741
8171cd68
PB
1742 wc.nsec = do_div(wall_nsec, 1000000000);
1743 wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
50d0a0f9 1744 wc.version = version;
18068523
GOC
1745
1746 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1747
1748 version++;
1749 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
18068523
GOC
1750}
1751
50d0a0f9
GH
1752static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1753{
b51012de
PB
1754 do_shl32_div32(dividend, divisor);
1755 return dividend;
50d0a0f9
GH
1756}
1757
3ae13faa 1758static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
5f4e3f88 1759 s8 *pshift, u32 *pmultiplier)
50d0a0f9 1760{
5f4e3f88 1761 uint64_t scaled64;
50d0a0f9
GH
1762 int32_t shift = 0;
1763 uint64_t tps64;
1764 uint32_t tps32;
1765
3ae13faa
PB
1766 tps64 = base_hz;
1767 scaled64 = scaled_hz;
50933623 1768 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
50d0a0f9
GH
1769 tps64 >>= 1;
1770 shift--;
1771 }
1772
1773 tps32 = (uint32_t)tps64;
50933623
JK
1774 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1775 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
5f4e3f88
ZA
1776 scaled64 >>= 1;
1777 else
1778 tps32 <<= 1;
50d0a0f9
GH
1779 shift++;
1780 }
1781
5f4e3f88
ZA
1782 *pshift = shift;
1783 *pmultiplier = div_frac(scaled64, tps32);
50d0a0f9
GH
1784}
1785
d828199e 1786#ifdef CONFIG_X86_64
16e8d74d 1787static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
d828199e 1788#endif
16e8d74d 1789
c8076604 1790static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
69b0049a 1791static unsigned long max_tsc_khz;
c8076604 1792
cc578287 1793static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1e993611 1794{
cc578287
ZA
1795 u64 v = (u64)khz * (1000000 + ppm);
1796 do_div(v, 1000000);
1797 return v;
1e993611
JR
1798}
1799
381d585c
HZ
1800static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1801{
1802 u64 ratio;
1803
1804 /* Guest TSC same frequency as host TSC? */
1805 if (!scale) {
1806 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1807 return 0;
1808 }
1809
1810 /* TSC scaling supported? */
1811 if (!kvm_has_tsc_control) {
1812 if (user_tsc_khz > tsc_khz) {
1813 vcpu->arch.tsc_catchup = 1;
1814 vcpu->arch.tsc_always_catchup = 1;
1815 return 0;
1816 } else {
3f16a5c3 1817 pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
381d585c
HZ
1818 return -1;
1819 }
1820 }
1821
1822 /* TSC scaling required - calculate ratio */
1823 ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
1824 user_tsc_khz, tsc_khz);
1825
1826 if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
3f16a5c3
PB
1827 pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
1828 user_tsc_khz);
381d585c
HZ
1829 return -1;
1830 }
1831
1832 vcpu->arch.tsc_scaling_ratio = ratio;
1833 return 0;
1834}
1835
4941b8cb 1836static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
759379dd 1837{
cc578287
ZA
1838 u32 thresh_lo, thresh_hi;
1839 int use_scaling = 0;
217fc9cf 1840
03ba32ca 1841 /* tsc_khz can be zero if TSC calibration fails */
4941b8cb 1842 if (user_tsc_khz == 0) {
ad721883
HZ
1843 /* set tsc_scaling_ratio to a safe value */
1844 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
381d585c 1845 return -1;
ad721883 1846 }
03ba32ca 1847
c285545f 1848 /* Compute a scale to convert nanoseconds in TSC cycles */
3ae13faa 1849 kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
cc578287
ZA
1850 &vcpu->arch.virtual_tsc_shift,
1851 &vcpu->arch.virtual_tsc_mult);
4941b8cb 1852 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
cc578287
ZA
1853
1854 /*
1855 * Compute the variation in TSC rate which is acceptable
1856 * within the range of tolerance and decide if the
1857 * rate being applied is within that bounds of the hardware
1858 * rate. If so, no scaling or compensation need be done.
1859 */
1860 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1861 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
4941b8cb
PB
1862 if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
1863 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
cc578287
ZA
1864 use_scaling = 1;
1865 }
4941b8cb 1866 return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
c285545f
ZA
1867}
1868
1869static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1870{
e26101b1 1871 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
cc578287
ZA
1872 vcpu->arch.virtual_tsc_mult,
1873 vcpu->arch.virtual_tsc_shift);
e26101b1 1874 tsc += vcpu->arch.this_tsc_write;
c285545f
ZA
1875 return tsc;
1876}
1877
b0c39dc6
VK
1878static inline int gtod_is_based_on_tsc(int mode)
1879{
b95a8a27 1880 return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
b0c39dc6
VK
1881}
1882
69b0049a 1883static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
b48aa97e
MT
1884{
1885#ifdef CONFIG_X86_64
1886 bool vcpus_matched;
b48aa97e
MT
1887 struct kvm_arch *ka = &vcpu->kvm->arch;
1888 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1889
1890 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1891 atomic_read(&vcpu->kvm->online_vcpus));
1892
7f187922
MT
1893 /*
1894 * Once the masterclock is enabled, always perform request in
1895 * order to update it.
1896 *
1897 * In order to enable masterclock, the host clocksource must be TSC
1898 * and the vcpus need to have matched TSCs. When that happens,
1899 * perform request to enable masterclock.
1900 */
1901 if (ka->use_master_clock ||
b0c39dc6 1902 (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
b48aa97e
MT
1903 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1904
1905 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1906 atomic_read(&vcpu->kvm->online_vcpus),
1907 ka->use_master_clock, gtod->clock.vclock_mode);
1908#endif
1909}
1910
ba904635
WA
1911static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1912{
afaf0b2f 1913 u64 curr_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu);
ba904635
WA
1914 vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1915}
1916
35181e86
HZ
1917/*
1918 * Multiply tsc by a fixed point number represented by ratio.
1919 *
1920 * The most significant 64-N bits (mult) of ratio represent the
1921 * integral part of the fixed point number; the remaining N bits
1922 * (frac) represent the fractional part, ie. ratio represents a fixed
1923 * point number (mult + frac * 2^(-N)).
1924 *
1925 * N equals to kvm_tsc_scaling_ratio_frac_bits.
1926 */
1927static inline u64 __scale_tsc(u64 ratio, u64 tsc)
1928{
1929 return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
1930}
1931
1932u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
1933{
1934 u64 _tsc = tsc;
1935 u64 ratio = vcpu->arch.tsc_scaling_ratio;
1936
1937 if (ratio != kvm_default_tsc_scaling_ratio)
1938 _tsc = __scale_tsc(ratio, tsc);
1939
1940 return _tsc;
1941}
1942EXPORT_SYMBOL_GPL(kvm_scale_tsc);
1943
07c1419a
HZ
1944static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1945{
1946 u64 tsc;
1947
1948 tsc = kvm_scale_tsc(vcpu, rdtsc());
1949
1950 return target_tsc - tsc;
1951}
1952
4ba76538
HZ
1953u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1954{
afaf0b2f 1955 u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu);
e79f245d
KA
1956
1957 return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
4ba76538
HZ
1958}
1959EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
1960
a545ab6a
LC
1961static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1962{
afaf0b2f 1963 vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
a545ab6a
LC
1964}
1965
b0c39dc6
VK
1966static inline bool kvm_check_tsc_unstable(void)
1967{
1968#ifdef CONFIG_X86_64
1969 /*
1970 * TSC is marked unstable when we're running on Hyper-V,
1971 * 'TSC page' clocksource is good.
1972 */
b95a8a27 1973 if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
b0c39dc6
VK
1974 return false;
1975#endif
1976 return check_tsc_unstable();
1977}
1978
8fe8ab46 1979void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
99e3e30a
ZA
1980{
1981 struct kvm *kvm = vcpu->kvm;
f38e098f 1982 u64 offset, ns, elapsed;
99e3e30a 1983 unsigned long flags;
b48aa97e 1984 bool matched;
0d3da0d2 1985 bool already_matched;
8fe8ab46 1986 u64 data = msr->data;
c5e8ec8e 1987 bool synchronizing = false;
99e3e30a 1988
038f8c11 1989 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
07c1419a 1990 offset = kvm_compute_tsc_offset(vcpu, data);
8171cd68 1991 ns = get_kvmclock_base_ns();
f38e098f 1992 elapsed = ns - kvm->arch.last_tsc_nsec;
5d3cb0f6 1993
03ba32ca 1994 if (vcpu->arch.virtual_tsc_khz) {
bd8fab39
DP
1995 if (data == 0 && msr->host_initiated) {
1996 /*
1997 * detection of vcpu initialization -- need to sync
1998 * with other vCPUs. This particularly helps to keep
1999 * kvm_clock stable after CPU hotplug
2000 */
2001 synchronizing = true;
2002 } else {
2003 u64 tsc_exp = kvm->arch.last_tsc_write +
2004 nsec_to_cycles(vcpu, elapsed);
2005 u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2006 /*
2007 * Special case: TSC write with a small delta (1 second)
2008 * of virtual cycle time against real time is
2009 * interpreted as an attempt to synchronize the CPU.
2010 */
2011 synchronizing = data < tsc_exp + tsc_hz &&
2012 data + tsc_hz > tsc_exp;
2013 }
c5e8ec8e 2014 }
f38e098f
ZA
2015
2016 /*
5d3cb0f6
ZA
2017 * For a reliable TSC, we can match TSC offsets, and for an unstable
2018 * TSC, we add elapsed time in this computation. We could let the
2019 * compensation code attempt to catch up if we fall behind, but
2020 * it's better to try to match offsets from the beginning.
2021 */
c5e8ec8e 2022 if (synchronizing &&
5d3cb0f6 2023 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
b0c39dc6 2024 if (!kvm_check_tsc_unstable()) {
e26101b1 2025 offset = kvm->arch.cur_tsc_offset;
f38e098f 2026 } else {
857e4099 2027 u64 delta = nsec_to_cycles(vcpu, elapsed);
5d3cb0f6 2028 data += delta;
07c1419a 2029 offset = kvm_compute_tsc_offset(vcpu, data);
f38e098f 2030 }
b48aa97e 2031 matched = true;
0d3da0d2 2032 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
e26101b1
ZA
2033 } else {
2034 /*
2035 * We split periods of matched TSC writes into generations.
2036 * For each generation, we track the original measured
2037 * nanosecond time, offset, and write, so if TSCs are in
2038 * sync, we can match exact offset, and if not, we can match
4a969980 2039 * exact software computation in compute_guest_tsc()
e26101b1
ZA
2040 *
2041 * These values are tracked in kvm->arch.cur_xxx variables.
2042 */
2043 kvm->arch.cur_tsc_generation++;
2044 kvm->arch.cur_tsc_nsec = ns;
2045 kvm->arch.cur_tsc_write = data;
2046 kvm->arch.cur_tsc_offset = offset;
b48aa97e 2047 matched = false;
f38e098f 2048 }
e26101b1
ZA
2049
2050 /*
2051 * We also track th most recent recorded KHZ, write and time to
2052 * allow the matching interval to be extended at each write.
2053 */
f38e098f
ZA
2054 kvm->arch.last_tsc_nsec = ns;
2055 kvm->arch.last_tsc_write = data;
5d3cb0f6 2056 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
99e3e30a 2057
b183aa58 2058 vcpu->arch.last_guest_tsc = data;
e26101b1
ZA
2059
2060 /* Keep track of which generation this VCPU has synchronized to */
2061 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2062 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2063 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2064
d6321d49 2065 if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
ba904635 2066 update_ia32_tsc_adjust_msr(vcpu, offset);
d6321d49 2067
a545ab6a 2068 kvm_vcpu_write_tsc_offset(vcpu, offset);
e26101b1 2069 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
b48aa97e
MT
2070
2071 spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
0d3da0d2 2072 if (!matched) {
b48aa97e 2073 kvm->arch.nr_vcpus_matched_tsc = 0;
0d3da0d2
TG
2074 } else if (!already_matched) {
2075 kvm->arch.nr_vcpus_matched_tsc++;
2076 }
b48aa97e
MT
2077
2078 kvm_track_tsc_matching(vcpu);
2079 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
99e3e30a 2080}
e26101b1 2081
99e3e30a
ZA
2082EXPORT_SYMBOL_GPL(kvm_write_tsc);
2083
58ea6767
HZ
2084static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
2085 s64 adjustment)
2086{
afaf0b2f 2087 u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu);
326e7425 2088 kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
58ea6767
HZ
2089}
2090
2091static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
2092{
2093 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
2094 WARN_ON(adjustment < 0);
2095 adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
ea26e4ec 2096 adjust_tsc_offset_guest(vcpu, adjustment);
58ea6767
HZ
2097}
2098
d828199e
MT
2099#ifdef CONFIG_X86_64
2100
a5a1d1c2 2101static u64 read_tsc(void)
d828199e 2102{
a5a1d1c2 2103 u64 ret = (u64)rdtsc_ordered();
03b9730b 2104 u64 last = pvclock_gtod_data.clock.cycle_last;
d828199e
MT
2105
2106 if (likely(ret >= last))
2107 return ret;
2108
2109 /*
2110 * GCC likes to generate cmov here, but this branch is extremely
6a6256f9 2111 * predictable (it's just a function of time and the likely is
d828199e
MT
2112 * very likely) and there's a data dependence, so force GCC
2113 * to generate a branch instead. I don't barrier() because
2114 * we don't actually need a barrier, and if this function
2115 * ever gets inlined it will generate worse code.
2116 */
2117 asm volatile ("");
2118 return last;
2119}
2120
53fafdbb
MT
2121static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2122 int *mode)
d828199e
MT
2123{
2124 long v;
b0c39dc6
VK
2125 u64 tsc_pg_val;
2126
53fafdbb 2127 switch (clock->vclock_mode) {
b95a8a27 2128 case VDSO_CLOCKMODE_HVCLOCK:
b0c39dc6
VK
2129 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
2130 tsc_timestamp);
2131 if (tsc_pg_val != U64_MAX) {
2132 /* TSC page valid */
b95a8a27 2133 *mode = VDSO_CLOCKMODE_HVCLOCK;
53fafdbb
MT
2134 v = (tsc_pg_val - clock->cycle_last) &
2135 clock->mask;
b0c39dc6
VK
2136 } else {
2137 /* TSC page invalid */
b95a8a27 2138 *mode = VDSO_CLOCKMODE_NONE;
b0c39dc6
VK
2139 }
2140 break;
b95a8a27
TG
2141 case VDSO_CLOCKMODE_TSC:
2142 *mode = VDSO_CLOCKMODE_TSC;
b0c39dc6 2143 *tsc_timestamp = read_tsc();
53fafdbb
MT
2144 v = (*tsc_timestamp - clock->cycle_last) &
2145 clock->mask;
b0c39dc6
VK
2146 break;
2147 default:
b95a8a27 2148 *mode = VDSO_CLOCKMODE_NONE;
b0c39dc6 2149 }
d828199e 2150
b95a8a27 2151 if (*mode == VDSO_CLOCKMODE_NONE)
b0c39dc6 2152 *tsc_timestamp = v = 0;
d828199e 2153
53fafdbb 2154 return v * clock->mult;
d828199e
MT
2155}
2156
53fafdbb 2157static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
d828199e 2158{
cbcf2dd3 2159 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
d828199e 2160 unsigned long seq;
d828199e 2161 int mode;
cbcf2dd3 2162 u64 ns;
d828199e 2163
d828199e
MT
2164 do {
2165 seq = read_seqcount_begin(&gtod->seq);
917f9475 2166 ns = gtod->raw_clock.base_cycles;
53fafdbb 2167 ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
917f9475
PB
2168 ns >>= gtod->raw_clock.shift;
2169 ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
d828199e 2170 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
cbcf2dd3 2171 *t = ns;
d828199e
MT
2172
2173 return mode;
2174}
2175
899a31f5 2176static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
55dd00a7
MT
2177{
2178 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2179 unsigned long seq;
2180 int mode;
2181 u64 ns;
2182
2183 do {
2184 seq = read_seqcount_begin(&gtod->seq);
55dd00a7 2185 ts->tv_sec = gtod->wall_time_sec;
917f9475 2186 ns = gtod->clock.base_cycles;
53fafdbb 2187 ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
55dd00a7
MT
2188 ns >>= gtod->clock.shift;
2189 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2190
2191 ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
2192 ts->tv_nsec = ns;
2193
2194 return mode;
2195}
2196
b0c39dc6
VK
2197/* returns true if host is using TSC based clocksource */
2198static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
d828199e 2199{
d828199e 2200 /* checked again under seqlock below */
b0c39dc6 2201 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
d828199e
MT
2202 return false;
2203
53fafdbb 2204 return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
b0c39dc6 2205 tsc_timestamp));
d828199e 2206}
55dd00a7 2207
b0c39dc6 2208/* returns true if host is using TSC based clocksource */
899a31f5 2209static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
b0c39dc6 2210 u64 *tsc_timestamp)
55dd00a7
MT
2211{
2212 /* checked again under seqlock below */
b0c39dc6 2213 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
55dd00a7
MT
2214 return false;
2215
b0c39dc6 2216 return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
55dd00a7 2217}
d828199e
MT
2218#endif
2219
2220/*
2221 *
b48aa97e
MT
2222 * Assuming a stable TSC across physical CPUS, and a stable TSC
2223 * across virtual CPUs, the following condition is possible.
2224 * Each numbered line represents an event visible to both
d828199e
MT
2225 * CPUs at the next numbered event.
2226 *
2227 * "timespecX" represents host monotonic time. "tscX" represents
2228 * RDTSC value.
2229 *
2230 * VCPU0 on CPU0 | VCPU1 on CPU1
2231 *
2232 * 1. read timespec0,tsc0
2233 * 2. | timespec1 = timespec0 + N
2234 * | tsc1 = tsc0 + M
2235 * 3. transition to guest | transition to guest
2236 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
2237 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
2238 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
2239 *
2240 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
2241 *
2242 * - ret0 < ret1
2243 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
2244 * ...
2245 * - 0 < N - M => M < N
2246 *
2247 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
2248 * always the case (the difference between two distinct xtime instances
2249 * might be smaller then the difference between corresponding TSC reads,
2250 * when updating guest vcpus pvclock areas).
2251 *
2252 * To avoid that problem, do not allow visibility of distinct
2253 * system_timestamp/tsc_timestamp values simultaneously: use a master
2254 * copy of host monotonic time values. Update that master copy
2255 * in lockstep.
2256 *
b48aa97e 2257 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
d828199e
MT
2258 *
2259 */
2260
2261static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
2262{
2263#ifdef CONFIG_X86_64
2264 struct kvm_arch *ka = &kvm->arch;
2265 int vclock_mode;
b48aa97e
MT
2266 bool host_tsc_clocksource, vcpus_matched;
2267
2268 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2269 atomic_read(&kvm->online_vcpus));
d828199e
MT
2270
2271 /*
2272 * If the host uses TSC clock, then passthrough TSC as stable
2273 * to the guest.
2274 */
b48aa97e 2275 host_tsc_clocksource = kvm_get_time_and_clockread(
d828199e
MT
2276 &ka->master_kernel_ns,
2277 &ka->master_cycle_now);
2278
16a96021 2279 ka->use_master_clock = host_tsc_clocksource && vcpus_matched
a826faf1 2280 && !ka->backwards_tsc_observed
54750f2c 2281 && !ka->boot_vcpu_runs_old_kvmclock;
b48aa97e 2282
d828199e
MT
2283 if (ka->use_master_clock)
2284 atomic_set(&kvm_guest_has_master_clock, 1);
2285
2286 vclock_mode = pvclock_gtod_data.clock.vclock_mode;
b48aa97e
MT
2287 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
2288 vcpus_matched);
d828199e
MT
2289#endif
2290}
2291
2860c4b1
PB
2292void kvm_make_mclock_inprogress_request(struct kvm *kvm)
2293{
2294 kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
2295}
2296
2e762ff7
MT
2297static void kvm_gen_update_masterclock(struct kvm *kvm)
2298{
2299#ifdef CONFIG_X86_64
2300 int i;
2301 struct kvm_vcpu *vcpu;
2302 struct kvm_arch *ka = &kvm->arch;
2303
2304 spin_lock(&ka->pvclock_gtod_sync_lock);
2305 kvm_make_mclock_inprogress_request(kvm);
2306 /* no guest entries from this point */
2307 pvclock_update_vm_gtod_copy(kvm);
2308
2309 kvm_for_each_vcpu(i, vcpu, kvm)
105b21bb 2310 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2e762ff7
MT
2311
2312 /* guest entries allowed */
2313 kvm_for_each_vcpu(i, vcpu, kvm)
72875d8a 2314 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
2e762ff7
MT
2315
2316 spin_unlock(&ka->pvclock_gtod_sync_lock);
2317#endif
2318}
2319
e891a32e 2320u64 get_kvmclock_ns(struct kvm *kvm)
108b249c 2321{
108b249c 2322 struct kvm_arch *ka = &kvm->arch;
8b953440 2323 struct pvclock_vcpu_time_info hv_clock;
e2c2206a 2324 u64 ret;
108b249c 2325
8b953440
PB
2326 spin_lock(&ka->pvclock_gtod_sync_lock);
2327 if (!ka->use_master_clock) {
2328 spin_unlock(&ka->pvclock_gtod_sync_lock);
8171cd68 2329 return get_kvmclock_base_ns() + ka->kvmclock_offset;
108b249c
PB
2330 }
2331
8b953440
PB
2332 hv_clock.tsc_timestamp = ka->master_cycle_now;
2333 hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
2334 spin_unlock(&ka->pvclock_gtod_sync_lock);
2335
e2c2206a
WL
2336 /* both __this_cpu_read() and rdtsc() should be on the same cpu */
2337 get_cpu();
2338
e70b57a6
WL
2339 if (__this_cpu_read(cpu_tsc_khz)) {
2340 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
2341 &hv_clock.tsc_shift,
2342 &hv_clock.tsc_to_system_mul);
2343 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
2344 } else
8171cd68 2345 ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
e2c2206a
WL
2346
2347 put_cpu();
2348
2349 return ret;
108b249c
PB
2350}
2351
0d6dd2ff
PB
2352static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
2353{
2354 struct kvm_vcpu_arch *vcpu = &v->arch;
2355 struct pvclock_vcpu_time_info guest_hv_clock;
2356
4e335d9e 2357 if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
0d6dd2ff
PB
2358 &guest_hv_clock, sizeof(guest_hv_clock))))
2359 return;
2360
2361 /* This VCPU is paused, but it's legal for a guest to read another
2362 * VCPU's kvmclock, so we really have to follow the specification where
2363 * it says that version is odd if data is being modified, and even after
2364 * it is consistent.
2365 *
2366 * Version field updates must be kept separate. This is because
2367 * kvm_write_guest_cached might use a "rep movs" instruction, and
2368 * writes within a string instruction are weakly ordered. So there
2369 * are three writes overall.
2370 *
2371 * As a small optimization, only write the version field in the first
2372 * and third write. The vcpu->pv_time cache is still valid, because the
2373 * version field is the first in the struct.
2374 */
2375 BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
2376
51c4b8bb
LA
2377 if (guest_hv_clock.version & 1)
2378 ++guest_hv_clock.version; /* first time write, random junk */
2379
0d6dd2ff 2380 vcpu->hv_clock.version = guest_hv_clock.version + 1;
4e335d9e
PB
2381 kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2382 &vcpu->hv_clock,
2383 sizeof(vcpu->hv_clock.version));
0d6dd2ff
PB
2384
2385 smp_wmb();
2386
2387 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
2388 vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
2389
2390 if (vcpu->pvclock_set_guest_stopped_request) {
2391 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
2392 vcpu->pvclock_set_guest_stopped_request = false;
2393 }
2394
2395 trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
2396
4e335d9e
PB
2397 kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2398 &vcpu->hv_clock,
2399 sizeof(vcpu->hv_clock));
0d6dd2ff
PB
2400
2401 smp_wmb();
2402
2403 vcpu->hv_clock.version++;
4e335d9e
PB
2404 kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2405 &vcpu->hv_clock,
2406 sizeof(vcpu->hv_clock.version));
0d6dd2ff
PB
2407}
2408
34c238a1 2409static int kvm_guest_time_update(struct kvm_vcpu *v)
18068523 2410{
78db6a50 2411 unsigned long flags, tgt_tsc_khz;
18068523 2412 struct kvm_vcpu_arch *vcpu = &v->arch;
d828199e 2413 struct kvm_arch *ka = &v->kvm->arch;
f25e656d 2414 s64 kernel_ns;
d828199e 2415 u64 tsc_timestamp, host_tsc;
51d59c6b 2416 u8 pvclock_flags;
d828199e
MT
2417 bool use_master_clock;
2418
2419 kernel_ns = 0;
2420 host_tsc = 0;
18068523 2421
d828199e
MT
2422 /*
2423 * If the host uses TSC clock, then passthrough TSC as stable
2424 * to the guest.
2425 */
2426 spin_lock(&ka->pvclock_gtod_sync_lock);
2427 use_master_clock = ka->use_master_clock;
2428 if (use_master_clock) {
2429 host_tsc = ka->master_cycle_now;
2430 kernel_ns = ka->master_kernel_ns;
2431 }
2432 spin_unlock(&ka->pvclock_gtod_sync_lock);
c09664bb
MT
2433
2434 /* Keep irq disabled to prevent changes to the clock */
2435 local_irq_save(flags);
78db6a50
PB
2436 tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2437 if (unlikely(tgt_tsc_khz == 0)) {
c09664bb
MT
2438 local_irq_restore(flags);
2439 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2440 return 1;
2441 }
d828199e 2442 if (!use_master_clock) {
4ea1636b 2443 host_tsc = rdtsc();
8171cd68 2444 kernel_ns = get_kvmclock_base_ns();
d828199e
MT
2445 }
2446
4ba76538 2447 tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
d828199e 2448
c285545f
ZA
2449 /*
2450 * We may have to catch up the TSC to match elapsed wall clock
2451 * time for two reasons, even if kvmclock is used.
2452 * 1) CPU could have been running below the maximum TSC rate
2453 * 2) Broken TSC compensation resets the base at each VCPU
2454 * entry to avoid unknown leaps of TSC even when running
2455 * again on the same CPU. This may cause apparent elapsed
2456 * time to disappear, and the guest to stand still or run
2457 * very slowly.
2458 */
2459 if (vcpu->tsc_catchup) {
2460 u64 tsc = compute_guest_tsc(v, kernel_ns);
2461 if (tsc > tsc_timestamp) {
f1e2b260 2462 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
c285545f
ZA
2463 tsc_timestamp = tsc;
2464 }
50d0a0f9
GH
2465 }
2466
18068523
GOC
2467 local_irq_restore(flags);
2468
0d6dd2ff 2469 /* With all the info we got, fill in the values */
18068523 2470
78db6a50
PB
2471 if (kvm_has_tsc_control)
2472 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2473
2474 if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
3ae13faa 2475 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
5f4e3f88
ZA
2476 &vcpu->hv_clock.tsc_shift,
2477 &vcpu->hv_clock.tsc_to_system_mul);
78db6a50 2478 vcpu->hw_tsc_khz = tgt_tsc_khz;
8cfdc000
ZA
2479 }
2480
1d5f066e 2481 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
759379dd 2482 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
28e4639a 2483 vcpu->last_guest_tsc = tsc_timestamp;
51d59c6b 2484
d828199e 2485 /* If the host uses TSC clocksource, then it is stable */
0d6dd2ff 2486 pvclock_flags = 0;
d828199e
MT
2487 if (use_master_clock)
2488 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2489
78c0337a
MT
2490 vcpu->hv_clock.flags = pvclock_flags;
2491
095cf55d
PB
2492 if (vcpu->pv_time_enabled)
2493 kvm_setup_pvclock_page(v);
2494 if (v == kvm_get_vcpu(v->kvm, 0))
2495 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
8cfdc000 2496 return 0;
c8076604
GH
2497}
2498
0061d53d
MT
2499/*
2500 * kvmclock updates which are isolated to a given vcpu, such as
2501 * vcpu->cpu migration, should not allow system_timestamp from
2502 * the rest of the vcpus to remain static. Otherwise ntp frequency
2503 * correction applies to one vcpu's system_timestamp but not
2504 * the others.
2505 *
2506 * So in those cases, request a kvmclock update for all vcpus.
7e44e449
AJ
2507 * We need to rate-limit these requests though, as they can
2508 * considerably slow guests that have a large number of vcpus.
2509 * The time for a remote vcpu to update its kvmclock is bound
2510 * by the delay we use to rate-limit the updates.
0061d53d
MT
2511 */
2512
7e44e449
AJ
2513#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2514
2515static void kvmclock_update_fn(struct work_struct *work)
0061d53d
MT
2516{
2517 int i;
7e44e449
AJ
2518 struct delayed_work *dwork = to_delayed_work(work);
2519 struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2520 kvmclock_update_work);
2521 struct kvm *kvm = container_of(ka, struct kvm, arch);
0061d53d
MT
2522 struct kvm_vcpu *vcpu;
2523
2524 kvm_for_each_vcpu(i, vcpu, kvm) {
105b21bb 2525 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
0061d53d
MT
2526 kvm_vcpu_kick(vcpu);
2527 }
2528}
2529
7e44e449
AJ
2530static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2531{
2532 struct kvm *kvm = v->kvm;
2533
105b21bb 2534 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
7e44e449
AJ
2535 schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2536 KVMCLOCK_UPDATE_DELAY);
2537}
2538
332967a3
AJ
2539#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2540
2541static void kvmclock_sync_fn(struct work_struct *work)
2542{
2543 struct delayed_work *dwork = to_delayed_work(work);
2544 struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2545 kvmclock_sync_work);
2546 struct kvm *kvm = container_of(ka, struct kvm, arch);
2547
630994b3
MT
2548 if (!kvmclock_periodic_sync)
2549 return;
2550
332967a3
AJ
2551 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2552 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2553 KVMCLOCK_SYNC_PERIOD);
2554}
2555
191c8137
BP
2556/*
2557 * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
2558 */
2559static bool can_set_mci_status(struct kvm_vcpu *vcpu)
2560{
2561 /* McStatusWrEn enabled? */
23493d0a 2562 if (guest_cpuid_is_amd_or_hygon(vcpu))
191c8137
BP
2563 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
2564
2565 return false;
2566}
2567
9ffd986c 2568static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
15c4a640 2569{
890ca9ae
HY
2570 u64 mcg_cap = vcpu->arch.mcg_cap;
2571 unsigned bank_num = mcg_cap & 0xff;
9ffd986c
WL
2572 u32 msr = msr_info->index;
2573 u64 data = msr_info->data;
890ca9ae 2574
15c4a640 2575 switch (msr) {
15c4a640 2576 case MSR_IA32_MCG_STATUS:
890ca9ae 2577 vcpu->arch.mcg_status = data;
15c4a640 2578 break;
c7ac679c 2579 case MSR_IA32_MCG_CTL:
44883f01
PB
2580 if (!(mcg_cap & MCG_CTL_P) &&
2581 (data || !msr_info->host_initiated))
890ca9ae
HY
2582 return 1;
2583 if (data != 0 && data != ~(u64)0)
44883f01 2584 return 1;
890ca9ae
HY
2585 vcpu->arch.mcg_ctl = data;
2586 break;
2587 default:
2588 if (msr >= MSR_IA32_MC0_CTL &&
81760dcc 2589 msr < MSR_IA32_MCx_CTL(bank_num)) {
6ec4c5ee
MP
2590 u32 offset = array_index_nospec(
2591 msr - MSR_IA32_MC0_CTL,
2592 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
2593
114be429
AP
2594 /* only 0 or all 1s can be written to IA32_MCi_CTL
2595 * some Linux kernels though clear bit 10 in bank 4 to
2596 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2597 * this to avoid an uncatched #GP in the guest
2598 */
890ca9ae 2599 if ((offset & 0x3) == 0 &&
114be429 2600 data != 0 && (data | (1 << 10)) != ~(u64)0)
890ca9ae 2601 return -1;
191c8137
BP
2602
2603 /* MCi_STATUS */
9ffd986c 2604 if (!msr_info->host_initiated &&
191c8137
BP
2605 (offset & 0x3) == 1 && data != 0) {
2606 if (!can_set_mci_status(vcpu))
2607 return -1;
2608 }
2609
890ca9ae
HY
2610 vcpu->arch.mce_banks[offset] = data;
2611 break;
2612 }
2613 return 1;
2614 }
2615 return 0;
2616}
2617
ffde22ac
ES
2618static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
2619{
2620 struct kvm *kvm = vcpu->kvm;
2621 int lm = is_long_mode(vcpu);
2622 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
2623 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
2624 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
2625 : kvm->arch.xen_hvm_config.blob_size_32;
2626 u32 page_num = data & ~PAGE_MASK;
2627 u64 page_addr = data & PAGE_MASK;
2628 u8 *page;
2629 int r;
2630
2631 r = -E2BIG;
2632 if (page_num >= blob_size)
2633 goto out;
2634 r = -ENOMEM;
ff5c2c03
SL
2635 page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2636 if (IS_ERR(page)) {
2637 r = PTR_ERR(page);
ffde22ac 2638 goto out;
ff5c2c03 2639 }
54bf36aa 2640 if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
ffde22ac
ES
2641 goto out_free;
2642 r = 0;
2643out_free:
2644 kfree(page);
2645out:
2646 return r;
2647}
2648
344d9588
GN
2649static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2650{
2651 gpa_t gpa = data & ~0x3f;
2652
52a5c155
WL
2653 /* Bits 3:5 are reserved, Should be zero */
2654 if (data & 0x38)
344d9588
GN
2655 return 1;
2656
2657 vcpu->arch.apf.msr_val = data;
2658
2659 if (!(data & KVM_ASYNC_PF_ENABLED)) {
2660 kvm_clear_async_pf_completion_queue(vcpu);
2661 kvm_async_pf_hash_reset(vcpu);
2662 return 0;
2663 }
2664
4e335d9e 2665 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
8f964525 2666 sizeof(u32)))
344d9588
GN
2667 return 1;
2668
6adba527 2669 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
52a5c155 2670 vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
344d9588
GN
2671 kvm_async_pf_wakeup_all(vcpu);
2672 return 0;
2673}
2674
12f9a48f
GC
2675static void kvmclock_reset(struct kvm_vcpu *vcpu)
2676{
0b79459b 2677 vcpu->arch.pv_time_enabled = false;
49dedf0d 2678 vcpu->arch.time = 0;
12f9a48f
GC
2679}
2680
7780938c 2681static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
f38a7b75
WL
2682{
2683 ++vcpu->stat.tlb_flush;
7780938c 2684 kvm_x86_ops.tlb_flush_all(vcpu);
f38a7b75
WL
2685}
2686
0baedd79
VK
2687static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
2688{
2689 ++vcpu->stat.tlb_flush;
2690 kvm_x86_ops.tlb_flush_guest(vcpu);
2691}
2692
c9aaa895
GC
2693static void record_steal_time(struct kvm_vcpu *vcpu)
2694{
b0431382
BO
2695 struct kvm_host_map map;
2696 struct kvm_steal_time *st;
2697
c9aaa895
GC
2698 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2699 return;
2700
b0431382
BO
2701 /* -EAGAIN is returned in atomic context so we can just return. */
2702 if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
2703 &map, &vcpu->arch.st.cache, false))
c9aaa895
GC
2704 return;
2705
b0431382
BO
2706 st = map.hva +
2707 offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
2708
f38a7b75
WL
2709 /*
2710 * Doing a TLB flush here, on the guest's behalf, can avoid
2711 * expensive IPIs.
2712 */
b382f44e 2713 trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
b0431382
BO
2714 st->preempted & KVM_VCPU_FLUSH_TLB);
2715 if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
0baedd79 2716 kvm_vcpu_flush_tlb_guest(vcpu);
0b9f6c46 2717
a6bd811f 2718 vcpu->arch.st.preempted = 0;
35f3fae1 2719
b0431382
BO
2720 if (st->version & 1)
2721 st->version += 1; /* first time write, random junk */
35f3fae1 2722
b0431382 2723 st->version += 1;
35f3fae1
WL
2724
2725 smp_wmb();
2726
b0431382 2727 st->steal += current->sched_info.run_delay -
c54cdf14
LC
2728 vcpu->arch.st.last_steal;
2729 vcpu->arch.st.last_steal = current->sched_info.run_delay;
35f3fae1 2730
35f3fae1
WL
2731 smp_wmb();
2732
b0431382 2733 st->version += 1;
c9aaa895 2734
b0431382 2735 kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
c9aaa895
GC
2736}
2737
8fe8ab46 2738int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
15c4a640 2739{
5753785f 2740 bool pr = false;
8fe8ab46
WA
2741 u32 msr = msr_info->index;
2742 u64 data = msr_info->data;
5753785f 2743
15c4a640 2744 switch (msr) {
2e32b719 2745 case MSR_AMD64_NB_CFG:
2e32b719
BP
2746 case MSR_IA32_UCODE_WRITE:
2747 case MSR_VM_HSAVE_PA:
2748 case MSR_AMD64_PATCH_LOADER:
2749 case MSR_AMD64_BU_CFG2:
405a353a 2750 case MSR_AMD64_DC_CFG:
0e1b869f 2751 case MSR_F15H_EX_CFG:
2e32b719
BP
2752 break;
2753
518e7b94
WL
2754 case MSR_IA32_UCODE_REV:
2755 if (msr_info->host_initiated)
2756 vcpu->arch.microcode_version = data;
2757 break;
0cf9135b
SC
2758 case MSR_IA32_ARCH_CAPABILITIES:
2759 if (!msr_info->host_initiated)
2760 return 1;
2761 vcpu->arch.arch_capabilities = data;
2762 break;
15c4a640 2763 case MSR_EFER:
11988499 2764 return set_efer(vcpu, msr_info);
8f1589d9
AP
2765 case MSR_K7_HWCR:
2766 data &= ~(u64)0x40; /* ignore flush filter disable */
82494028 2767 data &= ~(u64)0x100; /* ignore ignne emulation enable */
a223c313 2768 data &= ~(u64)0x8; /* ignore TLB cache disable */
191c8137
BP
2769
2770 /* Handle McStatusWrEn */
2771 if (data == BIT_ULL(18)) {
2772 vcpu->arch.msr_hwcr = data;
2773 } else if (data != 0) {
a737f256
CD
2774 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2775 data);
8f1589d9
AP
2776 return 1;
2777 }
15c4a640 2778 break;
f7c6d140
AP
2779 case MSR_FAM10H_MMIO_CONF_BASE:
2780 if (data != 0) {
a737f256
CD
2781 vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
2782 "0x%llx\n", data);
f7c6d140
AP
2783 return 1;
2784 }
15c4a640 2785 break;
b5e2fec0
AG
2786 case MSR_IA32_DEBUGCTLMSR:
2787 if (!data) {
2788 /* We support the non-activated case already */
2789 break;
2790 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
2791 /* Values other than LBR and BTF are vendor-specific,
2792 thus reserved and should throw a #GP */
2793 return 1;
2794 }
a737f256
CD
2795 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2796 __func__, data);
b5e2fec0 2797 break;
9ba075a6 2798 case 0x200 ... 0x2ff:
ff53604b 2799 return kvm_mtrr_set_msr(vcpu, msr, data);
15c4a640 2800 case MSR_IA32_APICBASE:
58cb628d 2801 return kvm_set_apic_base(vcpu, msr_info);
0105d1a5
GN
2802 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2803 return kvm_x2apic_msr_write(vcpu, msr, data);
a3e06bbe
LJ
2804 case MSR_IA32_TSCDEADLINE:
2805 kvm_set_lapic_tscdeadline_msr(vcpu, data);
2806 break;
ba904635 2807 case MSR_IA32_TSC_ADJUST:
d6321d49 2808 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
ba904635 2809 if (!msr_info->host_initiated) {
d913b904 2810 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
d7add054 2811 adjust_tsc_offset_guest(vcpu, adj);
ba904635
WA
2812 }
2813 vcpu->arch.ia32_tsc_adjust_msr = data;
2814 }
2815 break;
15c4a640 2816 case MSR_IA32_MISC_ENABLE:
511a8556
WL
2817 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
2818 ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
2819 if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
2820 return 1;
2821 vcpu->arch.ia32_misc_enable_msr = data;
2822 kvm_update_cpuid(vcpu);
2823 } else {
2824 vcpu->arch.ia32_misc_enable_msr = data;
2825 }
15c4a640 2826 break;
64d60670
PB
2827 case MSR_IA32_SMBASE:
2828 if (!msr_info->host_initiated)
2829 return 1;
2830 vcpu->arch.smbase = data;
2831 break;
73f624f4
PB
2832 case MSR_IA32_POWER_CTL:
2833 vcpu->arch.msr_ia32_power_ctl = data;
2834 break;
dd259935
PB
2835 case MSR_IA32_TSC:
2836 kvm_write_tsc(vcpu, msr_info);
2837 break;
864e2ab2
AL
2838 case MSR_IA32_XSS:
2839 if (!msr_info->host_initiated &&
2840 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
2841 return 1;
2842 /*
a1bead2a
SC
2843 * KVM supports exposing PT to the guest, but does not support
2844 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
2845 * XSAVES/XRSTORS to save/restore PT MSRs.
864e2ab2 2846 */
408e9a31 2847 if (data & ~supported_xss)
864e2ab2
AL
2848 return 1;
2849 vcpu->arch.ia32_xss = data;
2850 break;
52797bf9
LA
2851 case MSR_SMI_COUNT:
2852 if (!msr_info->host_initiated)
2853 return 1;
2854 vcpu->arch.smi_count = data;
2855 break;
11c6bffa 2856 case MSR_KVM_WALL_CLOCK_NEW:
18068523
GOC
2857 case MSR_KVM_WALL_CLOCK:
2858 vcpu->kvm->arch.wall_clock = data;
2859 kvm_write_wall_clock(vcpu->kvm, data);
2860 break;
11c6bffa 2861 case MSR_KVM_SYSTEM_TIME_NEW:
18068523 2862 case MSR_KVM_SYSTEM_TIME: {
54750f2c
MT
2863 struct kvm_arch *ka = &vcpu->kvm->arch;
2864
54750f2c
MT
2865 if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2866 bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2867
2868 if (ka->boot_vcpu_runs_old_kvmclock != tmp)
1bd2009e 2869 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
54750f2c
MT
2870
2871 ka->boot_vcpu_runs_old_kvmclock = tmp;
2872 }
2873
18068523 2874 vcpu->arch.time = data;
0061d53d 2875 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
18068523
GOC
2876
2877 /* we verify if the enable bit is set... */
49dedf0d 2878 vcpu->arch.pv_time_enabled = false;
18068523
GOC
2879 if (!(data & 1))
2880 break;
2881
49dedf0d 2882 if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
8f964525
AH
2883 &vcpu->arch.pv_time, data & ~1ULL,
2884 sizeof(struct pvclock_vcpu_time_info)))
0b79459b 2885 vcpu->arch.pv_time_enabled = true;
32cad84f 2886
18068523
GOC
2887 break;
2888 }
344d9588
GN
2889 case MSR_KVM_ASYNC_PF_EN:
2890 if (kvm_pv_enable_async_pf(vcpu, data))
2891 return 1;
2892 break;
c9aaa895
GC
2893 case MSR_KVM_STEAL_TIME:
2894
2895 if (unlikely(!sched_info_on()))
2896 return 1;
2897
2898 if (data & KVM_STEAL_RESERVED_MASK)
2899 return 1;
2900
c9aaa895
GC
2901 vcpu->arch.st.msr_val = data;
2902
2903 if (!(data & KVM_MSR_ENABLED))
2904 break;
2905
c9aaa895
GC
2906 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2907
2908 break;
ae7a2a3f 2909 case MSR_KVM_PV_EOI_EN:
72bbf935 2910 if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
ae7a2a3f
MT
2911 return 1;
2912 break;
c9aaa895 2913
2d5ba19b
MT
2914 case MSR_KVM_POLL_CONTROL:
2915 /* only enable bit supported */
2916 if (data & (-1ULL << 1))
2917 return 1;
2918
2919 vcpu->arch.msr_kvm_poll_control = data;
2920 break;
2921
890ca9ae
HY
2922 case MSR_IA32_MCG_CTL:
2923 case MSR_IA32_MCG_STATUS:
81760dcc 2924 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
9ffd986c 2925 return set_msr_mce(vcpu, msr_info);
71db6023 2926
6912ac32
WH
2927 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2928 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2929 pr = true; /* fall through */
2930 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2931 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
c6702c9d 2932 if (kvm_pmu_is_valid_msr(vcpu, msr))
afd80d85 2933 return kvm_pmu_set_msr(vcpu, msr_info);
5753785f
GN
2934
2935 if (pr || data != 0)
a737f256
CD
2936 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2937 "0x%x data 0x%llx\n", msr, data);
5753785f 2938 break;
84e0cefa
JS
2939 case MSR_K7_CLK_CTL:
2940 /*
2941 * Ignore all writes to this no longer documented MSR.
2942 * Writes are only relevant for old K7 processors,
2943 * all pre-dating SVM, but a recommended workaround from
4a969980 2944 * AMD for these chips. It is possible to specify the
84e0cefa
JS
2945 * affected processor models on the command line, hence
2946 * the need to ignore the workaround.
2947 */
2948 break;
55cd8e5a 2949 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
e7d9513b
AS
2950 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2951 case HV_X64_MSR_CRASH_CTL:
1f4b34f8 2952 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
a2e164e7
VK
2953 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2954 case HV_X64_MSR_TSC_EMULATION_CONTROL:
2955 case HV_X64_MSR_TSC_EMULATION_STATUS:
e7d9513b
AS
2956 return kvm_hv_set_msr_common(vcpu, msr, data,
2957 msr_info->host_initiated);
91c9c3ed 2958 case MSR_IA32_BBL_CR_CTL3:
2959 /* Drop writes to this legacy MSR -- see rdmsr
2960 * counterpart for further detail.
2961 */
fab0aa3b
EM
2962 if (report_ignored_msrs)
2963 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
2964 msr, data);
91c9c3ed 2965 break;
2b036c6b 2966 case MSR_AMD64_OSVW_ID_LENGTH:
d6321d49 2967 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2b036c6b
BO
2968 return 1;
2969 vcpu->arch.osvw.length = data;
2970 break;
2971 case MSR_AMD64_OSVW_STATUS:
d6321d49 2972 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2b036c6b
BO
2973 return 1;
2974 vcpu->arch.osvw.status = data;
2975 break;
db2336a8
KH
2976 case MSR_PLATFORM_INFO:
2977 if (!msr_info->host_initiated ||
db2336a8
KH
2978 (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
2979 cpuid_fault_enabled(vcpu)))
2980 return 1;
2981 vcpu->arch.msr_platform_info = data;
2982 break;
2983 case MSR_MISC_FEATURES_ENABLES:
2984 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
2985 (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
2986 !supports_cpuid_fault(vcpu)))
2987 return 1;
2988 vcpu->arch.msr_misc_features_enables = data;
2989 break;
15c4a640 2990 default:
ffde22ac
ES
2991 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2992 return xen_hvm_config(vcpu, data);
c6702c9d 2993 if (kvm_pmu_is_valid_msr(vcpu, msr))
afd80d85 2994 return kvm_pmu_set_msr(vcpu, msr_info);
ed85c068 2995 if (!ignore_msrs) {
ae0f5499 2996 vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
a737f256 2997 msr, data);
ed85c068
AP
2998 return 1;
2999 } else {
fab0aa3b
EM
3000 if (report_ignored_msrs)
3001 vcpu_unimpl(vcpu,
3002 "ignored wrmsr: 0x%x data 0x%llx\n",
3003 msr, data);
ed85c068
AP
3004 break;
3005 }
15c4a640
CO
3006 }
3007 return 0;
3008}
3009EXPORT_SYMBOL_GPL(kvm_set_msr_common);
3010
44883f01 3011static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
15c4a640
CO
3012{
3013 u64 data;
890ca9ae
HY
3014 u64 mcg_cap = vcpu->arch.mcg_cap;
3015 unsigned bank_num = mcg_cap & 0xff;
15c4a640
CO
3016
3017 switch (msr) {
15c4a640
CO
3018 case MSR_IA32_P5_MC_ADDR:
3019 case MSR_IA32_P5_MC_TYPE:
890ca9ae
HY
3020 data = 0;
3021 break;
15c4a640 3022 case MSR_IA32_MCG_CAP:
890ca9ae
HY
3023 data = vcpu->arch.mcg_cap;
3024 break;
c7ac679c 3025 case MSR_IA32_MCG_CTL:
44883f01 3026 if (!(mcg_cap & MCG_CTL_P) && !host)
890ca9ae
HY
3027 return 1;
3028 data = vcpu->arch.mcg_ctl;
3029 break;
3030 case MSR_IA32_MCG_STATUS:
3031 data = vcpu->arch.mcg_status;
3032 break;
3033 default:
3034 if (msr >= MSR_IA32_MC0_CTL &&
81760dcc 3035 msr < MSR_IA32_MCx_CTL(bank_num)) {
6ec4c5ee
MP
3036 u32 offset = array_index_nospec(
3037 msr - MSR_IA32_MC0_CTL,
3038 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
3039
890ca9ae
HY
3040 data = vcpu->arch.mce_banks[offset];
3041 break;
3042 }
3043 return 1;
3044 }
3045 *pdata = data;
3046 return 0;
3047}
3048
609e36d3 3049int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
890ca9ae 3050{
609e36d3 3051 switch (msr_info->index) {
890ca9ae 3052 case MSR_IA32_PLATFORM_ID:
15c4a640 3053 case MSR_IA32_EBL_CR_POWERON:
b5e2fec0
AG
3054 case MSR_IA32_DEBUGCTLMSR:
3055 case MSR_IA32_LASTBRANCHFROMIP:
3056 case MSR_IA32_LASTBRANCHTOIP:
3057 case MSR_IA32_LASTINTFROMIP:
3058 case MSR_IA32_LASTINTTOIP:
60af2ecd 3059 case MSR_K8_SYSCFG:
3afb1121
PB
3060 case MSR_K8_TSEG_ADDR:
3061 case MSR_K8_TSEG_MASK:
61a6bd67 3062 case MSR_VM_HSAVE_PA:
1fdbd48c 3063 case MSR_K8_INT_PENDING_MSG:
c323c0e5 3064 case MSR_AMD64_NB_CFG:
f7c6d140 3065 case MSR_FAM10H_MMIO_CONF_BASE:
2e32b719 3066 case MSR_AMD64_BU_CFG2:
0c2df2a1 3067 case MSR_IA32_PERF_CTL:
405a353a 3068 case MSR_AMD64_DC_CFG:
0e1b869f 3069 case MSR_F15H_EX_CFG:
2ca1a06a
VS
3070 /*
3071 * Intel Sandy Bridge CPUs must support the RAPL (running average power
3072 * limit) MSRs. Just return 0, as we do not want to expose the host
3073 * data here. Do not conditionalize this on CPUID, as KVM does not do
3074 * so for existing CPU-specific MSRs.
3075 */
3076 case MSR_RAPL_POWER_UNIT:
3077 case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
3078 case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
3079 case MSR_PKG_ENERGY_STATUS: /* Total package */
3080 case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
609e36d3 3081 msr_info->data = 0;
15c4a640 3082 break;
c51eb52b 3083 case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
6912ac32
WH
3084 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3085 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3086 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3087 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
c6702c9d 3088 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
609e36d3
PB
3089 return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
3090 msr_info->data = 0;
5753785f 3091 break;
742bc670 3092 case MSR_IA32_UCODE_REV:
518e7b94 3093 msr_info->data = vcpu->arch.microcode_version;
742bc670 3094 break;
0cf9135b
SC
3095 case MSR_IA32_ARCH_CAPABILITIES:
3096 if (!msr_info->host_initiated &&
3097 !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
3098 return 1;
3099 msr_info->data = vcpu->arch.arch_capabilities;
3100 break;
73f624f4
PB
3101 case MSR_IA32_POWER_CTL:
3102 msr_info->data = vcpu->arch.msr_ia32_power_ctl;
3103 break;
dd259935
PB
3104 case MSR_IA32_TSC:
3105 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
3106 break;
9ba075a6 3107 case MSR_MTRRcap:
9ba075a6 3108 case 0x200 ... 0x2ff:
ff53604b 3109 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
15c4a640 3110 case 0xcd: /* fsb frequency */
609e36d3 3111 msr_info->data = 3;
15c4a640 3112 break;
7b914098
JS
3113 /*
3114 * MSR_EBC_FREQUENCY_ID
3115 * Conservative value valid for even the basic CPU models.
3116 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
3117 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
3118 * and 266MHz for model 3, or 4. Set Core Clock
3119 * Frequency to System Bus Frequency Ratio to 1 (bits
3120 * 31:24) even though these are only valid for CPU
3121 * models > 2, however guests may end up dividing or
3122 * multiplying by zero otherwise.
3123 */
3124 case MSR_EBC_FREQUENCY_ID:
609e36d3 3125 msr_info->data = 1 << 24;
7b914098 3126 break;
15c4a640 3127 case MSR_IA32_APICBASE:
609e36d3 3128 msr_info->data = kvm_get_apic_base(vcpu);
15c4a640 3129 break;
0105d1a5 3130 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
609e36d3 3131 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
a3e06bbe 3132 case MSR_IA32_TSCDEADLINE:
609e36d3 3133 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
a3e06bbe 3134 break;
ba904635 3135 case MSR_IA32_TSC_ADJUST:
609e36d3 3136 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
ba904635 3137 break;
15c4a640 3138 case MSR_IA32_MISC_ENABLE:
609e36d3 3139 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
15c4a640 3140 break;
64d60670
PB
3141 case MSR_IA32_SMBASE:
3142 if (!msr_info->host_initiated)
3143 return 1;
3144 msr_info->data = vcpu->arch.smbase;
15c4a640 3145 break;
52797bf9
LA
3146 case MSR_SMI_COUNT:
3147 msr_info->data = vcpu->arch.smi_count;
3148 break;
847f0ad8
AG
3149 case MSR_IA32_PERF_STATUS:
3150 /* TSC increment by tick */
609e36d3 3151 msr_info->data = 1000ULL;
847f0ad8 3152 /* CPU multiplier */
b0996ae4 3153 msr_info->data |= (((uint64_t)4ULL) << 40);
847f0ad8 3154 break;
15c4a640 3155 case MSR_EFER:
609e36d3 3156 msr_info->data = vcpu->arch.efer;
15c4a640 3157 break;
18068523 3158 case MSR_KVM_WALL_CLOCK:
11c6bffa 3159 case MSR_KVM_WALL_CLOCK_NEW:
609e36d3 3160 msr_info->data = vcpu->kvm->arch.wall_clock;
18068523
GOC
3161 break;
3162 case MSR_KVM_SYSTEM_TIME:
11c6bffa 3163 case MSR_KVM_SYSTEM_TIME_NEW:
609e36d3 3164 msr_info->data = vcpu->arch.time;
18068523 3165 break;
344d9588 3166 case MSR_KVM_ASYNC_PF_EN:
609e36d3 3167 msr_info->data = vcpu->arch.apf.msr_val;
344d9588 3168 break;
c9aaa895 3169 case MSR_KVM_STEAL_TIME:
609e36d3 3170 msr_info->data = vcpu->arch.st.msr_val;
c9aaa895 3171 break;
1d92128f 3172 case MSR_KVM_PV_EOI_EN:
609e36d3 3173 msr_info->data = vcpu->arch.pv_eoi.msr_val;
1d92128f 3174 break;
2d5ba19b
MT
3175 case MSR_KVM_POLL_CONTROL:
3176 msr_info->data = vcpu->arch.msr_kvm_poll_control;
3177 break;
890ca9ae
HY
3178 case MSR_IA32_P5_MC_ADDR:
3179 case MSR_IA32_P5_MC_TYPE:
3180 case MSR_IA32_MCG_CAP:
3181 case MSR_IA32_MCG_CTL:
3182 case MSR_IA32_MCG_STATUS:
81760dcc 3183 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
44883f01
PB
3184 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
3185 msr_info->host_initiated);
864e2ab2
AL
3186 case MSR_IA32_XSS:
3187 if (!msr_info->host_initiated &&
3188 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3189 return 1;
3190 msr_info->data = vcpu->arch.ia32_xss;
3191 break;
84e0cefa
JS
3192 case MSR_K7_CLK_CTL:
3193 /*
3194 * Provide expected ramp-up count for K7. All other
3195 * are set to zero, indicating minimum divisors for
3196 * every field.
3197 *
3198 * This prevents guest kernels on AMD host with CPU
3199 * type 6, model 8 and higher from exploding due to
3200 * the rdmsr failing.
3201 */
609e36d3 3202 msr_info->data = 0x20000000;
84e0cefa 3203 break;
55cd8e5a 3204 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
e7d9513b
AS
3205 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3206 case HV_X64_MSR_CRASH_CTL:
1f4b34f8 3207 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
a2e164e7
VK
3208 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3209 case HV_X64_MSR_TSC_EMULATION_CONTROL:
3210 case HV_X64_MSR_TSC_EMULATION_STATUS:
e83d5887 3211 return kvm_hv_get_msr_common(vcpu,
44883f01
PB
3212 msr_info->index, &msr_info->data,
3213 msr_info->host_initiated);
91c9c3ed 3214 case MSR_IA32_BBL_CR_CTL3:
3215 /* This legacy MSR exists but isn't fully documented in current
3216 * silicon. It is however accessed by winxp in very narrow
3217 * scenarios where it sets bit #19, itself documented as
3218 * a "reserved" bit. Best effort attempt to source coherent
3219 * read data here should the balance of the register be
3220 * interpreted by the guest:
3221 *
3222 * L2 cache control register 3: 64GB range, 256KB size,
3223 * enabled, latency 0x1, configured
3224 */
609e36d3 3225 msr_info->data = 0xbe702111;
91c9c3ed 3226 break;
2b036c6b 3227 case MSR_AMD64_OSVW_ID_LENGTH:
d6321d49 3228 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2b036c6b 3229 return 1;
609e36d3 3230 msr_info->data = vcpu->arch.osvw.length;
2b036c6b
BO
3231 break;
3232 case MSR_AMD64_OSVW_STATUS:
d6321d49 3233 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2b036c6b 3234 return 1;
609e36d3 3235 msr_info->data = vcpu->arch.osvw.status;
2b036c6b 3236 break;
db2336a8 3237 case MSR_PLATFORM_INFO:
6fbbde9a
DS
3238 if (!msr_info->host_initiated &&
3239 !vcpu->kvm->arch.guest_can_read_msr_platform_info)
3240 return 1;
db2336a8
KH
3241 msr_info->data = vcpu->arch.msr_platform_info;
3242 break;
3243 case MSR_MISC_FEATURES_ENABLES:
3244 msr_info->data = vcpu->arch.msr_misc_features_enables;
3245 break;
191c8137
BP
3246 case MSR_K7_HWCR:
3247 msr_info->data = vcpu->arch.msr_hwcr;
3248 break;
15c4a640 3249 default:
c6702c9d 3250 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
609e36d3 3251 return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
ed85c068 3252 if (!ignore_msrs) {
ae0f5499
BD
3253 vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
3254 msr_info->index);
ed85c068
AP
3255 return 1;
3256 } else {
fab0aa3b
EM
3257 if (report_ignored_msrs)
3258 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
3259 msr_info->index);
609e36d3 3260 msr_info->data = 0;
ed85c068
AP
3261 }
3262 break;
15c4a640 3263 }
15c4a640
CO
3264 return 0;
3265}
3266EXPORT_SYMBOL_GPL(kvm_get_msr_common);
3267
313a3dc7
CO
3268/*
3269 * Read or write a bunch of msrs. All parameters are kernel addresses.
3270 *
3271 * @return number of msrs set successfully.
3272 */
3273static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
3274 struct kvm_msr_entry *entries,
3275 int (*do_msr)(struct kvm_vcpu *vcpu,
3276 unsigned index, u64 *data))
3277{
801e459a 3278 int i;
313a3dc7 3279
313a3dc7
CO
3280 for (i = 0; i < msrs->nmsrs; ++i)
3281 if (do_msr(vcpu, entries[i].index, &entries[i].data))
3282 break;
3283
313a3dc7
CO
3284 return i;
3285}
3286
3287/*
3288 * Read or write a bunch of msrs. Parameters are user addresses.
3289 *
3290 * @return number of msrs set successfully.
3291 */
3292static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
3293 int (*do_msr)(struct kvm_vcpu *vcpu,
3294 unsigned index, u64 *data),
3295 int writeback)
3296{
3297 struct kvm_msrs msrs;
3298 struct kvm_msr_entry *entries;
3299 int r, n;
3300 unsigned size;
3301
3302 r = -EFAULT;
0e96f31e 3303 if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
313a3dc7
CO
3304 goto out;
3305
3306 r = -E2BIG;
3307 if (msrs.nmsrs >= MAX_IO_MSRS)
3308 goto out;
3309
313a3dc7 3310 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
ff5c2c03
SL
3311 entries = memdup_user(user_msrs->entries, size);
3312 if (IS_ERR(entries)) {
3313 r = PTR_ERR(entries);
313a3dc7 3314 goto out;
ff5c2c03 3315 }
313a3dc7
CO
3316
3317 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
3318 if (r < 0)
3319 goto out_free;
3320
3321 r = -EFAULT;
3322 if (writeback && copy_to_user(user_msrs->entries, entries, size))
3323 goto out_free;
3324
3325 r = n;
3326
3327out_free:
7a73c028 3328 kfree(entries);
313a3dc7
CO
3329out:
3330 return r;
3331}
3332
4d5422ce
WL
3333static inline bool kvm_can_mwait_in_guest(void)
3334{
3335 return boot_cpu_has(X86_FEATURE_MWAIT) &&
8e9b29b6
KA
3336 !boot_cpu_has_bug(X86_BUG_MONITOR) &&
3337 boot_cpu_has(X86_FEATURE_ARAT);
4d5422ce
WL
3338}
3339
784aa3d7 3340int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
018d00d2 3341{
4d5422ce 3342 int r = 0;
018d00d2
ZX
3343
3344 switch (ext) {
3345 case KVM_CAP_IRQCHIP:
3346 case KVM_CAP_HLT:
3347 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
018d00d2 3348 case KVM_CAP_SET_TSS_ADDR:
07716717 3349 case KVM_CAP_EXT_CPUID:
9c15bb1d 3350 case KVM_CAP_EXT_EMUL_CPUID:
c8076604 3351 case KVM_CAP_CLOCKSOURCE:
7837699f 3352 case KVM_CAP_PIT:
a28e4f5a 3353 case KVM_CAP_NOP_IO_DELAY:
62d9f0db 3354 case KVM_CAP_MP_STATE:
ed848624 3355 case KVM_CAP_SYNC_MMU:
a355c85c 3356 case KVM_CAP_USER_NMI:
52d939a0 3357 case KVM_CAP_REINJECT_CONTROL:
4925663a 3358 case KVM_CAP_IRQ_INJECT_STATUS:
d34e6b17 3359 case KVM_CAP_IOEVENTFD:
f848a5a8 3360 case KVM_CAP_IOEVENTFD_NO_LENGTH:
c5ff41ce 3361 case KVM_CAP_PIT2:
e9f42757 3362 case KVM_CAP_PIT_STATE2:
b927a3ce 3363 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
ffde22ac 3364 case KVM_CAP_XEN_HVM:
3cfc3092 3365 case KVM_CAP_VCPU_EVENTS:
55cd8e5a 3366 case KVM_CAP_HYPERV:
10388a07 3367 case KVM_CAP_HYPERV_VAPIC:
c25bc163 3368 case KVM_CAP_HYPERV_SPIN:
5c919412 3369 case KVM_CAP_HYPERV_SYNIC:
efc479e6 3370 case KVM_CAP_HYPERV_SYNIC2:
d3457c87 3371 case KVM_CAP_HYPERV_VP_INDEX:
faeb7833 3372 case KVM_CAP_HYPERV_EVENTFD:
c1aea919 3373 case KVM_CAP_HYPERV_TLBFLUSH:
214ff83d 3374 case KVM_CAP_HYPERV_SEND_IPI:
2bc39970 3375 case KVM_CAP_HYPERV_CPUID:
ab9f4ecb 3376 case KVM_CAP_PCI_SEGMENT:
a1efbe77 3377 case KVM_CAP_DEBUGREGS:
d2be1651 3378 case KVM_CAP_X86_ROBUST_SINGLESTEP:
2d5b5a66 3379 case KVM_CAP_XSAVE:
344d9588 3380 case KVM_CAP_ASYNC_PF:
92a1f12d 3381 case KVM_CAP_GET_TSC_KHZ:
1c0b28c2 3382 case KVM_CAP_KVMCLOCK_CTRL:
4d8b81ab 3383 case KVM_CAP_READONLY_MEM:
5f66b620 3384 case KVM_CAP_HYPERV_TIME:
100943c5 3385 case KVM_CAP_IOAPIC_POLARITY_IGNORED:
defcf51f 3386 case KVM_CAP_TSC_DEADLINE_TIMER:
90de4a18 3387 case KVM_CAP_DISABLE_QUIRKS:
d71ba788 3388 case KVM_CAP_SET_BOOT_CPU_ID:
49df6397 3389 case KVM_CAP_SPLIT_IRQCHIP:
460df4c1 3390 case KVM_CAP_IMMEDIATE_EXIT:
66bb8a06 3391 case KVM_CAP_PMU_EVENT_FILTER:
801e459a 3392 case KVM_CAP_GET_MSR_FEATURES:
6fbbde9a 3393 case KVM_CAP_MSR_PLATFORM_INFO:
c4f55198 3394 case KVM_CAP_EXCEPTION_PAYLOAD:
b9b2782c 3395 case KVM_CAP_SET_GUEST_DEBUG:
018d00d2
ZX
3396 r = 1;
3397 break;
01643c51
KH
3398 case KVM_CAP_SYNC_REGS:
3399 r = KVM_SYNC_X86_VALID_FIELDS;
3400 break;
e3fd9a93
PB
3401 case KVM_CAP_ADJUST_CLOCK:
3402 r = KVM_CLOCK_TSC_STABLE;
3403 break;
4d5422ce 3404 case KVM_CAP_X86_DISABLE_EXITS:
b5170063
WL
3405 r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
3406 KVM_X86_DISABLE_EXITS_CSTATE;
4d5422ce
WL
3407 if(kvm_can_mwait_in_guest())
3408 r |= KVM_X86_DISABLE_EXITS_MWAIT;
668fffa3 3409 break;
6d396b55
PB
3410 case KVM_CAP_X86_SMM:
3411 /* SMBASE is usually relocated above 1M on modern chipsets,
3412 * and SMM handlers might indeed rely on 4G segment limits,
3413 * so do not report SMM to be available if real mode is
3414 * emulated via vm86 mode. Still, do not go to great lengths
3415 * to avoid userspace's usage of the feature, because it is a
3416 * fringe case that is not enabled except via specific settings
3417 * of the module parameters.
3418 */
afaf0b2f 3419 r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
6d396b55 3420 break;
774ead3a 3421 case KVM_CAP_VAPIC:
afaf0b2f 3422 r = !kvm_x86_ops.cpu_has_accelerated_tpr();
774ead3a 3423 break;
f725230a 3424 case KVM_CAP_NR_VCPUS:
8c3ba334
SL
3425 r = KVM_SOFT_MAX_VCPUS;
3426 break;
3427 case KVM_CAP_MAX_VCPUS:
f725230a
AK
3428 r = KVM_MAX_VCPUS;
3429 break;
a86cb413
TH
3430 case KVM_CAP_MAX_VCPU_ID:
3431 r = KVM_MAX_VCPU_ID;
3432 break;
a68a6a72
MT
3433 case KVM_CAP_PV_MMU: /* obsolete */
3434 r = 0;
2f333bcb 3435 break;
890ca9ae
HY
3436 case KVM_CAP_MCE:
3437 r = KVM_MAX_MCE_BANKS;
3438 break;
2d5b5a66 3439 case KVM_CAP_XCRS:
d366bf7e 3440 r = boot_cpu_has(X86_FEATURE_XSAVE);
2d5b5a66 3441 break;
92a1f12d
JR
3442 case KVM_CAP_TSC_CONTROL:
3443 r = kvm_has_tsc_control;
3444 break;
37131313
RK
3445 case KVM_CAP_X2APIC_API:
3446 r = KVM_X2APIC_API_VALID_FLAGS;
3447 break;
8fcc4b59 3448 case KVM_CAP_NESTED_STATE:
33b22172
PB
3449 r = kvm_x86_ops.nested_ops->get_state ?
3450 kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
8fcc4b59 3451 break;
344c6c80 3452 case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
afaf0b2f 3453 r = kvm_x86_ops.enable_direct_tlbflush != NULL;
5a0165f6
VK
3454 break;
3455 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
33b22172 3456 r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
344c6c80 3457 break;
018d00d2 3458 default:
018d00d2
ZX
3459 break;
3460 }
3461 return r;
3462
3463}
3464
043405e1
CO
3465long kvm_arch_dev_ioctl(struct file *filp,
3466 unsigned int ioctl, unsigned long arg)
3467{
3468 void __user *argp = (void __user *)arg;
3469 long r;
3470
3471 switch (ioctl) {
3472 case KVM_GET_MSR_INDEX_LIST: {
3473 struct kvm_msr_list __user *user_msr_list = argp;
3474 struct kvm_msr_list msr_list;
3475 unsigned n;
3476
3477 r = -EFAULT;
0e96f31e 3478 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
043405e1
CO
3479 goto out;
3480 n = msr_list.nmsrs;
62ef68bb 3481 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
0e96f31e 3482 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
043405e1
CO
3483 goto out;
3484 r = -E2BIG;
e125e7b6 3485 if (n < msr_list.nmsrs)
043405e1
CO
3486 goto out;
3487 r = -EFAULT;
3488 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3489 num_msrs_to_save * sizeof(u32)))
3490 goto out;
e125e7b6 3491 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
043405e1 3492 &emulated_msrs,
62ef68bb 3493 num_emulated_msrs * sizeof(u32)))
043405e1
CO
3494 goto out;
3495 r = 0;
3496 break;
3497 }
9c15bb1d
BP
3498 case KVM_GET_SUPPORTED_CPUID:
3499 case KVM_GET_EMULATED_CPUID: {
674eea0f
AK
3500 struct kvm_cpuid2 __user *cpuid_arg = argp;
3501 struct kvm_cpuid2 cpuid;
3502
3503 r = -EFAULT;
0e96f31e 3504 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
674eea0f 3505 goto out;
9c15bb1d
BP
3506
3507 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
3508 ioctl);
674eea0f
AK
3509 if (r)
3510 goto out;
3511
3512 r = -EFAULT;
0e96f31e 3513 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
674eea0f
AK
3514 goto out;
3515 r = 0;
3516 break;
3517 }
cf6c26ec 3518 case KVM_X86_GET_MCE_CAP_SUPPORTED:
890ca9ae 3519 r = -EFAULT;
c45dcc71
AR
3520 if (copy_to_user(argp, &kvm_mce_cap_supported,
3521 sizeof(kvm_mce_cap_supported)))
890ca9ae
HY
3522 goto out;
3523 r = 0;
3524 break;
801e459a
TL
3525 case KVM_GET_MSR_FEATURE_INDEX_LIST: {
3526 struct kvm_msr_list __user *user_msr_list = argp;
3527 struct kvm_msr_list msr_list;
3528 unsigned int n;
3529
3530 r = -EFAULT;
3531 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3532 goto out;
3533 n = msr_list.nmsrs;
3534 msr_list.nmsrs = num_msr_based_features;
3535 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3536 goto out;
3537 r = -E2BIG;
3538 if (n < msr_list.nmsrs)
3539 goto out;
3540 r = -EFAULT;
3541 if (copy_to_user(user_msr_list->indices, &msr_based_features,
3542 num_msr_based_features * sizeof(u32)))
3543 goto out;
3544 r = 0;
3545 break;
3546 }
3547 case KVM_GET_MSRS:
3548 r = msr_io(NULL, argp, do_get_msr_feature, 1);
3549 break;
043405e1
CO
3550 default:
3551 r = -EINVAL;
cf6c26ec 3552 break;
043405e1
CO
3553 }
3554out:
3555 return r;
3556}
3557
f5f48ee1
SY
3558static void wbinvd_ipi(void *garbage)
3559{
3560 wbinvd();
3561}
3562
3563static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
3564{
e0f0bbc5 3565 return kvm_arch_has_noncoherent_dma(vcpu->kvm);
f5f48ee1
SY
3566}
3567
313a3dc7
CO
3568void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3569{
f5f48ee1
SY
3570 /* Address WBINVD may be executed by guest */
3571 if (need_emulate_wbinvd(vcpu)) {
afaf0b2f 3572 if (kvm_x86_ops.has_wbinvd_exit())
f5f48ee1
SY
3573 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3574 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
3575 smp_call_function_single(vcpu->cpu,
3576 wbinvd_ipi, NULL, 1);
3577 }
3578
afaf0b2f 3579 kvm_x86_ops.vcpu_load(vcpu, cpu);
8f6055cb 3580
37486135
BM
3581 /* Save host pkru register if supported */
3582 vcpu->arch.host_pkru = read_pkru();
3583
0dd6a6ed
ZA
3584 /* Apply any externally detected TSC adjustments (due to suspend) */
3585 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
3586 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
3587 vcpu->arch.tsc_offset_adjustment = 0;
105b21bb 3588 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
0dd6a6ed 3589 }
8f6055cb 3590
b0c39dc6 3591 if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
6f526ec5 3592 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
4ea1636b 3593 rdtsc() - vcpu->arch.last_host_tsc;
e48672fa
ZA
3594 if (tsc_delta < 0)
3595 mark_tsc_unstable("KVM discovered backwards TSC");
ce7a058a 3596
b0c39dc6 3597 if (kvm_check_tsc_unstable()) {
07c1419a 3598 u64 offset = kvm_compute_tsc_offset(vcpu,
b183aa58 3599 vcpu->arch.last_guest_tsc);
a545ab6a 3600 kvm_vcpu_write_tsc_offset(vcpu, offset);
c285545f 3601 vcpu->arch.tsc_catchup = 1;
c285545f 3602 }
a749e247
PB
3603
3604 if (kvm_lapic_hv_timer_in_use(vcpu))
3605 kvm_lapic_restart_hv_timer(vcpu);
3606
d98d07ca
MT
3607 /*
3608 * On a host with synchronized TSC, there is no need to update
3609 * kvmclock on vcpu->cpu migration
3610 */
3611 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
0061d53d 3612 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
c285545f 3613 if (vcpu->cpu != cpu)
1bd2009e 3614 kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
e48672fa 3615 vcpu->cpu = cpu;
6b7d7e76 3616 }
c9aaa895 3617
c9aaa895 3618 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
313a3dc7
CO
3619}
3620
0b9f6c46
PX
3621static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
3622{
b0431382
BO
3623 struct kvm_host_map map;
3624 struct kvm_steal_time *st;
3625
0b9f6c46
PX
3626 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3627 return;
3628
a6bd811f 3629 if (vcpu->arch.st.preempted)
8c6de56a
BO
3630 return;
3631
b0431382
BO
3632 if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
3633 &vcpu->arch.st.cache, true))
3634 return;
3635
3636 st = map.hva +
3637 offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
0b9f6c46 3638
a6bd811f 3639 st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
0b9f6c46 3640
b0431382 3641 kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
0b9f6c46
PX
3642}
3643
313a3dc7
CO
3644void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
3645{
cc0d907c 3646 int idx;
de63ad4c
LM
3647
3648 if (vcpu->preempted)
afaf0b2f 3649 vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
de63ad4c 3650
931f261b
AA
3651 /*
3652 * Disable page faults because we're in atomic context here.
3653 * kvm_write_guest_offset_cached() would call might_fault()
3654 * that relies on pagefault_disable() to tell if there's a
3655 * bug. NOTE: the write to guest memory may not go through if
3656 * during postcopy live migration or if there's heavy guest
3657 * paging.
3658 */
3659 pagefault_disable();
cc0d907c
AA
3660 /*
3661 * kvm_memslots() will be called by
3662 * kvm_write_guest_offset_cached() so take the srcu lock.
3663 */
3664 idx = srcu_read_lock(&vcpu->kvm->srcu);
0b9f6c46 3665 kvm_steal_time_set_preempted(vcpu);
cc0d907c 3666 srcu_read_unlock(&vcpu->kvm->srcu, idx);
931f261b 3667 pagefault_enable();
afaf0b2f 3668 kvm_x86_ops.vcpu_put(vcpu);
4ea1636b 3669 vcpu->arch.last_host_tsc = rdtsc();
efdab992 3670 /*
f9dcf08e
RK
3671 * If userspace has set any breakpoints or watchpoints, dr6 is restored
3672 * on every vmexit, but if not, we might have a stale dr6 from the
3673 * guest. do_debug expects dr6 to be cleared after it runs, do the same.
efdab992 3674 */
f9dcf08e 3675 set_debugreg(0, 6);
313a3dc7
CO
3676}
3677
313a3dc7
CO
3678static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
3679 struct kvm_lapic_state *s)
3680{
fa59cc00 3681 if (vcpu->arch.apicv_active)
afaf0b2f 3682 kvm_x86_ops.sync_pir_to_irr(vcpu);
d62caabb 3683
a92e2543 3684 return kvm_apic_get_state(vcpu, s);
313a3dc7
CO
3685}
3686
3687static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
3688 struct kvm_lapic_state *s)
3689{
a92e2543
RK
3690 int r;
3691
3692 r = kvm_apic_set_state(vcpu, s);
3693 if (r)
3694 return r;
cb142eb7 3695 update_cr8_intercept(vcpu);
313a3dc7
CO
3696
3697 return 0;
3698}
3699
127a457a
MG
3700static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
3701{
3702 return (!lapic_in_kernel(vcpu) ||
3703 kvm_apic_accept_pic_intr(vcpu));
3704}
3705
782d422b
MG
3706/*
3707 * if userspace requested an interrupt window, check that the
3708 * interrupt window is open.
3709 *
3710 * No need to exit to userspace if we already have an interrupt queued.
3711 */
3712static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
3713{
3714 return kvm_arch_interrupt_allowed(vcpu) &&
3715 !kvm_cpu_has_interrupt(vcpu) &&
3716 !kvm_event_needs_reinjection(vcpu) &&
3717 kvm_cpu_accept_dm_intr(vcpu);
3718}
3719
f77bc6a4
ZX
3720static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
3721 struct kvm_interrupt *irq)
3722{
02cdb50f 3723 if (irq->irq >= KVM_NR_INTERRUPTS)
f77bc6a4 3724 return -EINVAL;
1c1a9ce9
SR
3725
3726 if (!irqchip_in_kernel(vcpu->kvm)) {
3727 kvm_queue_interrupt(vcpu, irq->irq, false);
3728 kvm_make_request(KVM_REQ_EVENT, vcpu);
3729 return 0;
3730 }
3731
3732 /*
3733 * With in-kernel LAPIC, we only use this to inject EXTINT, so
3734 * fail for in-kernel 8259.
3735 */
3736 if (pic_in_kernel(vcpu->kvm))
f77bc6a4 3737 return -ENXIO;
f77bc6a4 3738
1c1a9ce9
SR
3739 if (vcpu->arch.pending_external_vector != -1)
3740 return -EEXIST;
f77bc6a4 3741
1c1a9ce9 3742 vcpu->arch.pending_external_vector = irq->irq;
934bf653 3743 kvm_make_request(KVM_REQ_EVENT, vcpu);
f77bc6a4
ZX
3744 return 0;
3745}
3746
c4abb7c9
JK
3747static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
3748{
c4abb7c9 3749 kvm_inject_nmi(vcpu);
c4abb7c9
JK
3750
3751 return 0;
3752}
3753
f077825a
PB
3754static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
3755{
64d60670
PB
3756 kvm_make_request(KVM_REQ_SMI, vcpu);
3757
f077825a
PB
3758 return 0;
3759}
3760
b209749f
AK
3761static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
3762 struct kvm_tpr_access_ctl *tac)
3763{
3764 if (tac->flags)
3765 return -EINVAL;
3766 vcpu->arch.tpr_access_reporting = !!tac->enabled;
3767 return 0;
3768}
3769
890ca9ae
HY
3770static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
3771 u64 mcg_cap)
3772{
3773 int r;
3774 unsigned bank_num = mcg_cap & 0xff, bank;
3775
3776 r = -EINVAL;
a9e38c3e 3777 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
890ca9ae 3778 goto out;
c45dcc71 3779 if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
890ca9ae
HY
3780 goto out;
3781 r = 0;
3782 vcpu->arch.mcg_cap = mcg_cap;
3783 /* Init IA32_MCG_CTL to all 1s */
3784 if (mcg_cap & MCG_CTL_P)
3785 vcpu->arch.mcg_ctl = ~(u64)0;
3786 /* Init IA32_MCi_CTL to all 1s */
3787 for (bank = 0; bank < bank_num; bank++)
3788 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
c45dcc71 3789
afaf0b2f 3790 kvm_x86_ops.setup_mce(vcpu);
890ca9ae
HY
3791out:
3792 return r;
3793}
3794
3795static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
3796 struct kvm_x86_mce *mce)
3797{
3798 u64 mcg_cap = vcpu->arch.mcg_cap;
3799 unsigned bank_num = mcg_cap & 0xff;
3800 u64 *banks = vcpu->arch.mce_banks;
3801
3802 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
3803 return -EINVAL;
3804 /*
3805 * if IA32_MCG_CTL is not all 1s, the uncorrected error
3806 * reporting is disabled
3807 */
3808 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
3809 vcpu->arch.mcg_ctl != ~(u64)0)
3810 return 0;
3811 banks += 4 * mce->bank;
3812 /*
3813 * if IA32_MCi_CTL is not all 1s, the uncorrected error
3814 * reporting is disabled for the bank
3815 */
3816 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
3817 return 0;
3818 if (mce->status & MCI_STATUS_UC) {
3819 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
fc78f519 3820 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
a8eeb04a 3821 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
890ca9ae
HY
3822 return 0;
3823 }
3824 if (banks[1] & MCI_STATUS_VAL)
3825 mce->status |= MCI_STATUS_OVER;
3826 banks[2] = mce->addr;
3827 banks[3] = mce->misc;
3828 vcpu->arch.mcg_status = mce->mcg_status;
3829 banks[1] = mce->status;
3830 kvm_queue_exception(vcpu, MC_VECTOR);
3831 } else if (!(banks[1] & MCI_STATUS_VAL)
3832 || !(banks[1] & MCI_STATUS_UC)) {
3833 if (banks[1] & MCI_STATUS_VAL)
3834 mce->status |= MCI_STATUS_OVER;
3835 banks[2] = mce->addr;
3836 banks[3] = mce->misc;
3837 banks[1] = mce->status;
3838 } else
3839 banks[1] |= MCI_STATUS_OVER;
3840 return 0;
3841}
3842
3cfc3092
JK
3843static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3844 struct kvm_vcpu_events *events)
3845{
7460fb4a 3846 process_nmi(vcpu);
59073aaf 3847
a06230b6
OU
3848 /*
3849 * In guest mode, payload delivery should be deferred,
3850 * so that the L1 hypervisor can intercept #PF before
3851 * CR2 is modified (or intercept #DB before DR6 is
3852 * modified under nVMX). Unless the per-VM capability,
3853 * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
3854 * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
3855 * opportunistically defer the exception payload, deliver it if the
3856 * capability hasn't been requested before processing a
3857 * KVM_GET_VCPU_EVENTS.
3858 */
3859 if (!vcpu->kvm->arch.exception_payload_enabled &&
3860 vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
3861 kvm_deliver_exception_payload(vcpu);
3862
664f8e26 3863 /*
59073aaf
JM
3864 * The API doesn't provide the instruction length for software
3865 * exceptions, so don't report them. As long as the guest RIP
3866 * isn't advanced, we should expect to encounter the exception
3867 * again.
664f8e26 3868 */
59073aaf
JM
3869 if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
3870 events->exception.injected = 0;
3871 events->exception.pending = 0;
3872 } else {
3873 events->exception.injected = vcpu->arch.exception.injected;
3874 events->exception.pending = vcpu->arch.exception.pending;
3875 /*
3876 * For ABI compatibility, deliberately conflate
3877 * pending and injected exceptions when
3878 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
3879 */
3880 if (!vcpu->kvm->arch.exception_payload_enabled)
3881 events->exception.injected |=
3882 vcpu->arch.exception.pending;
3883 }
3cfc3092
JK
3884 events->exception.nr = vcpu->arch.exception.nr;
3885 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3886 events->exception.error_code = vcpu->arch.exception.error_code;
59073aaf
JM
3887 events->exception_has_payload = vcpu->arch.exception.has_payload;
3888 events->exception_payload = vcpu->arch.exception.payload;
3cfc3092 3889
03b82a30 3890 events->interrupt.injected =
04140b41 3891 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3cfc3092 3892 events->interrupt.nr = vcpu->arch.interrupt.nr;
03b82a30 3893 events->interrupt.soft = 0;
afaf0b2f 3894 events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
3cfc3092
JK
3895
3896 events->nmi.injected = vcpu->arch.nmi_injected;
7460fb4a 3897 events->nmi.pending = vcpu->arch.nmi_pending != 0;
afaf0b2f 3898 events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
97e69aa6 3899 events->nmi.pad = 0;
3cfc3092 3900
66450a21 3901 events->sipi_vector = 0; /* never valid when reporting to user space */
3cfc3092 3902
f077825a
PB
3903 events->smi.smm = is_smm(vcpu);
3904 events->smi.pending = vcpu->arch.smi_pending;
3905 events->smi.smm_inside_nmi =
3906 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
3907 events->smi.latched_init = kvm_lapic_latched_init(vcpu);
3908
dab4b911 3909 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
f077825a
PB
3910 | KVM_VCPUEVENT_VALID_SHADOW
3911 | KVM_VCPUEVENT_VALID_SMM);
59073aaf
JM
3912 if (vcpu->kvm->arch.exception_payload_enabled)
3913 events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
3914
97e69aa6 3915 memset(&events->reserved, 0, sizeof(events->reserved));
3cfc3092
JK
3916}
3917
c5833c7a 3918static void kvm_smm_changed(struct kvm_vcpu *vcpu);
6ef4e07e 3919
3cfc3092
JK
3920static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3921 struct kvm_vcpu_events *events)
3922{
dab4b911 3923 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
48005f64 3924 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
f077825a 3925 | KVM_VCPUEVENT_VALID_SHADOW
59073aaf
JM
3926 | KVM_VCPUEVENT_VALID_SMM
3927 | KVM_VCPUEVENT_VALID_PAYLOAD))
3cfc3092
JK
3928 return -EINVAL;
3929
59073aaf
JM
3930 if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
3931 if (!vcpu->kvm->arch.exception_payload_enabled)
3932 return -EINVAL;
3933 if (events->exception.pending)
3934 events->exception.injected = 0;
3935 else
3936 events->exception_has_payload = 0;
3937 } else {
3938 events->exception.pending = 0;
3939 events->exception_has_payload = 0;
3940 }
3941
3942 if ((events->exception.injected || events->exception.pending) &&
3943 (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
78e546c8
PB
3944 return -EINVAL;
3945
28bf2888
DH
3946 /* INITs are latched while in SMM */
3947 if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
3948 (events->smi.smm || events->smi.pending) &&
3949 vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3950 return -EINVAL;
3951
7460fb4a 3952 process_nmi(vcpu);
59073aaf
JM
3953 vcpu->arch.exception.injected = events->exception.injected;
3954 vcpu->arch.exception.pending = events->exception.pending;
3cfc3092
JK
3955 vcpu->arch.exception.nr = events->exception.nr;
3956 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3957 vcpu->arch.exception.error_code = events->exception.error_code;
59073aaf
JM
3958 vcpu->arch.exception.has_payload = events->exception_has_payload;
3959 vcpu->arch.exception.payload = events->exception_payload;
3cfc3092 3960
04140b41 3961 vcpu->arch.interrupt.injected = events->interrupt.injected;
3cfc3092
JK
3962 vcpu->arch.interrupt.nr = events->interrupt.nr;
3963 vcpu->arch.interrupt.soft = events->interrupt.soft;
48005f64 3964 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
afaf0b2f 3965 kvm_x86_ops.set_interrupt_shadow(vcpu,
48005f64 3966 events->interrupt.shadow);
3cfc3092
JK
3967
3968 vcpu->arch.nmi_injected = events->nmi.injected;
dab4b911
JK
3969 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3970 vcpu->arch.nmi_pending = events->nmi.pending;
afaf0b2f 3971 kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
3cfc3092 3972
66450a21 3973 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
bce87cce 3974 lapic_in_kernel(vcpu))
66450a21 3975 vcpu->arch.apic->sipi_vector = events->sipi_vector;
3cfc3092 3976
f077825a 3977 if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
c5833c7a
SC
3978 if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
3979 if (events->smi.smm)
3980 vcpu->arch.hflags |= HF_SMM_MASK;
3981 else
3982 vcpu->arch.hflags &= ~HF_SMM_MASK;
3983 kvm_smm_changed(vcpu);
3984 }
6ef4e07e 3985
f077825a 3986 vcpu->arch.smi_pending = events->smi.pending;
f4ef1910
WL
3987
3988 if (events->smi.smm) {
3989 if (events->smi.smm_inside_nmi)
3990 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
f077825a 3991 else
f4ef1910 3992 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
ff90afa7
LA
3993 }
3994
3995 if (lapic_in_kernel(vcpu)) {
3996 if (events->smi.latched_init)
3997 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3998 else
3999 clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
f077825a
PB
4000 }
4001 }
4002
3842d135
AK
4003 kvm_make_request(KVM_REQ_EVENT, vcpu);
4004
3cfc3092
JK
4005 return 0;
4006}
4007
a1efbe77
JK
4008static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
4009 struct kvm_debugregs *dbgregs)
4010{
73aaf249
JK
4011 unsigned long val;
4012
a1efbe77 4013 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
16f8a6f9 4014 kvm_get_dr(vcpu, 6, &val);
73aaf249 4015 dbgregs->dr6 = val;
a1efbe77
JK
4016 dbgregs->dr7 = vcpu->arch.dr7;
4017 dbgregs->flags = 0;
97e69aa6 4018 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
a1efbe77
JK
4019}
4020
4021static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
4022 struct kvm_debugregs *dbgregs)
4023{
4024 if (dbgregs->flags)
4025 return -EINVAL;
4026
d14bdb55
PB
4027 if (dbgregs->dr6 & ~0xffffffffull)
4028 return -EINVAL;
4029 if (dbgregs->dr7 & ~0xffffffffull)
4030 return -EINVAL;
4031
a1efbe77 4032 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
ae561ede 4033 kvm_update_dr0123(vcpu);
a1efbe77
JK
4034 vcpu->arch.dr6 = dbgregs->dr6;
4035 vcpu->arch.dr7 = dbgregs->dr7;
9926c9fd 4036 kvm_update_dr7(vcpu);
a1efbe77 4037
a1efbe77
JK
4038 return 0;
4039}
4040
df1daba7
PB
4041#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
4042
4043static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
4044{
b666a4b6 4045 struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
400e4b20 4046 u64 xstate_bv = xsave->header.xfeatures;
df1daba7
PB
4047 u64 valid;
4048
4049 /*
4050 * Copy legacy XSAVE area, to avoid complications with CPUID
4051 * leaves 0 and 1 in the loop below.
4052 */
4053 memcpy(dest, xsave, XSAVE_HDR_OFFSET);
4054
4055 /* Set XSTATE_BV */
00c87e9a 4056 xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
df1daba7
PB
4057 *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
4058
4059 /*
4060 * Copy each region from the possibly compacted offset to the
4061 * non-compacted offset.
4062 */
d91cab78 4063 valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
df1daba7 4064 while (valid) {
abd16d68
SAS
4065 u64 xfeature_mask = valid & -valid;
4066 int xfeature_nr = fls64(xfeature_mask) - 1;
4067 void *src = get_xsave_addr(xsave, xfeature_nr);
df1daba7
PB
4068
4069 if (src) {
4070 u32 size, offset, ecx, edx;
abd16d68 4071 cpuid_count(XSTATE_CPUID, xfeature_nr,
df1daba7 4072 &size, &offset, &ecx, &edx);
abd16d68 4073 if (xfeature_nr == XFEATURE_PKRU)
38cfd5e3
PB
4074 memcpy(dest + offset, &vcpu->arch.pkru,
4075 sizeof(vcpu->arch.pkru));
4076 else
4077 memcpy(dest + offset, src, size);
4078
df1daba7
PB
4079 }
4080
abd16d68 4081 valid -= xfeature_mask;
df1daba7
PB
4082 }
4083}
4084
4085static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
4086{
b666a4b6 4087 struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
df1daba7
PB
4088 u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
4089 u64 valid;
4090
4091 /*
4092 * Copy legacy XSAVE area, to avoid complications with CPUID
4093 * leaves 0 and 1 in the loop below.
4094 */
4095 memcpy(xsave, src, XSAVE_HDR_OFFSET);
4096
4097 /* Set XSTATE_BV and possibly XCOMP_BV. */
400e4b20 4098 xsave->header.xfeatures = xstate_bv;
782511b0 4099 if (boot_cpu_has(X86_FEATURE_XSAVES))
3a54450b 4100 xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
df1daba7
PB
4101
4102 /*
4103 * Copy each region from the non-compacted offset to the
4104 * possibly compacted offset.
4105 */
d91cab78 4106 valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
df1daba7 4107 while (valid) {
abd16d68
SAS
4108 u64 xfeature_mask = valid & -valid;
4109 int xfeature_nr = fls64(xfeature_mask) - 1;
4110 void *dest = get_xsave_addr(xsave, xfeature_nr);
df1daba7
PB
4111
4112 if (dest) {
4113 u32 size, offset, ecx, edx;
abd16d68 4114 cpuid_count(XSTATE_CPUID, xfeature_nr,
df1daba7 4115 &size, &offset, &ecx, &edx);
abd16d68 4116 if (xfeature_nr == XFEATURE_PKRU)
38cfd5e3
PB
4117 memcpy(&vcpu->arch.pkru, src + offset,
4118 sizeof(vcpu->arch.pkru));
4119 else
4120 memcpy(dest, src + offset, size);
ee4100da 4121 }
df1daba7 4122
abd16d68 4123 valid -= xfeature_mask;
df1daba7
PB
4124 }
4125}
4126
2d5b5a66
SY
4127static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
4128 struct kvm_xsave *guest_xsave)
4129{
d366bf7e 4130 if (boot_cpu_has(X86_FEATURE_XSAVE)) {
df1daba7
PB
4131 memset(guest_xsave, 0, sizeof(struct kvm_xsave));
4132 fill_xsave((u8 *) guest_xsave->region, vcpu);
4344ee98 4133 } else {
2d5b5a66 4134 memcpy(guest_xsave->region,
b666a4b6 4135 &vcpu->arch.guest_fpu->state.fxsave,
c47ada30 4136 sizeof(struct fxregs_state));
2d5b5a66 4137 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
d91cab78 4138 XFEATURE_MASK_FPSSE;
2d5b5a66
SY
4139 }
4140}
4141
a575813b
WL
4142#define XSAVE_MXCSR_OFFSET 24
4143
2d5b5a66
SY
4144static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
4145 struct kvm_xsave *guest_xsave)
4146{
4147 u64 xstate_bv =
4148 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
a575813b 4149 u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
2d5b5a66 4150
d366bf7e 4151 if (boot_cpu_has(X86_FEATURE_XSAVE)) {
d7876f1b
PB
4152 /*
4153 * Here we allow setting states that are not present in
4154 * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
4155 * with old userspace.
4156 */
cfc48181 4157 if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
d7876f1b 4158 return -EINVAL;
df1daba7 4159 load_xsave(vcpu, (u8 *)guest_xsave->region);
d7876f1b 4160 } else {
a575813b
WL
4161 if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
4162 mxcsr & ~mxcsr_feature_mask)
2d5b5a66 4163 return -EINVAL;
b666a4b6 4164 memcpy(&vcpu->arch.guest_fpu->state.fxsave,
c47ada30 4165 guest_xsave->region, sizeof(struct fxregs_state));
2d5b5a66
SY
4166 }
4167 return 0;
4168}
4169
4170static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
4171 struct kvm_xcrs *guest_xcrs)
4172{
d366bf7e 4173 if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
2d5b5a66
SY
4174 guest_xcrs->nr_xcrs = 0;
4175 return;
4176 }
4177
4178 guest_xcrs->nr_xcrs = 1;
4179 guest_xcrs->flags = 0;
4180 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
4181 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
4182}
4183
4184static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
4185 struct kvm_xcrs *guest_xcrs)
4186{
4187 int i, r = 0;
4188
d366bf7e 4189 if (!boot_cpu_has(X86_FEATURE_XSAVE))
2d5b5a66
SY
4190 return -EINVAL;
4191
4192 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
4193 return -EINVAL;
4194
4195 for (i = 0; i < guest_xcrs->nr_xcrs; i++)
4196 /* Only support XCR0 currently */
c67a04cb 4197 if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
2d5b5a66 4198 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
c67a04cb 4199 guest_xcrs->xcrs[i].value);
2d5b5a66
SY
4200 break;
4201 }
4202 if (r)
4203 r = -EINVAL;
4204 return r;
4205}
4206
1c0b28c2
EM
4207/*
4208 * kvm_set_guest_paused() indicates to the guest kernel that it has been
4209 * stopped by the hypervisor. This function will be called from the host only.
4210 * EINVAL is returned when the host attempts to set the flag for a guest that
4211 * does not support pv clocks.
4212 */
4213static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
4214{
0b79459b 4215 if (!vcpu->arch.pv_time_enabled)
1c0b28c2 4216 return -EINVAL;
51d59c6b 4217 vcpu->arch.pvclock_set_guest_stopped_request = true;
1c0b28c2
EM
4218 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4219 return 0;
4220}
4221
5c919412
AS
4222static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
4223 struct kvm_enable_cap *cap)
4224{
57b119da
VK
4225 int r;
4226 uint16_t vmcs_version;
4227 void __user *user_ptr;
4228
5c919412
AS
4229 if (cap->flags)
4230 return -EINVAL;
4231
4232 switch (cap->cap) {
efc479e6
RK
4233 case KVM_CAP_HYPERV_SYNIC2:
4234 if (cap->args[0])
4235 return -EINVAL;
b2869f28
GS
4236 /* fall through */
4237
5c919412 4238 case KVM_CAP_HYPERV_SYNIC:
546d87e5
WL
4239 if (!irqchip_in_kernel(vcpu->kvm))
4240 return -EINVAL;
efc479e6
RK
4241 return kvm_hv_activate_synic(vcpu, cap->cap ==
4242 KVM_CAP_HYPERV_SYNIC2);
57b119da 4243 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
33b22172 4244 if (!kvm_x86_ops.nested_ops->enable_evmcs)
5158917c 4245 return -ENOTTY;
33b22172 4246 r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
57b119da
VK
4247 if (!r) {
4248 user_ptr = (void __user *)(uintptr_t)cap->args[0];
4249 if (copy_to_user(user_ptr, &vmcs_version,
4250 sizeof(vmcs_version)))
4251 r = -EFAULT;
4252 }
4253 return r;
344c6c80 4254 case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
afaf0b2f 4255 if (!kvm_x86_ops.enable_direct_tlbflush)
344c6c80
TL
4256 return -ENOTTY;
4257
afaf0b2f 4258 return kvm_x86_ops.enable_direct_tlbflush(vcpu);
57b119da 4259
5c919412
AS
4260 default:
4261 return -EINVAL;
4262 }
4263}
4264
313a3dc7
CO
4265long kvm_arch_vcpu_ioctl(struct file *filp,
4266 unsigned int ioctl, unsigned long arg)
4267{
4268 struct kvm_vcpu *vcpu = filp->private_data;
4269 void __user *argp = (void __user *)arg;
4270 int r;
d1ac91d8
AK
4271 union {
4272 struct kvm_lapic_state *lapic;
4273 struct kvm_xsave *xsave;
4274 struct kvm_xcrs *xcrs;
4275 void *buffer;
4276 } u;
4277
9b062471
CD
4278 vcpu_load(vcpu);
4279
d1ac91d8 4280 u.buffer = NULL;
313a3dc7
CO
4281 switch (ioctl) {
4282 case KVM_GET_LAPIC: {
2204ae3c 4283 r = -EINVAL;
bce87cce 4284 if (!lapic_in_kernel(vcpu))
2204ae3c 4285 goto out;
254272ce
BG
4286 u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
4287 GFP_KERNEL_ACCOUNT);
313a3dc7 4288
b772ff36 4289 r = -ENOMEM;
d1ac91d8 4290 if (!u.lapic)
b772ff36 4291 goto out;
d1ac91d8 4292 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
313a3dc7
CO
4293 if (r)
4294 goto out;
4295 r = -EFAULT;
d1ac91d8 4296 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
313a3dc7
CO
4297 goto out;
4298 r = 0;
4299 break;
4300 }
4301 case KVM_SET_LAPIC: {
2204ae3c 4302 r = -EINVAL;
bce87cce 4303 if (!lapic_in_kernel(vcpu))
2204ae3c 4304 goto out;
ff5c2c03 4305 u.lapic = memdup_user(argp, sizeof(*u.lapic));
9b062471
CD
4306 if (IS_ERR(u.lapic)) {
4307 r = PTR_ERR(u.lapic);
4308 goto out_nofree;
4309 }
ff5c2c03 4310
d1ac91d8 4311 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
313a3dc7
CO
4312 break;
4313 }
f77bc6a4
ZX
4314 case KVM_INTERRUPT: {
4315 struct kvm_interrupt irq;
4316
4317 r = -EFAULT;
0e96f31e 4318 if (copy_from_user(&irq, argp, sizeof(irq)))
f77bc6a4
ZX
4319 goto out;
4320 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
f77bc6a4
ZX
4321 break;
4322 }
c4abb7c9
JK
4323 case KVM_NMI: {
4324 r = kvm_vcpu_ioctl_nmi(vcpu);
c4abb7c9
JK
4325 break;
4326 }
f077825a
PB
4327 case KVM_SMI: {
4328 r = kvm_vcpu_ioctl_smi(vcpu);
4329 break;
4330 }
313a3dc7
CO
4331 case KVM_SET_CPUID: {
4332 struct kvm_cpuid __user *cpuid_arg = argp;
4333 struct kvm_cpuid cpuid;
4334
4335 r = -EFAULT;
0e96f31e 4336 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
313a3dc7
CO
4337 goto out;
4338 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
313a3dc7
CO
4339 break;
4340 }
07716717
DK
4341 case KVM_SET_CPUID2: {
4342 struct kvm_cpuid2 __user *cpuid_arg = argp;
4343 struct kvm_cpuid2 cpuid;
4344
4345 r = -EFAULT;
0e96f31e 4346 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
07716717
DK
4347 goto out;
4348 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
19355475 4349 cpuid_arg->entries);
07716717
DK
4350 break;
4351 }
4352 case KVM_GET_CPUID2: {
4353 struct kvm_cpuid2 __user *cpuid_arg = argp;
4354 struct kvm_cpuid2 cpuid;
4355
4356 r = -EFAULT;
0e96f31e 4357 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
07716717
DK
4358 goto out;
4359 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
19355475 4360 cpuid_arg->entries);
07716717
DK
4361 if (r)
4362 goto out;
4363 r = -EFAULT;
0e96f31e 4364 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
07716717
DK
4365 goto out;
4366 r = 0;
4367 break;
4368 }
801e459a
TL
4369 case KVM_GET_MSRS: {
4370 int idx = srcu_read_lock(&vcpu->kvm->srcu);
609e36d3 4371 r = msr_io(vcpu, argp, do_get_msr, 1);
801e459a 4372 srcu_read_unlock(&vcpu->kvm->srcu, idx);
313a3dc7 4373 break;
801e459a
TL
4374 }
4375 case KVM_SET_MSRS: {
4376 int idx = srcu_read_lock(&vcpu->kvm->srcu);
313a3dc7 4377 r = msr_io(vcpu, argp, do_set_msr, 0);
801e459a 4378 srcu_read_unlock(&vcpu->kvm->srcu, idx);
313a3dc7 4379 break;
801e459a 4380 }
b209749f
AK
4381 case KVM_TPR_ACCESS_REPORTING: {
4382 struct kvm_tpr_access_ctl tac;
4383
4384 r = -EFAULT;
0e96f31e 4385 if (copy_from_user(&tac, argp, sizeof(tac)))
b209749f
AK
4386 goto out;
4387 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
4388 if (r)
4389 goto out;
4390 r = -EFAULT;
0e96f31e 4391 if (copy_to_user(argp, &tac, sizeof(tac)))
b209749f
AK
4392 goto out;
4393 r = 0;
4394 break;
4395 };
b93463aa
AK
4396 case KVM_SET_VAPIC_ADDR: {
4397 struct kvm_vapic_addr va;
7301d6ab 4398 int idx;
b93463aa
AK
4399
4400 r = -EINVAL;
35754c98 4401 if (!lapic_in_kernel(vcpu))
b93463aa
AK
4402 goto out;
4403 r = -EFAULT;
0e96f31e 4404 if (copy_from_user(&va, argp, sizeof(va)))
b93463aa 4405 goto out;
7301d6ab 4406 idx = srcu_read_lock(&vcpu->kvm->srcu);
fda4e2e8 4407 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
7301d6ab 4408 srcu_read_unlock(&vcpu->kvm->srcu, idx);
b93463aa
AK
4409 break;
4410 }
890ca9ae
HY
4411 case KVM_X86_SETUP_MCE: {
4412 u64 mcg_cap;
4413
4414 r = -EFAULT;
0e96f31e 4415 if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
890ca9ae
HY
4416 goto out;
4417 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
4418 break;
4419 }
4420 case KVM_X86_SET_MCE: {
4421 struct kvm_x86_mce mce;
4422
4423 r = -EFAULT;
0e96f31e 4424 if (copy_from_user(&mce, argp, sizeof(mce)))
890ca9ae
HY
4425 goto out;
4426 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
4427 break;
4428 }
3cfc3092
JK
4429 case KVM_GET_VCPU_EVENTS: {
4430 struct kvm_vcpu_events events;
4431
4432 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
4433
4434 r = -EFAULT;
4435 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
4436 break;
4437 r = 0;
4438 break;
4439 }
4440 case KVM_SET_VCPU_EVENTS: {
4441 struct kvm_vcpu_events events;
4442
4443 r = -EFAULT;
4444 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
4445 break;
4446
4447 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
4448 break;
4449 }
a1efbe77
JK
4450 case KVM_GET_DEBUGREGS: {
4451 struct kvm_debugregs dbgregs;
4452
4453 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
4454
4455 r = -EFAULT;
4456 if (copy_to_user(argp, &dbgregs,
4457 sizeof(struct kvm_debugregs)))
4458 break;
4459 r = 0;
4460 break;
4461 }
4462 case KVM_SET_DEBUGREGS: {
4463 struct kvm_debugregs dbgregs;
4464
4465 r = -EFAULT;
4466 if (copy_from_user(&dbgregs, argp,
4467 sizeof(struct kvm_debugregs)))
4468 break;
4469
4470 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
4471 break;
4472 }
2d5b5a66 4473 case KVM_GET_XSAVE: {
254272ce 4474 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
2d5b5a66 4475 r = -ENOMEM;
d1ac91d8 4476 if (!u.xsave)
2d5b5a66
SY
4477 break;
4478
d1ac91d8 4479 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
2d5b5a66
SY
4480
4481 r = -EFAULT;
d1ac91d8 4482 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
2d5b5a66
SY
4483 break;
4484 r = 0;
4485 break;
4486 }
4487 case KVM_SET_XSAVE: {
ff5c2c03 4488 u.xsave = memdup_user(argp, sizeof(*u.xsave));
9b062471
CD
4489 if (IS_ERR(u.xsave)) {
4490 r = PTR_ERR(u.xsave);
4491 goto out_nofree;
4492 }
2d5b5a66 4493
d1ac91d8 4494 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
2d5b5a66
SY
4495 break;
4496 }
4497 case KVM_GET_XCRS: {
254272ce 4498 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
2d5b5a66 4499 r = -ENOMEM;
d1ac91d8 4500 if (!u.xcrs)
2d5b5a66
SY
4501 break;
4502
d1ac91d8 4503 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
2d5b5a66
SY
4504
4505 r = -EFAULT;
d1ac91d8 4506 if (copy_to_user(argp, u.xcrs,
2d5b5a66
SY
4507 sizeof(struct kvm_xcrs)))
4508 break;
4509 r = 0;
4510 break;
4511 }
4512 case KVM_SET_XCRS: {
ff5c2c03 4513 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
9b062471
CD
4514 if (IS_ERR(u.xcrs)) {
4515 r = PTR_ERR(u.xcrs);
4516 goto out_nofree;
4517 }
2d5b5a66 4518
d1ac91d8 4519 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2d5b5a66
SY
4520 break;
4521 }
92a1f12d
JR
4522 case KVM_SET_TSC_KHZ: {
4523 u32 user_tsc_khz;
4524
4525 r = -EINVAL;
92a1f12d
JR
4526 user_tsc_khz = (u32)arg;
4527
4528 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
4529 goto out;
4530
cc578287
ZA
4531 if (user_tsc_khz == 0)
4532 user_tsc_khz = tsc_khz;
4533
381d585c
HZ
4534 if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
4535 r = 0;
92a1f12d 4536
92a1f12d
JR
4537 goto out;
4538 }
4539 case KVM_GET_TSC_KHZ: {
cc578287 4540 r = vcpu->arch.virtual_tsc_khz;
92a1f12d
JR
4541 goto out;
4542 }
1c0b28c2
EM
4543 case KVM_KVMCLOCK_CTRL: {
4544 r = kvm_set_guest_paused(vcpu);
4545 goto out;
4546 }
5c919412
AS
4547 case KVM_ENABLE_CAP: {
4548 struct kvm_enable_cap cap;
4549
4550 r = -EFAULT;
4551 if (copy_from_user(&cap, argp, sizeof(cap)))
4552 goto out;
4553 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
4554 break;
4555 }
8fcc4b59
JM
4556 case KVM_GET_NESTED_STATE: {
4557 struct kvm_nested_state __user *user_kvm_nested_state = argp;
4558 u32 user_data_size;
4559
4560 r = -EINVAL;
33b22172 4561 if (!kvm_x86_ops.nested_ops->get_state)
8fcc4b59
JM
4562 break;
4563
4564 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
26b471c7 4565 r = -EFAULT;
8fcc4b59 4566 if (get_user(user_data_size, &user_kvm_nested_state->size))
26b471c7 4567 break;
8fcc4b59 4568
33b22172
PB
4569 r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
4570 user_data_size);
8fcc4b59 4571 if (r < 0)
26b471c7 4572 break;
8fcc4b59
JM
4573
4574 if (r > user_data_size) {
4575 if (put_user(r, &user_kvm_nested_state->size))
26b471c7
LA
4576 r = -EFAULT;
4577 else
4578 r = -E2BIG;
4579 break;
8fcc4b59 4580 }
26b471c7 4581
8fcc4b59
JM
4582 r = 0;
4583 break;
4584 }
4585 case KVM_SET_NESTED_STATE: {
4586 struct kvm_nested_state __user *user_kvm_nested_state = argp;
4587 struct kvm_nested_state kvm_state;
ad5996d9 4588 int idx;
8fcc4b59
JM
4589
4590 r = -EINVAL;
33b22172 4591 if (!kvm_x86_ops.nested_ops->set_state)
8fcc4b59
JM
4592 break;
4593
26b471c7 4594 r = -EFAULT;
8fcc4b59 4595 if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
26b471c7 4596 break;
8fcc4b59 4597
26b471c7 4598 r = -EINVAL;
8fcc4b59 4599 if (kvm_state.size < sizeof(kvm_state))
26b471c7 4600 break;
8fcc4b59
JM
4601
4602 if (kvm_state.flags &
8cab6507
VK
4603 ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
4604 | KVM_STATE_NESTED_EVMCS))
26b471c7 4605 break;
8fcc4b59
JM
4606
4607 /* nested_run_pending implies guest_mode. */
8cab6507
VK
4608 if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
4609 && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
26b471c7 4610 break;
8fcc4b59 4611
ad5996d9 4612 idx = srcu_read_lock(&vcpu->kvm->srcu);
33b22172 4613 r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
ad5996d9 4614 srcu_read_unlock(&vcpu->kvm->srcu, idx);
8fcc4b59
JM
4615 break;
4616 }
2bc39970
VK
4617 case KVM_GET_SUPPORTED_HV_CPUID: {
4618 struct kvm_cpuid2 __user *cpuid_arg = argp;
4619 struct kvm_cpuid2 cpuid;
4620
4621 r = -EFAULT;
4622 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4623 goto out;
4624
4625 r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
4626 cpuid_arg->entries);
4627 if (r)
4628 goto out;
4629
4630 r = -EFAULT;
4631 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4632 goto out;
4633 r = 0;
4634 break;
4635 }
313a3dc7
CO
4636 default:
4637 r = -EINVAL;
4638 }
4639out:
d1ac91d8 4640 kfree(u.buffer);
9b062471
CD
4641out_nofree:
4642 vcpu_put(vcpu);
313a3dc7
CO
4643 return r;
4644}
4645
1499fa80 4646vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
5b1c1493
CO
4647{
4648 return VM_FAULT_SIGBUS;
4649}
4650
1fe779f8
CO
4651static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
4652{
4653 int ret;
4654
4655 if (addr > (unsigned int)(-3 * PAGE_SIZE))
951179ce 4656 return -EINVAL;
afaf0b2f 4657 ret = kvm_x86_ops.set_tss_addr(kvm, addr);
1fe779f8
CO
4658 return ret;
4659}
4660
b927a3ce
SY
4661static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
4662 u64 ident_addr)
4663{
afaf0b2f 4664 return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
b927a3ce
SY
4665}
4666
1fe779f8 4667static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
bc8a3d89 4668 unsigned long kvm_nr_mmu_pages)
1fe779f8
CO
4669{
4670 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
4671 return -EINVAL;
4672
79fac95e 4673 mutex_lock(&kvm->slots_lock);
1fe779f8
CO
4674
4675 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
f05e70ac 4676 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1fe779f8 4677
79fac95e 4678 mutex_unlock(&kvm->slots_lock);
1fe779f8
CO
4679 return 0;
4680}
4681
bc8a3d89 4682static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1fe779f8 4683{
39de71ec 4684 return kvm->arch.n_max_mmu_pages;
1fe779f8
CO
4685}
4686
1fe779f8
CO
4687static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
4688{
90bca052 4689 struct kvm_pic *pic = kvm->arch.vpic;
1fe779f8
CO
4690 int r;
4691
4692 r = 0;
4693 switch (chip->chip_id) {
4694 case KVM_IRQCHIP_PIC_MASTER:
90bca052 4695 memcpy(&chip->chip.pic, &pic->pics[0],
1fe779f8
CO
4696 sizeof(struct kvm_pic_state));
4697 break;
4698 case KVM_IRQCHIP_PIC_SLAVE:
90bca052 4699 memcpy(&chip->chip.pic, &pic->pics[1],
1fe779f8
CO
4700 sizeof(struct kvm_pic_state));
4701 break;
4702 case KVM_IRQCHIP_IOAPIC:
33392b49 4703 kvm_get_ioapic(kvm, &chip->chip.ioapic);
1fe779f8
CO
4704 break;
4705 default:
4706 r = -EINVAL;
4707 break;
4708 }
4709 return r;
4710}
4711
4712static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
4713{
90bca052 4714 struct kvm_pic *pic = kvm->arch.vpic;
1fe779f8
CO
4715 int r;
4716
4717 r = 0;
4718 switch (chip->chip_id) {
4719 case KVM_IRQCHIP_PIC_MASTER:
90bca052
DH
4720 spin_lock(&pic->lock);
4721 memcpy(&pic->pics[0], &chip->chip.pic,
1fe779f8 4722 sizeof(struct kvm_pic_state));
90bca052 4723 spin_unlock(&pic->lock);
1fe779f8
CO
4724 break;
4725 case KVM_IRQCHIP_PIC_SLAVE:
90bca052
DH
4726 spin_lock(&pic->lock);
4727 memcpy(&pic->pics[1], &chip->chip.pic,
1fe779f8 4728 sizeof(struct kvm_pic_state));
90bca052 4729 spin_unlock(&pic->lock);
1fe779f8
CO
4730 break;
4731 case KVM_IRQCHIP_IOAPIC:
33392b49 4732 kvm_set_ioapic(kvm, &chip->chip.ioapic);
1fe779f8
CO
4733 break;
4734 default:
4735 r = -EINVAL;
4736 break;
4737 }
90bca052 4738 kvm_pic_update_irq(pic);
1fe779f8
CO
4739 return r;
4740}
4741
e0f63cb9
SY
4742static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
4743{
34f3941c
RK
4744 struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
4745
4746 BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
4747
4748 mutex_lock(&kps->lock);
4749 memcpy(ps, &kps->channels, sizeof(*ps));
4750 mutex_unlock(&kps->lock);
2da29bcc 4751 return 0;
e0f63cb9
SY
4752}
4753
4754static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
4755{
0185604c 4756 int i;
09edea72
RK
4757 struct kvm_pit *pit = kvm->arch.vpit;
4758
4759 mutex_lock(&pit->pit_state.lock);
34f3941c 4760 memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
0185604c 4761 for (i = 0; i < 3; i++)
09edea72
RK
4762 kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
4763 mutex_unlock(&pit->pit_state.lock);
2da29bcc 4764 return 0;
e9f42757
BK
4765}
4766
4767static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
4768{
e9f42757
BK
4769 mutex_lock(&kvm->arch.vpit->pit_state.lock);
4770 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
4771 sizeof(ps->channels));
4772 ps->flags = kvm->arch.vpit->pit_state.flags;
4773 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
97e69aa6 4774 memset(&ps->reserved, 0, sizeof(ps->reserved));
2da29bcc 4775 return 0;
e9f42757
BK
4776}
4777
4778static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
4779{
2da29bcc 4780 int start = 0;
0185604c 4781 int i;
e9f42757 4782 u32 prev_legacy, cur_legacy;
09edea72
RK
4783 struct kvm_pit *pit = kvm->arch.vpit;
4784
4785 mutex_lock(&pit->pit_state.lock);
4786 prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
e9f42757
BK
4787 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
4788 if (!prev_legacy && cur_legacy)
4789 start = 1;
09edea72
RK
4790 memcpy(&pit->pit_state.channels, &ps->channels,
4791 sizeof(pit->pit_state.channels));
4792 pit->pit_state.flags = ps->flags;
0185604c 4793 for (i = 0; i < 3; i++)
09edea72 4794 kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
e5e57e7a 4795 start && i == 0);
09edea72 4796 mutex_unlock(&pit->pit_state.lock);
2da29bcc 4797 return 0;
e0f63cb9
SY
4798}
4799
52d939a0
MT
4800static int kvm_vm_ioctl_reinject(struct kvm *kvm,
4801 struct kvm_reinject_control *control)
4802{
71474e2f
RK
4803 struct kvm_pit *pit = kvm->arch.vpit;
4804
71474e2f
RK
4805 /* pit->pit_state.lock was overloaded to prevent userspace from getting
4806 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
4807 * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
4808 */
4809 mutex_lock(&pit->pit_state.lock);
4810 kvm_pit_set_reinject(pit, control->pit_reinject);
4811 mutex_unlock(&pit->pit_state.lock);
b39c90b6 4812
52d939a0
MT
4813 return 0;
4814}
4815
0dff0846 4816void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
5bb064dc 4817{
88178fd4
KH
4818 /*
4819 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
4820 */
afaf0b2f
SC
4821 if (kvm_x86_ops.flush_log_dirty)
4822 kvm_x86_ops.flush_log_dirty(kvm);
5bb064dc
ZX
4823}
4824
aa2fbe6d
YZ
4825int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
4826 bool line_status)
23d43cf9
CD
4827{
4828 if (!irqchip_in_kernel(kvm))
4829 return -ENXIO;
4830
4831 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
aa2fbe6d
YZ
4832 irq_event->irq, irq_event->level,
4833 line_status);
23d43cf9
CD
4834 return 0;
4835}
4836
e5d83c74
PB
4837int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4838 struct kvm_enable_cap *cap)
90de4a18
NA
4839{
4840 int r;
4841
4842 if (cap->flags)
4843 return -EINVAL;
4844
4845 switch (cap->cap) {
4846 case KVM_CAP_DISABLE_QUIRKS:
4847 kvm->arch.disabled_quirks = cap->args[0];
4848 r = 0;
4849 break;
49df6397
SR
4850 case KVM_CAP_SPLIT_IRQCHIP: {
4851 mutex_lock(&kvm->lock);
b053b2ae
SR
4852 r = -EINVAL;
4853 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
4854 goto split_irqchip_unlock;
49df6397
SR
4855 r = -EEXIST;
4856 if (irqchip_in_kernel(kvm))
4857 goto split_irqchip_unlock;
557abc40 4858 if (kvm->created_vcpus)
49df6397
SR
4859 goto split_irqchip_unlock;
4860 r = kvm_setup_empty_irq_routing(kvm);
5c0aea0e 4861 if (r)
49df6397
SR
4862 goto split_irqchip_unlock;
4863 /* Pairs with irqchip_in_kernel. */
4864 smp_wmb();
49776faf 4865 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
b053b2ae 4866 kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
49df6397
SR
4867 r = 0;
4868split_irqchip_unlock:
4869 mutex_unlock(&kvm->lock);
4870 break;
4871 }
37131313
RK
4872 case KVM_CAP_X2APIC_API:
4873 r = -EINVAL;
4874 if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
4875 break;
4876
4877 if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
4878 kvm->arch.x2apic_format = true;
c519265f
RK
4879 if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
4880 kvm->arch.x2apic_broadcast_quirk_disabled = true;
37131313
RK
4881
4882 r = 0;
4883 break;
4d5422ce
WL
4884 case KVM_CAP_X86_DISABLE_EXITS:
4885 r = -EINVAL;
4886 if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
4887 break;
4888
4889 if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
4890 kvm_can_mwait_in_guest())
4891 kvm->arch.mwait_in_guest = true;
766d3571 4892 if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
caa057a2 4893 kvm->arch.hlt_in_guest = true;
b31c114b
WL
4894 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
4895 kvm->arch.pause_in_guest = true;
b5170063
WL
4896 if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
4897 kvm->arch.cstate_in_guest = true;
4d5422ce
WL
4898 r = 0;
4899 break;
6fbbde9a
DS
4900 case KVM_CAP_MSR_PLATFORM_INFO:
4901 kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
4902 r = 0;
c4f55198
JM
4903 break;
4904 case KVM_CAP_EXCEPTION_PAYLOAD:
4905 kvm->arch.exception_payload_enabled = cap->args[0];
4906 r = 0;
6fbbde9a 4907 break;
90de4a18
NA
4908 default:
4909 r = -EINVAL;
4910 break;
4911 }
4912 return r;
4913}
4914
1fe779f8
CO
4915long kvm_arch_vm_ioctl(struct file *filp,
4916 unsigned int ioctl, unsigned long arg)
4917{
4918 struct kvm *kvm = filp->private_data;
4919 void __user *argp = (void __user *)arg;
367e1319 4920 int r = -ENOTTY;
f0d66275
DH
4921 /*
4922 * This union makes it completely explicit to gcc-3.x
4923 * that these two variables' stack usage should be
4924 * combined, not added together.
4925 */
4926 union {
4927 struct kvm_pit_state ps;
e9f42757 4928 struct kvm_pit_state2 ps2;
c5ff41ce 4929 struct kvm_pit_config pit_config;
f0d66275 4930 } u;
1fe779f8
CO
4931
4932 switch (ioctl) {
4933 case KVM_SET_TSS_ADDR:
4934 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1fe779f8 4935 break;
b927a3ce
SY
4936 case KVM_SET_IDENTITY_MAP_ADDR: {
4937 u64 ident_addr;
4938
1af1ac91
DH
4939 mutex_lock(&kvm->lock);
4940 r = -EINVAL;
4941 if (kvm->created_vcpus)
4942 goto set_identity_unlock;
b927a3ce 4943 r = -EFAULT;
0e96f31e 4944 if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
1af1ac91 4945 goto set_identity_unlock;
b927a3ce 4946 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
1af1ac91
DH
4947set_identity_unlock:
4948 mutex_unlock(&kvm->lock);
b927a3ce
SY
4949 break;
4950 }
1fe779f8
CO
4951 case KVM_SET_NR_MMU_PAGES:
4952 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1fe779f8
CO
4953 break;
4954 case KVM_GET_NR_MMU_PAGES:
4955 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
4956 break;
3ddea128 4957 case KVM_CREATE_IRQCHIP: {
3ddea128 4958 mutex_lock(&kvm->lock);
09941366 4959
3ddea128 4960 r = -EEXIST;
35e6eaa3 4961 if (irqchip_in_kernel(kvm))
3ddea128 4962 goto create_irqchip_unlock;
09941366 4963
3e515705 4964 r = -EINVAL;
557abc40 4965 if (kvm->created_vcpus)
3e515705 4966 goto create_irqchip_unlock;
09941366
RK
4967
4968 r = kvm_pic_init(kvm);
4969 if (r)
3ddea128 4970 goto create_irqchip_unlock;
09941366
RK
4971
4972 r = kvm_ioapic_init(kvm);
4973 if (r) {
09941366 4974 kvm_pic_destroy(kvm);
3ddea128 4975 goto create_irqchip_unlock;
09941366
RK
4976 }
4977
399ec807
AK
4978 r = kvm_setup_default_irq_routing(kvm);
4979 if (r) {
72bb2fcd 4980 kvm_ioapic_destroy(kvm);
09941366 4981 kvm_pic_destroy(kvm);
71ba994c 4982 goto create_irqchip_unlock;
399ec807 4983 }
49776faf 4984 /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
71ba994c 4985 smp_wmb();
49776faf 4986 kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
3ddea128
MT
4987 create_irqchip_unlock:
4988 mutex_unlock(&kvm->lock);
1fe779f8 4989 break;
3ddea128 4990 }
7837699f 4991 case KVM_CREATE_PIT:
c5ff41ce
JK
4992 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
4993 goto create_pit;
4994 case KVM_CREATE_PIT2:
4995 r = -EFAULT;
4996 if (copy_from_user(&u.pit_config, argp,
4997 sizeof(struct kvm_pit_config)))
4998 goto out;
4999 create_pit:
250715a6 5000 mutex_lock(&kvm->lock);
269e05e4
AK
5001 r = -EEXIST;
5002 if (kvm->arch.vpit)
5003 goto create_pit_unlock;
7837699f 5004 r = -ENOMEM;
c5ff41ce 5005 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
7837699f
SY
5006 if (kvm->arch.vpit)
5007 r = 0;
269e05e4 5008 create_pit_unlock:
250715a6 5009 mutex_unlock(&kvm->lock);
7837699f 5010 break;
1fe779f8
CO
5011 case KVM_GET_IRQCHIP: {
5012 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
ff5c2c03 5013 struct kvm_irqchip *chip;
1fe779f8 5014
ff5c2c03
SL
5015 chip = memdup_user(argp, sizeof(*chip));
5016 if (IS_ERR(chip)) {
5017 r = PTR_ERR(chip);
1fe779f8 5018 goto out;
ff5c2c03
SL
5019 }
5020
1fe779f8 5021 r = -ENXIO;
826da321 5022 if (!irqchip_kernel(kvm))
f0d66275
DH
5023 goto get_irqchip_out;
5024 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1fe779f8 5025 if (r)
f0d66275 5026 goto get_irqchip_out;
1fe779f8 5027 r = -EFAULT;
0e96f31e 5028 if (copy_to_user(argp, chip, sizeof(*chip)))
f0d66275 5029 goto get_irqchip_out;
1fe779f8 5030 r = 0;
f0d66275
DH
5031 get_irqchip_out:
5032 kfree(chip);
1fe779f8
CO
5033 break;
5034 }
5035 case KVM_SET_IRQCHIP: {
5036 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
ff5c2c03 5037 struct kvm_irqchip *chip;
1fe779f8 5038
ff5c2c03
SL
5039 chip = memdup_user(argp, sizeof(*chip));
5040 if (IS_ERR(chip)) {
5041 r = PTR_ERR(chip);
1fe779f8 5042 goto out;
ff5c2c03
SL
5043 }
5044
1fe779f8 5045 r = -ENXIO;
826da321 5046 if (!irqchip_kernel(kvm))
f0d66275
DH
5047 goto set_irqchip_out;
5048 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
f0d66275
DH
5049 set_irqchip_out:
5050 kfree(chip);
1fe779f8
CO
5051 break;
5052 }
e0f63cb9 5053 case KVM_GET_PIT: {
e0f63cb9 5054 r = -EFAULT;
f0d66275 5055 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
e0f63cb9
SY
5056 goto out;
5057 r = -ENXIO;
5058 if (!kvm->arch.vpit)
5059 goto out;
f0d66275 5060 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
e0f63cb9
SY
5061 if (r)
5062 goto out;
5063 r = -EFAULT;
f0d66275 5064 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
e0f63cb9
SY
5065 goto out;
5066 r = 0;
5067 break;
5068 }
5069 case KVM_SET_PIT: {
e0f63cb9 5070 r = -EFAULT;
0e96f31e 5071 if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
e0f63cb9 5072 goto out;
7289fdb5 5073 mutex_lock(&kvm->lock);
e0f63cb9
SY
5074 r = -ENXIO;
5075 if (!kvm->arch.vpit)
7289fdb5 5076 goto set_pit_out;
f0d66275 5077 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
7289fdb5
SR
5078set_pit_out:
5079 mutex_unlock(&kvm->lock);
e0f63cb9
SY
5080 break;
5081 }
e9f42757
BK
5082 case KVM_GET_PIT2: {
5083 r = -ENXIO;
5084 if (!kvm->arch.vpit)
5085 goto out;
5086 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
5087 if (r)
5088 goto out;
5089 r = -EFAULT;
5090 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
5091 goto out;
5092 r = 0;
5093 break;
5094 }
5095 case KVM_SET_PIT2: {
5096 r = -EFAULT;
5097 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
5098 goto out;
7289fdb5 5099 mutex_lock(&kvm->lock);
e9f42757
BK
5100 r = -ENXIO;
5101 if (!kvm->arch.vpit)
7289fdb5 5102 goto set_pit2_out;
e9f42757 5103 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
7289fdb5
SR
5104set_pit2_out:
5105 mutex_unlock(&kvm->lock);
e9f42757
BK
5106 break;
5107 }
52d939a0
MT
5108 case KVM_REINJECT_CONTROL: {
5109 struct kvm_reinject_control control;
5110 r = -EFAULT;
5111 if (copy_from_user(&control, argp, sizeof(control)))
5112 goto out;
cad23e72
ML
5113 r = -ENXIO;
5114 if (!kvm->arch.vpit)
5115 goto out;
52d939a0 5116 r = kvm_vm_ioctl_reinject(kvm, &control);
52d939a0
MT
5117 break;
5118 }
d71ba788
PB
5119 case KVM_SET_BOOT_CPU_ID:
5120 r = 0;
5121 mutex_lock(&kvm->lock);
557abc40 5122 if (kvm->created_vcpus)
d71ba788
PB
5123 r = -EBUSY;
5124 else
5125 kvm->arch.bsp_vcpu_id = arg;
5126 mutex_unlock(&kvm->lock);
5127 break;
ffde22ac 5128 case KVM_XEN_HVM_CONFIG: {
51776043 5129 struct kvm_xen_hvm_config xhc;
ffde22ac 5130 r = -EFAULT;
51776043 5131 if (copy_from_user(&xhc, argp, sizeof(xhc)))
ffde22ac
ES
5132 goto out;
5133 r = -EINVAL;
51776043 5134 if (xhc.flags)
ffde22ac 5135 goto out;
51776043 5136 memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
ffde22ac
ES
5137 r = 0;
5138 break;
5139 }
afbcf7ab 5140 case KVM_SET_CLOCK: {
afbcf7ab
GC
5141 struct kvm_clock_data user_ns;
5142 u64 now_ns;
afbcf7ab
GC
5143
5144 r = -EFAULT;
5145 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
5146 goto out;
5147
5148 r = -EINVAL;
5149 if (user_ns.flags)
5150 goto out;
5151
5152 r = 0;
0bc48bea
RK
5153 /*
5154 * TODO: userspace has to take care of races with VCPU_RUN, so
5155 * kvm_gen_update_masterclock() can be cut down to locked
5156 * pvclock_update_vm_gtod_copy().
5157 */
5158 kvm_gen_update_masterclock(kvm);
e891a32e 5159 now_ns = get_kvmclock_ns(kvm);
108b249c 5160 kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
0bc48bea 5161 kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
afbcf7ab
GC
5162 break;
5163 }
5164 case KVM_GET_CLOCK: {
afbcf7ab
GC
5165 struct kvm_clock_data user_ns;
5166 u64 now_ns;
5167
e891a32e 5168 now_ns = get_kvmclock_ns(kvm);
108b249c 5169 user_ns.clock = now_ns;
e3fd9a93 5170 user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
97e69aa6 5171 memset(&user_ns.pad, 0, sizeof(user_ns.pad));
afbcf7ab
GC
5172
5173 r = -EFAULT;
5174 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
5175 goto out;
5176 r = 0;
5177 break;
5178 }
5acc5c06
BS
5179 case KVM_MEMORY_ENCRYPT_OP: {
5180 r = -ENOTTY;
afaf0b2f
SC
5181 if (kvm_x86_ops.mem_enc_op)
5182 r = kvm_x86_ops.mem_enc_op(kvm, argp);
5acc5c06
BS
5183 break;
5184 }
69eaedee
BS
5185 case KVM_MEMORY_ENCRYPT_REG_REGION: {
5186 struct kvm_enc_region region;
5187
5188 r = -EFAULT;
5189 if (copy_from_user(&region, argp, sizeof(region)))
5190 goto out;
5191
5192 r = -ENOTTY;
afaf0b2f
SC
5193 if (kvm_x86_ops.mem_enc_reg_region)
5194 r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
69eaedee
BS
5195 break;
5196 }
5197 case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
5198 struct kvm_enc_region region;
5199
5200 r = -EFAULT;
5201 if (copy_from_user(&region, argp, sizeof(region)))
5202 goto out;
5203
5204 r = -ENOTTY;
afaf0b2f
SC
5205 if (kvm_x86_ops.mem_enc_unreg_region)
5206 r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
69eaedee
BS
5207 break;
5208 }
faeb7833
RK
5209 case KVM_HYPERV_EVENTFD: {
5210 struct kvm_hyperv_eventfd hvevfd;
5211
5212 r = -EFAULT;
5213 if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
5214 goto out;
5215 r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
5216 break;
5217 }
66bb8a06
EH
5218 case KVM_SET_PMU_EVENT_FILTER:
5219 r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
5220 break;
1fe779f8 5221 default:
ad6260da 5222 r = -ENOTTY;
1fe779f8
CO
5223 }
5224out:
5225 return r;
5226}
5227
a16b043c 5228static void kvm_init_msr_list(void)
043405e1 5229{
24c29b7a 5230 struct x86_pmu_capability x86_pmu;
043405e1 5231 u32 dummy[2];
7a5ee6ed 5232 unsigned i;
043405e1 5233
e2ada66e 5234 BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
7a5ee6ed 5235 "Please update the fixed PMCs in msrs_to_saved_all[]");
24c29b7a
PB
5236
5237 perf_get_x86_pmu_capability(&x86_pmu);
e2ada66e 5238
6cbee2b9
XL
5239 num_msrs_to_save = 0;
5240 num_emulated_msrs = 0;
5241 num_msr_based_features = 0;
5242
7a5ee6ed
CQ
5243 for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
5244 if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
043405e1 5245 continue;
93c4adc7
PB
5246
5247 /*
5248 * Even MSRs that are valid in the host may not be exposed
9dbe6cf9 5249 * to the guests in some cases.
93c4adc7 5250 */
7a5ee6ed 5251 switch (msrs_to_save_all[i]) {
93c4adc7 5252 case MSR_IA32_BNDCFGS:
503234b3 5253 if (!kvm_mpx_supported())
93c4adc7
PB
5254 continue;
5255 break;
9dbe6cf9 5256 case MSR_TSC_AUX:
13908510 5257 if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
9dbe6cf9
PB
5258 continue;
5259 break;
bf8c55d8
CP
5260 case MSR_IA32_RTIT_CTL:
5261 case MSR_IA32_RTIT_STATUS:
7b874c26 5262 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
bf8c55d8
CP
5263 continue;
5264 break;
5265 case MSR_IA32_RTIT_CR3_MATCH:
7b874c26 5266 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
bf8c55d8
CP
5267 !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
5268 continue;
5269 break;
5270 case MSR_IA32_RTIT_OUTPUT_BASE:
5271 case MSR_IA32_RTIT_OUTPUT_MASK:
7b874c26 5272 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
bf8c55d8
CP
5273 (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
5274 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
5275 continue;
5276 break;
5277 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
7b874c26 5278 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
7a5ee6ed 5279 msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
bf8c55d8
CP
5280 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
5281 continue;
5282 break;
cf05a67b 5283 case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
7a5ee6ed 5284 if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
24c29b7a
PB
5285 min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
5286 continue;
5287 break;
cf05a67b 5288 case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
7a5ee6ed 5289 if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
24c29b7a
PB
5290 min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
5291 continue;
bf8c55d8 5292 }
93c4adc7
PB
5293 default:
5294 break;
5295 }
5296
7a5ee6ed 5297 msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
043405e1 5298 }
62ef68bb 5299
7a5ee6ed 5300 for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
afaf0b2f 5301 if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
bc226f07 5302 continue;
62ef68bb 5303
7a5ee6ed 5304 emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
62ef68bb 5305 }
801e459a 5306
7a5ee6ed 5307 for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
801e459a
TL
5308 struct kvm_msr_entry msr;
5309
7a5ee6ed 5310 msr.index = msr_based_features_all[i];
66421c1e 5311 if (kvm_get_msr_feature(&msr))
801e459a
TL
5312 continue;
5313
7a5ee6ed 5314 msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
801e459a 5315 }
043405e1
CO
5316}
5317
bda9020e
MT
5318static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
5319 const void *v)
bbd9b64e 5320{
70252a10
AK
5321 int handled = 0;
5322 int n;
5323
5324 do {
5325 n = min(len, 8);
bce87cce 5326 if (!(lapic_in_kernel(vcpu) &&
e32edf4f
NN
5327 !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
5328 && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
70252a10
AK
5329 break;
5330 handled += n;
5331 addr += n;
5332 len -= n;
5333 v += n;
5334 } while (len);
bbd9b64e 5335
70252a10 5336 return handled;
bbd9b64e
CO
5337}
5338
bda9020e 5339static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
bbd9b64e 5340{
70252a10
AK
5341 int handled = 0;
5342 int n;
5343
5344 do {
5345 n = min(len, 8);
bce87cce 5346 if (!(lapic_in_kernel(vcpu) &&
e32edf4f
NN
5347 !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
5348 addr, n, v))
5349 && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
70252a10 5350 break;
e39d200f 5351 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
70252a10
AK
5352 handled += n;
5353 addr += n;
5354 len -= n;
5355 v += n;
5356 } while (len);
bbd9b64e 5357
70252a10 5358 return handled;
bbd9b64e
CO
5359}
5360
2dafc6c2
GN
5361static void kvm_set_segment(struct kvm_vcpu *vcpu,
5362 struct kvm_segment *var, int seg)
5363{
afaf0b2f 5364 kvm_x86_ops.set_segment(vcpu, var, seg);
2dafc6c2
GN
5365}
5366
5367void kvm_get_segment(struct kvm_vcpu *vcpu,
5368 struct kvm_segment *var, int seg)
5369{
afaf0b2f 5370 kvm_x86_ops.get_segment(vcpu, var, seg);
2dafc6c2
GN
5371}
5372
54987b7a
PB
5373gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
5374 struct x86_exception *exception)
02f59dc9
JR
5375{
5376 gpa_t t_gpa;
02f59dc9
JR
5377
5378 BUG_ON(!mmu_is_nested(vcpu));
5379
5380 /* NPT walks are always user-walks */
5381 access |= PFERR_USER_MASK;
44dd3ffa 5382 t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
02f59dc9
JR
5383
5384 return t_gpa;
5385}
5386
ab9ae313
AK
5387gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
5388 struct x86_exception *exception)
1871c602 5389{
afaf0b2f 5390 u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
ab9ae313 5391 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
1871c602
GN
5392}
5393
ab9ae313
AK
5394 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
5395 struct x86_exception *exception)
1871c602 5396{
afaf0b2f 5397 u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
1871c602 5398 access |= PFERR_FETCH_MASK;
ab9ae313 5399 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
1871c602
GN
5400}
5401
ab9ae313
AK
5402gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
5403 struct x86_exception *exception)
1871c602 5404{
afaf0b2f 5405 u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
1871c602 5406 access |= PFERR_WRITE_MASK;
ab9ae313 5407 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
1871c602
GN
5408}
5409
5410/* uses this to access any guest's mapped memory without checking CPL */
ab9ae313
AK
5411gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
5412 struct x86_exception *exception)
1871c602 5413{
ab9ae313 5414 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
1871c602
GN
5415}
5416
5417static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
5418 struct kvm_vcpu *vcpu, u32 access,
bcc55cba 5419 struct x86_exception *exception)
bbd9b64e
CO
5420{
5421 void *data = val;
10589a46 5422 int r = X86EMUL_CONTINUE;
bbd9b64e
CO
5423
5424 while (bytes) {
14dfe855 5425 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
ab9ae313 5426 exception);
bbd9b64e 5427 unsigned offset = addr & (PAGE_SIZE-1);
77c2002e 5428 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
bbd9b64e
CO
5429 int ret;
5430
bcc55cba 5431 if (gpa == UNMAPPED_GVA)
ab9ae313 5432 return X86EMUL_PROPAGATE_FAULT;
54bf36aa
PB
5433 ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
5434 offset, toread);
10589a46 5435 if (ret < 0) {
c3cd7ffa 5436 r = X86EMUL_IO_NEEDED;
10589a46
MT
5437 goto out;
5438 }
bbd9b64e 5439
77c2002e
IE
5440 bytes -= toread;
5441 data += toread;
5442 addr += toread;
bbd9b64e 5443 }
10589a46 5444out:
10589a46 5445 return r;
bbd9b64e 5446}
77c2002e 5447
1871c602 5448/* used for instruction fetching */
0f65dd70
AK
5449static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
5450 gva_t addr, void *val, unsigned int bytes,
bcc55cba 5451 struct x86_exception *exception)
1871c602 5452{
0f65dd70 5453 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
afaf0b2f 5454 u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
44583cba
PB
5455 unsigned offset;
5456 int ret;
0f65dd70 5457
44583cba
PB
5458 /* Inline kvm_read_guest_virt_helper for speed. */
5459 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
5460 exception);
5461 if (unlikely(gpa == UNMAPPED_GVA))
5462 return X86EMUL_PROPAGATE_FAULT;
5463
5464 offset = addr & (PAGE_SIZE-1);
5465 if (WARN_ON(offset + bytes > PAGE_SIZE))
5466 bytes = (unsigned)PAGE_SIZE - offset;
54bf36aa
PB
5467 ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
5468 offset, bytes);
44583cba
PB
5469 if (unlikely(ret < 0))
5470 return X86EMUL_IO_NEEDED;
5471
5472 return X86EMUL_CONTINUE;
1871c602
GN
5473}
5474
ce14e868 5475int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
0f65dd70 5476 gva_t addr, void *val, unsigned int bytes,
bcc55cba 5477 struct x86_exception *exception)
1871c602 5478{
afaf0b2f 5479 u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
0f65dd70 5480
353c0956
PB
5481 /*
5482 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
5483 * is returned, but our callers are not ready for that and they blindly
5484 * call kvm_inject_page_fault. Ensure that they at least do not leak
5485 * uninitialized kernel stack memory into cr2 and error code.
5486 */
5487 memset(exception, 0, sizeof(*exception));
1871c602 5488 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
bcc55cba 5489 exception);
1871c602 5490}
064aea77 5491EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
1871c602 5492
ce14e868
PB
5493static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
5494 gva_t addr, void *val, unsigned int bytes,
3c9fa24c 5495 struct x86_exception *exception, bool system)
1871c602 5496{
0f65dd70 5497 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3c9fa24c
PB
5498 u32 access = 0;
5499
afaf0b2f 5500 if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
3c9fa24c
PB
5501 access |= PFERR_USER_MASK;
5502
5503 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
1871c602
GN
5504}
5505
7a036a6f
RK
5506static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
5507 unsigned long addr, void *val, unsigned int bytes)
5508{
5509 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5510 int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
5511
5512 return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
5513}
5514
ce14e868
PB
5515static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
5516 struct kvm_vcpu *vcpu, u32 access,
5517 struct x86_exception *exception)
77c2002e
IE
5518{
5519 void *data = val;
5520 int r = X86EMUL_CONTINUE;
5521
5522 while (bytes) {
14dfe855 5523 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
ce14e868 5524 access,
ab9ae313 5525 exception);
77c2002e
IE
5526 unsigned offset = addr & (PAGE_SIZE-1);
5527 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
5528 int ret;
5529
bcc55cba 5530 if (gpa == UNMAPPED_GVA)
ab9ae313 5531 return X86EMUL_PROPAGATE_FAULT;
54bf36aa 5532 ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
77c2002e 5533 if (ret < 0) {
c3cd7ffa 5534 r = X86EMUL_IO_NEEDED;
77c2002e
IE
5535 goto out;
5536 }
5537
5538 bytes -= towrite;
5539 data += towrite;
5540 addr += towrite;
5541 }
5542out:
5543 return r;
5544}
ce14e868
PB
5545
5546static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
3c9fa24c
PB
5547 unsigned int bytes, struct x86_exception *exception,
5548 bool system)
ce14e868
PB
5549{
5550 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3c9fa24c
PB
5551 u32 access = PFERR_WRITE_MASK;
5552
afaf0b2f 5553 if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
3c9fa24c 5554 access |= PFERR_USER_MASK;
ce14e868
PB
5555
5556 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
3c9fa24c 5557 access, exception);
ce14e868
PB
5558}
5559
5560int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
5561 unsigned int bytes, struct x86_exception *exception)
5562{
c595ceee
PB
5563 /* kvm_write_guest_virt_system can pull in tons of pages. */
5564 vcpu->arch.l1tf_flush_l1d = true;
5565
541ab2ae
FH
5566 /*
5567 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
5568 * is returned, but our callers are not ready for that and they blindly
5569 * call kvm_inject_page_fault. Ensure that they at least do not leak
5570 * uninitialized kernel stack memory into cr2 and error code.
5571 */
5572 memset(exception, 0, sizeof(*exception));
ce14e868
PB
5573 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
5574 PFERR_WRITE_MASK, exception);
5575}
6a4d7550 5576EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
77c2002e 5577
082d06ed
WL
5578int handle_ud(struct kvm_vcpu *vcpu)
5579{
b3dc0695 5580 static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
6c86eedc 5581 int emul_type = EMULTYPE_TRAP_UD;
6c86eedc
WL
5582 char sig[5]; /* ud2; .ascii "kvm" */
5583 struct x86_exception e;
5584
5585 if (force_emulation_prefix &&
3c9fa24c
PB
5586 kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
5587 sig, sizeof(sig), &e) == 0 &&
b3dc0695 5588 memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
6c86eedc 5589 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
b4000606 5590 emul_type = EMULTYPE_TRAP_UD_FORCED;
6c86eedc 5591 }
082d06ed 5592
60fc3d02 5593 return kvm_emulate_instruction(vcpu, emul_type);
082d06ed
WL
5594}
5595EXPORT_SYMBOL_GPL(handle_ud);
5596
0f89b207
TL
5597static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
5598 gpa_t gpa, bool write)
5599{
5600 /* For APIC access vmexit */
5601 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
5602 return 1;
5603
5604 if (vcpu_match_mmio_gpa(vcpu, gpa)) {
5605 trace_vcpu_match_mmio(gva, gpa, write, true);
5606 return 1;
5607 }
5608
5609 return 0;
5610}
5611
af7cc7d1
XG
5612static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
5613 gpa_t *gpa, struct x86_exception *exception,
5614 bool write)
5615{
afaf0b2f 5616 u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
97d64b78 5617 | (write ? PFERR_WRITE_MASK : 0);
af7cc7d1 5618
be94f6b7
HH
5619 /*
5620 * currently PKRU is only applied to ept enabled guest so
5621 * there is no pkey in EPT page table for L1 guest or EPT
5622 * shadow page table for L2 guest.
5623 */
97d64b78 5624 if (vcpu_match_mmio_gva(vcpu, gva)
97ec8c06 5625 && !permission_fault(vcpu, vcpu->arch.walk_mmu,
871bd034 5626 vcpu->arch.mmio_access, 0, access)) {
bebb106a
XG
5627 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
5628 (gva & (PAGE_SIZE - 1));
4f022648 5629 trace_vcpu_match_mmio(gva, *gpa, write, false);
bebb106a
XG
5630 return 1;
5631 }
5632
af7cc7d1
XG
5633 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
5634
5635 if (*gpa == UNMAPPED_GVA)
5636 return -1;
5637
0f89b207 5638 return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
af7cc7d1
XG
5639}
5640
3200f405 5641int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
bcc55cba 5642 const void *val, int bytes)
bbd9b64e
CO
5643{
5644 int ret;
5645
54bf36aa 5646 ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
9f811285 5647 if (ret < 0)
bbd9b64e 5648 return 0;
0eb05bf2 5649 kvm_page_track_write(vcpu, gpa, val, bytes);
bbd9b64e
CO
5650 return 1;
5651}
5652
77d197b2
XG
5653struct read_write_emulator_ops {
5654 int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
5655 int bytes);
5656 int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
5657 void *val, int bytes);
5658 int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
5659 int bytes, void *val);
5660 int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
5661 void *val, int bytes);
5662 bool write;
5663};
5664
5665static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
5666{
5667 if (vcpu->mmio_read_completed) {
77d197b2 5668 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
e39d200f 5669 vcpu->mmio_fragments[0].gpa, val);
77d197b2
XG
5670 vcpu->mmio_read_completed = 0;
5671 return 1;
5672 }
5673
5674 return 0;
5675}
5676
5677static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
5678 void *val, int bytes)
5679{
54bf36aa 5680 return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
77d197b2
XG
5681}
5682
5683static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
5684 void *val, int bytes)
5685{
5686 return emulator_write_phys(vcpu, gpa, val, bytes);
5687}
5688
5689static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
5690{
e39d200f 5691 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
77d197b2
XG
5692 return vcpu_mmio_write(vcpu, gpa, bytes, val);
5693}
5694
5695static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
5696 void *val, int bytes)
5697{
e39d200f 5698 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
77d197b2
XG
5699 return X86EMUL_IO_NEEDED;
5700}
5701
5702static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
5703 void *val, int bytes)
5704{
f78146b0
AK
5705 struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
5706
87da7e66 5707 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
77d197b2
XG
5708 return X86EMUL_CONTINUE;
5709}
5710
0fbe9b0b 5711static const struct read_write_emulator_ops read_emultor = {
77d197b2
XG
5712 .read_write_prepare = read_prepare,
5713 .read_write_emulate = read_emulate,
5714 .read_write_mmio = vcpu_mmio_read,
5715 .read_write_exit_mmio = read_exit_mmio,
5716};
5717
0fbe9b0b 5718static const struct read_write_emulator_ops write_emultor = {
77d197b2
XG
5719 .read_write_emulate = write_emulate,
5720 .read_write_mmio = write_mmio,
5721 .read_write_exit_mmio = write_exit_mmio,
5722 .write = true,
5723};
5724
22388a3c
XG
5725static int emulator_read_write_onepage(unsigned long addr, void *val,
5726 unsigned int bytes,
5727 struct x86_exception *exception,
5728 struct kvm_vcpu *vcpu,
0fbe9b0b 5729 const struct read_write_emulator_ops *ops)
bbd9b64e 5730{
af7cc7d1
XG
5731 gpa_t gpa;
5732 int handled, ret;
22388a3c 5733 bool write = ops->write;
f78146b0 5734 struct kvm_mmio_fragment *frag;
c9b8b07c 5735 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
0f89b207
TL
5736
5737 /*
5738 * If the exit was due to a NPF we may already have a GPA.
5739 * If the GPA is present, use it to avoid the GVA to GPA table walk.
5740 * Note, this cannot be used on string operations since string
5741 * operation using rep will only have the initial GPA from the NPF
5742 * occurred.
5743 */
744e699c
SC
5744 if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
5745 (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
5746 gpa = ctxt->gpa_val;
618232e2
BS
5747 ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
5748 } else {
5749 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
5750 if (ret < 0)
5751 return X86EMUL_PROPAGATE_FAULT;
0f89b207 5752 }
10589a46 5753
618232e2 5754 if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
bbd9b64e
CO
5755 return X86EMUL_CONTINUE;
5756
bbd9b64e
CO
5757 /*
5758 * Is this MMIO handled locally?
5759 */
22388a3c 5760 handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
70252a10 5761 if (handled == bytes)
bbd9b64e 5762 return X86EMUL_CONTINUE;
bbd9b64e 5763
70252a10
AK
5764 gpa += handled;
5765 bytes -= handled;
5766 val += handled;
5767
87da7e66
XG
5768 WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
5769 frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
5770 frag->gpa = gpa;
5771 frag->data = val;
5772 frag->len = bytes;
f78146b0 5773 return X86EMUL_CONTINUE;
bbd9b64e
CO
5774}
5775
52eb5a6d
XL
5776static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
5777 unsigned long addr,
22388a3c
XG
5778 void *val, unsigned int bytes,
5779 struct x86_exception *exception,
0fbe9b0b 5780 const struct read_write_emulator_ops *ops)
bbd9b64e 5781{
0f65dd70 5782 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
f78146b0
AK
5783 gpa_t gpa;
5784 int rc;
5785
5786 if (ops->read_write_prepare &&
5787 ops->read_write_prepare(vcpu, val, bytes))
5788 return X86EMUL_CONTINUE;
5789
5790 vcpu->mmio_nr_fragments = 0;
0f65dd70 5791
bbd9b64e
CO
5792 /* Crossing a page boundary? */
5793 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
f78146b0 5794 int now;
bbd9b64e
CO
5795
5796 now = -addr & ~PAGE_MASK;
22388a3c
XG
5797 rc = emulator_read_write_onepage(addr, val, now, exception,
5798 vcpu, ops);
5799
bbd9b64e
CO
5800 if (rc != X86EMUL_CONTINUE)
5801 return rc;
5802 addr += now;
bac15531
NA
5803 if (ctxt->mode != X86EMUL_MODE_PROT64)
5804 addr = (u32)addr;
bbd9b64e
CO
5805 val += now;
5806 bytes -= now;
5807 }
22388a3c 5808
f78146b0
AK
5809 rc = emulator_read_write_onepage(addr, val, bytes, exception,
5810 vcpu, ops);
5811 if (rc != X86EMUL_CONTINUE)
5812 return rc;
5813
5814 if (!vcpu->mmio_nr_fragments)
5815 return rc;
5816
5817 gpa = vcpu->mmio_fragments[0].gpa;
5818
5819 vcpu->mmio_needed = 1;
5820 vcpu->mmio_cur_fragment = 0;
5821
87da7e66 5822 vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
f78146b0
AK
5823 vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
5824 vcpu->run->exit_reason = KVM_EXIT_MMIO;
5825 vcpu->run->mmio.phys_addr = gpa;
5826
5827 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
22388a3c
XG
5828}
5829
5830static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
5831 unsigned long addr,
5832 void *val,
5833 unsigned int bytes,
5834 struct x86_exception *exception)
5835{
5836 return emulator_read_write(ctxt, addr, val, bytes,
5837 exception, &read_emultor);
5838}
5839
52eb5a6d 5840static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
22388a3c
XG
5841 unsigned long addr,
5842 const void *val,
5843 unsigned int bytes,
5844 struct x86_exception *exception)
5845{
5846 return emulator_read_write(ctxt, addr, (void *)val, bytes,
5847 exception, &write_emultor);
bbd9b64e 5848}
bbd9b64e 5849
daea3e73
AK
5850#define CMPXCHG_TYPE(t, ptr, old, new) \
5851 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
5852
5853#ifdef CONFIG_X86_64
5854# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
5855#else
5856# define CMPXCHG64(ptr, old, new) \
9749a6c0 5857 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
daea3e73
AK
5858#endif
5859
0f65dd70
AK
5860static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
5861 unsigned long addr,
bbd9b64e
CO
5862 const void *old,
5863 const void *new,
5864 unsigned int bytes,
0f65dd70 5865 struct x86_exception *exception)
bbd9b64e 5866{
42e35f80 5867 struct kvm_host_map map;
0f65dd70 5868 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
9de6fe3c 5869 u64 page_line_mask;
daea3e73 5870 gpa_t gpa;
daea3e73
AK
5871 char *kaddr;
5872 bool exchanged;
2bacc55c 5873
daea3e73
AK
5874 /* guests cmpxchg8b have to be emulated atomically */
5875 if (bytes > 8 || (bytes & (bytes - 1)))
5876 goto emul_write;
10589a46 5877
daea3e73 5878 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
2bacc55c 5879
daea3e73
AK
5880 if (gpa == UNMAPPED_GVA ||
5881 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
5882 goto emul_write;
2bacc55c 5883
9de6fe3c
XL
5884 /*
5885 * Emulate the atomic as a straight write to avoid #AC if SLD is
5886 * enabled in the host and the access splits a cache line.
5887 */
5888 if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5889 page_line_mask = ~(cache_line_size() - 1);
5890 else
5891 page_line_mask = PAGE_MASK;
5892
5893 if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
daea3e73 5894 goto emul_write;
72dc67a6 5895
42e35f80 5896 if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
c19b8bd6 5897 goto emul_write;
72dc67a6 5898
42e35f80
KA
5899 kaddr = map.hva + offset_in_page(gpa);
5900
daea3e73
AK
5901 switch (bytes) {
5902 case 1:
5903 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
5904 break;
5905 case 2:
5906 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
5907 break;
5908 case 4:
5909 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
5910 break;
5911 case 8:
5912 exchanged = CMPXCHG64(kaddr, old, new);
5913 break;
5914 default:
5915 BUG();
2bacc55c 5916 }
42e35f80
KA
5917
5918 kvm_vcpu_unmap(vcpu, &map, true);
daea3e73
AK
5919
5920 if (!exchanged)
5921 return X86EMUL_CMPXCHG_FAILED;
5922
0eb05bf2 5923 kvm_page_track_write(vcpu, gpa, new, bytes);
8f6abd06
GN
5924
5925 return X86EMUL_CONTINUE;
4a5f48f6 5926
3200f405 5927emul_write:
daea3e73 5928 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
2bacc55c 5929
0f65dd70 5930 return emulator_write_emulated(ctxt, addr, new, bytes, exception);
bbd9b64e
CO
5931}
5932
cf8f70bf
GN
5933static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
5934{
cbfc6c91 5935 int r = 0, i;
cf8f70bf 5936
cbfc6c91
WL
5937 for (i = 0; i < vcpu->arch.pio.count; i++) {
5938 if (vcpu->arch.pio.in)
5939 r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
5940 vcpu->arch.pio.size, pd);
5941 else
5942 r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
5943 vcpu->arch.pio.port, vcpu->arch.pio.size,
5944 pd);
5945 if (r)
5946 break;
5947 pd += vcpu->arch.pio.size;
5948 }
cf8f70bf
GN
5949 return r;
5950}
5951
6f6fbe98
XG
5952static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
5953 unsigned short port, void *val,
5954 unsigned int count, bool in)
cf8f70bf 5955{
cf8f70bf 5956 vcpu->arch.pio.port = port;
6f6fbe98 5957 vcpu->arch.pio.in = in;
7972995b 5958 vcpu->arch.pio.count = count;
cf8f70bf
GN
5959 vcpu->arch.pio.size = size;
5960
5961 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
7972995b 5962 vcpu->arch.pio.count = 0;
cf8f70bf
GN
5963 return 1;
5964 }
5965
5966 vcpu->run->exit_reason = KVM_EXIT_IO;
6f6fbe98 5967 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
cf8f70bf
GN
5968 vcpu->run->io.size = size;
5969 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
5970 vcpu->run->io.count = count;
5971 vcpu->run->io.port = port;
5972
5973 return 0;
5974}
5975
2e3bb4d8
SC
5976static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
5977 unsigned short port, void *val, unsigned int count)
cf8f70bf 5978{
6f6fbe98 5979 int ret;
ca1d4a9e 5980
6f6fbe98
XG
5981 if (vcpu->arch.pio.count)
5982 goto data_avail;
cf8f70bf 5983
cbfc6c91
WL
5984 memset(vcpu->arch.pio_data, 0, size * count);
5985
6f6fbe98
XG
5986 ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
5987 if (ret) {
5988data_avail:
5989 memcpy(val, vcpu->arch.pio_data, size * count);
1171903d 5990 trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
7972995b 5991 vcpu->arch.pio.count = 0;
cf8f70bf
GN
5992 return 1;
5993 }
5994
cf8f70bf
GN
5995 return 0;
5996}
5997
2e3bb4d8
SC
5998static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
5999 int size, unsigned short port, void *val,
6000 unsigned int count)
6f6fbe98 6001{
2e3bb4d8 6002 return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
6f6fbe98 6003
2e3bb4d8 6004}
6f6fbe98 6005
2e3bb4d8
SC
6006static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
6007 unsigned short port, const void *val,
6008 unsigned int count)
6009{
6f6fbe98 6010 memcpy(vcpu->arch.pio_data, val, size * count);
1171903d 6011 trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
6f6fbe98
XG
6012 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
6013}
6014
2e3bb4d8
SC
6015static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
6016 int size, unsigned short port,
6017 const void *val, unsigned int count)
6018{
6019 return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
6020}
6021
bbd9b64e
CO
6022static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
6023{
afaf0b2f 6024 return kvm_x86_ops.get_segment_base(vcpu, seg);
bbd9b64e
CO
6025}
6026
3cb16fe7 6027static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
bbd9b64e 6028{
3cb16fe7 6029 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
bbd9b64e
CO
6030}
6031
ae6a2375 6032static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
f5f48ee1
SY
6033{
6034 if (!need_emulate_wbinvd(vcpu))
6035 return X86EMUL_CONTINUE;
6036
afaf0b2f 6037 if (kvm_x86_ops.has_wbinvd_exit()) {
2eec7343
JK
6038 int cpu = get_cpu();
6039
6040 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
f5f48ee1
SY
6041 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
6042 wbinvd_ipi, NULL, 1);
2eec7343 6043 put_cpu();
f5f48ee1 6044 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
2eec7343
JK
6045 } else
6046 wbinvd();
f5f48ee1
SY
6047 return X86EMUL_CONTINUE;
6048}
5cb56059
JS
6049
6050int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
6051{
6affcbed
KH
6052 kvm_emulate_wbinvd_noskip(vcpu);
6053 return kvm_skip_emulated_instruction(vcpu);
5cb56059 6054}
f5f48ee1
SY
6055EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
6056
5cb56059
JS
6057
6058
bcaf5cc5
AK
6059static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
6060{
5cb56059 6061 kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
bcaf5cc5
AK
6062}
6063
52eb5a6d
XL
6064static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
6065 unsigned long *dest)
bbd9b64e 6066{
16f8a6f9 6067 return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
bbd9b64e
CO
6068}
6069
52eb5a6d
XL
6070static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
6071 unsigned long value)
bbd9b64e 6072{
338dbc97 6073
717746e3 6074 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
bbd9b64e
CO
6075}
6076
52a46617 6077static u64 mk_cr_64(u64 curr_cr, u32 new_val)
5fdbf976 6078{
52a46617 6079 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
5fdbf976
MT
6080}
6081
717746e3 6082static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
bbd9b64e 6083{
717746e3 6084 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
52a46617
GN
6085 unsigned long value;
6086
6087 switch (cr) {
6088 case 0:
6089 value = kvm_read_cr0(vcpu);
6090 break;
6091 case 2:
6092 value = vcpu->arch.cr2;
6093 break;
6094 case 3:
9f8fe504 6095 value = kvm_read_cr3(vcpu);
52a46617
GN
6096 break;
6097 case 4:
6098 value = kvm_read_cr4(vcpu);
6099 break;
6100 case 8:
6101 value = kvm_get_cr8(vcpu);
6102 break;
6103 default:
a737f256 6104 kvm_err("%s: unexpected cr %u\n", __func__, cr);
52a46617
GN
6105 return 0;
6106 }
6107
6108 return value;
6109}
6110
717746e3 6111static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
52a46617 6112{
717746e3 6113 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
0f12244f
GN
6114 int res = 0;
6115
52a46617
GN
6116 switch (cr) {
6117 case 0:
49a9b07e 6118 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
52a46617
GN
6119 break;
6120 case 2:
6121 vcpu->arch.cr2 = val;
6122 break;
6123 case 3:
2390218b 6124 res = kvm_set_cr3(vcpu, val);
52a46617
GN
6125 break;
6126 case 4:
a83b29c6 6127 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
52a46617
GN
6128 break;
6129 case 8:
eea1cff9 6130 res = kvm_set_cr8(vcpu, val);
52a46617
GN
6131 break;
6132 default:
a737f256 6133 kvm_err("%s: unexpected cr %u\n", __func__, cr);
0f12244f 6134 res = -1;
52a46617 6135 }
0f12244f
GN
6136
6137 return res;
52a46617
GN
6138}
6139
717746e3 6140static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
9c537244 6141{
afaf0b2f 6142 return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
9c537244
GN
6143}
6144
4bff1e86 6145static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
2dafc6c2 6146{
afaf0b2f 6147 kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
2dafc6c2
GN
6148}
6149
4bff1e86 6150static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
160ce1f1 6151{
afaf0b2f 6152 kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
160ce1f1
MG
6153}
6154
1ac9d0cf
AK
6155static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
6156{
afaf0b2f 6157 kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
1ac9d0cf
AK
6158}
6159
6160static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
6161{
afaf0b2f 6162 kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
1ac9d0cf
AK
6163}
6164
4bff1e86
AK
6165static unsigned long emulator_get_cached_segment_base(
6166 struct x86_emulate_ctxt *ctxt, int seg)
5951c442 6167{
4bff1e86 6168 return get_segment_base(emul_to_vcpu(ctxt), seg);
5951c442
GN
6169}
6170
1aa36616
AK
6171static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
6172 struct desc_struct *desc, u32 *base3,
6173 int seg)
2dafc6c2
GN
6174{
6175 struct kvm_segment var;
6176
4bff1e86 6177 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
1aa36616 6178 *selector = var.selector;
2dafc6c2 6179
378a8b09
GN
6180 if (var.unusable) {
6181 memset(desc, 0, sizeof(*desc));
f0367ee1
RK
6182 if (base3)
6183 *base3 = 0;
2dafc6c2 6184 return false;
378a8b09 6185 }
2dafc6c2
GN
6186
6187 if (var.g)
6188 var.limit >>= 12;
6189 set_desc_limit(desc, var.limit);
6190 set_desc_base(desc, (unsigned long)var.base);
5601d05b
GN
6191#ifdef CONFIG_X86_64
6192 if (base3)
6193 *base3 = var.base >> 32;
6194#endif
2dafc6c2
GN
6195 desc->type = var.type;
6196 desc->s = var.s;
6197 desc->dpl = var.dpl;
6198 desc->p = var.present;
6199 desc->avl = var.avl;
6200 desc->l = var.l;
6201 desc->d = var.db;
6202 desc->g = var.g;
6203
6204 return true;
6205}
6206
1aa36616
AK
6207static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
6208 struct desc_struct *desc, u32 base3,
6209 int seg)
2dafc6c2 6210{
4bff1e86 6211 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
2dafc6c2
GN
6212 struct kvm_segment var;
6213
1aa36616 6214 var.selector = selector;
2dafc6c2 6215 var.base = get_desc_base(desc);
5601d05b
GN
6216#ifdef CONFIG_X86_64
6217 var.base |= ((u64)base3) << 32;
6218#endif
2dafc6c2
GN
6219 var.limit = get_desc_limit(desc);
6220 if (desc->g)
6221 var.limit = (var.limit << 12) | 0xfff;
6222 var.type = desc->type;
2dafc6c2
GN
6223 var.dpl = desc->dpl;
6224 var.db = desc->d;
6225 var.s = desc->s;
6226 var.l = desc->l;
6227 var.g = desc->g;
6228 var.avl = desc->avl;
6229 var.present = desc->p;
6230 var.unusable = !var.present;
6231 var.padding = 0;
6232
6233 kvm_set_segment(vcpu, &var, seg);
6234 return;
6235}
6236
717746e3
AK
6237static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
6238 u32 msr_index, u64 *pdata)
6239{
f20935d8 6240 return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
717746e3
AK
6241}
6242
6243static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
6244 u32 msr_index, u64 data)
6245{
f20935d8 6246 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
717746e3
AK
6247}
6248
64d60670
PB
6249static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
6250{
6251 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6252
6253 return vcpu->arch.smbase;
6254}
6255
6256static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
6257{
6258 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6259
6260 vcpu->arch.smbase = smbase;
6261}
6262
67f4d428
NA
6263static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
6264 u32 pmc)
6265{
98ff80f5 6266 return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
67f4d428
NA
6267}
6268
222d21aa
AK
6269static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
6270 u32 pmc, u64 *pdata)
6271{
c6702c9d 6272 return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
222d21aa
AK
6273}
6274
6c3287f7
AK
6275static void emulator_halt(struct x86_emulate_ctxt *ctxt)
6276{
6277 emul_to_vcpu(ctxt)->arch.halt_request = 1;
6278}
6279
2953538e 6280static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
8a76d7f2 6281 struct x86_instruction_info *info,
c4f035c6
AK
6282 enum x86_intercept_stage stage)
6283{
afaf0b2f 6284 return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
21f1b8f2 6285 &ctxt->exception);
c4f035c6
AK
6286}
6287
e911eb3b 6288static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
f91af517
SC
6289 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
6290 bool exact_only)
bdb42f5a 6291{
f91af517 6292 return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
bdb42f5a
SB
6293}
6294
5ae78e95
SC
6295static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
6296{
6297 return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
6298}
6299
6300static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
6301{
6302 return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
6303}
6304
6305static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
6306{
6307 return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
6308}
6309
dd856efa
AK
6310static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
6311{
6312 return kvm_register_read(emul_to_vcpu(ctxt), reg);
6313}
6314
6315static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
6316{
6317 kvm_register_write(emul_to_vcpu(ctxt), reg, val);
6318}
6319
801806d9
NA
6320static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
6321{
afaf0b2f 6322 kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
801806d9
NA
6323}
6324
6ed071f0
LP
6325static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
6326{
6327 return emul_to_vcpu(ctxt)->arch.hflags;
6328}
6329
6330static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
6331{
c5833c7a 6332 emul_to_vcpu(ctxt)->arch.hflags = emul_flags;
6ed071f0
LP
6333}
6334
ed19321f
SC
6335static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
6336 const char *smstate)
0234bf88 6337{
afaf0b2f 6338 return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
0234bf88
LP
6339}
6340
c5833c7a
SC
6341static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
6342{
6343 kvm_smm_changed(emul_to_vcpu(ctxt));
6344}
6345
02d4160f
VK
6346static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
6347{
6348 return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
6349}
6350
0225fb50 6351static const struct x86_emulate_ops emulate_ops = {
dd856efa
AK
6352 .read_gpr = emulator_read_gpr,
6353 .write_gpr = emulator_write_gpr,
ce14e868
PB
6354 .read_std = emulator_read_std,
6355 .write_std = emulator_write_std,
7a036a6f 6356 .read_phys = kvm_read_guest_phys_system,
1871c602 6357 .fetch = kvm_fetch_guest_virt,
bbd9b64e
CO
6358 .read_emulated = emulator_read_emulated,
6359 .write_emulated = emulator_write_emulated,
6360 .cmpxchg_emulated = emulator_cmpxchg_emulated,
3cb16fe7 6361 .invlpg = emulator_invlpg,
cf8f70bf
GN
6362 .pio_in_emulated = emulator_pio_in_emulated,
6363 .pio_out_emulated = emulator_pio_out_emulated,
1aa36616
AK
6364 .get_segment = emulator_get_segment,
6365 .set_segment = emulator_set_segment,
5951c442 6366 .get_cached_segment_base = emulator_get_cached_segment_base,
2dafc6c2 6367 .get_gdt = emulator_get_gdt,
160ce1f1 6368 .get_idt = emulator_get_idt,
1ac9d0cf
AK
6369 .set_gdt = emulator_set_gdt,
6370 .set_idt = emulator_set_idt,
52a46617
GN
6371 .get_cr = emulator_get_cr,
6372 .set_cr = emulator_set_cr,
9c537244 6373 .cpl = emulator_get_cpl,
35aa5375
GN
6374 .get_dr = emulator_get_dr,
6375 .set_dr = emulator_set_dr,
64d60670
PB
6376 .get_smbase = emulator_get_smbase,
6377 .set_smbase = emulator_set_smbase,
717746e3
AK
6378 .set_msr = emulator_set_msr,
6379 .get_msr = emulator_get_msr,
67f4d428 6380 .check_pmc = emulator_check_pmc,
222d21aa 6381 .read_pmc = emulator_read_pmc,
6c3287f7 6382 .halt = emulator_halt,
bcaf5cc5 6383 .wbinvd = emulator_wbinvd,
d6aa1000 6384 .fix_hypercall = emulator_fix_hypercall,
c4f035c6 6385 .intercept = emulator_intercept,
bdb42f5a 6386 .get_cpuid = emulator_get_cpuid,
5ae78e95
SC
6387 .guest_has_long_mode = emulator_guest_has_long_mode,
6388 .guest_has_movbe = emulator_guest_has_movbe,
6389 .guest_has_fxsr = emulator_guest_has_fxsr,
801806d9 6390 .set_nmi_mask = emulator_set_nmi_mask,
6ed071f0
LP
6391 .get_hflags = emulator_get_hflags,
6392 .set_hflags = emulator_set_hflags,
0234bf88 6393 .pre_leave_smm = emulator_pre_leave_smm,
c5833c7a 6394 .post_leave_smm = emulator_post_leave_smm,
02d4160f 6395 .set_xcr = emulator_set_xcr,
bbd9b64e
CO
6396};
6397
95cb2295
GN
6398static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
6399{
afaf0b2f 6400 u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
95cb2295
GN
6401 /*
6402 * an sti; sti; sequence only disable interrupts for the first
6403 * instruction. So, if the last instruction, be it emulated or
6404 * not, left the system with the INT_STI flag enabled, it
6405 * means that the last instruction is an sti. We should not
6406 * leave the flag on in this case. The same goes for mov ss
6407 */
37ccdcbe
PB
6408 if (int_shadow & mask)
6409 mask = 0;
6addfc42 6410 if (unlikely(int_shadow || mask)) {
afaf0b2f 6411 kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
6addfc42
PB
6412 if (!mask)
6413 kvm_make_request(KVM_REQ_EVENT, vcpu);
6414 }
95cb2295
GN
6415}
6416
ef54bcfe 6417static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
54b8486f 6418{
c9b8b07c 6419 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
da9cb575 6420 if (ctxt->exception.vector == PF_VECTOR)
53b3d8e9 6421 return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
ef54bcfe
PB
6422
6423 if (ctxt->exception.error_code_valid)
da9cb575
AK
6424 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
6425 ctxt->exception.error_code);
54b8486f 6426 else
da9cb575 6427 kvm_queue_exception(vcpu, ctxt->exception.vector);
ef54bcfe 6428 return false;
54b8486f
GN
6429}
6430
c9b8b07c
SC
6431static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
6432{
6433 struct x86_emulate_ctxt *ctxt;
6434
6435 ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
6436 if (!ctxt) {
6437 pr_err("kvm: failed to allocate vcpu's emulator\n");
6438 return NULL;
6439 }
6440
6441 ctxt->vcpu = vcpu;
6442 ctxt->ops = &emulate_ops;
6443 vcpu->arch.emulate_ctxt = ctxt;
6444
6445 return ctxt;
6446}
6447
8ec4722d
MG
6448static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
6449{
c9b8b07c 6450 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8ec4722d
MG
6451 int cs_db, cs_l;
6452
afaf0b2f 6453 kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
8ec4722d 6454
744e699c 6455 ctxt->gpa_available = false;
adf52235 6456 ctxt->eflags = kvm_get_rflags(vcpu);
c8401dda
PB
6457 ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
6458
adf52235
TY
6459 ctxt->eip = kvm_rip_read(vcpu);
6460 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
6461 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
42bf549f 6462 (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
adf52235
TY
6463 cs_db ? X86EMUL_MODE_PROT32 :
6464 X86EMUL_MODE_PROT16;
a584539b 6465 BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
64d60670
PB
6466 BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
6467 BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
adf52235 6468
dd856efa 6469 init_decode_cache(ctxt);
7ae441ea 6470 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
8ec4722d
MG
6471}
6472
9497e1f2 6473void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
63995653 6474{
c9b8b07c 6475 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
63995653
MG
6476 int ret;
6477
6478 init_emulate_ctxt(vcpu);
6479
9dac77fa
AK
6480 ctxt->op_bytes = 2;
6481 ctxt->ad_bytes = 2;
6482 ctxt->_eip = ctxt->eip + inc_eip;
9d74191a 6483 ret = emulate_int_real(ctxt, irq);
63995653 6484
9497e1f2
SC
6485 if (ret != X86EMUL_CONTINUE) {
6486 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6487 } else {
6488 ctxt->eip = ctxt->_eip;
6489 kvm_rip_write(vcpu, ctxt->eip);
6490 kvm_set_rflags(vcpu, ctxt->eflags);
6491 }
63995653
MG
6492}
6493EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
6494
e2366171 6495static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
6d77dbfc 6496{
6d77dbfc
GN
6497 ++vcpu->stat.insn_emulation_fail;
6498 trace_kvm_emulate_insn_failed(vcpu);
e2366171 6499
42cbf068
SC
6500 if (emulation_type & EMULTYPE_VMWARE_GP) {
6501 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
60fc3d02 6502 return 1;
42cbf068 6503 }
e2366171 6504
738fece4
SC
6505 if (emulation_type & EMULTYPE_SKIP) {
6506 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6507 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
6508 vcpu->run->internal.ndata = 0;
60fc3d02 6509 return 0;
738fece4
SC
6510 }
6511
22da61c9
SC
6512 kvm_queue_exception(vcpu, UD_VECTOR);
6513
afaf0b2f 6514 if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
fc3a9157
JR
6515 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6516 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
6517 vcpu->run->internal.ndata = 0;
60fc3d02 6518 return 0;
fc3a9157 6519 }
e2366171 6520
60fc3d02 6521 return 1;
6d77dbfc
GN
6522}
6523
736c291c 6524static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
991eebf9
GN
6525 bool write_fault_to_shadow_pgtable,
6526 int emulation_type)
a6f177ef 6527{
736c291c 6528 gpa_t gpa = cr2_or_gpa;
ba049e93 6529 kvm_pfn_t pfn;
a6f177ef 6530
92daa48b 6531 if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
991eebf9
GN
6532 return false;
6533
92daa48b
SC
6534 if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
6535 WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
6c3dfeb6
SC
6536 return false;
6537
44dd3ffa 6538 if (!vcpu->arch.mmu->direct_map) {
95b3cf69
XG
6539 /*
6540 * Write permission should be allowed since only
6541 * write access need to be emulated.
6542 */
736c291c 6543 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
a6f177ef 6544
95b3cf69
XG
6545 /*
6546 * If the mapping is invalid in guest, let cpu retry
6547 * it to generate fault.
6548 */
6549 if (gpa == UNMAPPED_GVA)
6550 return true;
6551 }
a6f177ef 6552
8e3d9d06
XG
6553 /*
6554 * Do not retry the unhandleable instruction if it faults on the
6555 * readonly host memory, otherwise it will goto a infinite loop:
6556 * retry instruction -> write #PF -> emulation fail -> retry
6557 * instruction -> ...
6558 */
6559 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
95b3cf69
XG
6560
6561 /*
6562 * If the instruction failed on the error pfn, it can not be fixed,
6563 * report the error to userspace.
6564 */
6565 if (is_error_noslot_pfn(pfn))
6566 return false;
6567
6568 kvm_release_pfn_clean(pfn);
6569
6570 /* The instructions are well-emulated on direct mmu. */
44dd3ffa 6571 if (vcpu->arch.mmu->direct_map) {
95b3cf69
XG
6572 unsigned int indirect_shadow_pages;
6573
6574 spin_lock(&vcpu->kvm->mmu_lock);
6575 indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
6576 spin_unlock(&vcpu->kvm->mmu_lock);
6577
6578 if (indirect_shadow_pages)
6579 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
6580
a6f177ef 6581 return true;
8e3d9d06 6582 }
a6f177ef 6583
95b3cf69
XG
6584 /*
6585 * if emulation was due to access to shadowed page table
6586 * and it failed try to unshadow page and re-enter the
6587 * guest to let CPU execute the instruction.
6588 */
6589 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
93c05d3e
XG
6590
6591 /*
6592 * If the access faults on its page table, it can not
6593 * be fixed by unprotecting shadow page and it should
6594 * be reported to userspace.
6595 */
6596 return !write_fault_to_shadow_pgtable;
a6f177ef
GN
6597}
6598
1cb3f3ae 6599static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
736c291c 6600 gpa_t cr2_or_gpa, int emulation_type)
1cb3f3ae
XG
6601{
6602 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
736c291c 6603 unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
1cb3f3ae
XG
6604
6605 last_retry_eip = vcpu->arch.last_retry_eip;
6606 last_retry_addr = vcpu->arch.last_retry_addr;
6607
6608 /*
6609 * If the emulation is caused by #PF and it is non-page_table
6610 * writing instruction, it means the VM-EXIT is caused by shadow
6611 * page protected, we can zap the shadow page and retry this
6612 * instruction directly.
6613 *
6614 * Note: if the guest uses a non-page-table modifying instruction
6615 * on the PDE that points to the instruction, then we will unmap
6616 * the instruction and go to an infinite loop. So, we cache the
6617 * last retried eip and the last fault address, if we meet the eip
6618 * and the address again, we can break out of the potential infinite
6619 * loop.
6620 */
6621 vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
6622
92daa48b 6623 if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
1cb3f3ae
XG
6624 return false;
6625
92daa48b
SC
6626 if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
6627 WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
6c3dfeb6
SC
6628 return false;
6629
1cb3f3ae
XG
6630 if (x86_page_table_writing_insn(ctxt))
6631 return false;
6632
736c291c 6633 if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
1cb3f3ae
XG
6634 return false;
6635
6636 vcpu->arch.last_retry_eip = ctxt->eip;
736c291c 6637 vcpu->arch.last_retry_addr = cr2_or_gpa;
1cb3f3ae 6638
44dd3ffa 6639 if (!vcpu->arch.mmu->direct_map)
736c291c 6640 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
1cb3f3ae 6641
22368028 6642 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
1cb3f3ae
XG
6643
6644 return true;
6645}
6646
716d51ab
GN
6647static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
6648static int complete_emulated_pio(struct kvm_vcpu *vcpu);
6649
64d60670 6650static void kvm_smm_changed(struct kvm_vcpu *vcpu)
a584539b 6651{
64d60670 6652 if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
660a5d51
PB
6653 /* This is a good place to trace that we are exiting SMM. */
6654 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
6655
c43203ca
PB
6656 /* Process a latched INIT or SMI, if any. */
6657 kvm_make_request(KVM_REQ_EVENT, vcpu);
64d60670 6658 }
699023e2
PB
6659
6660 kvm_mmu_reset_context(vcpu);
64d60670
PB
6661}
6662
4a1e10d5
PB
6663static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
6664 unsigned long *db)
6665{
6666 u32 dr6 = 0;
6667 int i;
6668 u32 enable, rwlen;
6669
6670 enable = dr7;
6671 rwlen = dr7 >> 16;
6672 for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
6673 if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
6674 dr6 |= (1 << i);
6675 return dr6;
6676}
6677
120c2c4f 6678static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
663f4c61
PB
6679{
6680 struct kvm_run *kvm_run = vcpu->run;
6681
c8401dda
PB
6682 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
6683 kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
d5d260c5 6684 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
c8401dda
PB
6685 kvm_run->debug.arch.exception = DB_VECTOR;
6686 kvm_run->exit_reason = KVM_EXIT_DEBUG;
60fc3d02 6687 return 0;
663f4c61 6688 }
120c2c4f 6689 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
60fc3d02 6690 return 1;
663f4c61
PB
6691}
6692
6affcbed
KH
6693int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
6694{
afaf0b2f 6695 unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
f8ea7c60 6696 int r;
6affcbed 6697
afaf0b2f 6698 r = kvm_x86_ops.skip_emulated_instruction(vcpu);
60fc3d02 6699 if (unlikely(!r))
f8ea7c60 6700 return 0;
c8401dda
PB
6701
6702 /*
6703 * rflags is the old, "raw" value of the flags. The new value has
6704 * not been saved yet.
6705 *
6706 * This is correct even for TF set by the guest, because "the
6707 * processor will not generate this exception after the instruction
6708 * that sets the TF flag".
6709 */
6710 if (unlikely(rflags & X86_EFLAGS_TF))
120c2c4f 6711 r = kvm_vcpu_do_singlestep(vcpu);
60fc3d02 6712 return r;
6affcbed
KH
6713}
6714EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
6715
4a1e10d5
PB
6716static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
6717{
4a1e10d5
PB
6718 if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
6719 (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
82b32774
NA
6720 struct kvm_run *kvm_run = vcpu->run;
6721 unsigned long eip = kvm_get_linear_rip(vcpu);
6722 u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
4a1e10d5
PB
6723 vcpu->arch.guest_debug_dr7,
6724 vcpu->arch.eff_db);
6725
6726 if (dr6 != 0) {
6f43ed01 6727 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
82b32774 6728 kvm_run->debug.arch.pc = eip;
4a1e10d5
PB
6729 kvm_run->debug.arch.exception = DB_VECTOR;
6730 kvm_run->exit_reason = KVM_EXIT_DEBUG;
60fc3d02 6731 *r = 0;
4a1e10d5
PB
6732 return true;
6733 }
6734 }
6735
4161a569
NA
6736 if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
6737 !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
82b32774
NA
6738 unsigned long eip = kvm_get_linear_rip(vcpu);
6739 u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
4a1e10d5
PB
6740 vcpu->arch.dr7,
6741 vcpu->arch.db);
6742
6743 if (dr6 != 0) {
4d5523cf 6744 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
60fc3d02 6745 *r = 1;
4a1e10d5
PB
6746 return true;
6747 }
6748 }
6749
6750 return false;
6751}
6752
04789b66
LA
6753static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
6754{
2d7921c4
AM
6755 switch (ctxt->opcode_len) {
6756 case 1:
6757 switch (ctxt->b) {
6758 case 0xe4: /* IN */
6759 case 0xe5:
6760 case 0xec:
6761 case 0xed:
6762 case 0xe6: /* OUT */
6763 case 0xe7:
6764 case 0xee:
6765 case 0xef:
6766 case 0x6c: /* INS */
6767 case 0x6d:
6768 case 0x6e: /* OUTS */
6769 case 0x6f:
6770 return true;
6771 }
6772 break;
6773 case 2:
6774 switch (ctxt->b) {
6775 case 0x33: /* RDPMC */
6776 return true;
6777 }
6778 break;
04789b66
LA
6779 }
6780
6781 return false;
6782}
6783
736c291c
SC
6784int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
6785 int emulation_type, void *insn, int insn_len)
bbd9b64e 6786{
95cb2295 6787 int r;
c9b8b07c 6788 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7ae441ea 6789 bool writeback = true;
93c05d3e 6790 bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
bbd9b64e 6791
c595ceee
PB
6792 vcpu->arch.l1tf_flush_l1d = true;
6793
93c05d3e
XG
6794 /*
6795 * Clear write_fault_to_shadow_pgtable here to ensure it is
6796 * never reused.
6797 */
6798 vcpu->arch.write_fault_to_shadow_pgtable = false;
26eef70c 6799 kvm_clear_exception_queue(vcpu);
8d7d8102 6800
571008da 6801 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
8ec4722d 6802 init_emulate_ctxt(vcpu);
4a1e10d5
PB
6803
6804 /*
6805 * We will reenter on the same instruction since
6806 * we do not set complete_userspace_io. This does not
6807 * handle watchpoints yet, those would be handled in
6808 * the emulate_ops.
6809 */
d391f120
VK
6810 if (!(emulation_type & EMULTYPE_SKIP) &&
6811 kvm_vcpu_check_breakpoint(vcpu, &r))
4a1e10d5
PB
6812 return r;
6813
9d74191a
TY
6814 ctxt->interruptibility = 0;
6815 ctxt->have_exception = false;
e0ad0b47 6816 ctxt->exception.vector = -1;
9d74191a 6817 ctxt->perm_ok = false;
bbd9b64e 6818
b51e974f 6819 ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
4005996e 6820
9d74191a 6821 r = x86_decode_insn(ctxt, insn, insn_len);
bbd9b64e 6822
e46479f8 6823 trace_kvm_emulate_insn_start(vcpu);
f2b5756b 6824 ++vcpu->stat.insn_emulation;
1d2887e2 6825 if (r != EMULATION_OK) {
b4000606 6826 if ((emulation_type & EMULTYPE_TRAP_UD) ||
c83fad65
SC
6827 (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
6828 kvm_queue_exception(vcpu, UD_VECTOR);
60fc3d02 6829 return 1;
c83fad65 6830 }
736c291c
SC
6831 if (reexecute_instruction(vcpu, cr2_or_gpa,
6832 write_fault_to_spt,
6833 emulation_type))
60fc3d02 6834 return 1;
8530a79c 6835 if (ctxt->have_exception) {
c8848cee
JD
6836 /*
6837 * #UD should result in just EMULATION_FAILED, and trap-like
6838 * exception should not be encountered during decode.
6839 */
6840 WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
6841 exception_type(ctxt->exception.vector) == EXCPT_TRAP);
8530a79c 6842 inject_emulated_exception(vcpu);
60fc3d02 6843 return 1;
8530a79c 6844 }
e2366171 6845 return handle_emulation_failure(vcpu, emulation_type);
bbd9b64e
CO
6846 }
6847 }
6848
42cbf068
SC
6849 if ((emulation_type & EMULTYPE_VMWARE_GP) &&
6850 !is_vmware_backdoor_opcode(ctxt)) {
6851 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
60fc3d02 6852 return 1;
42cbf068 6853 }
04789b66 6854
1957aa63
SC
6855 /*
6856 * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
6857 * for kvm_skip_emulated_instruction(). The caller is responsible for
6858 * updating interruptibility state and injecting single-step #DBs.
6859 */
ba8afb6b 6860 if (emulation_type & EMULTYPE_SKIP) {
9dac77fa 6861 kvm_rip_write(vcpu, ctxt->_eip);
bb663c7a
NA
6862 if (ctxt->eflags & X86_EFLAGS_RF)
6863 kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
60fc3d02 6864 return 1;
ba8afb6b
GN
6865 }
6866
736c291c 6867 if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
60fc3d02 6868 return 1;
1cb3f3ae 6869
7ae441ea 6870 /* this is needed for vmware backdoor interface to work since it
4d2179e1 6871 changes registers values during IO operation */
7ae441ea
GN
6872 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
6873 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
dd856efa 6874 emulator_invalidate_register_cache(ctxt);
7ae441ea 6875 }
4d2179e1 6876
5cd21917 6877restart:
92daa48b
SC
6878 if (emulation_type & EMULTYPE_PF) {
6879 /* Save the faulting GPA (cr2) in the address field */
6880 ctxt->exception.address = cr2_or_gpa;
6881
6882 /* With shadow page tables, cr2 contains a GVA or nGPA. */
6883 if (vcpu->arch.mmu->direct_map) {
744e699c
SC
6884 ctxt->gpa_available = true;
6885 ctxt->gpa_val = cr2_or_gpa;
92daa48b
SC
6886 }
6887 } else {
6888 /* Sanitize the address out of an abundance of paranoia. */
6889 ctxt->exception.address = 0;
6890 }
0f89b207 6891
9d74191a 6892 r = x86_emulate_insn(ctxt);
bbd9b64e 6893
775fde86 6894 if (r == EMULATION_INTERCEPTED)
60fc3d02 6895 return 1;
775fde86 6896
d2ddd1c4 6897 if (r == EMULATION_FAILED) {
736c291c 6898 if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
991eebf9 6899 emulation_type))
60fc3d02 6900 return 1;
c3cd7ffa 6901
e2366171 6902 return handle_emulation_failure(vcpu, emulation_type);
bbd9b64e
CO
6903 }
6904
9d74191a 6905 if (ctxt->have_exception) {
60fc3d02 6906 r = 1;
ef54bcfe
PB
6907 if (inject_emulated_exception(vcpu))
6908 return r;
d2ddd1c4 6909 } else if (vcpu->arch.pio.count) {
0912c977
PB
6910 if (!vcpu->arch.pio.in) {
6911 /* FIXME: return into emulator if single-stepping. */
3457e419 6912 vcpu->arch.pio.count = 0;
0912c977 6913 } else {
7ae441ea 6914 writeback = false;
716d51ab
GN
6915 vcpu->arch.complete_userspace_io = complete_emulated_pio;
6916 }
60fc3d02 6917 r = 0;
7ae441ea 6918 } else if (vcpu->mmio_needed) {
bc8a0aaf
SC
6919 ++vcpu->stat.mmio_exits;
6920
7ae441ea
GN
6921 if (!vcpu->mmio_is_write)
6922 writeback = false;
60fc3d02 6923 r = 0;
716d51ab 6924 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
7ae441ea 6925 } else if (r == EMULATION_RESTART)
5cd21917 6926 goto restart;
d2ddd1c4 6927 else
60fc3d02 6928 r = 1;
f850e2e6 6929
7ae441ea 6930 if (writeback) {
afaf0b2f 6931 unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
9d74191a 6932 toggle_interruptibility(vcpu, ctxt->interruptibility);
7ae441ea 6933 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
38827dbd 6934 if (!ctxt->have_exception ||
75ee23b3
SC
6935 exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
6936 kvm_rip_write(vcpu, ctxt->eip);
60fc3d02 6937 if (r && ctxt->tf)
120c2c4f 6938 r = kvm_vcpu_do_singlestep(vcpu);
afaf0b2f
SC
6939 if (kvm_x86_ops.update_emulated_instruction)
6940 kvm_x86_ops.update_emulated_instruction(vcpu);
38827dbd 6941 __kvm_set_rflags(vcpu, ctxt->eflags);
75ee23b3 6942 }
6addfc42
PB
6943
6944 /*
6945 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
6946 * do nothing, and it will be requested again as soon as
6947 * the shadow expires. But we still need to check here,
6948 * because POPF has no interrupt shadow.
6949 */
6950 if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
6951 kvm_make_request(KVM_REQ_EVENT, vcpu);
7ae441ea
GN
6952 } else
6953 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
e85d28f8
GN
6954
6955 return r;
de7d789a 6956}
c60658d1
SC
6957
6958int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
6959{
6960 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
6961}
6962EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
6963
6964int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
6965 void *insn, int insn_len)
6966{
6967 return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
6968}
6969EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
de7d789a 6970
8764ed55
SC
6971static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
6972{
6973 vcpu->arch.pio.count = 0;
6974 return 1;
6975}
6976
45def77e
SC
6977static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
6978{
6979 vcpu->arch.pio.count = 0;
6980
6981 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
6982 return 1;
6983
6984 return kvm_skip_emulated_instruction(vcpu);
6985}
6986
dca7f128
SC
6987static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
6988 unsigned short port)
de7d789a 6989{
de3cd117 6990 unsigned long val = kvm_rax_read(vcpu);
2e3bb4d8
SC
6991 int ret = emulator_pio_out(vcpu, size, port, &val, 1);
6992
8764ed55
SC
6993 if (ret)
6994 return ret;
45def77e 6995
8764ed55
SC
6996 /*
6997 * Workaround userspace that relies on old KVM behavior of %rip being
6998 * incremented prior to exiting to userspace to handle "OUT 0x7e".
6999 */
7000 if (port == 0x7e &&
7001 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
7002 vcpu->arch.complete_userspace_io =
7003 complete_fast_pio_out_port_0x7e;
7004 kvm_skip_emulated_instruction(vcpu);
7005 } else {
45def77e
SC
7006 vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
7007 vcpu->arch.complete_userspace_io = complete_fast_pio_out;
7008 }
8764ed55 7009 return 0;
de7d789a 7010}
de7d789a 7011
8370c3d0
TL
7012static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
7013{
7014 unsigned long val;
7015
7016 /* We should only ever be called with arch.pio.count equal to 1 */
7017 BUG_ON(vcpu->arch.pio.count != 1);
7018
45def77e
SC
7019 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
7020 vcpu->arch.pio.count = 0;
7021 return 1;
7022 }
7023
8370c3d0 7024 /* For size less than 4 we merge, else we zero extend */
de3cd117 7025 val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
8370c3d0
TL
7026
7027 /*
2e3bb4d8 7028 * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
8370c3d0
TL
7029 * the copy and tracing
7030 */
2e3bb4d8 7031 emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
de3cd117 7032 kvm_rax_write(vcpu, val);
8370c3d0 7033
45def77e 7034 return kvm_skip_emulated_instruction(vcpu);
8370c3d0
TL
7035}
7036
dca7f128
SC
7037static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
7038 unsigned short port)
8370c3d0
TL
7039{
7040 unsigned long val;
7041 int ret;
7042
7043 /* For size less than 4 we merge, else we zero extend */
de3cd117 7044 val = (size < 4) ? kvm_rax_read(vcpu) : 0;
8370c3d0 7045
2e3bb4d8 7046 ret = emulator_pio_in(vcpu, size, port, &val, 1);
8370c3d0 7047 if (ret) {
de3cd117 7048 kvm_rax_write(vcpu, val);
8370c3d0
TL
7049 return ret;
7050 }
7051
45def77e 7052 vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
8370c3d0
TL
7053 vcpu->arch.complete_userspace_io = complete_fast_pio_in;
7054
7055 return 0;
7056}
dca7f128
SC
7057
7058int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
7059{
45def77e 7060 int ret;
dca7f128 7061
dca7f128 7062 if (in)
45def77e 7063 ret = kvm_fast_pio_in(vcpu, size, port);
dca7f128 7064 else
45def77e
SC
7065 ret = kvm_fast_pio_out(vcpu, size, port);
7066 return ret && kvm_skip_emulated_instruction(vcpu);
dca7f128
SC
7067}
7068EXPORT_SYMBOL_GPL(kvm_fast_pio);
8370c3d0 7069
251a5fd6 7070static int kvmclock_cpu_down_prep(unsigned int cpu)
8cfdc000 7071{
0a3aee0d 7072 __this_cpu_write(cpu_tsc_khz, 0);
251a5fd6 7073 return 0;
8cfdc000
ZA
7074}
7075
7076static void tsc_khz_changed(void *data)
c8076604 7077{
8cfdc000
ZA
7078 struct cpufreq_freqs *freq = data;
7079 unsigned long khz = 0;
7080
7081 if (data)
7082 khz = freq->new;
7083 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
7084 khz = cpufreq_quick_get(raw_smp_processor_id());
7085 if (!khz)
7086 khz = tsc_khz;
0a3aee0d 7087 __this_cpu_write(cpu_tsc_khz, khz);
c8076604
GH
7088}
7089
5fa4ec9c 7090#ifdef CONFIG_X86_64
0092e434
VK
7091static void kvm_hyperv_tsc_notifier(void)
7092{
0092e434
VK
7093 struct kvm *kvm;
7094 struct kvm_vcpu *vcpu;
7095 int cpu;
7096
0d9ce162 7097 mutex_lock(&kvm_lock);
0092e434
VK
7098 list_for_each_entry(kvm, &vm_list, vm_list)
7099 kvm_make_mclock_inprogress_request(kvm);
7100
7101 hyperv_stop_tsc_emulation();
7102
7103 /* TSC frequency always matches when on Hyper-V */
7104 for_each_present_cpu(cpu)
7105 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
7106 kvm_max_guest_tsc_khz = tsc_khz;
7107
7108 list_for_each_entry(kvm, &vm_list, vm_list) {
7109 struct kvm_arch *ka = &kvm->arch;
7110
7111 spin_lock(&ka->pvclock_gtod_sync_lock);
7112
7113 pvclock_update_vm_gtod_copy(kvm);
7114
7115 kvm_for_each_vcpu(cpu, vcpu, kvm)
7116 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
7117
7118 kvm_for_each_vcpu(cpu, vcpu, kvm)
7119 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
7120
7121 spin_unlock(&ka->pvclock_gtod_sync_lock);
7122 }
0d9ce162 7123 mutex_unlock(&kvm_lock);
0092e434 7124}
5fa4ec9c 7125#endif
0092e434 7126
df24014a 7127static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
c8076604 7128{
c8076604
GH
7129 struct kvm *kvm;
7130 struct kvm_vcpu *vcpu;
7131 int i, send_ipi = 0;
7132
8cfdc000
ZA
7133 /*
7134 * We allow guests to temporarily run on slowing clocks,
7135 * provided we notify them after, or to run on accelerating
7136 * clocks, provided we notify them before. Thus time never
7137 * goes backwards.
7138 *
7139 * However, we have a problem. We can't atomically update
7140 * the frequency of a given CPU from this function; it is
7141 * merely a notifier, which can be called from any CPU.
7142 * Changing the TSC frequency at arbitrary points in time
7143 * requires a recomputation of local variables related to
7144 * the TSC for each VCPU. We must flag these local variables
7145 * to be updated and be sure the update takes place with the
7146 * new frequency before any guests proceed.
7147 *
7148 * Unfortunately, the combination of hotplug CPU and frequency
7149 * change creates an intractable locking scenario; the order
7150 * of when these callouts happen is undefined with respect to
7151 * CPU hotplug, and they can race with each other. As such,
7152 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
7153 * undefined; you can actually have a CPU frequency change take
7154 * place in between the computation of X and the setting of the
7155 * variable. To protect against this problem, all updates of
7156 * the per_cpu tsc_khz variable are done in an interrupt
7157 * protected IPI, and all callers wishing to update the value
7158 * must wait for a synchronous IPI to complete (which is trivial
7159 * if the caller is on the CPU already). This establishes the
7160 * necessary total order on variable updates.
7161 *
7162 * Note that because a guest time update may take place
7163 * anytime after the setting of the VCPU's request bit, the
7164 * correct TSC value must be set before the request. However,
7165 * to ensure the update actually makes it to any guest which
7166 * starts running in hardware virtualization between the set
7167 * and the acquisition of the spinlock, we must also ping the
7168 * CPU after setting the request bit.
7169 *
7170 */
7171
df24014a 7172 smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
c8076604 7173
0d9ce162 7174 mutex_lock(&kvm_lock);
c8076604 7175 list_for_each_entry(kvm, &vm_list, vm_list) {
988a2cae 7176 kvm_for_each_vcpu(i, vcpu, kvm) {
df24014a 7177 if (vcpu->cpu != cpu)
c8076604 7178 continue;
c285545f 7179 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
0d9ce162 7180 if (vcpu->cpu != raw_smp_processor_id())
8cfdc000 7181 send_ipi = 1;
c8076604
GH
7182 }
7183 }
0d9ce162 7184 mutex_unlock(&kvm_lock);
c8076604
GH
7185
7186 if (freq->old < freq->new && send_ipi) {
7187 /*
7188 * We upscale the frequency. Must make the guest
7189 * doesn't see old kvmclock values while running with
7190 * the new frequency, otherwise we risk the guest sees
7191 * time go backwards.
7192 *
7193 * In case we update the frequency for another cpu
7194 * (which might be in guest context) send an interrupt
7195 * to kick the cpu out of guest context. Next time
7196 * guest context is entered kvmclock will be updated,
7197 * so the guest will not see stale values.
7198 */
df24014a 7199 smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
c8076604 7200 }
df24014a
VK
7201}
7202
7203static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
7204 void *data)
7205{
7206 struct cpufreq_freqs *freq = data;
7207 int cpu;
7208
7209 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
7210 return 0;
7211 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
7212 return 0;
7213
7214 for_each_cpu(cpu, freq->policy->cpus)
7215 __kvmclock_cpufreq_notifier(freq, cpu);
7216
c8076604
GH
7217 return 0;
7218}
7219
7220static struct notifier_block kvmclock_cpufreq_notifier_block = {
8cfdc000
ZA
7221 .notifier_call = kvmclock_cpufreq_notifier
7222};
7223
251a5fd6 7224static int kvmclock_cpu_online(unsigned int cpu)
8cfdc000 7225{
251a5fd6
SAS
7226 tsc_khz_changed(NULL);
7227 return 0;
8cfdc000
ZA
7228}
7229
b820cc0c
ZA
7230static void kvm_timer_init(void)
7231{
c285545f 7232 max_tsc_khz = tsc_khz;
460dd42e 7233
b820cc0c 7234 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
c285545f 7235#ifdef CONFIG_CPU_FREQ
aaec7c03 7236 struct cpufreq_policy *policy;
758f588d
BP
7237 int cpu;
7238
3e26f230 7239 cpu = get_cpu();
aaec7c03 7240 policy = cpufreq_cpu_get(cpu);
9a11997e
WL
7241 if (policy) {
7242 if (policy->cpuinfo.max_freq)
7243 max_tsc_khz = policy->cpuinfo.max_freq;
7244 cpufreq_cpu_put(policy);
7245 }
3e26f230 7246 put_cpu();
c285545f 7247#endif
b820cc0c
ZA
7248 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
7249 CPUFREQ_TRANSITION_NOTIFIER);
7250 }
460dd42e 7251
73c1b41e 7252 cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
251a5fd6 7253 kvmclock_cpu_online, kvmclock_cpu_down_prep);
b820cc0c
ZA
7254}
7255
dd60d217
AK
7256DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
7257EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
ff9d07a0 7258
f5132b01 7259int kvm_is_in_guest(void)
ff9d07a0 7260{
086c9855 7261 return __this_cpu_read(current_vcpu) != NULL;
ff9d07a0
ZY
7262}
7263
7264static int kvm_is_user_mode(void)
7265{
7266 int user_mode = 3;
dcf46b94 7267
086c9855 7268 if (__this_cpu_read(current_vcpu))
afaf0b2f 7269 user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
dcf46b94 7270
ff9d07a0
ZY
7271 return user_mode != 0;
7272}
7273
7274static unsigned long kvm_get_guest_ip(void)
7275{
7276 unsigned long ip = 0;
dcf46b94 7277
086c9855
AS
7278 if (__this_cpu_read(current_vcpu))
7279 ip = kvm_rip_read(__this_cpu_read(current_vcpu));
dcf46b94 7280
ff9d07a0
ZY
7281 return ip;
7282}
7283
8479e04e
LK
7284static void kvm_handle_intel_pt_intr(void)
7285{
7286 struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
7287
7288 kvm_make_request(KVM_REQ_PMI, vcpu);
7289 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
7290 (unsigned long *)&vcpu->arch.pmu.global_status);
7291}
7292
ff9d07a0
ZY
7293static struct perf_guest_info_callbacks kvm_guest_cbs = {
7294 .is_in_guest = kvm_is_in_guest,
7295 .is_user_mode = kvm_is_user_mode,
7296 .get_guest_ip = kvm_get_guest_ip,
8479e04e 7297 .handle_intel_pt_intr = kvm_handle_intel_pt_intr,
ff9d07a0
ZY
7298};
7299
16e8d74d
MT
7300#ifdef CONFIG_X86_64
7301static void pvclock_gtod_update_fn(struct work_struct *work)
7302{
d828199e
MT
7303 struct kvm *kvm;
7304
7305 struct kvm_vcpu *vcpu;
7306 int i;
7307
0d9ce162 7308 mutex_lock(&kvm_lock);
d828199e
MT
7309 list_for_each_entry(kvm, &vm_list, vm_list)
7310 kvm_for_each_vcpu(i, vcpu, kvm)
105b21bb 7311 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
d828199e 7312 atomic_set(&kvm_guest_has_master_clock, 0);
0d9ce162 7313 mutex_unlock(&kvm_lock);
16e8d74d
MT
7314}
7315
7316static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
7317
7318/*
7319 * Notification about pvclock gtod data update.
7320 */
7321static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
7322 void *priv)
7323{
7324 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
7325 struct timekeeper *tk = priv;
7326
7327 update_pvclock_gtod(tk);
7328
7329 /* disable master clock if host does not trust, or does not
b0c39dc6 7330 * use, TSC based clocksource.
16e8d74d 7331 */
b0c39dc6 7332 if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
16e8d74d
MT
7333 atomic_read(&kvm_guest_has_master_clock) != 0)
7334 queue_work(system_long_wq, &pvclock_gtod_work);
7335
7336 return 0;
7337}
7338
7339static struct notifier_block pvclock_gtod_notifier = {
7340 .notifier_call = pvclock_gtod_notify,
7341};
7342#endif
7343
f8c16bba 7344int kvm_arch_init(void *opaque)
043405e1 7345{
d008dfdb 7346 struct kvm_x86_init_ops *ops = opaque;
b820cc0c 7347 int r;
f8c16bba 7348
afaf0b2f 7349 if (kvm_x86_ops.hardware_enable) {
f8c16bba 7350 printk(KERN_ERR "kvm: already loaded the other module\n");
56c6d28a
ZX
7351 r = -EEXIST;
7352 goto out;
f8c16bba
ZX
7353 }
7354
7355 if (!ops->cpu_has_kvm_support()) {
ef935c25 7356 pr_err_ratelimited("kvm: no hardware support\n");
56c6d28a
ZX
7357 r = -EOPNOTSUPP;
7358 goto out;
f8c16bba
ZX
7359 }
7360 if (ops->disabled_by_bios()) {
ef935c25 7361 pr_err_ratelimited("kvm: disabled by bios\n");
56c6d28a
ZX
7362 r = -EOPNOTSUPP;
7363 goto out;
f8c16bba
ZX
7364 }
7365
b666a4b6
MO
7366 /*
7367 * KVM explicitly assumes that the guest has an FPU and
7368 * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
7369 * vCPU's FPU state as a fxregs_state struct.
7370 */
7371 if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
7372 printk(KERN_ERR "kvm: inadequate fpu\n");
7373 r = -EOPNOTSUPP;
7374 goto out;
7375 }
7376
013f6a5d 7377 r = -ENOMEM;
ed8e4812 7378 x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
b666a4b6
MO
7379 __alignof__(struct fpu), SLAB_ACCOUNT,
7380 NULL);
7381 if (!x86_fpu_cache) {
7382 printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
7383 goto out;
7384 }
7385
c9b8b07c
SC
7386 x86_emulator_cache = kvm_alloc_emulator_cache();
7387 if (!x86_emulator_cache) {
7388 pr_err("kvm: failed to allocate cache for x86 emulator\n");
7389 goto out_free_x86_fpu_cache;
7390 }
7391
013f6a5d
MT
7392 shared_msrs = alloc_percpu(struct kvm_shared_msrs);
7393 if (!shared_msrs) {
7394 printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
c9b8b07c 7395 goto out_free_x86_emulator_cache;
013f6a5d
MT
7396 }
7397
97db56ce
AK
7398 r = kvm_mmu_module_init();
7399 if (r)
013f6a5d 7400 goto out_free_percpu;
97db56ce 7401
7b52345e 7402 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
ffb128c8 7403 PT_DIRTY_MASK, PT64_NX_MASK, 0,
d0ec49d4 7404 PT_PRESENT_MASK, 0, sme_me_mask);
b820cc0c 7405 kvm_timer_init();
c8076604 7406
ff9d07a0
ZY
7407 perf_register_guest_info_callbacks(&kvm_guest_cbs);
7408
cfc48181 7409 if (boot_cpu_has(X86_FEATURE_XSAVE)) {
2acf923e 7410 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
cfc48181
SC
7411 supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
7412 }
2acf923e 7413
c5cc421b 7414 kvm_lapic_init();
0c5f81da
WL
7415 if (pi_inject_timer == -1)
7416 pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
16e8d74d
MT
7417#ifdef CONFIG_X86_64
7418 pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
0092e434 7419
5fa4ec9c 7420 if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
0092e434 7421 set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
16e8d74d
MT
7422#endif
7423
f8c16bba 7424 return 0;
56c6d28a 7425
013f6a5d
MT
7426out_free_percpu:
7427 free_percpu(shared_msrs);
c9b8b07c
SC
7428out_free_x86_emulator_cache:
7429 kmem_cache_destroy(x86_emulator_cache);
b666a4b6
MO
7430out_free_x86_fpu_cache:
7431 kmem_cache_destroy(x86_fpu_cache);
56c6d28a 7432out:
56c6d28a 7433 return r;
043405e1 7434}
8776e519 7435
f8c16bba
ZX
7436void kvm_arch_exit(void)
7437{
0092e434 7438#ifdef CONFIG_X86_64
5fa4ec9c 7439 if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
0092e434
VK
7440 clear_hv_tscchange_cb();
7441#endif
cef84c30 7442 kvm_lapic_exit();
ff9d07a0
ZY
7443 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
7444
888d256e
JK
7445 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
7446 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
7447 CPUFREQ_TRANSITION_NOTIFIER);
251a5fd6 7448 cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
16e8d74d
MT
7449#ifdef CONFIG_X86_64
7450 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
7451#endif
afaf0b2f 7452 kvm_x86_ops.hardware_enable = NULL;
56c6d28a 7453 kvm_mmu_module_exit();
013f6a5d 7454 free_percpu(shared_msrs);
b666a4b6 7455 kmem_cache_destroy(x86_fpu_cache);
56c6d28a 7456}
f8c16bba 7457
5cb56059 7458int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
8776e519
HB
7459{
7460 ++vcpu->stat.halt_exits;
35754c98 7461 if (lapic_in_kernel(vcpu)) {
a4535290 7462 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
8776e519
HB
7463 return 1;
7464 } else {
7465 vcpu->run->exit_reason = KVM_EXIT_HLT;
7466 return 0;
7467 }
7468}
5cb56059
JS
7469EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
7470
7471int kvm_emulate_halt(struct kvm_vcpu *vcpu)
7472{
6affcbed
KH
7473 int ret = kvm_skip_emulated_instruction(vcpu);
7474 /*
7475 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
7476 * KVM_EXIT_DEBUG here.
7477 */
7478 return kvm_vcpu_halt(vcpu) && ret;
5cb56059 7479}
8776e519
HB
7480EXPORT_SYMBOL_GPL(kvm_emulate_halt);
7481
8ef81a9a 7482#ifdef CONFIG_X86_64
55dd00a7
MT
7483static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
7484 unsigned long clock_type)
7485{
7486 struct kvm_clock_pairing clock_pairing;
899a31f5 7487 struct timespec64 ts;
80fbd89c 7488 u64 cycle;
55dd00a7
MT
7489 int ret;
7490
7491 if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
7492 return -KVM_EOPNOTSUPP;
7493
7494 if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
7495 return -KVM_EOPNOTSUPP;
7496
7497 clock_pairing.sec = ts.tv_sec;
7498 clock_pairing.nsec = ts.tv_nsec;
7499 clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
7500 clock_pairing.flags = 0;
bcbfbd8e 7501 memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
55dd00a7
MT
7502
7503 ret = 0;
7504 if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
7505 sizeof(struct kvm_clock_pairing)))
7506 ret = -KVM_EFAULT;
7507
7508 return ret;
7509}
8ef81a9a 7510#endif
55dd00a7 7511
6aef266c
SV
7512/*
7513 * kvm_pv_kick_cpu_op: Kick a vcpu.
7514 *
7515 * @apicid - apicid of vcpu to be kicked.
7516 */
7517static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
7518{
24d2166b 7519 struct kvm_lapic_irq lapic_irq;
6aef266c 7520
150a84fe 7521 lapic_irq.shorthand = APIC_DEST_NOSHORT;
c96001c5 7522 lapic_irq.dest_mode = APIC_DEST_PHYSICAL;
ebd28fcb 7523 lapic_irq.level = 0;
24d2166b 7524 lapic_irq.dest_id = apicid;
93bbf0b8 7525 lapic_irq.msi_redir_hint = false;
6aef266c 7526
24d2166b 7527 lapic_irq.delivery_mode = APIC_DM_REMRD;
795a149e 7528 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
6aef266c
SV
7529}
7530
4e19c36f
SS
7531bool kvm_apicv_activated(struct kvm *kvm)
7532{
7533 return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
7534}
7535EXPORT_SYMBOL_GPL(kvm_apicv_activated);
7536
7537void kvm_apicv_init(struct kvm *kvm, bool enable)
7538{
7539 if (enable)
7540 clear_bit(APICV_INHIBIT_REASON_DISABLE,
7541 &kvm->arch.apicv_inhibit_reasons);
7542 else
7543 set_bit(APICV_INHIBIT_REASON_DISABLE,
7544 &kvm->arch.apicv_inhibit_reasons);
7545}
7546EXPORT_SYMBOL_GPL(kvm_apicv_init);
7547
71506297
WL
7548static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
7549{
7550 struct kvm_vcpu *target = NULL;
7551 struct kvm_apic_map *map;
7552
7553 rcu_read_lock();
7554 map = rcu_dereference(kvm->arch.apic_map);
7555
7556 if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
7557 target = map->phys_map[dest_id]->vcpu;
7558
7559 rcu_read_unlock();
7560
266e85a5 7561 if (target && READ_ONCE(target->ready))
71506297
WL
7562 kvm_vcpu_yield_to(target);
7563}
7564
8776e519
HB
7565int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
7566{
7567 unsigned long nr, a0, a1, a2, a3, ret;
6356ee0c 7568 int op_64_bit;
8776e519 7569
696ca779
RK
7570 if (kvm_hv_hypercall_enabled(vcpu->kvm))
7571 return kvm_hv_hypercall(vcpu);
55cd8e5a 7572
de3cd117
SC
7573 nr = kvm_rax_read(vcpu);
7574 a0 = kvm_rbx_read(vcpu);
7575 a1 = kvm_rcx_read(vcpu);
7576 a2 = kvm_rdx_read(vcpu);
7577 a3 = kvm_rsi_read(vcpu);
8776e519 7578
229456fc 7579 trace_kvm_hypercall(nr, a0, a1, a2, a3);
2714d1d3 7580
a449c7aa
NA
7581 op_64_bit = is_64_bit_mode(vcpu);
7582 if (!op_64_bit) {
8776e519
HB
7583 nr &= 0xFFFFFFFF;
7584 a0 &= 0xFFFFFFFF;
7585 a1 &= 0xFFFFFFFF;
7586 a2 &= 0xFFFFFFFF;
7587 a3 &= 0xFFFFFFFF;
7588 }
7589
afaf0b2f 7590 if (kvm_x86_ops.get_cpl(vcpu) != 0) {
07708c4a 7591 ret = -KVM_EPERM;
696ca779 7592 goto out;
07708c4a
JK
7593 }
7594
8776e519 7595 switch (nr) {
b93463aa
AK
7596 case KVM_HC_VAPIC_POLL_IRQ:
7597 ret = 0;
7598 break;
6aef266c
SV
7599 case KVM_HC_KICK_CPU:
7600 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
266e85a5 7601 kvm_sched_yield(vcpu->kvm, a1);
6aef266c
SV
7602 ret = 0;
7603 break;
8ef81a9a 7604#ifdef CONFIG_X86_64
55dd00a7
MT
7605 case KVM_HC_CLOCK_PAIRING:
7606 ret = kvm_pv_clock_pairing(vcpu, a0, a1);
7607 break;
1ed199a4 7608#endif
4180bf1b
WL
7609 case KVM_HC_SEND_IPI:
7610 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
7611 break;
71506297
WL
7612 case KVM_HC_SCHED_YIELD:
7613 kvm_sched_yield(vcpu->kvm, a0);
7614 ret = 0;
7615 break;
8776e519
HB
7616 default:
7617 ret = -KVM_ENOSYS;
7618 break;
7619 }
696ca779 7620out:
a449c7aa
NA
7621 if (!op_64_bit)
7622 ret = (u32)ret;
de3cd117 7623 kvm_rax_write(vcpu, ret);
6356ee0c 7624
f11c3a8d 7625 ++vcpu->stat.hypercalls;
6356ee0c 7626 return kvm_skip_emulated_instruction(vcpu);
8776e519
HB
7627}
7628EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
7629
b6785def 7630static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
8776e519 7631{
d6aa1000 7632 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8776e519 7633 char instruction[3];
5fdbf976 7634 unsigned long rip = kvm_rip_read(vcpu);
8776e519 7635
afaf0b2f 7636 kvm_x86_ops.patch_hypercall(vcpu, instruction);
8776e519 7637
ce2e852e
DV
7638 return emulator_write_emulated(ctxt, rip, instruction, 3,
7639 &ctxt->exception);
8776e519
HB
7640}
7641
851ba692 7642static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
b6c7a5dc 7643{
782d422b
MG
7644 return vcpu->run->request_interrupt_window &&
7645 likely(!pic_in_kernel(vcpu->kvm));
b6c7a5dc
HB
7646}
7647
851ba692 7648static void post_kvm_run_save(struct kvm_vcpu *vcpu)
b6c7a5dc 7649{
851ba692
AK
7650 struct kvm_run *kvm_run = vcpu->run;
7651
91586a3b 7652 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
f077825a 7653 kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
2d3ad1f4 7654 kvm_run->cr8 = kvm_get_cr8(vcpu);
b6c7a5dc 7655 kvm_run->apic_base = kvm_get_apic_base(vcpu);
127a457a
MG
7656 kvm_run->ready_for_interrupt_injection =
7657 pic_in_kernel(vcpu->kvm) ||
782d422b 7658 kvm_vcpu_ready_for_interrupt_injection(vcpu);
b6c7a5dc
HB
7659}
7660
95ba8273
GN
7661static void update_cr8_intercept(struct kvm_vcpu *vcpu)
7662{
7663 int max_irr, tpr;
7664
afaf0b2f 7665 if (!kvm_x86_ops.update_cr8_intercept)
95ba8273
GN
7666 return;
7667
bce87cce 7668 if (!lapic_in_kernel(vcpu))
88c808fd
AK
7669 return;
7670
d62caabb
AS
7671 if (vcpu->arch.apicv_active)
7672 return;
7673
8db3baa2
GN
7674 if (!vcpu->arch.apic->vapic_addr)
7675 max_irr = kvm_lapic_find_highest_irr(vcpu);
7676 else
7677 max_irr = -1;
95ba8273
GN
7678
7679 if (max_irr != -1)
7680 max_irr >>= 4;
7681
7682 tpr = kvm_lapic_get_cr8(vcpu);
7683
afaf0b2f 7684 kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
95ba8273
GN
7685}
7686
a1c77abb 7687static int inject_pending_event(struct kvm_vcpu *vcpu)
95ba8273 7688{
b6b8a145
JK
7689 int r;
7690
95ba8273 7691 /* try to reinject previous events if any */
664f8e26 7692
1a680e35 7693 if (vcpu->arch.exception.injected)
afaf0b2f 7694 kvm_x86_ops.queue_exception(vcpu);
664f8e26 7695 /*
a042c26f
LA
7696 * Do not inject an NMI or interrupt if there is a pending
7697 * exception. Exceptions and interrupts are recognized at
7698 * instruction boundaries, i.e. the start of an instruction.
7699 * Trap-like exceptions, e.g. #DB, have higher priority than
7700 * NMIs and interrupts, i.e. traps are recognized before an
7701 * NMI/interrupt that's pending on the same instruction.
7702 * Fault-like exceptions, e.g. #GP and #PF, are the lowest
7703 * priority, but are only generated (pended) during instruction
7704 * execution, i.e. a pending fault-like exception means the
7705 * fault occurred on the *previous* instruction and must be
7706 * serviced prior to recognizing any new events in order to
7707 * fully complete the previous instruction.
664f8e26 7708 */
1a680e35
LA
7709 else if (!vcpu->arch.exception.pending) {
7710 if (vcpu->arch.nmi_injected)
afaf0b2f 7711 kvm_x86_ops.set_nmi(vcpu);
1a680e35 7712 else if (vcpu->arch.interrupt.injected)
afaf0b2f 7713 kvm_x86_ops.set_irq(vcpu);
664f8e26
WL
7714 }
7715
3b82b8d7
SC
7716 WARN_ON_ONCE(vcpu->arch.exception.injected &&
7717 vcpu->arch.exception.pending);
7718
1a680e35
LA
7719 /*
7720 * Call check_nested_events() even if we reinjected a previous event
7721 * in order for caller to determine if it should require immediate-exit
7722 * from L2 to L1 due to pending L1 events which require exit
7723 * from L2 to L1.
7724 */
56083bdf 7725 if (is_guest_mode(vcpu)) {
33b22172 7726 r = kvm_x86_ops.nested_ops->check_events(vcpu);
664f8e26
WL
7727 if (r != 0)
7728 return r;
7729 }
7730
7731 /* try to inject new event if pending */
b59bb7bd 7732 if (vcpu->arch.exception.pending) {
5c1c85d0
AK
7733 trace_kvm_inj_exception(vcpu->arch.exception.nr,
7734 vcpu->arch.exception.has_error_code,
7735 vcpu->arch.exception.error_code);
d6e8c854 7736
664f8e26
WL
7737 vcpu->arch.exception.pending = false;
7738 vcpu->arch.exception.injected = true;
7739
d6e8c854
NA
7740 if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
7741 __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
7742 X86_EFLAGS_RF);
7743
f10c729f
JM
7744 if (vcpu->arch.exception.nr == DB_VECTOR) {
7745 /*
7746 * This code assumes that nSVM doesn't use
7747 * check_nested_events(). If it does, the
7748 * DR6/DR7 changes should happen before L1
7749 * gets a #VMEXIT for an intercepted #DB in
7750 * L2. (Under VMX, on the other hand, the
7751 * DR6/DR7 changes should not happen in the
7752 * event of a VM-exit to L1 for an intercepted
7753 * #DB in L2.)
7754 */
7755 kvm_deliver_exception_payload(vcpu);
7756 if (vcpu->arch.dr7 & DR7_GD) {
7757 vcpu->arch.dr7 &= ~DR7_GD;
7758 kvm_update_dr7(vcpu);
7759 }
6bdf0662
NA
7760 }
7761
afaf0b2f 7762 kvm_x86_ops.queue_exception(vcpu);
1a680e35
LA
7763 }
7764
7765 /* Don't consider new event if we re-injected an event */
7766 if (kvm_event_needs_reinjection(vcpu))
7767 return 0;
7768
a9fa7cb6 7769 if (vcpu->arch.smi_pending && kvm_x86_ops.smi_allowed(vcpu)) {
c43203ca 7770 vcpu->arch.smi_pending = false;
52797bf9 7771 ++vcpu->arch.smi_count;
ee2cd4b7 7772 enter_smm(vcpu);
afaf0b2f 7773 } else if (vcpu->arch.nmi_pending && kvm_x86_ops.nmi_allowed(vcpu)) {
321c5658
YS
7774 --vcpu->arch.nmi_pending;
7775 vcpu->arch.nmi_injected = true;
afaf0b2f 7776 kvm_x86_ops.set_nmi(vcpu);
c7c9c56c 7777 } else if (kvm_cpu_has_injectable_intr(vcpu)) {
9242b5b6
BD
7778 /*
7779 * Because interrupts can be injected asynchronously, we are
7780 * calling check_nested_events again here to avoid a race condition.
7781 * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
7782 * proposal and current concerns. Perhaps we should be setting
7783 * KVM_REQ_EVENT only on certain events and not unconditionally?
7784 */
56083bdf 7785 if (is_guest_mode(vcpu)) {
33b22172 7786 r = kvm_x86_ops.nested_ops->check_events(vcpu);
9242b5b6
BD
7787 if (r != 0)
7788 return r;
7789 }
afaf0b2f 7790 if (kvm_x86_ops.interrupt_allowed(vcpu)) {
66fd3f7f
GN
7791 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
7792 false);
afaf0b2f 7793 kvm_x86_ops.set_irq(vcpu);
95ba8273
GN
7794 }
7795 }
ee2cd4b7 7796
b6b8a145 7797 return 0;
95ba8273
GN
7798}
7799
7460fb4a
AK
7800static void process_nmi(struct kvm_vcpu *vcpu)
7801{
7802 unsigned limit = 2;
7803
7804 /*
7805 * x86 is limited to one NMI running, and one NMI pending after it.
7806 * If an NMI is already in progress, limit further NMIs to just one.
7807 * Otherwise, allow two (and we'll inject the first one immediately).
7808 */
afaf0b2f 7809 if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
7460fb4a
AK
7810 limit = 1;
7811
7812 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
7813 vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
7814 kvm_make_request(KVM_REQ_EVENT, vcpu);
7815}
7816
ee2cd4b7 7817static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
660a5d51
PB
7818{
7819 u32 flags = 0;
7820 flags |= seg->g << 23;
7821 flags |= seg->db << 22;
7822 flags |= seg->l << 21;
7823 flags |= seg->avl << 20;
7824 flags |= seg->present << 15;
7825 flags |= seg->dpl << 13;
7826 flags |= seg->s << 12;
7827 flags |= seg->type << 8;
7828 return flags;
7829}
7830
ee2cd4b7 7831static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
660a5d51
PB
7832{
7833 struct kvm_segment seg;
7834 int offset;
7835
7836 kvm_get_segment(vcpu, &seg, n);
7837 put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
7838
7839 if (n < 3)
7840 offset = 0x7f84 + n * 12;
7841 else
7842 offset = 0x7f2c + (n - 3) * 12;
7843
7844 put_smstate(u32, buf, offset + 8, seg.base);
7845 put_smstate(u32, buf, offset + 4, seg.limit);
ee2cd4b7 7846 put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
660a5d51
PB
7847}
7848
efbb288a 7849#ifdef CONFIG_X86_64
ee2cd4b7 7850static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
660a5d51
PB
7851{
7852 struct kvm_segment seg;
7853 int offset;
7854 u16 flags;
7855
7856 kvm_get_segment(vcpu, &seg, n);
7857 offset = 0x7e00 + n * 16;
7858
ee2cd4b7 7859 flags = enter_smm_get_segment_flags(&seg) >> 8;
660a5d51
PB
7860 put_smstate(u16, buf, offset, seg.selector);
7861 put_smstate(u16, buf, offset + 2, flags);
7862 put_smstate(u32, buf, offset + 4, seg.limit);
7863 put_smstate(u64, buf, offset + 8, seg.base);
7864}
efbb288a 7865#endif
660a5d51 7866
ee2cd4b7 7867static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
660a5d51
PB
7868{
7869 struct desc_ptr dt;
7870 struct kvm_segment seg;
7871 unsigned long val;
7872 int i;
7873
7874 put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
7875 put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
7876 put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
7877 put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
7878
7879 for (i = 0; i < 8; i++)
7880 put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
7881
7882 kvm_get_dr(vcpu, 6, &val);
7883 put_smstate(u32, buf, 0x7fcc, (u32)val);
7884 kvm_get_dr(vcpu, 7, &val);
7885 put_smstate(u32, buf, 0x7fc8, (u32)val);
7886
7887 kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
7888 put_smstate(u32, buf, 0x7fc4, seg.selector);
7889 put_smstate(u32, buf, 0x7f64, seg.base);
7890 put_smstate(u32, buf, 0x7f60, seg.limit);
ee2cd4b7 7891 put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
660a5d51
PB
7892
7893 kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
7894 put_smstate(u32, buf, 0x7fc0, seg.selector);
7895 put_smstate(u32, buf, 0x7f80, seg.base);
7896 put_smstate(u32, buf, 0x7f7c, seg.limit);
ee2cd4b7 7897 put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
660a5d51 7898
afaf0b2f 7899 kvm_x86_ops.get_gdt(vcpu, &dt);
660a5d51
PB
7900 put_smstate(u32, buf, 0x7f74, dt.address);
7901 put_smstate(u32, buf, 0x7f70, dt.size);
7902
afaf0b2f 7903 kvm_x86_ops.get_idt(vcpu, &dt);
660a5d51
PB
7904 put_smstate(u32, buf, 0x7f58, dt.address);
7905 put_smstate(u32, buf, 0x7f54, dt.size);
7906
7907 for (i = 0; i < 6; i++)
ee2cd4b7 7908 enter_smm_save_seg_32(vcpu, buf, i);
660a5d51
PB
7909
7910 put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
7911
7912 /* revision id */
7913 put_smstate(u32, buf, 0x7efc, 0x00020000);
7914 put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
7915}
7916
b68f3cc7 7917#ifdef CONFIG_X86_64
ee2cd4b7 7918static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
660a5d51 7919{
660a5d51
PB
7920 struct desc_ptr dt;
7921 struct kvm_segment seg;
7922 unsigned long val;
7923 int i;
7924
7925 for (i = 0; i < 16; i++)
7926 put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
7927
7928 put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
7929 put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
7930
7931 kvm_get_dr(vcpu, 6, &val);
7932 put_smstate(u64, buf, 0x7f68, val);
7933 kvm_get_dr(vcpu, 7, &val);
7934 put_smstate(u64, buf, 0x7f60, val);
7935
7936 put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
7937 put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
7938 put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
7939
7940 put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
7941
7942 /* revision id */
7943 put_smstate(u32, buf, 0x7efc, 0x00020064);
7944
7945 put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
7946
7947 kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
7948 put_smstate(u16, buf, 0x7e90, seg.selector);
ee2cd4b7 7949 put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
660a5d51
PB
7950 put_smstate(u32, buf, 0x7e94, seg.limit);
7951 put_smstate(u64, buf, 0x7e98, seg.base);
7952
afaf0b2f 7953 kvm_x86_ops.get_idt(vcpu, &dt);
660a5d51
PB
7954 put_smstate(u32, buf, 0x7e84, dt.size);
7955 put_smstate(u64, buf, 0x7e88, dt.address);
7956
7957 kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
7958 put_smstate(u16, buf, 0x7e70, seg.selector);
ee2cd4b7 7959 put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
660a5d51
PB
7960 put_smstate(u32, buf, 0x7e74, seg.limit);
7961 put_smstate(u64, buf, 0x7e78, seg.base);
7962
afaf0b2f 7963 kvm_x86_ops.get_gdt(vcpu, &dt);
660a5d51
PB
7964 put_smstate(u32, buf, 0x7e64, dt.size);
7965 put_smstate(u64, buf, 0x7e68, dt.address);
7966
7967 for (i = 0; i < 6; i++)
ee2cd4b7 7968 enter_smm_save_seg_64(vcpu, buf, i);
660a5d51 7969}
b68f3cc7 7970#endif
660a5d51 7971
ee2cd4b7 7972static void enter_smm(struct kvm_vcpu *vcpu)
64d60670 7973{
660a5d51 7974 struct kvm_segment cs, ds;
18c3626e 7975 struct desc_ptr dt;
660a5d51
PB
7976 char buf[512];
7977 u32 cr0;
7978
660a5d51 7979 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
660a5d51 7980 memset(buf, 0, 512);
b68f3cc7 7981#ifdef CONFIG_X86_64
d6321d49 7982 if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
ee2cd4b7 7983 enter_smm_save_state_64(vcpu, buf);
660a5d51 7984 else
b68f3cc7 7985#endif
ee2cd4b7 7986 enter_smm_save_state_32(vcpu, buf);
660a5d51 7987
0234bf88
LP
7988 /*
7989 * Give pre_enter_smm() a chance to make ISA-specific changes to the
7990 * vCPU state (e.g. leave guest mode) after we've saved the state into
7991 * the SMM state-save area.
7992 */
afaf0b2f 7993 kvm_x86_ops.pre_enter_smm(vcpu, buf);
0234bf88
LP
7994
7995 vcpu->arch.hflags |= HF_SMM_MASK;
54bf36aa 7996 kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
660a5d51 7997
afaf0b2f 7998 if (kvm_x86_ops.get_nmi_mask(vcpu))
660a5d51
PB
7999 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
8000 else
afaf0b2f 8001 kvm_x86_ops.set_nmi_mask(vcpu, true);
660a5d51
PB
8002
8003 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
8004 kvm_rip_write(vcpu, 0x8000);
8005
8006 cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
afaf0b2f 8007 kvm_x86_ops.set_cr0(vcpu, cr0);
660a5d51
PB
8008 vcpu->arch.cr0 = cr0;
8009
afaf0b2f 8010 kvm_x86_ops.set_cr4(vcpu, 0);
660a5d51 8011
18c3626e
PB
8012 /* Undocumented: IDT limit is set to zero on entry to SMM. */
8013 dt.address = dt.size = 0;
afaf0b2f 8014 kvm_x86_ops.set_idt(vcpu, &dt);
18c3626e 8015
660a5d51
PB
8016 __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
8017
8018 cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
8019 cs.base = vcpu->arch.smbase;
8020
8021 ds.selector = 0;
8022 ds.base = 0;
8023
8024 cs.limit = ds.limit = 0xffffffff;
8025 cs.type = ds.type = 0x3;
8026 cs.dpl = ds.dpl = 0;
8027 cs.db = ds.db = 0;
8028 cs.s = ds.s = 1;
8029 cs.l = ds.l = 0;
8030 cs.g = ds.g = 1;
8031 cs.avl = ds.avl = 0;
8032 cs.present = ds.present = 1;
8033 cs.unusable = ds.unusable = 0;
8034 cs.padding = ds.padding = 0;
8035
8036 kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
8037 kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
8038 kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
8039 kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
8040 kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
8041 kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
8042
b68f3cc7 8043#ifdef CONFIG_X86_64
d6321d49 8044 if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
afaf0b2f 8045 kvm_x86_ops.set_efer(vcpu, 0);
b68f3cc7 8046#endif
660a5d51
PB
8047
8048 kvm_update_cpuid(vcpu);
8049 kvm_mmu_reset_context(vcpu);
64d60670
PB
8050}
8051
ee2cd4b7 8052static void process_smi(struct kvm_vcpu *vcpu)
c43203ca
PB
8053{
8054 vcpu->arch.smi_pending = true;
8055 kvm_make_request(KVM_REQ_EVENT, vcpu);
8056}
8057
7ee30bc1
NNL
8058void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
8059 unsigned long *vcpu_bitmap)
8060{
8061 cpumask_var_t cpus;
7ee30bc1
NNL
8062
8063 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
8064
db5a95ec 8065 kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
54163a34 8066 NULL, vcpu_bitmap, cpus);
7ee30bc1
NNL
8067
8068 free_cpumask_var(cpus);
8069}
8070
2860c4b1
PB
8071void kvm_make_scan_ioapic_request(struct kvm *kvm)
8072{
8073 kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
8074}
8075
8df14af4
SS
8076void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
8077{
8078 if (!lapic_in_kernel(vcpu))
8079 return;
8080
8081 vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
8082 kvm_apic_update_apicv(vcpu);
afaf0b2f 8083 kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
8df14af4
SS
8084}
8085EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
8086
8087/*
8088 * NOTE: Do not hold any lock prior to calling this.
8089 *
8090 * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
8091 * locked, because it calls __x86_set_memory_region() which does
8092 * synchronize_srcu(&kvm->srcu).
8093 */
8094void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
8095{
7d611233 8096 struct kvm_vcpu *except;
8e205a6b
PB
8097 unsigned long old, new, expected;
8098
afaf0b2f
SC
8099 if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
8100 !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
ef8efd7a
SS
8101 return;
8102
8e205a6b
PB
8103 old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
8104 do {
8105 expected = new = old;
8106 if (activate)
8107 __clear_bit(bit, &new);
8108 else
8109 __set_bit(bit, &new);
8110 if (new == old)
8111 break;
8112 old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
8113 } while (old != expected);
8114
8115 if (!!old == !!new)
8116 return;
8df14af4 8117
24bbf74c 8118 trace_kvm_apicv_update_request(activate, bit);
afaf0b2f
SC
8119 if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
8120 kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
7d611233
SS
8121
8122 /*
8123 * Sending request to update APICV for all other vcpus,
8124 * while update the calling vcpu immediately instead of
8125 * waiting for another #VMEXIT to handle the request.
8126 */
8127 except = kvm_get_running_vcpu();
8128 kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
8129 except);
8130 if (except)
8131 kvm_vcpu_update_apicv(except);
8df14af4
SS
8132}
8133EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
8134
3d81bc7e 8135static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
c7c9c56c 8136{
dcbd3e49 8137 if (!kvm_apic_present(vcpu))
3d81bc7e 8138 return;
c7c9c56c 8139
6308630b 8140 bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
c7c9c56c 8141
b053b2ae 8142 if (irqchip_split(vcpu->kvm))
6308630b 8143 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
db2bdcbb 8144 else {
fa59cc00 8145 if (vcpu->arch.apicv_active)
afaf0b2f 8146 kvm_x86_ops.sync_pir_to_irr(vcpu);
e97f852f
WL
8147 if (ioapic_in_kernel(vcpu->kvm))
8148 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
db2bdcbb 8149 }
e40ff1d6
LA
8150
8151 if (is_guest_mode(vcpu))
8152 vcpu->arch.load_eoi_exitmap_pending = true;
8153 else
8154 kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
8155}
8156
8157static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
8158{
8159 u64 eoi_exit_bitmap[4];
8160
8161 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
8162 return;
8163
5c919412
AS
8164 bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
8165 vcpu_to_synic(vcpu)->vec_bitmap, 256);
afaf0b2f 8166 kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
c7c9c56c
YZ
8167}
8168
93065ac7
MH
8169int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
8170 unsigned long start, unsigned long end,
8171 bool blockable)
b1394e74
RK
8172{
8173 unsigned long apic_address;
8174
8175 /*
8176 * The physical address of apic access page is stored in the VMCS.
8177 * Update it when it becomes invalid.
8178 */
8179 apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
8180 if (start <= apic_address && apic_address < end)
8181 kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
93065ac7
MH
8182
8183 return 0;
b1394e74
RK
8184}
8185
4256f43f
TC
8186void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
8187{
35754c98 8188 if (!lapic_in_kernel(vcpu))
f439ed27
PB
8189 return;
8190
afaf0b2f 8191 if (!kvm_x86_ops.set_apic_access_page_addr)
4256f43f
TC
8192 return;
8193
a4148b7c 8194 kvm_x86_ops.set_apic_access_page_addr(vcpu);
4256f43f 8195}
4256f43f 8196
d264ee0c
SC
8197void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
8198{
8199 smp_send_reschedule(vcpu->cpu);
8200}
8201EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
8202
9357d939 8203/*
362c698f 8204 * Returns 1 to let vcpu_run() continue the guest execution loop without
9357d939
TY
8205 * exiting to the userspace. Otherwise, the value will be returned to the
8206 * userspace.
8207 */
851ba692 8208static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
b6c7a5dc
HB
8209{
8210 int r;
62a193ed
MG
8211 bool req_int_win =
8212 dm_request_for_irq_injection(vcpu) &&
8213 kvm_cpu_accept_dm_intr(vcpu);
a9ab13ff 8214 enum exit_fastpath_completion exit_fastpath;
62a193ed 8215
730dca42 8216 bool req_immediate_exit = false;
b6c7a5dc 8217
2fa6e1e1 8218 if (kvm_request_pending(vcpu)) {
671ddc70 8219 if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
33b22172 8220 if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) {
671ddc70
JM
8221 r = 0;
8222 goto out;
8223 }
8224 }
a8eeb04a 8225 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
2e53d63a 8226 kvm_mmu_unload(vcpu);
a8eeb04a 8227 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
2f599714 8228 __kvm_migrate_timers(vcpu);
d828199e
MT
8229 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
8230 kvm_gen_update_masterclock(vcpu->kvm);
0061d53d
MT
8231 if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
8232 kvm_gen_kvmclock_update(vcpu);
34c238a1
ZA
8233 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
8234 r = kvm_guest_time_update(vcpu);
8cfdc000
ZA
8235 if (unlikely(r))
8236 goto out;
8237 }
a8eeb04a 8238 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4731d4c7 8239 kvm_mmu_sync_roots(vcpu);
727a7e27
PB
8240 if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
8241 kvm_mmu_load_pgd(vcpu);
eeeb4f67 8242 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
7780938c 8243 kvm_vcpu_flush_tlb_all(vcpu);
eeeb4f67
SC
8244
8245 /* Flushing all ASIDs flushes the current ASID... */
8246 kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
8247 }
8248 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
8249 kvm_vcpu_flush_tlb_current(vcpu);
0baedd79
VK
8250 if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
8251 kvm_vcpu_flush_tlb_guest(vcpu);
eeeb4f67 8252
a8eeb04a 8253 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
851ba692 8254 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
b93463aa
AK
8255 r = 0;
8256 goto out;
8257 }
a8eeb04a 8258 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
851ba692 8259 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
bbeac283 8260 vcpu->mmio_needed = 0;
71c4dfaf
JR
8261 r = 0;
8262 goto out;
8263 }
af585b92
GN
8264 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
8265 /* Page is swapped out. Do synthetic halt */
8266 vcpu->arch.apf.halted = true;
8267 r = 1;
8268 goto out;
8269 }
c9aaa895
GC
8270 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
8271 record_steal_time(vcpu);
64d60670
PB
8272 if (kvm_check_request(KVM_REQ_SMI, vcpu))
8273 process_smi(vcpu);
7460fb4a
AK
8274 if (kvm_check_request(KVM_REQ_NMI, vcpu))
8275 process_nmi(vcpu);
f5132b01 8276 if (kvm_check_request(KVM_REQ_PMU, vcpu))
c6702c9d 8277 kvm_pmu_handle_event(vcpu);
f5132b01 8278 if (kvm_check_request(KVM_REQ_PMI, vcpu))
c6702c9d 8279 kvm_pmu_deliver_pmi(vcpu);
7543a635
SR
8280 if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
8281 BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
8282 if (test_bit(vcpu->arch.pending_ioapic_eoi,
6308630b 8283 vcpu->arch.ioapic_handled_vectors)) {
7543a635
SR
8284 vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
8285 vcpu->run->eoi.vector =
8286 vcpu->arch.pending_ioapic_eoi;
8287 r = 0;
8288 goto out;
8289 }
8290 }
3d81bc7e
YZ
8291 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
8292 vcpu_scan_ioapic(vcpu);
e40ff1d6
LA
8293 if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
8294 vcpu_load_eoi_exitmap(vcpu);
4256f43f
TC
8295 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
8296 kvm_vcpu_reload_apic_access_page(vcpu);
2ce79189
AS
8297 if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
8298 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
8299 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
8300 r = 0;
8301 goto out;
8302 }
e516cebb
AS
8303 if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
8304 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
8305 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
8306 r = 0;
8307 goto out;
8308 }
db397571
AS
8309 if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
8310 vcpu->run->exit_reason = KVM_EXIT_HYPERV;
8311 vcpu->run->hyperv = vcpu->arch.hyperv.exit;
8312 r = 0;
8313 goto out;
8314 }
f3b138c5
AS
8315
8316 /*
8317 * KVM_REQ_HV_STIMER has to be processed after
8318 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
8319 * depend on the guest clock being up-to-date
8320 */
1f4b34f8
AS
8321 if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
8322 kvm_hv_process_stimers(vcpu);
8df14af4
SS
8323 if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
8324 kvm_vcpu_update_apicv(vcpu);
2f52d58c 8325 }
b93463aa 8326
b463a6f7 8327 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
0f1e261e 8328 ++vcpu->stat.req_event;
66450a21
JK
8329 kvm_apic_accept_events(vcpu);
8330 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
8331 r = 1;
8332 goto out;
8333 }
8334
a1c77abb 8335 if (inject_pending_event(vcpu) != 0)
b6b8a145 8336 req_immediate_exit = true;
321c5658 8337 else {
cc3d967f 8338 /* Enable SMI/NMI/IRQ window open exits if needed.
c43203ca 8339 *
cc3d967f
LP
8340 * SMIs have three cases:
8341 * 1) They can be nested, and then there is nothing to
8342 * do here because RSM will cause a vmexit anyway.
8343 * 2) There is an ISA-specific reason why SMI cannot be
8344 * injected, and the moment when this changes can be
8345 * intercepted.
8346 * 3) Or the SMI can be pending because
8347 * inject_pending_event has completed the injection
8348 * of an IRQ or NMI from the previous vmexit, and
8349 * then we request an immediate exit to inject the
8350 * SMI.
c43203ca
PB
8351 */
8352 if (vcpu->arch.smi_pending && !is_smm(vcpu))
afaf0b2f 8353 if (!kvm_x86_ops.enable_smi_window(vcpu))
cc3d967f 8354 req_immediate_exit = true;
321c5658 8355 if (vcpu->arch.nmi_pending)
afaf0b2f 8356 kvm_x86_ops.enable_nmi_window(vcpu);
321c5658 8357 if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
afaf0b2f 8358 kvm_x86_ops.enable_irq_window(vcpu);
d2060bd4
SC
8359 if (is_guest_mode(vcpu) &&
8360 kvm_x86_ops.nested_ops->hv_timer_pending &&
8361 kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
8362 req_immediate_exit = true;
664f8e26 8363 WARN_ON(vcpu->arch.exception.pending);
321c5658 8364 }
b463a6f7
AK
8365
8366 if (kvm_lapic_enabled(vcpu)) {
8367 update_cr8_intercept(vcpu);
8368 kvm_lapic_sync_to_vapic(vcpu);
8369 }
8370 }
8371
d8368af8
AK
8372 r = kvm_mmu_reload(vcpu);
8373 if (unlikely(r)) {
d905c069 8374 goto cancel_injection;
d8368af8
AK
8375 }
8376
b6c7a5dc
HB
8377 preempt_disable();
8378
afaf0b2f 8379 kvm_x86_ops.prepare_guest_switch(vcpu);
b95234c8
PB
8380
8381 /*
8382 * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
8383 * IPI are then delayed after guest entry, which ensures that they
8384 * result in virtual interrupt delivery.
8385 */
8386 local_irq_disable();
6b7e2d09
XG
8387 vcpu->mode = IN_GUEST_MODE;
8388
01b71917
MT
8389 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
8390
0f127d12 8391 /*
b95234c8 8392 * 1) We should set ->mode before checking ->requests. Please see
cde9af6e 8393 * the comment in kvm_vcpu_exiting_guest_mode().
b95234c8 8394 *
81b01667 8395 * 2) For APICv, we should set ->mode before checking PID.ON. This
b95234c8
PB
8396 * pairs with the memory barrier implicit in pi_test_and_set_on
8397 * (see vmx_deliver_posted_interrupt).
8398 *
8399 * 3) This also orders the write to mode from any reads to the page
8400 * tables done while the VCPU is running. Please see the comment
8401 * in kvm_flush_remote_tlbs.
6b7e2d09 8402 */
01b71917 8403 smp_mb__after_srcu_read_unlock();
b6c7a5dc 8404
b95234c8
PB
8405 /*
8406 * This handles the case where a posted interrupt was
8407 * notified with kvm_vcpu_kick.
8408 */
fa59cc00 8409 if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
afaf0b2f 8410 kvm_x86_ops.sync_pir_to_irr(vcpu);
32f88400 8411
2fa6e1e1 8412 if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
d94e1dc9 8413 || need_resched() || signal_pending(current)) {
6b7e2d09 8414 vcpu->mode = OUTSIDE_GUEST_MODE;
d94e1dc9 8415 smp_wmb();
6c142801
AK
8416 local_irq_enable();
8417 preempt_enable();
01b71917 8418 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
6c142801 8419 r = 1;
d905c069 8420 goto cancel_injection;
6c142801
AK
8421 }
8422
c43203ca
PB
8423 if (req_immediate_exit) {
8424 kvm_make_request(KVM_REQ_EVENT, vcpu);
afaf0b2f 8425 kvm_x86_ops.request_immediate_exit(vcpu);
c43203ca 8426 }
d6185f20 8427
8b89fe1f 8428 trace_kvm_entry(vcpu->vcpu_id);
6edaa530 8429 guest_enter_irqoff();
b6c7a5dc 8430
2620fe26
SC
8431 fpregs_assert_state_consistent();
8432 if (test_thread_flag(TIF_NEED_FPU_LOAD))
8433 switch_fpu_return();
5f409e20 8434
42dbaa5a 8435 if (unlikely(vcpu->arch.switch_db_regs)) {
42dbaa5a
JK
8436 set_debugreg(0, 7);
8437 set_debugreg(vcpu->arch.eff_db[0], 0);
8438 set_debugreg(vcpu->arch.eff_db[1], 1);
8439 set_debugreg(vcpu->arch.eff_db[2], 2);
8440 set_debugreg(vcpu->arch.eff_db[3], 3);
c77fb5fe 8441 set_debugreg(vcpu->arch.dr6, 6);
ae561ede 8442 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
42dbaa5a 8443 }
b6c7a5dc 8444
a9ab13ff 8445 exit_fastpath = kvm_x86_ops.run(vcpu);
b6c7a5dc 8446
c77fb5fe
PB
8447 /*
8448 * Do this here before restoring debug registers on the host. And
8449 * since we do this before handling the vmexit, a DR access vmexit
8450 * can (a) read the correct value of the debug registers, (b) set
8451 * KVM_DEBUGREG_WONT_EXIT again.
8452 */
8453 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
c77fb5fe 8454 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
afaf0b2f 8455 kvm_x86_ops.sync_dirty_debug_regs(vcpu);
70e4da7a 8456 kvm_update_dr0123(vcpu);
70e4da7a
PB
8457 kvm_update_dr7(vcpu);
8458 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
c77fb5fe
PB
8459 }
8460
24f1e32c
FW
8461 /*
8462 * If the guest has used debug registers, at least dr7
8463 * will be disabled while returning to the host.
8464 * If we don't have active breakpoints in the host, we don't
8465 * care about the messed up debug address registers. But if
8466 * we have some of them active, restore the old state.
8467 */
59d8eb53 8468 if (hw_breakpoint_active())
24f1e32c 8469 hw_breakpoint_restore();
42dbaa5a 8470
4ba76538 8471 vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1d5f066e 8472
6b7e2d09 8473 vcpu->mode = OUTSIDE_GUEST_MODE;
d94e1dc9 8474 smp_wmb();
a547c6db 8475
a9ab13ff 8476 kvm_x86_ops.handle_exit_irqoff(vcpu);
b6c7a5dc 8477
d7a08882
SC
8478 /*
8479 * Consume any pending interrupts, including the possible source of
8480 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
8481 * An instruction is required after local_irq_enable() to fully unblock
8482 * interrupts on processors that implement an interrupt shadow, the
8483 * stat.exits increment will do nicely.
8484 */
8485 kvm_before_interrupt(vcpu);
8486 local_irq_enable();
b6c7a5dc 8487 ++vcpu->stat.exits;
d7a08882
SC
8488 local_irq_disable();
8489 kvm_after_interrupt(vcpu);
b6c7a5dc 8490
f2485b3e 8491 guest_exit_irqoff();
ec0671d5
WL
8492 if (lapic_in_kernel(vcpu)) {
8493 s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
8494 if (delta != S64_MIN) {
8495 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
8496 vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
8497 }
8498 }
b6c7a5dc 8499
f2485b3e 8500 local_irq_enable();
b6c7a5dc
HB
8501 preempt_enable();
8502
f656ce01 8503 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3200f405 8504
b6c7a5dc
HB
8505 /*
8506 * Profile KVM exit RIPs:
8507 */
8508 if (unlikely(prof_on == KVM_PROFILING)) {
5fdbf976
MT
8509 unsigned long rip = kvm_rip_read(vcpu);
8510 profile_hit(KVM_PROFILING, (void *)rip);
b6c7a5dc
HB
8511 }
8512
cc578287
ZA
8513 if (unlikely(vcpu->arch.tsc_always_catchup))
8514 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
298101da 8515
5cfb1d5a
MT
8516 if (vcpu->arch.apic_attention)
8517 kvm_lapic_sync_from_vapic(vcpu);
b93463aa 8518
afaf0b2f 8519 r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
d905c069
MT
8520 return r;
8521
8522cancel_injection:
8081ad06
SC
8523 if (req_immediate_exit)
8524 kvm_make_request(KVM_REQ_EVENT, vcpu);
afaf0b2f 8525 kvm_x86_ops.cancel_injection(vcpu);
ae7a2a3f
MT
8526 if (unlikely(vcpu->arch.apic_attention))
8527 kvm_lapic_sync_from_vapic(vcpu);
d7690175
MT
8528out:
8529 return r;
8530}
b6c7a5dc 8531
362c698f
PB
8532static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
8533{
bf9f6ac8 8534 if (!kvm_arch_vcpu_runnable(vcpu) &&
afaf0b2f 8535 (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
9c8fd1ba
PB
8536 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
8537 kvm_vcpu_block(vcpu);
8538 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
bf9f6ac8 8539
afaf0b2f
SC
8540 if (kvm_x86_ops.post_block)
8541 kvm_x86_ops.post_block(vcpu);
bf9f6ac8 8542
9c8fd1ba
PB
8543 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
8544 return 1;
8545 }
362c698f
PB
8546
8547 kvm_apic_accept_events(vcpu);
8548 switch(vcpu->arch.mp_state) {
8549 case KVM_MP_STATE_HALTED:
8550 vcpu->arch.pv.pv_unhalted = false;
8551 vcpu->arch.mp_state =
8552 KVM_MP_STATE_RUNNABLE;
b2869f28 8553 /* fall through */
362c698f
PB
8554 case KVM_MP_STATE_RUNNABLE:
8555 vcpu->arch.apf.halted = false;
8556 break;
8557 case KVM_MP_STATE_INIT_RECEIVED:
8558 break;
8559 default:
8560 return -EINTR;
362c698f
PB
8561 }
8562 return 1;
8563}
09cec754 8564
5d9bc648
PB
8565static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
8566{
56083bdf 8567 if (is_guest_mode(vcpu))
33b22172 8568 kvm_x86_ops.nested_ops->check_events(vcpu);
0ad3bed6 8569
5d9bc648
PB
8570 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
8571 !vcpu->arch.apf.halted);
8572}
8573
362c698f 8574static int vcpu_run(struct kvm_vcpu *vcpu)
d7690175
MT
8575{
8576 int r;
f656ce01 8577 struct kvm *kvm = vcpu->kvm;
d7690175 8578
f656ce01 8579 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
c595ceee 8580 vcpu->arch.l1tf_flush_l1d = true;
d7690175 8581
362c698f 8582 for (;;) {
58f800d5 8583 if (kvm_vcpu_running(vcpu)) {
851ba692 8584 r = vcpu_enter_guest(vcpu);
bf9f6ac8 8585 } else {
362c698f 8586 r = vcpu_block(kvm, vcpu);
bf9f6ac8
FW
8587 }
8588
09cec754
GN
8589 if (r <= 0)
8590 break;
8591
72875d8a 8592 kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
09cec754
GN
8593 if (kvm_cpu_has_pending_timer(vcpu))
8594 kvm_inject_pending_timer_irqs(vcpu);
8595
782d422b
MG
8596 if (dm_request_for_irq_injection(vcpu) &&
8597 kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
4ca7dd8c
PB
8598 r = 0;
8599 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
09cec754 8600 ++vcpu->stat.request_irq_exits;
362c698f 8601 break;
09cec754 8602 }
af585b92
GN
8603
8604 kvm_check_async_pf_completion(vcpu);
8605
09cec754
GN
8606 if (signal_pending(current)) {
8607 r = -EINTR;
851ba692 8608 vcpu->run->exit_reason = KVM_EXIT_INTR;
09cec754 8609 ++vcpu->stat.signal_exits;
362c698f 8610 break;
09cec754
GN
8611 }
8612 if (need_resched()) {
f656ce01 8613 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
c08ac06a 8614 cond_resched();
f656ce01 8615 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
d7690175 8616 }
b6c7a5dc
HB
8617 }
8618
f656ce01 8619 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
b6c7a5dc
HB
8620
8621 return r;
8622}
8623
716d51ab
GN
8624static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
8625{
8626 int r;
60fc3d02 8627
716d51ab 8628 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
0ce97a2b 8629 r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
716d51ab 8630 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
60fc3d02 8631 return r;
716d51ab
GN
8632}
8633
8634static int complete_emulated_pio(struct kvm_vcpu *vcpu)
8635{
8636 BUG_ON(!vcpu->arch.pio.count);
8637
8638 return complete_emulated_io(vcpu);
8639}
8640
f78146b0
AK
8641/*
8642 * Implements the following, as a state machine:
8643 *
8644 * read:
8645 * for each fragment
87da7e66
XG
8646 * for each mmio piece in the fragment
8647 * write gpa, len
8648 * exit
8649 * copy data
f78146b0
AK
8650 * execute insn
8651 *
8652 * write:
8653 * for each fragment
87da7e66
XG
8654 * for each mmio piece in the fragment
8655 * write gpa, len
8656 * copy data
8657 * exit
f78146b0 8658 */
716d51ab 8659static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
5287f194
AK
8660{
8661 struct kvm_run *run = vcpu->run;
f78146b0 8662 struct kvm_mmio_fragment *frag;
87da7e66 8663 unsigned len;
5287f194 8664
716d51ab 8665 BUG_ON(!vcpu->mmio_needed);
5287f194 8666
716d51ab 8667 /* Complete previous fragment */
87da7e66
XG
8668 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
8669 len = min(8u, frag->len);
716d51ab 8670 if (!vcpu->mmio_is_write)
87da7e66
XG
8671 memcpy(frag->data, run->mmio.data, len);
8672
8673 if (frag->len <= 8) {
8674 /* Switch to the next fragment. */
8675 frag++;
8676 vcpu->mmio_cur_fragment++;
8677 } else {
8678 /* Go forward to the next mmio piece. */
8679 frag->data += len;
8680 frag->gpa += len;
8681 frag->len -= len;
8682 }
8683
a08d3b3b 8684 if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
716d51ab 8685 vcpu->mmio_needed = 0;
0912c977
PB
8686
8687 /* FIXME: return into emulator if single-stepping. */
cef4dea0 8688 if (vcpu->mmio_is_write)
716d51ab
GN
8689 return 1;
8690 vcpu->mmio_read_completed = 1;
8691 return complete_emulated_io(vcpu);
8692 }
87da7e66 8693
716d51ab
GN
8694 run->exit_reason = KVM_EXIT_MMIO;
8695 run->mmio.phys_addr = frag->gpa;
8696 if (vcpu->mmio_is_write)
87da7e66
XG
8697 memcpy(run->mmio.data, frag->data, min(8u, frag->len));
8698 run->mmio.len = min(8u, frag->len);
716d51ab
GN
8699 run->mmio.is_write = vcpu->mmio_is_write;
8700 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
8701 return 0;
5287f194
AK
8702}
8703
c9aef3b8
SC
8704static void kvm_save_current_fpu(struct fpu *fpu)
8705{
8706 /*
8707 * If the target FPU state is not resident in the CPU registers, just
8708 * memcpy() from current, else save CPU state directly to the target.
8709 */
8710 if (test_thread_flag(TIF_NEED_FPU_LOAD))
8711 memcpy(&fpu->state, &current->thread.fpu.state,
8712 fpu_kernel_xstate_size);
8713 else
8714 copy_fpregs_to_fpstate(fpu);
8715}
8716
822f312d
SAS
8717/* Swap (qemu) user FPU context for the guest FPU context. */
8718static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
8719{
5f409e20
RR
8720 fpregs_lock();
8721
c9aef3b8
SC
8722 kvm_save_current_fpu(vcpu->arch.user_fpu);
8723
afaf0b2f 8724 /* PKRU is separately restored in kvm_x86_ops.run. */
b666a4b6 8725 __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
822f312d 8726 ~XFEATURE_MASK_PKRU);
5f409e20
RR
8727
8728 fpregs_mark_activate();
8729 fpregs_unlock();
8730
822f312d
SAS
8731 trace_kvm_fpu(1);
8732}
8733
8734/* When vcpu_run ends, restore user space FPU context. */
8735static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
8736{
5f409e20
RR
8737 fpregs_lock();
8738
c9aef3b8
SC
8739 kvm_save_current_fpu(vcpu->arch.guest_fpu);
8740
d9a710e5 8741 copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
5f409e20
RR
8742
8743 fpregs_mark_activate();
8744 fpregs_unlock();
8745
822f312d
SAS
8746 ++vcpu->stat.fpu_reload;
8747 trace_kvm_fpu(0);
8748}
8749
1b94f6f8 8750int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
b6c7a5dc 8751{
1b94f6f8 8752 struct kvm_run *kvm_run = vcpu->run;
b6c7a5dc 8753 int r;
b6c7a5dc 8754
accb757d 8755 vcpu_load(vcpu);
20b7035c 8756 kvm_sigset_activate(vcpu);
5663d8f9
PX
8757 kvm_load_guest_fpu(vcpu);
8758
a4535290 8759 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2f173d26
JS
8760 if (kvm_run->immediate_exit) {
8761 r = -EINTR;
8762 goto out;
8763 }
b6c7a5dc 8764 kvm_vcpu_block(vcpu);
66450a21 8765 kvm_apic_accept_events(vcpu);
72875d8a 8766 kvm_clear_request(KVM_REQ_UNHALT, vcpu);
ac9f6dc0 8767 r = -EAGAIN;
a0595000
JS
8768 if (signal_pending(current)) {
8769 r = -EINTR;
1b94f6f8 8770 kvm_run->exit_reason = KVM_EXIT_INTR;
a0595000
JS
8771 ++vcpu->stat.signal_exits;
8772 }
ac9f6dc0 8773 goto out;
b6c7a5dc
HB
8774 }
8775
1b94f6f8 8776 if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
01643c51
KH
8777 r = -EINVAL;
8778 goto out;
8779 }
8780
1b94f6f8 8781 if (kvm_run->kvm_dirty_regs) {
01643c51
KH
8782 r = sync_regs(vcpu);
8783 if (r != 0)
8784 goto out;
8785 }
8786
b6c7a5dc 8787 /* re-sync apic's tpr */
35754c98 8788 if (!lapic_in_kernel(vcpu)) {
eea1cff9
AP
8789 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
8790 r = -EINVAL;
8791 goto out;
8792 }
8793 }
b6c7a5dc 8794
716d51ab
GN
8795 if (unlikely(vcpu->arch.complete_userspace_io)) {
8796 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
8797 vcpu->arch.complete_userspace_io = NULL;
8798 r = cui(vcpu);
8799 if (r <= 0)
5663d8f9 8800 goto out;
716d51ab
GN
8801 } else
8802 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
5287f194 8803
460df4c1
PB
8804 if (kvm_run->immediate_exit)
8805 r = -EINTR;
8806 else
8807 r = vcpu_run(vcpu);
b6c7a5dc
HB
8808
8809out:
5663d8f9 8810 kvm_put_guest_fpu(vcpu);
1b94f6f8 8811 if (kvm_run->kvm_valid_regs)
01643c51 8812 store_regs(vcpu);
f1d86e46 8813 post_kvm_run_save(vcpu);
20b7035c 8814 kvm_sigset_deactivate(vcpu);
b6c7a5dc 8815
accb757d 8816 vcpu_put(vcpu);
b6c7a5dc
HB
8817 return r;
8818}
8819
01643c51 8820static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
b6c7a5dc 8821{
7ae441ea
GN
8822 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
8823 /*
8824 * We are here if userspace calls get_regs() in the middle of
8825 * instruction emulation. Registers state needs to be copied
4a969980 8826 * back from emulation context to vcpu. Userspace shouldn't do
7ae441ea
GN
8827 * that usually, but some bad designed PV devices (vmware
8828 * backdoor interface) need this to work
8829 */
c9b8b07c 8830 emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
7ae441ea
GN
8831 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
8832 }
de3cd117
SC
8833 regs->rax = kvm_rax_read(vcpu);
8834 regs->rbx = kvm_rbx_read(vcpu);
8835 regs->rcx = kvm_rcx_read(vcpu);
8836 regs->rdx = kvm_rdx_read(vcpu);
8837 regs->rsi = kvm_rsi_read(vcpu);
8838 regs->rdi = kvm_rdi_read(vcpu);
e9c16c78 8839 regs->rsp = kvm_rsp_read(vcpu);
de3cd117 8840 regs->rbp = kvm_rbp_read(vcpu);
b6c7a5dc 8841#ifdef CONFIG_X86_64
de3cd117
SC
8842 regs->r8 = kvm_r8_read(vcpu);
8843 regs->r9 = kvm_r9_read(vcpu);
8844 regs->r10 = kvm_r10_read(vcpu);
8845 regs->r11 = kvm_r11_read(vcpu);
8846 regs->r12 = kvm_r12_read(vcpu);
8847 regs->r13 = kvm_r13_read(vcpu);
8848 regs->r14 = kvm_r14_read(vcpu);
8849 regs->r15 = kvm_r15_read(vcpu);
b6c7a5dc
HB
8850#endif
8851
5fdbf976 8852 regs->rip = kvm_rip_read(vcpu);
91586a3b 8853 regs->rflags = kvm_get_rflags(vcpu);
01643c51 8854}
b6c7a5dc 8855
01643c51
KH
8856int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
8857{
8858 vcpu_load(vcpu);
8859 __get_regs(vcpu, regs);
1fc9b76b 8860 vcpu_put(vcpu);
b6c7a5dc
HB
8861 return 0;
8862}
8863
01643c51 8864static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
b6c7a5dc 8865{
7ae441ea
GN
8866 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
8867 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
8868
de3cd117
SC
8869 kvm_rax_write(vcpu, regs->rax);
8870 kvm_rbx_write(vcpu, regs->rbx);
8871 kvm_rcx_write(vcpu, regs->rcx);
8872 kvm_rdx_write(vcpu, regs->rdx);
8873 kvm_rsi_write(vcpu, regs->rsi);
8874 kvm_rdi_write(vcpu, regs->rdi);
e9c16c78 8875 kvm_rsp_write(vcpu, regs->rsp);
de3cd117 8876 kvm_rbp_write(vcpu, regs->rbp);
b6c7a5dc 8877#ifdef CONFIG_X86_64
de3cd117
SC
8878 kvm_r8_write(vcpu, regs->r8);
8879 kvm_r9_write(vcpu, regs->r9);
8880 kvm_r10_write(vcpu, regs->r10);
8881 kvm_r11_write(vcpu, regs->r11);
8882 kvm_r12_write(vcpu, regs->r12);
8883 kvm_r13_write(vcpu, regs->r13);
8884 kvm_r14_write(vcpu, regs->r14);
8885 kvm_r15_write(vcpu, regs->r15);
b6c7a5dc
HB
8886#endif
8887
5fdbf976 8888 kvm_rip_write(vcpu, regs->rip);
d73235d1 8889 kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
b6c7a5dc 8890
b4f14abd
JK
8891 vcpu->arch.exception.pending = false;
8892
3842d135 8893 kvm_make_request(KVM_REQ_EVENT, vcpu);
01643c51 8894}
3842d135 8895
01643c51
KH
8896int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
8897{
8898 vcpu_load(vcpu);
8899 __set_regs(vcpu, regs);
875656fe 8900 vcpu_put(vcpu);
b6c7a5dc
HB
8901 return 0;
8902}
8903
b6c7a5dc
HB
8904void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
8905{
8906 struct kvm_segment cs;
8907
3e6e0aab 8908 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
b6c7a5dc
HB
8909 *db = cs.db;
8910 *l = cs.l;
8911}
8912EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
8913
01643c51 8914static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
b6c7a5dc 8915{
89a27f4d 8916 struct desc_ptr dt;
b6c7a5dc 8917
3e6e0aab
GT
8918 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
8919 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
8920 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
8921 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
8922 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
8923 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
b6c7a5dc 8924
3e6e0aab
GT
8925 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
8926 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
b6c7a5dc 8927
afaf0b2f 8928 kvm_x86_ops.get_idt(vcpu, &dt);
89a27f4d
GN
8929 sregs->idt.limit = dt.size;
8930 sregs->idt.base = dt.address;
afaf0b2f 8931 kvm_x86_ops.get_gdt(vcpu, &dt);
89a27f4d
GN
8932 sregs->gdt.limit = dt.size;
8933 sregs->gdt.base = dt.address;
b6c7a5dc 8934
4d4ec087 8935 sregs->cr0 = kvm_read_cr0(vcpu);
ad312c7c 8936 sregs->cr2 = vcpu->arch.cr2;
9f8fe504 8937 sregs->cr3 = kvm_read_cr3(vcpu);
fc78f519 8938 sregs->cr4 = kvm_read_cr4(vcpu);
2d3ad1f4 8939 sregs->cr8 = kvm_get_cr8(vcpu);
f6801dff 8940 sregs->efer = vcpu->arch.efer;
b6c7a5dc
HB
8941 sregs->apic_base = kvm_get_apic_base(vcpu);
8942
0e96f31e 8943 memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
b6c7a5dc 8944
04140b41 8945 if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
14d0bc1f
GN
8946 set_bit(vcpu->arch.interrupt.nr,
8947 (unsigned long *)sregs->interrupt_bitmap);
01643c51 8948}
16d7a191 8949
01643c51
KH
8950int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
8951 struct kvm_sregs *sregs)
8952{
8953 vcpu_load(vcpu);
8954 __get_sregs(vcpu, sregs);
bcdec41c 8955 vcpu_put(vcpu);
b6c7a5dc
HB
8956 return 0;
8957}
8958
62d9f0db
MT
8959int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
8960 struct kvm_mp_state *mp_state)
8961{
fd232561 8962 vcpu_load(vcpu);
f958bd23
SC
8963 if (kvm_mpx_supported())
8964 kvm_load_guest_fpu(vcpu);
fd232561 8965
66450a21 8966 kvm_apic_accept_events(vcpu);
6aef266c
SV
8967 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
8968 vcpu->arch.pv.pv_unhalted)
8969 mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
8970 else
8971 mp_state->mp_state = vcpu->arch.mp_state;
8972
f958bd23
SC
8973 if (kvm_mpx_supported())
8974 kvm_put_guest_fpu(vcpu);
fd232561 8975 vcpu_put(vcpu);
62d9f0db
MT
8976 return 0;
8977}
8978
8979int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
8980 struct kvm_mp_state *mp_state)
8981{
e83dff5e
CD
8982 int ret = -EINVAL;
8983
8984 vcpu_load(vcpu);
8985
bce87cce 8986 if (!lapic_in_kernel(vcpu) &&
66450a21 8987 mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
e83dff5e 8988 goto out;
66450a21 8989
27cbe7d6
LA
8990 /*
8991 * KVM_MP_STATE_INIT_RECEIVED means the processor is in
8992 * INIT state; latched init should be reported using
8993 * KVM_SET_VCPU_EVENTS, so reject it here.
8994 */
8995 if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
28bf2888
DH
8996 (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
8997 mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
e83dff5e 8998 goto out;
28bf2888 8999
66450a21
JK
9000 if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
9001 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
9002 set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
9003 } else
9004 vcpu->arch.mp_state = mp_state->mp_state;
3842d135 9005 kvm_make_request(KVM_REQ_EVENT, vcpu);
e83dff5e
CD
9006
9007 ret = 0;
9008out:
9009 vcpu_put(vcpu);
9010 return ret;
62d9f0db
MT
9011}
9012
7f3d35fd
KW
9013int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
9014 int reason, bool has_error_code, u32 error_code)
b6c7a5dc 9015{
c9b8b07c 9016 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8ec4722d 9017 int ret;
e01c2426 9018
8ec4722d 9019 init_emulate_ctxt(vcpu);
c697518a 9020
7f3d35fd 9021 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
9d74191a 9022 has_error_code, error_code);
1051778f
SC
9023 if (ret) {
9024 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9025 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
9026 vcpu->run->internal.ndata = 0;
60fc3d02 9027 return 0;
1051778f 9028 }
37817f29 9029
9d74191a
TY
9030 kvm_rip_write(vcpu, ctxt->eip);
9031 kvm_set_rflags(vcpu, ctxt->eflags);
60fc3d02 9032 return 1;
37817f29
IE
9033}
9034EXPORT_SYMBOL_GPL(kvm_task_switch);
9035
3140c156 9036static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
f2981033 9037{
37b95951 9038 if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
f2981033
LT
9039 /*
9040 * When EFER.LME and CR0.PG are set, the processor is in
9041 * 64-bit mode (though maybe in a 32-bit code segment).
9042 * CR4.PAE and EFER.LMA must be set.
9043 */
37b95951 9044 if (!(sregs->cr4 & X86_CR4_PAE)
f2981033
LT
9045 || !(sregs->efer & EFER_LMA))
9046 return -EINVAL;
9047 } else {
9048 /*
9049 * Not in 64-bit mode: EFER.LMA is clear and the code
9050 * segment cannot be 64-bit.
9051 */
9052 if (sregs->efer & EFER_LMA || sregs->cs.l)
9053 return -EINVAL;
9054 }
9055
3ca94192 9056 return kvm_valid_cr4(vcpu, sregs->cr4);
f2981033
LT
9057}
9058
01643c51 9059static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
b6c7a5dc 9060{
58cb628d 9061 struct msr_data apic_base_msr;
b6c7a5dc 9062 int mmu_reset_needed = 0;
c4d21882 9063 int cpuid_update_needed = 0;
63f42e02 9064 int pending_vec, max_bits, idx;
89a27f4d 9065 struct desc_ptr dt;
b4ef9d4e
CD
9066 int ret = -EINVAL;
9067
f2981033 9068 if (kvm_valid_sregs(vcpu, sregs))
8dbfb2bf 9069 goto out;
f2981033 9070
d3802286
JM
9071 apic_base_msr.data = sregs->apic_base;
9072 apic_base_msr.host_initiated = true;
9073 if (kvm_set_apic_base(vcpu, &apic_base_msr))
b4ef9d4e 9074 goto out;
6d1068b3 9075
89a27f4d
GN
9076 dt.size = sregs->idt.limit;
9077 dt.address = sregs->idt.base;
afaf0b2f 9078 kvm_x86_ops.set_idt(vcpu, &dt);
89a27f4d
GN
9079 dt.size = sregs->gdt.limit;
9080 dt.address = sregs->gdt.base;
afaf0b2f 9081 kvm_x86_ops.set_gdt(vcpu, &dt);
b6c7a5dc 9082
ad312c7c 9083 vcpu->arch.cr2 = sregs->cr2;
9f8fe504 9084 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
dc7e795e 9085 vcpu->arch.cr3 = sregs->cr3;
cb3c1e2f 9086 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
b6c7a5dc 9087
2d3ad1f4 9088 kvm_set_cr8(vcpu, sregs->cr8);
b6c7a5dc 9089
f6801dff 9090 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
afaf0b2f 9091 kvm_x86_ops.set_efer(vcpu, sregs->efer);
b6c7a5dc 9092
4d4ec087 9093 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
afaf0b2f 9094 kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
d7306163 9095 vcpu->arch.cr0 = sregs->cr0;
b6c7a5dc 9096
fc78f519 9097 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
c4d21882
WH
9098 cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
9099 (X86_CR4_OSXSAVE | X86_CR4_PKE));
afaf0b2f 9100 kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
c4d21882 9101 if (cpuid_update_needed)
00b27a3e 9102 kvm_update_cpuid(vcpu);
63f42e02
XG
9103
9104 idx = srcu_read_lock(&vcpu->kvm->srcu);
bf03d4f9 9105 if (is_pae_paging(vcpu)) {
9f8fe504 9106 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
7c93be44
MT
9107 mmu_reset_needed = 1;
9108 }
63f42e02 9109 srcu_read_unlock(&vcpu->kvm->srcu, idx);
b6c7a5dc
HB
9110
9111 if (mmu_reset_needed)
9112 kvm_mmu_reset_context(vcpu);
9113
a50abc3b 9114 max_bits = KVM_NR_INTERRUPTS;
923c61bb
GN
9115 pending_vec = find_first_bit(
9116 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
9117 if (pending_vec < max_bits) {
66fd3f7f 9118 kvm_queue_interrupt(vcpu, pending_vec, false);
923c61bb 9119 pr_debug("Set back pending irq %d\n", pending_vec);
b6c7a5dc
HB
9120 }
9121
3e6e0aab
GT
9122 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
9123 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
9124 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
9125 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
9126 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
9127 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
b6c7a5dc 9128
3e6e0aab
GT
9129 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
9130 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
b6c7a5dc 9131
5f0269f5
ME
9132 update_cr8_intercept(vcpu);
9133
9c3e4aab 9134 /* Older userspace won't unhalt the vcpu on reset. */
c5af89b6 9135 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
9c3e4aab 9136 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
3eeb3288 9137 !is_protmode(vcpu))
9c3e4aab
MT
9138 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
9139
3842d135
AK
9140 kvm_make_request(KVM_REQ_EVENT, vcpu);
9141
b4ef9d4e
CD
9142 ret = 0;
9143out:
01643c51
KH
9144 return ret;
9145}
9146
9147int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
9148 struct kvm_sregs *sregs)
9149{
9150 int ret;
9151
9152 vcpu_load(vcpu);
9153 ret = __set_sregs(vcpu, sregs);
b4ef9d4e
CD
9154 vcpu_put(vcpu);
9155 return ret;
b6c7a5dc
HB
9156}
9157
d0bfb940
JK
9158int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
9159 struct kvm_guest_debug *dbg)
b6c7a5dc 9160{
355be0b9 9161 unsigned long rflags;
ae675ef0 9162 int i, r;
b6c7a5dc 9163
66b56562
CD
9164 vcpu_load(vcpu);
9165
4f926bf2
JK
9166 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
9167 r = -EBUSY;
9168 if (vcpu->arch.exception.pending)
2122ff5e 9169 goto out;
4f926bf2
JK
9170 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
9171 kvm_queue_exception(vcpu, DB_VECTOR);
9172 else
9173 kvm_queue_exception(vcpu, BP_VECTOR);
9174 }
9175
91586a3b
JK
9176 /*
9177 * Read rflags as long as potentially injected trace flags are still
9178 * filtered out.
9179 */
9180 rflags = kvm_get_rflags(vcpu);
355be0b9
JK
9181
9182 vcpu->guest_debug = dbg->control;
9183 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
9184 vcpu->guest_debug = 0;
9185
9186 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
ae675ef0
JK
9187 for (i = 0; i < KVM_NR_DB_REGS; ++i)
9188 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
c8639010 9189 vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
ae675ef0
JK
9190 } else {
9191 for (i = 0; i < KVM_NR_DB_REGS; i++)
9192 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
ae675ef0 9193 }
c8639010 9194 kvm_update_dr7(vcpu);
ae675ef0 9195
f92653ee
JK
9196 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
9197 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
9198 get_segment_base(vcpu, VCPU_SREG_CS);
94fe45da 9199
91586a3b
JK
9200 /*
9201 * Trigger an rflags update that will inject or remove the trace
9202 * flags.
9203 */
9204 kvm_set_rflags(vcpu, rflags);
b6c7a5dc 9205
afaf0b2f 9206 kvm_x86_ops.update_bp_intercept(vcpu);
b6c7a5dc 9207
4f926bf2 9208 r = 0;
d0bfb940 9209
2122ff5e 9210out:
66b56562 9211 vcpu_put(vcpu);
b6c7a5dc
HB
9212 return r;
9213}
9214
8b006791
ZX
9215/*
9216 * Translate a guest virtual address to a guest physical address.
9217 */
9218int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
9219 struct kvm_translation *tr)
9220{
9221 unsigned long vaddr = tr->linear_address;
9222 gpa_t gpa;
f656ce01 9223 int idx;
8b006791 9224
1da5b61d
CD
9225 vcpu_load(vcpu);
9226
f656ce01 9227 idx = srcu_read_lock(&vcpu->kvm->srcu);
1871c602 9228 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
f656ce01 9229 srcu_read_unlock(&vcpu->kvm->srcu, idx);
8b006791
ZX
9230 tr->physical_address = gpa;
9231 tr->valid = gpa != UNMAPPED_GVA;
9232 tr->writeable = 1;
9233 tr->usermode = 0;
8b006791 9234
1da5b61d 9235 vcpu_put(vcpu);
8b006791
ZX
9236 return 0;
9237}
9238
d0752060
HB
9239int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
9240{
1393123e 9241 struct fxregs_state *fxsave;
d0752060 9242
1393123e 9243 vcpu_load(vcpu);
d0752060 9244
b666a4b6 9245 fxsave = &vcpu->arch.guest_fpu->state.fxsave;
d0752060
HB
9246 memcpy(fpu->fpr, fxsave->st_space, 128);
9247 fpu->fcw = fxsave->cwd;
9248 fpu->fsw = fxsave->swd;
9249 fpu->ftwx = fxsave->twd;
9250 fpu->last_opcode = fxsave->fop;
9251 fpu->last_ip = fxsave->rip;
9252 fpu->last_dp = fxsave->rdp;
0e96f31e 9253 memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
d0752060 9254
1393123e 9255 vcpu_put(vcpu);
d0752060
HB
9256 return 0;
9257}
9258
9259int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
9260{
6a96bc7f
CD
9261 struct fxregs_state *fxsave;
9262
9263 vcpu_load(vcpu);
9264
b666a4b6 9265 fxsave = &vcpu->arch.guest_fpu->state.fxsave;
d0752060 9266
d0752060
HB
9267 memcpy(fxsave->st_space, fpu->fpr, 128);
9268 fxsave->cwd = fpu->fcw;
9269 fxsave->swd = fpu->fsw;
9270 fxsave->twd = fpu->ftwx;
9271 fxsave->fop = fpu->last_opcode;
9272 fxsave->rip = fpu->last_ip;
9273 fxsave->rdp = fpu->last_dp;
0e96f31e 9274 memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
d0752060 9275
6a96bc7f 9276 vcpu_put(vcpu);
d0752060
HB
9277 return 0;
9278}
9279
01643c51
KH
9280static void store_regs(struct kvm_vcpu *vcpu)
9281{
9282 BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
9283
9284 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
9285 __get_regs(vcpu, &vcpu->run->s.regs.regs);
9286
9287 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
9288 __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
9289
9290 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
9291 kvm_vcpu_ioctl_x86_get_vcpu_events(
9292 vcpu, &vcpu->run->s.regs.events);
9293}
9294
9295static int sync_regs(struct kvm_vcpu *vcpu)
9296{
9297 if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
9298 return -EINVAL;
9299
9300 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
9301 __set_regs(vcpu, &vcpu->run->s.regs.regs);
9302 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
9303 }
9304 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
9305 if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
9306 return -EINVAL;
9307 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
9308 }
9309 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
9310 if (kvm_vcpu_ioctl_x86_set_vcpu_events(
9311 vcpu, &vcpu->run->s.regs.events))
9312 return -EINVAL;
9313 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
9314 }
9315
9316 return 0;
9317}
9318
0ee6a517 9319static void fx_init(struct kvm_vcpu *vcpu)
d0752060 9320{
b666a4b6 9321 fpstate_init(&vcpu->arch.guest_fpu->state);
782511b0 9322 if (boot_cpu_has(X86_FEATURE_XSAVES))
b666a4b6 9323 vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
df1daba7 9324 host_xcr0 | XSTATE_COMPACTION_ENABLED;
d0752060 9325
2acf923e
DC
9326 /*
9327 * Ensure guest xcr0 is valid for loading
9328 */
d91cab78 9329 vcpu->arch.xcr0 = XFEATURE_MASK_FP;
2acf923e 9330
ad312c7c 9331 vcpu->arch.cr0 |= X86_CR0_ET;
d0752060 9332}
d0752060 9333
897cc38e 9334int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
e9b11c17 9335{
897cc38e
SC
9336 if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
9337 pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
9338 "guest TSC will not be reliable\n");
7f1ea208 9339
897cc38e 9340 return 0;
e9b11c17
ZX
9341}
9342
e529ef66 9343int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
e9b11c17 9344{
95a0d01e
SC
9345 struct page *page;
9346 int r;
c447e76b 9347
95a0d01e
SC
9348 if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
9349 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
9350 else
9351 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
c447e76b 9352
95a0d01e 9353 kvm_set_tsc_khz(vcpu, max_tsc_khz);
c447e76b 9354
95a0d01e
SC
9355 r = kvm_mmu_create(vcpu);
9356 if (r < 0)
9357 return r;
9358
9359 if (irqchip_in_kernel(vcpu->kvm)) {
95a0d01e
SC
9360 r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
9361 if (r < 0)
9362 goto fail_mmu_destroy;
4e19c36f
SS
9363 if (kvm_apicv_activated(vcpu->kvm))
9364 vcpu->arch.apicv_active = true;
95a0d01e
SC
9365 } else
9366 static_key_slow_inc(&kvm_no_apic_vcpu);
9367
9368 r = -ENOMEM;
9369
9370 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
9371 if (!page)
9372 goto fail_free_lapic;
9373 vcpu->arch.pio_data = page_address(page);
9374
9375 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
9376 GFP_KERNEL_ACCOUNT);
9377 if (!vcpu->arch.mce_banks)
9378 goto fail_free_pio_data;
9379 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
9380
9381 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
9382 GFP_KERNEL_ACCOUNT))
9383 goto fail_free_mce_banks;
9384
c9b8b07c
SC
9385 if (!alloc_emulate_ctxt(vcpu))
9386 goto free_wbinvd_dirty_mask;
9387
95a0d01e
SC
9388 vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
9389 GFP_KERNEL_ACCOUNT);
9390 if (!vcpu->arch.user_fpu) {
9391 pr_err("kvm: failed to allocate userspace's fpu\n");
c9b8b07c 9392 goto free_emulate_ctxt;
95a0d01e
SC
9393 }
9394
9395 vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
9396 GFP_KERNEL_ACCOUNT);
9397 if (!vcpu->arch.guest_fpu) {
9398 pr_err("kvm: failed to allocate vcpu's fpu\n");
9399 goto free_user_fpu;
9400 }
9401 fx_init(vcpu);
9402
9403 vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
9404
9405 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
9406
9407 vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
9408
9409 kvm_async_pf_hash_reset(vcpu);
9410 kvm_pmu_init(vcpu);
9411
9412 vcpu->arch.pending_external_vector = -1;
9413 vcpu->arch.preempted_in_kernel = false;
9414
9415 kvm_hv_vcpu_init(vcpu);
9416
afaf0b2f 9417 r = kvm_x86_ops.vcpu_create(vcpu);
95a0d01e
SC
9418 if (r)
9419 goto free_guest_fpu;
e9b11c17 9420
0cf9135b 9421 vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
e53d88af 9422 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
19efffa2 9423 kvm_vcpu_mtrr_init(vcpu);
ec7660cc 9424 vcpu_load(vcpu);
d28bc9dd 9425 kvm_vcpu_reset(vcpu, false);
e1732991 9426 kvm_init_mmu(vcpu, false);
e9b11c17 9427 vcpu_put(vcpu);
ec7660cc 9428 return 0;
95a0d01e
SC
9429
9430free_guest_fpu:
9431 kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
9432free_user_fpu:
9433 kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
c9b8b07c
SC
9434free_emulate_ctxt:
9435 kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
95a0d01e
SC
9436free_wbinvd_dirty_mask:
9437 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
9438fail_free_mce_banks:
9439 kfree(vcpu->arch.mce_banks);
9440fail_free_pio_data:
9441 free_page((unsigned long)vcpu->arch.pio_data);
9442fail_free_lapic:
9443 kvm_free_lapic(vcpu);
9444fail_mmu_destroy:
9445 kvm_mmu_destroy(vcpu);
9446 return r;
e9b11c17
ZX
9447}
9448
31928aa5 9449void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
42897d86 9450{
8fe8ab46 9451 struct msr_data msr;
332967a3 9452 struct kvm *kvm = vcpu->kvm;
42897d86 9453
d3457c87
RK
9454 kvm_hv_vcpu_postcreate(vcpu);
9455
ec7660cc 9456 if (mutex_lock_killable(&vcpu->mutex))
31928aa5 9457 return;
ec7660cc 9458 vcpu_load(vcpu);
8fe8ab46
WA
9459 msr.data = 0x0;
9460 msr.index = MSR_IA32_TSC;
9461 msr.host_initiated = true;
9462 kvm_write_tsc(vcpu, &msr);
42897d86 9463 vcpu_put(vcpu);
2d5ba19b
MT
9464
9465 /* poll control enabled by default */
9466 vcpu->arch.msr_kvm_poll_control = 1;
9467
ec7660cc 9468 mutex_unlock(&vcpu->mutex);
42897d86 9469
b34de572
WL
9470 if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
9471 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
9472 KVMCLOCK_SYNC_PERIOD);
42897d86
MT
9473}
9474
d40ccc62 9475void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
e9b11c17 9476{
4cbc418a 9477 struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
95a0d01e 9478 int idx;
344d9588 9479
4cbc418a
PB
9480 kvm_release_pfn(cache->pfn, cache->dirty, cache);
9481
50b143e1 9482 kvmclock_reset(vcpu);
e9b11c17 9483
afaf0b2f 9484 kvm_x86_ops.vcpu_free(vcpu);
50b143e1 9485
c9b8b07c 9486 kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
50b143e1
SC
9487 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
9488 kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
9489 kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
95a0d01e
SC
9490
9491 kvm_hv_vcpu_uninit(vcpu);
9492 kvm_pmu_destroy(vcpu);
9493 kfree(vcpu->arch.mce_banks);
9494 kvm_free_lapic(vcpu);
9495 idx = srcu_read_lock(&vcpu->kvm->srcu);
9496 kvm_mmu_destroy(vcpu);
9497 srcu_read_unlock(&vcpu->kvm->srcu, idx);
9498 free_page((unsigned long)vcpu->arch.pio_data);
9499 if (!lapic_in_kernel(vcpu))
9500 static_key_slow_dec(&kvm_no_apic_vcpu);
e9b11c17
ZX
9501}
9502
d28bc9dd 9503void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
e9b11c17 9504{
b7e31be3
RK
9505 kvm_lapic_reset(vcpu, init_event);
9506
e69fab5d
PB
9507 vcpu->arch.hflags = 0;
9508
c43203ca 9509 vcpu->arch.smi_pending = 0;
52797bf9 9510 vcpu->arch.smi_count = 0;
7460fb4a
AK
9511 atomic_set(&vcpu->arch.nmi_queued, 0);
9512 vcpu->arch.nmi_pending = 0;
448fa4a9 9513 vcpu->arch.nmi_injected = false;
5f7552d4
NA
9514 kvm_clear_interrupt_queue(vcpu);
9515 kvm_clear_exception_queue(vcpu);
448fa4a9 9516
42dbaa5a 9517 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
ae561ede 9518 kvm_update_dr0123(vcpu);
6f43ed01 9519 vcpu->arch.dr6 = DR6_INIT;
42dbaa5a 9520 vcpu->arch.dr7 = DR7_FIXED_1;
c8639010 9521 kvm_update_dr7(vcpu);
42dbaa5a 9522
1119022c
NA
9523 vcpu->arch.cr2 = 0;
9524
3842d135 9525 kvm_make_request(KVM_REQ_EVENT, vcpu);
344d9588 9526 vcpu->arch.apf.msr_val = 0;
c9aaa895 9527 vcpu->arch.st.msr_val = 0;
3842d135 9528
12f9a48f
GC
9529 kvmclock_reset(vcpu);
9530
af585b92
GN
9531 kvm_clear_async_pf_completion_queue(vcpu);
9532 kvm_async_pf_hash_reset(vcpu);
9533 vcpu->arch.apf.halted = false;
3842d135 9534
a554d207
WL
9535 if (kvm_mpx_supported()) {
9536 void *mpx_state_buffer;
9537
9538 /*
9539 * To avoid have the INIT path from kvm_apic_has_events() that be
9540 * called with loaded FPU and does not let userspace fix the state.
9541 */
f775b13e
RR
9542 if (init_event)
9543 kvm_put_guest_fpu(vcpu);
b666a4b6 9544 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
abd16d68 9545 XFEATURE_BNDREGS);
a554d207
WL
9546 if (mpx_state_buffer)
9547 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
b666a4b6 9548 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
abd16d68 9549 XFEATURE_BNDCSR);
a554d207
WL
9550 if (mpx_state_buffer)
9551 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
f775b13e
RR
9552 if (init_event)
9553 kvm_load_guest_fpu(vcpu);
a554d207
WL
9554 }
9555
64d60670 9556 if (!init_event) {
d28bc9dd 9557 kvm_pmu_reset(vcpu);
64d60670 9558 vcpu->arch.smbase = 0x30000;
db2336a8 9559
db2336a8 9560 vcpu->arch.msr_misc_features_enables = 0;
a554d207
WL
9561
9562 vcpu->arch.xcr0 = XFEATURE_MASK_FP;
64d60670 9563 }
f5132b01 9564
66f7b72e
JS
9565 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
9566 vcpu->arch.regs_avail = ~0;
9567 vcpu->arch.regs_dirty = ~0;
9568
a554d207
WL
9569 vcpu->arch.ia32_xss = 0;
9570
afaf0b2f 9571 kvm_x86_ops.vcpu_reset(vcpu, init_event);
e9b11c17
ZX
9572}
9573
2b4a273b 9574void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
66450a21
JK
9575{
9576 struct kvm_segment cs;
9577
9578 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
9579 cs.selector = vector << 8;
9580 cs.base = vector << 12;
9581 kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
9582 kvm_rip_write(vcpu, 0);
e9b11c17
ZX
9583}
9584
13a34e06 9585int kvm_arch_hardware_enable(void)
e9b11c17 9586{
ca84d1a2
ZA
9587 struct kvm *kvm;
9588 struct kvm_vcpu *vcpu;
9589 int i;
0dd6a6ed
ZA
9590 int ret;
9591 u64 local_tsc;
9592 u64 max_tsc = 0;
9593 bool stable, backwards_tsc = false;
18863bdd
AK
9594
9595 kvm_shared_msr_cpu_online();
afaf0b2f 9596 ret = kvm_x86_ops.hardware_enable();
0dd6a6ed
ZA
9597 if (ret != 0)
9598 return ret;
9599
4ea1636b 9600 local_tsc = rdtsc();
b0c39dc6 9601 stable = !kvm_check_tsc_unstable();
0dd6a6ed
ZA
9602 list_for_each_entry(kvm, &vm_list, vm_list) {
9603 kvm_for_each_vcpu(i, vcpu, kvm) {
9604 if (!stable && vcpu->cpu == smp_processor_id())
105b21bb 9605 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
0dd6a6ed
ZA
9606 if (stable && vcpu->arch.last_host_tsc > local_tsc) {
9607 backwards_tsc = true;
9608 if (vcpu->arch.last_host_tsc > max_tsc)
9609 max_tsc = vcpu->arch.last_host_tsc;
9610 }
9611 }
9612 }
9613
9614 /*
9615 * Sometimes, even reliable TSCs go backwards. This happens on
9616 * platforms that reset TSC during suspend or hibernate actions, but
9617 * maintain synchronization. We must compensate. Fortunately, we can
9618 * detect that condition here, which happens early in CPU bringup,
9619 * before any KVM threads can be running. Unfortunately, we can't
9620 * bring the TSCs fully up to date with real time, as we aren't yet far
9621 * enough into CPU bringup that we know how much real time has actually
9285ec4c 9622 * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
0dd6a6ed
ZA
9623 * variables that haven't been updated yet.
9624 *
9625 * So we simply find the maximum observed TSC above, then record the
9626 * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
9627 * the adjustment will be applied. Note that we accumulate
9628 * adjustments, in case multiple suspend cycles happen before some VCPU
9629 * gets a chance to run again. In the event that no KVM threads get a
9630 * chance to run, we will miss the entire elapsed period, as we'll have
9631 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
9632 * loose cycle time. This isn't too big a deal, since the loss will be
9633 * uniform across all VCPUs (not to mention the scenario is extremely
9634 * unlikely). It is possible that a second hibernate recovery happens
9635 * much faster than a first, causing the observed TSC here to be
9636 * smaller; this would require additional padding adjustment, which is
9637 * why we set last_host_tsc to the local tsc observed here.
9638 *
9639 * N.B. - this code below runs only on platforms with reliable TSC,
9640 * as that is the only way backwards_tsc is set above. Also note
9641 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
9642 * have the same delta_cyc adjustment applied if backwards_tsc
9643 * is detected. Note further, this adjustment is only done once,
9644 * as we reset last_host_tsc on all VCPUs to stop this from being
9645 * called multiple times (one for each physical CPU bringup).
9646 *
4a969980 9647 * Platforms with unreliable TSCs don't have to deal with this, they
0dd6a6ed
ZA
9648 * will be compensated by the logic in vcpu_load, which sets the TSC to
9649 * catchup mode. This will catchup all VCPUs to real time, but cannot
9650 * guarantee that they stay in perfect synchronization.
9651 */
9652 if (backwards_tsc) {
9653 u64 delta_cyc = max_tsc - local_tsc;
9654 list_for_each_entry(kvm, &vm_list, vm_list) {
a826faf1 9655 kvm->arch.backwards_tsc_observed = true;
0dd6a6ed
ZA
9656 kvm_for_each_vcpu(i, vcpu, kvm) {
9657 vcpu->arch.tsc_offset_adjustment += delta_cyc;
9658 vcpu->arch.last_host_tsc = local_tsc;
105b21bb 9659 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
0dd6a6ed
ZA
9660 }
9661
9662 /*
9663 * We have to disable TSC offset matching.. if you were
9664 * booting a VM while issuing an S4 host suspend....
9665 * you may have some problem. Solving this issue is
9666 * left as an exercise to the reader.
9667 */
9668 kvm->arch.last_tsc_nsec = 0;
9669 kvm->arch.last_tsc_write = 0;
9670 }
9671
9672 }
9673 return 0;
e9b11c17
ZX
9674}
9675
13a34e06 9676void kvm_arch_hardware_disable(void)
e9b11c17 9677{
afaf0b2f 9678 kvm_x86_ops.hardware_disable();
13a34e06 9679 drop_user_return_notifiers();
e9b11c17
ZX
9680}
9681
b9904085 9682int kvm_arch_hardware_setup(void *opaque)
e9b11c17 9683{
d008dfdb 9684 struct kvm_x86_init_ops *ops = opaque;
9e9c3fe4
NA
9685 int r;
9686
91661989
SC
9687 rdmsrl_safe(MSR_EFER, &host_efer);
9688
408e9a31
PB
9689 if (boot_cpu_has(X86_FEATURE_XSAVES))
9690 rdmsrl(MSR_IA32_XSS, host_xss);
9691
d008dfdb 9692 r = ops->hardware_setup();
9e9c3fe4
NA
9693 if (r != 0)
9694 return r;
9695
afaf0b2f 9696 memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
69c6f69a 9697
408e9a31
PB
9698 if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
9699 supported_xss = 0;
9700
139f7425
PB
9701#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
9702 cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
9703#undef __kvm_cpu_cap_has
b11306b5 9704
35181e86
HZ
9705 if (kvm_has_tsc_control) {
9706 /*
9707 * Make sure the user can only configure tsc_khz values that
9708 * fit into a signed integer.
273ba457 9709 * A min value is not calculated because it will always
35181e86
HZ
9710 * be 1 on all machines.
9711 */
9712 u64 max = min(0x7fffffffULL,
9713 __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
9714 kvm_max_guest_tsc_khz = max;
9715
ad721883 9716 kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
35181e86 9717 }
ad721883 9718
9e9c3fe4
NA
9719 kvm_init_msr_list();
9720 return 0;
e9b11c17
ZX
9721}
9722
9723void kvm_arch_hardware_unsetup(void)
9724{
afaf0b2f 9725 kvm_x86_ops.hardware_unsetup();
e9b11c17
ZX
9726}
9727
b9904085 9728int kvm_arch_check_processor_compat(void *opaque)
e9b11c17 9729{
f1cdecf5 9730 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
d008dfdb 9731 struct kvm_x86_init_ops *ops = opaque;
f1cdecf5
SC
9732
9733 WARN_ON(!irqs_disabled());
9734
139f7425
PB
9735 if (__cr4_reserved_bits(cpu_has, c) !=
9736 __cr4_reserved_bits(cpu_has, &boot_cpu_data))
f1cdecf5
SC
9737 return -EIO;
9738
d008dfdb 9739 return ops->check_processor_compatibility();
d71ba788
PB
9740}
9741
9742bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
9743{
9744 return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
9745}
9746EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
9747
9748bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
9749{
9750 return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
e9b11c17
ZX
9751}
9752
54e9818f 9753struct static_key kvm_no_apic_vcpu __read_mostly;
bce87cce 9754EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
54e9818f 9755
e790d9ef
RK
9756void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
9757{
b35e5548
LX
9758 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
9759
c595ceee 9760 vcpu->arch.l1tf_flush_l1d = true;
b35e5548
LX
9761 if (pmu->version && unlikely(pmu->event_count)) {
9762 pmu->need_cleanup = true;
9763 kvm_make_request(KVM_REQ_PMU, vcpu);
9764 }
afaf0b2f 9765 kvm_x86_ops.sched_in(vcpu, cpu);
e790d9ef
RK
9766}
9767
562b6b08
SC
9768void kvm_arch_free_vm(struct kvm *kvm)
9769{
9770 kfree(kvm->arch.hyperv.hv_pa_pg);
9771 vfree(kvm);
e790d9ef
RK
9772}
9773
562b6b08 9774
e08b9637 9775int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
d19a9cd2 9776{
e08b9637
CO
9777 if (type)
9778 return -EINVAL;
9779
6ef768fa 9780 INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
f05e70ac 9781 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
10605204 9782 INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
1aa9b957 9783 INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
4d5c5d0f 9784 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
e0f0bbc5 9785 atomic_set(&kvm->arch.noncoherent_dma_count, 0);
d19a9cd2 9786
5550af4d
SY
9787 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
9788 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
7a84428a
AW
9789 /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
9790 set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
9791 &kvm->arch.irq_sources_bitmap);
5550af4d 9792
038f8c11 9793 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
1e08ec4a 9794 mutex_init(&kvm->arch.apic_map_lock);
d828199e
MT
9795 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
9796
8171cd68 9797 kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
d828199e 9798 pvclock_update_vm_gtod_copy(kvm);
53f658b3 9799
6fbbde9a
DS
9800 kvm->arch.guest_can_read_msr_platform_info = true;
9801
7e44e449 9802 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
332967a3 9803 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
7e44e449 9804
cbc0236a 9805 kvm_hv_init_vm(kvm);
0eb05bf2 9806 kvm_page_track_init(kvm);
13d268ca 9807 kvm_mmu_init_vm(kvm);
0eb05bf2 9808
afaf0b2f 9809 return kvm_x86_ops.vm_init(kvm);
d19a9cd2
ZX
9810}
9811
1aa9b957
JS
9812int kvm_arch_post_init_vm(struct kvm *kvm)
9813{
9814 return kvm_mmu_post_init_vm(kvm);
9815}
9816
d19a9cd2
ZX
9817static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
9818{
ec7660cc 9819 vcpu_load(vcpu);
d19a9cd2
ZX
9820 kvm_mmu_unload(vcpu);
9821 vcpu_put(vcpu);
9822}
9823
9824static void kvm_free_vcpus(struct kvm *kvm)
9825{
9826 unsigned int i;
988a2cae 9827 struct kvm_vcpu *vcpu;
d19a9cd2
ZX
9828
9829 /*
9830 * Unpin any mmu pages first.
9831 */
af585b92
GN
9832 kvm_for_each_vcpu(i, vcpu, kvm) {
9833 kvm_clear_async_pf_completion_queue(vcpu);
988a2cae 9834 kvm_unload_vcpu_mmu(vcpu);
af585b92 9835 }
988a2cae 9836 kvm_for_each_vcpu(i, vcpu, kvm)
4543bdc0 9837 kvm_vcpu_destroy(vcpu);
988a2cae
GN
9838
9839 mutex_lock(&kvm->lock);
9840 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
9841 kvm->vcpus[i] = NULL;
d19a9cd2 9842
988a2cae
GN
9843 atomic_set(&kvm->online_vcpus, 0);
9844 mutex_unlock(&kvm->lock);
d19a9cd2
ZX
9845}
9846
ad8ba2cd
SY
9847void kvm_arch_sync_events(struct kvm *kvm)
9848{
332967a3 9849 cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
7e44e449 9850 cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
aea924f6 9851 kvm_free_pit(kvm);
ad8ba2cd
SY
9852}
9853
1d8007bd 9854int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
9da0e4d5
PB
9855{
9856 int i, r;
0577d1ab 9857 unsigned long hva, uninitialized_var(old_npages);
f0d648bd 9858 struct kvm_memslots *slots = kvm_memslots(kvm);
0577d1ab 9859 struct kvm_memory_slot *slot;
9da0e4d5
PB
9860
9861 /* Called with kvm->slots_lock held. */
1d8007bd
PB
9862 if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
9863 return -EINVAL;
9da0e4d5 9864
f0d648bd
PB
9865 slot = id_to_memslot(slots, id);
9866 if (size) {
0577d1ab 9867 if (slot && slot->npages)
f0d648bd
PB
9868 return -EEXIST;
9869
9870 /*
9871 * MAP_SHARED to prevent internal slot pages from being moved
9872 * by fork()/COW.
9873 */
9874 hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
9875 MAP_SHARED | MAP_ANONYMOUS, 0);
9876 if (IS_ERR((void *)hva))
9877 return PTR_ERR((void *)hva);
9878 } else {
0577d1ab 9879 if (!slot || !slot->npages)
f0d648bd
PB
9880 return 0;
9881
abbed4fa
SC
9882 /*
9883 * Stuff a non-canonical value to catch use-after-delete. This
9884 * ends up being 0 on 32-bit KVM, but there's no better
9885 * alternative.
9886 */
9887 hva = (unsigned long)(0xdeadull << 48);
0577d1ab 9888 old_npages = slot->npages;
f0d648bd
PB
9889 }
9890
9da0e4d5 9891 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1d8007bd 9892 struct kvm_userspace_memory_region m;
9da0e4d5 9893
1d8007bd
PB
9894 m.slot = id | (i << 16);
9895 m.flags = 0;
9896 m.guest_phys_addr = gpa;
f0d648bd 9897 m.userspace_addr = hva;
1d8007bd 9898 m.memory_size = size;
9da0e4d5
PB
9899 r = __kvm_set_memory_region(kvm, &m);
9900 if (r < 0)
9901 return r;
9902 }
9903
103c763c 9904 if (!size)
0577d1ab 9905 vm_munmap(hva, old_npages * PAGE_SIZE);
f0d648bd 9906
9da0e4d5
PB
9907 return 0;
9908}
9909EXPORT_SYMBOL_GPL(__x86_set_memory_region);
9910
1aa9b957
JS
9911void kvm_arch_pre_destroy_vm(struct kvm *kvm)
9912{
9913 kvm_mmu_pre_destroy_vm(kvm);
9914}
9915
d19a9cd2
ZX
9916void kvm_arch_destroy_vm(struct kvm *kvm)
9917{
27469d29
AH
9918 if (current->mm == kvm->mm) {
9919 /*
9920 * Free memory regions allocated on behalf of userspace,
9921 * unless the the memory map has changed due to process exit
9922 * or fd copying.
9923 */
6a3c623b
PX
9924 mutex_lock(&kvm->slots_lock);
9925 __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
9926 0, 0);
9927 __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
9928 0, 0);
9929 __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
9930 mutex_unlock(&kvm->slots_lock);
27469d29 9931 }
afaf0b2f
SC
9932 if (kvm_x86_ops.vm_destroy)
9933 kvm_x86_ops.vm_destroy(kvm);
c761159c
PX
9934 kvm_pic_destroy(kvm);
9935 kvm_ioapic_destroy(kvm);
d19a9cd2 9936 kvm_free_vcpus(kvm);
af1bae54 9937 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
66bb8a06 9938 kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
13d268ca 9939 kvm_mmu_uninit_vm(kvm);
2beb6dad 9940 kvm_page_track_cleanup(kvm);
cbc0236a 9941 kvm_hv_destroy_vm(kvm);
d19a9cd2 9942}
0de10343 9943
e96c81ee 9944void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
db3fe4eb
TY
9945{
9946 int i;
9947
d89cc617 9948 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
e96c81ee
SC
9949 kvfree(slot->arch.rmap[i]);
9950 slot->arch.rmap[i] = NULL;
9951
d89cc617
TY
9952 if (i == 0)
9953 continue;
9954
e96c81ee
SC
9955 kvfree(slot->arch.lpage_info[i - 1]);
9956 slot->arch.lpage_info[i - 1] = NULL;
db3fe4eb 9957 }
21ebbeda 9958
e96c81ee 9959 kvm_page_track_free_memslot(slot);
db3fe4eb
TY
9960}
9961
0dab98b7
SC
9962static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
9963 unsigned long npages)
db3fe4eb
TY
9964{
9965 int i;
9966
edd4fa37
SC
9967 /*
9968 * Clear out the previous array pointers for the KVM_MR_MOVE case. The
9969 * old arrays will be freed by __kvm_set_memory_region() if installing
9970 * the new memslot is successful.
9971 */
9972 memset(&slot->arch, 0, sizeof(slot->arch));
9973
d89cc617 9974 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
92f94f1e 9975 struct kvm_lpage_info *linfo;
db3fe4eb
TY
9976 unsigned long ugfn;
9977 int lpages;
d89cc617 9978 int level = i + 1;
db3fe4eb
TY
9979
9980 lpages = gfn_to_index(slot->base_gfn + npages - 1,
9981 slot->base_gfn, level) + 1;
9982
d89cc617 9983 slot->arch.rmap[i] =
778e1cdd 9984 kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
254272ce 9985 GFP_KERNEL_ACCOUNT);
d89cc617 9986 if (!slot->arch.rmap[i])
77d11309 9987 goto out_free;
d89cc617
TY
9988 if (i == 0)
9989 continue;
77d11309 9990
254272ce 9991 linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
92f94f1e 9992 if (!linfo)
db3fe4eb
TY
9993 goto out_free;
9994
92f94f1e
XG
9995 slot->arch.lpage_info[i - 1] = linfo;
9996
db3fe4eb 9997 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
92f94f1e 9998 linfo[0].disallow_lpage = 1;
db3fe4eb 9999 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
92f94f1e 10000 linfo[lpages - 1].disallow_lpage = 1;
db3fe4eb
TY
10001 ugfn = slot->userspace_addr >> PAGE_SHIFT;
10002 /*
10003 * If the gfn and userspace address are not aligned wrt each
600087b6 10004 * other, disable large page support for this slot.
db3fe4eb 10005 */
600087b6 10006 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
db3fe4eb
TY
10007 unsigned long j;
10008
10009 for (j = 0; j < lpages; ++j)
92f94f1e 10010 linfo[j].disallow_lpage = 1;
db3fe4eb
TY
10011 }
10012 }
10013
21ebbeda
XG
10014 if (kvm_page_track_create_memslot(slot, npages))
10015 goto out_free;
10016
db3fe4eb
TY
10017 return 0;
10018
10019out_free:
d89cc617 10020 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
548ef284 10021 kvfree(slot->arch.rmap[i]);
d89cc617
TY
10022 slot->arch.rmap[i] = NULL;
10023 if (i == 0)
10024 continue;
10025
548ef284 10026 kvfree(slot->arch.lpage_info[i - 1]);
d89cc617 10027 slot->arch.lpage_info[i - 1] = NULL;
db3fe4eb
TY
10028 }
10029 return -ENOMEM;
10030}
10031
15248258 10032void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
e59dbe09 10033{
91724814
BO
10034 struct kvm_vcpu *vcpu;
10035 int i;
10036
e6dff7d1
TY
10037 /*
10038 * memslots->generation has been incremented.
10039 * mmio generation may have reached its maximum value.
10040 */
15248258 10041 kvm_mmu_invalidate_mmio_sptes(kvm, gen);
91724814
BO
10042
10043 /* Force re-initialization of steal_time cache */
10044 kvm_for_each_vcpu(i, vcpu, kvm)
10045 kvm_vcpu_kick(vcpu);
e59dbe09
TY
10046}
10047
f7784b8e
MT
10048int kvm_arch_prepare_memory_region(struct kvm *kvm,
10049 struct kvm_memory_slot *memslot,
09170a49 10050 const struct kvm_userspace_memory_region *mem,
7b6195a9 10051 enum kvm_mr_change change)
0de10343 10052{
0dab98b7
SC
10053 if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
10054 return kvm_alloc_memslot_metadata(memslot,
10055 mem->memory_size >> PAGE_SHIFT);
f7784b8e
MT
10056 return 0;
10057}
10058
88178fd4
KH
10059static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
10060 struct kvm_memory_slot *new)
10061{
10062 /* Still write protect RO slot */
10063 if (new->flags & KVM_MEM_READONLY) {
3c9bd400 10064 kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL);
88178fd4
KH
10065 return;
10066 }
10067
10068 /*
10069 * Call kvm_x86_ops dirty logging hooks when they are valid.
10070 *
afaf0b2f 10071 * kvm_x86_ops.slot_disable_log_dirty is called when:
88178fd4
KH
10072 *
10073 * - KVM_MR_CREATE with dirty logging is disabled
10074 * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
10075 *
10076 * The reason is, in case of PML, we need to set D-bit for any slots
10077 * with dirty logging disabled in order to eliminate unnecessary GPA
0a03cbda 10078 * logging in PML buffer (and potential PML buffer full VMEXIT). This
88178fd4 10079 * guarantees leaving PML enabled during guest's lifetime won't have
bdd303cb 10080 * any additional overhead from PML when guest is running with dirty
88178fd4
KH
10081 * logging disabled for memory slots.
10082 *
afaf0b2f 10083 * kvm_x86_ops.slot_enable_log_dirty is called when switching new slot
88178fd4
KH
10084 * to dirty logging mode.
10085 *
10086 * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
10087 *
10088 * In case of write protect:
10089 *
10090 * Write protect all pages for dirty logging.
10091 *
10092 * All the sptes including the large sptes which point to this
10093 * slot are set to readonly. We can not create any new large
10094 * spte on this slot until the end of the logging.
10095 *
10096 * See the comments in fast_page_fault().
10097 */
10098 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
afaf0b2f
SC
10099 if (kvm_x86_ops.slot_enable_log_dirty) {
10100 kvm_x86_ops.slot_enable_log_dirty(kvm, new);
3c9bd400
JZ
10101 } else {
10102 int level =
10103 kvm_dirty_log_manual_protect_and_init_set(kvm) ?
10104 PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL;
10105
10106 /*
10107 * If we're with initial-all-set, we don't need
10108 * to write protect any small page because
10109 * they're reported as dirty already. However
10110 * we still need to write-protect huge pages
10111 * so that the page split can happen lazily on
10112 * the first write to the huge page.
10113 */
10114 kvm_mmu_slot_remove_write_access(kvm, new, level);
10115 }
88178fd4 10116 } else {
afaf0b2f
SC
10117 if (kvm_x86_ops.slot_disable_log_dirty)
10118 kvm_x86_ops.slot_disable_log_dirty(kvm, new);
88178fd4
KH
10119 }
10120}
10121
f7784b8e 10122void kvm_arch_commit_memory_region(struct kvm *kvm,
09170a49 10123 const struct kvm_userspace_memory_region *mem,
9d4c197c 10124 struct kvm_memory_slot *old,
f36f3f28 10125 const struct kvm_memory_slot *new,
8482644a 10126 enum kvm_mr_change change)
f7784b8e 10127{
48c0e4e9 10128 if (!kvm->arch.n_requested_mmu_pages)
4d66623c
WY
10129 kvm_mmu_change_mmu_pages(kvm,
10130 kvm_mmu_calculate_default_mmu_pages(kvm));
1c91cad4 10131
3ea3b7fa
WL
10132 /*
10133 * Dirty logging tracks sptes in 4k granularity, meaning that large
10134 * sptes have to be split. If live migration is successful, the guest
10135 * in the source machine will be destroyed and large sptes will be
10136 * created in the destination. However, if the guest continues to run
10137 * in the source machine (for example if live migration fails), small
10138 * sptes will remain around and cause bad performance.
10139 *
10140 * Scan sptes if dirty logging has been stopped, dropping those
10141 * which can be collapsed into a single large-page spte. Later
10142 * page faults will create the large-page sptes.
319109a2
SC
10143 *
10144 * There is no need to do this in any of the following cases:
10145 * CREATE: No dirty mappings will already exist.
10146 * MOVE/DELETE: The old mappings will already have been cleaned up by
10147 * kvm_arch_flush_shadow_memslot()
3ea3b7fa 10148 */
319109a2 10149 if (change == KVM_MR_FLAGS_ONLY &&
3ea3b7fa
WL
10150 (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
10151 !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
10152 kvm_mmu_zap_collapsible_sptes(kvm, new);
10153
c972f3b1 10154 /*
88178fd4 10155 * Set up write protection and/or dirty logging for the new slot.
c126d94f 10156 *
88178fd4
KH
10157 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
10158 * been zapped so no dirty logging staff is needed for old slot. For
10159 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
10160 * new and it's also covered when dealing with the new slot.
f36f3f28
PB
10161 *
10162 * FIXME: const-ify all uses of struct kvm_memory_slot.
c972f3b1 10163 */
88178fd4 10164 if (change != KVM_MR_DELETE)
f36f3f28 10165 kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
21198846
SC
10166
10167 /* Free the arrays associated with the old memslot. */
10168 if (change == KVM_MR_MOVE)
e96c81ee 10169 kvm_arch_free_memslot(kvm, old);
0de10343 10170}
1d737c8a 10171
2df72e9b 10172void kvm_arch_flush_shadow_all(struct kvm *kvm)
34d4cb8f 10173{
7390de1e 10174 kvm_mmu_zap_all(kvm);
34d4cb8f
MT
10175}
10176
2df72e9b
MT
10177void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
10178 struct kvm_memory_slot *slot)
10179{
ae7cd873 10180 kvm_page_track_flush_slot(kvm, slot);
2df72e9b
MT
10181}
10182
e6c67d8c
LA
10183static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
10184{
10185 return (is_guest_mode(vcpu) &&
afaf0b2f
SC
10186 kvm_x86_ops.guest_apic_has_interrupt &&
10187 kvm_x86_ops.guest_apic_has_interrupt(vcpu));
e6c67d8c
LA
10188}
10189
5d9bc648
PB
10190static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
10191{
10192 if (!list_empty_careful(&vcpu->async_pf.done))
10193 return true;
10194
10195 if (kvm_apic_has_events(vcpu))
10196 return true;
10197
10198 if (vcpu->arch.pv.pv_unhalted)
10199 return true;
10200
a5f01f8e
WL
10201 if (vcpu->arch.exception.pending)
10202 return true;
10203
47a66eed
Z
10204 if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
10205 (vcpu->arch.nmi_pending &&
afaf0b2f 10206 kvm_x86_ops.nmi_allowed(vcpu)))
5d9bc648
PB
10207 return true;
10208
47a66eed 10209 if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
a9fa7cb6
PB
10210 (vcpu->arch.smi_pending &&
10211 kvm_x86_ops.smi_allowed(vcpu)))
73917739
PB
10212 return true;
10213
5d9bc648 10214 if (kvm_arch_interrupt_allowed(vcpu) &&
e6c67d8c
LA
10215 (kvm_cpu_has_interrupt(vcpu) ||
10216 kvm_guest_apic_has_interrupt(vcpu)))
5d9bc648
PB
10217 return true;
10218
1f4b34f8
AS
10219 if (kvm_hv_has_stimer_pending(vcpu))
10220 return true;
10221
d2060bd4
SC
10222 if (is_guest_mode(vcpu) &&
10223 kvm_x86_ops.nested_ops->hv_timer_pending &&
10224 kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
10225 return true;
10226
5d9bc648
PB
10227 return false;
10228}
10229
1d737c8a
ZX
10230int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
10231{
5d9bc648 10232 return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
1d737c8a 10233}
5736199a 10234
17e433b5
WL
10235bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
10236{
10237 if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
10238 return true;
10239
10240 if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
10241 kvm_test_request(KVM_REQ_SMI, vcpu) ||
10242 kvm_test_request(KVM_REQ_EVENT, vcpu))
10243 return true;
10244
afaf0b2f 10245 if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
17e433b5
WL
10246 return true;
10247
10248 return false;
10249}
10250
199b5763
LM
10251bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
10252{
de63ad4c 10253 return vcpu->arch.preempted_in_kernel;
199b5763
LM
10254}
10255
b6d33834 10256int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
5736199a 10257{
b6d33834 10258 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
5736199a 10259}
78646121
GN
10260
10261int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
10262{
afaf0b2f 10263 return kvm_x86_ops.interrupt_allowed(vcpu);
78646121 10264}
229456fc 10265
82b32774 10266unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
f92653ee 10267{
82b32774
NA
10268 if (is_64_bit_mode(vcpu))
10269 return kvm_rip_read(vcpu);
10270 return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
10271 kvm_rip_read(vcpu));
10272}
10273EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
f92653ee 10274
82b32774
NA
10275bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
10276{
10277 return kvm_get_linear_rip(vcpu) == linear_rip;
f92653ee
JK
10278}
10279EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
10280
94fe45da
JK
10281unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
10282{
10283 unsigned long rflags;
10284
afaf0b2f 10285 rflags = kvm_x86_ops.get_rflags(vcpu);
94fe45da 10286 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
c310bac5 10287 rflags &= ~X86_EFLAGS_TF;
94fe45da
JK
10288 return rflags;
10289}
10290EXPORT_SYMBOL_GPL(kvm_get_rflags);
10291
6addfc42 10292static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
94fe45da
JK
10293{
10294 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
f92653ee 10295 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
c310bac5 10296 rflags |= X86_EFLAGS_TF;
afaf0b2f 10297 kvm_x86_ops.set_rflags(vcpu, rflags);
6addfc42
PB
10298}
10299
10300void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
10301{
10302 __kvm_set_rflags(vcpu, rflags);
3842d135 10303 kvm_make_request(KVM_REQ_EVENT, vcpu);
94fe45da
JK
10304}
10305EXPORT_SYMBOL_GPL(kvm_set_rflags);
10306
56028d08
GN
10307void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
10308{
10309 int r;
10310
44dd3ffa 10311 if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
f2e10669 10312 work->wakeup_all)
56028d08
GN
10313 return;
10314
10315 r = kvm_mmu_reload(vcpu);
10316 if (unlikely(r))
10317 return;
10318
44dd3ffa 10319 if (!vcpu->arch.mmu->direct_map &&
d8dd54e0 10320 work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
fb67e14f
XG
10321 return;
10322
7a02674d 10323 kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
56028d08
GN
10324}
10325
af585b92
GN
10326static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
10327{
10328 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
10329}
10330
10331static inline u32 kvm_async_pf_next_probe(u32 key)
10332{
10333 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
10334}
10335
10336static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
10337{
10338 u32 key = kvm_async_pf_hash_fn(gfn);
10339
10340 while (vcpu->arch.apf.gfns[key] != ~0)
10341 key = kvm_async_pf_next_probe(key);
10342
10343 vcpu->arch.apf.gfns[key] = gfn;
10344}
10345
10346static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
10347{
10348 int i;
10349 u32 key = kvm_async_pf_hash_fn(gfn);
10350
10351 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
c7d28c24
XG
10352 (vcpu->arch.apf.gfns[key] != gfn &&
10353 vcpu->arch.apf.gfns[key] != ~0); i++)
af585b92
GN
10354 key = kvm_async_pf_next_probe(key);
10355
10356 return key;
10357}
10358
10359bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
10360{
10361 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
10362}
10363
10364static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
10365{
10366 u32 i, j, k;
10367
10368 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
10369 while (true) {
10370 vcpu->arch.apf.gfns[i] = ~0;
10371 do {
10372 j = kvm_async_pf_next_probe(j);
10373 if (vcpu->arch.apf.gfns[j] == ~0)
10374 return;
10375 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
10376 /*
10377 * k lies cyclically in ]i,j]
10378 * | i.k.j |
10379 * |....j i.k.| or |.k..j i...|
10380 */
10381 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
10382 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
10383 i = j;
10384 }
10385}
10386
7c90705b
GN
10387static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
10388{
4e335d9e
PB
10389
10390 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
10391 sizeof(val));
7c90705b
GN
10392}
10393
9a6e7c39
WL
10394static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
10395{
10396
10397 return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
10398 sizeof(u32));
10399}
10400
1dfdb45e
PB
10401static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
10402{
10403 if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
10404 return false;
10405
10406 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
10407 (vcpu->arch.apf.send_user_only &&
afaf0b2f 10408 kvm_x86_ops.get_cpl(vcpu) == 0))
1dfdb45e
PB
10409 return false;
10410
10411 return true;
10412}
10413
10414bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
10415{
10416 if (unlikely(!lapic_in_kernel(vcpu) ||
10417 kvm_event_needs_reinjection(vcpu) ||
10418 vcpu->arch.exception.pending))
10419 return false;
10420
10421 if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
10422 return false;
10423
10424 /*
10425 * If interrupts are off we cannot even use an artificial
10426 * halt state.
10427 */
afaf0b2f 10428 return kvm_x86_ops.interrupt_allowed(vcpu);
1dfdb45e
PB
10429}
10430
af585b92
GN
10431void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
10432 struct kvm_async_pf *work)
10433{
6389ee94
AK
10434 struct x86_exception fault;
10435
736c291c 10436 trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
af585b92 10437 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
7c90705b 10438
1dfdb45e
PB
10439 if (kvm_can_deliver_async_pf(vcpu) &&
10440 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
6389ee94
AK
10441 fault.vector = PF_VECTOR;
10442 fault.error_code_valid = true;
10443 fault.error_code = 0;
10444 fault.nested_page_fault = false;
10445 fault.address = work->arch.token;
adfe20fb 10446 fault.async_page_fault = true;
6389ee94 10447 kvm_inject_page_fault(vcpu, &fault);
1dfdb45e
PB
10448 } else {
10449 /*
10450 * It is not possible to deliver a paravirtualized asynchronous
10451 * page fault, but putting the guest in an artificial halt state
10452 * can be beneficial nevertheless: if an interrupt arrives, we
10453 * can deliver it timely and perhaps the guest will schedule
10454 * another process. When the instruction that triggered a page
10455 * fault is retried, hopefully the page will be ready in the host.
10456 */
10457 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
7c90705b 10458 }
af585b92
GN
10459}
10460
10461void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
10462 struct kvm_async_pf *work)
10463{
6389ee94 10464 struct x86_exception fault;
9a6e7c39 10465 u32 val;
6389ee94 10466
f2e10669 10467 if (work->wakeup_all)
7c90705b
GN
10468 work->arch.token = ~0; /* broadcast wakeup */
10469 else
10470 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
736c291c 10471 trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
7c90705b 10472
9a6e7c39
WL
10473 if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
10474 !apf_get_user(vcpu, &val)) {
10475 if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
10476 vcpu->arch.exception.pending &&
10477 vcpu->arch.exception.nr == PF_VECTOR &&
10478 !apf_put_user(vcpu, 0)) {
10479 vcpu->arch.exception.injected = false;
10480 vcpu->arch.exception.pending = false;
10481 vcpu->arch.exception.nr = 0;
10482 vcpu->arch.exception.has_error_code = false;
10483 vcpu->arch.exception.error_code = 0;
c851436a
JM
10484 vcpu->arch.exception.has_payload = false;
10485 vcpu->arch.exception.payload = 0;
9a6e7c39
WL
10486 } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
10487 fault.vector = PF_VECTOR;
10488 fault.error_code_valid = true;
10489 fault.error_code = 0;
10490 fault.nested_page_fault = false;
10491 fault.address = work->arch.token;
10492 fault.async_page_fault = true;
10493 kvm_inject_page_fault(vcpu, &fault);
10494 }
7c90705b 10495 }
e6d53e3b 10496 vcpu->arch.apf.halted = false;
a4fa1635 10497 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7c90705b
GN
10498}
10499
10500bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
10501{
10502 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
10503 return true;
10504 else
9bc1f09f 10505 return kvm_can_do_async_pf(vcpu);
af585b92
GN
10506}
10507
5544eb9b
PB
10508void kvm_arch_start_assignment(struct kvm *kvm)
10509{
10510 atomic_inc(&kvm->arch.assigned_device_count);
10511}
10512EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
10513
10514void kvm_arch_end_assignment(struct kvm *kvm)
10515{
10516 atomic_dec(&kvm->arch.assigned_device_count);
10517}
10518EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
10519
10520bool kvm_arch_has_assigned_device(struct kvm *kvm)
10521{
10522 return atomic_read(&kvm->arch.assigned_device_count);
10523}
10524EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
10525
e0f0bbc5
AW
10526void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
10527{
10528 atomic_inc(&kvm->arch.noncoherent_dma_count);
10529}
10530EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
10531
10532void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
10533{
10534 atomic_dec(&kvm->arch.noncoherent_dma_count);
10535}
10536EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
10537
10538bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
10539{
10540 return atomic_read(&kvm->arch.noncoherent_dma_count);
10541}
10542EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
10543
14717e20
AW
10544bool kvm_arch_has_irq_bypass(void)
10545{
92735b1b 10546 return true;
14717e20
AW
10547}
10548
87276880
FW
10549int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
10550 struct irq_bypass_producer *prod)
10551{
10552 struct kvm_kernel_irqfd *irqfd =
10553 container_of(cons, struct kvm_kernel_irqfd, consumer);
10554
14717e20 10555 irqfd->producer = prod;
87276880 10556
afaf0b2f 10557 return kvm_x86_ops.update_pi_irte(irqfd->kvm,
14717e20 10558 prod->irq, irqfd->gsi, 1);
87276880
FW
10559}
10560
10561void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
10562 struct irq_bypass_producer *prod)
10563{
10564 int ret;
10565 struct kvm_kernel_irqfd *irqfd =
10566 container_of(cons, struct kvm_kernel_irqfd, consumer);
10567
87276880
FW
10568 WARN_ON(irqfd->producer != prod);
10569 irqfd->producer = NULL;
10570
10571 /*
10572 * When producer of consumer is unregistered, we change back to
10573 * remapped mode, so we can re-use the current implementation
bb3541f1 10574 * when the irq is masked/disabled or the consumer side (KVM
87276880
FW
10575 * int this case doesn't want to receive the interrupts.
10576 */
afaf0b2f 10577 ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
87276880
FW
10578 if (ret)
10579 printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
10580 " fails: %d\n", irqfd->consumer.token, ret);
10581}
10582
10583int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
10584 uint32_t guest_irq, bool set)
10585{
afaf0b2f 10586 return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
87276880
FW
10587}
10588
52004014
FW
10589bool kvm_vector_hashing_enabled(void)
10590{
10591 return vector_hashing;
10592}
52004014 10593
2d5ba19b
MT
10594bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
10595{
10596 return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
10597}
10598EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
10599
6441fa61
PB
10600u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
10601{
10602 uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD;
10603
10604 /* The STIBP bit doesn't fault even if it's not advertised */
10605 if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
10606 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
10607 bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
10608 if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
10609 !boot_cpu_has(X86_FEATURE_AMD_IBRS))
10610 bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
10611
10612 if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) &&
10613 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
10614 bits &= ~SPEC_CTRL_SSBD;
10615 if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
10616 !boot_cpu_has(X86_FEATURE_AMD_SSBD))
10617 bits &= ~SPEC_CTRL_SSBD;
10618
10619 return bits;
10620}
10621EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
2d5ba19b 10622
229456fc 10623EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
931c33b1 10624EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
229456fc
MT
10625EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
10626EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
10627EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
10628EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
0ac406de 10629EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
d8cabddf 10630EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
17897f36 10631EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
236649de 10632EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5497b955 10633EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
ec1ff790 10634EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
532a46b9 10635EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
2e554e8d 10636EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
489223ed 10637EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
4f75bcc3 10638EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
843e4330 10639EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
efc64404 10640EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
18f40c53
SS
10641EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
10642EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
ab56f8e6 10643EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
24bbf74c 10644EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);