KVM: x86: Restrict get_mt_mask() to a u8, use KVM_X86_OP_OPTIONAL_RET0
[linux-2.6-block.git] / arch / x86 / kvm / svm / svm.c
CommitLineData
44a95dae
SS
1#define pr_fmt(fmt) "SVM: " fmt
2
edf88417
AK
3#include <linux/kvm_host.h>
4
85f455f7 5#include "irq.h"
1d737c8a 6#include "mmu.h"
5fdbf976 7#include "kvm_cache_regs.h"
fe4c7b19 8#include "x86.h"
66f7b72e 9#include "cpuid.h"
25462f7f 10#include "pmu.h"
e495606d 11
6aa8b732 12#include <linux/module.h>
ae759544 13#include <linux/mod_devicetable.h>
9d8f549d 14#include <linux/kernel.h>
6aa8b732
AK
15#include <linux/vmalloc.h>
16#include <linux/highmem.h>
ef0f6496 17#include <linux/amd-iommu.h>
e8edc6e0 18#include <linux/sched.h>
af658dca 19#include <linux/trace_events.h>
5a0e3ad6 20#include <linux/slab.h>
5881f737 21#include <linux/hashtable.h>
00089c04 22#include <linux/objtool.h>
e9df0942 23#include <linux/psp-sev.h>
1654efcb 24#include <linux/file.h>
89c50580
BS
25#include <linux/pagemap.h>
26#include <linux/swap.h>
33af3a7e 27#include <linux/rwsem.h>
4d96f910 28#include <linux/cc_platform.h>
6aa8b732 29
8221c137 30#include <asm/apic.h>
1018faa6 31#include <asm/perf_event.h>
67ec6607 32#include <asm/tlbflush.h>
e495606d 33#include <asm/desc.h>
facb0139 34#include <asm/debugreg.h>
631bc487 35#include <asm/kvm_para.h>
411b44ba 36#include <asm/irq_remapping.h>
28a27752 37#include <asm/spec-ctrl.h>
ba5bade4 38#include <asm/cpu_device_id.h>
f1c6366e 39#include <asm/traps.h>
d69c1382 40#include <asm/fpu/api.h>
6aa8b732 41
63d1142f 42#include <asm/virtext.h>
229456fc 43#include "trace.h"
63d1142f 44
883b0a91 45#include "svm.h"
35a78319 46#include "svm_ops.h"
883b0a91 47
1e0c7d40
VP
48#include "kvm_onhyperv.h"
49#include "svm_onhyperv.h"
50
6aa8b732
AK
51MODULE_AUTHOR("Qumranet");
52MODULE_LICENSE("GPL");
53
575b255c 54#ifdef MODULE
ae759544 55static const struct x86_cpu_id svm_cpu_id[] = {
320debe5 56 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
ae759544
JT
57 {}
58};
59MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
575b255c 60#endif
ae759544 61
6aa8b732
AK
62#define SEG_TYPE_LDT 2
63#define SEG_TYPE_BUSY_TSS16 3
64
67ec6607
JR
65static bool erratum_383_found __read_mostly;
66
883b0a91 67u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
323c3d80 68
2b036c6b
BO
69/*
70 * Set osvw_len to higher value when updated Revision Guides
71 * are published and we know what the new status bits are
72 */
73static uint64_t osvw_len = 4, osvw_status;
74
fbc0db76 75static DEFINE_PER_CPU(u64, current_tsc_ratio);
fbc0db76 76
09941fbb 77static const struct svm_direct_access_msrs {
ac72a9b7 78 u32 index; /* Index of the MSR */
376c6d28 79 bool always; /* True if intercept is initially cleared */
fd6fa73d 80} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
8c06585d 81 { .index = MSR_STAR, .always = true },
ac72a9b7 82 { .index = MSR_IA32_SYSENTER_CS, .always = true },
adc2a237
ML
83 { .index = MSR_IA32_SYSENTER_EIP, .always = false },
84 { .index = MSR_IA32_SYSENTER_ESP, .always = false },
ac72a9b7
JR
85#ifdef CONFIG_X86_64
86 { .index = MSR_GS_BASE, .always = true },
87 { .index = MSR_FS_BASE, .always = true },
88 { .index = MSR_KERNEL_GS_BASE, .always = true },
89 { .index = MSR_LSTAR, .always = true },
90 { .index = MSR_CSTAR, .always = true },
91 { .index = MSR_SYSCALL_MASK, .always = true },
92#endif
b2ac58f9 93 { .index = MSR_IA32_SPEC_CTRL, .always = false },
15d45071 94 { .index = MSR_IA32_PRED_CMD, .always = false },
ac72a9b7
JR
95 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
96 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
97 { .index = MSR_IA32_LASTINTFROMIP, .always = false },
98 { .index = MSR_IA32_LASTINTTOIP, .always = false },
376c6d28
TL
99 { .index = MSR_EFER, .always = false },
100 { .index = MSR_IA32_CR_PAT, .always = false },
101 { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
296d5a17 102 { .index = MSR_TSC_AUX, .always = false },
5c127c85
SS
103 { .index = (APIC_BASE_MSR + APIC_ID), .always = false },
104 { .index = (APIC_BASE_MSR + APIC_LVR), .always = false },
105 { .index = (APIC_BASE_MSR + APIC_TASKPRI), .always = false },
106 { .index = (APIC_BASE_MSR + APIC_ARBPRI), .always = false },
107 { .index = (APIC_BASE_MSR + APIC_PROCPRI), .always = false },
108 { .index = (APIC_BASE_MSR + APIC_EOI), .always = false },
109 { .index = (APIC_BASE_MSR + APIC_RRR), .always = false },
110 { .index = (APIC_BASE_MSR + APIC_LDR), .always = false },
111 { .index = (APIC_BASE_MSR + APIC_DFR), .always = false },
112 { .index = (APIC_BASE_MSR + APIC_SPIV), .always = false },
113 { .index = (APIC_BASE_MSR + APIC_ISR), .always = false },
114 { .index = (APIC_BASE_MSR + APIC_TMR), .always = false },
115 { .index = (APIC_BASE_MSR + APIC_IRR), .always = false },
116 { .index = (APIC_BASE_MSR + APIC_ESR), .always = false },
117 { .index = (APIC_BASE_MSR + APIC_ICR), .always = false },
118 { .index = (APIC_BASE_MSR + APIC_ICR2), .always = false },
119 { .index = (APIC_BASE_MSR + APIC_LVTT), .always = false },
120 { .index = (APIC_BASE_MSR + APIC_LVTTHMR), .always = false },
121 { .index = (APIC_BASE_MSR + APIC_LVTPC), .always = false },
122 { .index = (APIC_BASE_MSR + APIC_LVT0), .always = false },
123 { .index = (APIC_BASE_MSR + APIC_LVT1), .always = false },
124 { .index = (APIC_BASE_MSR + APIC_LVTERR), .always = false },
125 { .index = (APIC_BASE_MSR + APIC_TMICT), .always = false },
126 { .index = (APIC_BASE_MSR + APIC_TMCCT), .always = false },
127 { .index = (APIC_BASE_MSR + APIC_TDCR), .always = false },
ac72a9b7 128 { .index = MSR_INVALID, .always = false },
6c8166a7
AK
129};
130
8566ac8b
BM
131/*
132 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
133 * pause_filter_count: On processors that support Pause filtering(indicated
134 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
135 * count value. On VMRUN this value is loaded into an internal counter.
136 * Each time a pause instruction is executed, this counter is decremented
137 * until it reaches zero at which time a #VMEXIT is generated if pause
138 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
139 * Intercept Filtering for more details.
140 * This also indicate if ple logic enabled.
141 *
142 * pause_filter_thresh: In addition, some processor families support advanced
143 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
144 * the amount of time a guest is allowed to execute in a pause loop.
145 * In this mode, a 16-bit pause filter threshold field is added in the
146 * VMCB. The threshold value is a cycle count that is used to reset the
147 * pause counter. As with simple pause filtering, VMRUN loads the pause
148 * count value from VMCB into an internal counter. Then, on each pause
149 * instruction the hardware checks the elapsed number of cycles since
150 * the most recent pause instruction against the pause filter threshold.
151 * If the elapsed cycle count is greater than the pause filter threshold,
152 * then the internal pause count is reloaded from the VMCB and execution
153 * continues. If the elapsed cycle count is less than the pause filter
154 * threshold, then the internal pause count is decremented. If the count
155 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
156 * triggered. If advanced pause filtering is supported and pause filter
157 * threshold field is set to zero, the filter will operate in the simpler,
158 * count only mode.
159 */
160
161static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
162module_param(pause_filter_thresh, ushort, 0444);
163
164static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
165module_param(pause_filter_count, ushort, 0444);
166
167/* Default doubles per-vcpu window every exit. */
168static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
169module_param(pause_filter_count_grow, ushort, 0444);
170
171/* Default resets per-vcpu window every exit to pause_filter_count. */
172static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
173module_param(pause_filter_count_shrink, ushort, 0444);
174
175/* Default is to compute the maximum so we can never overflow. */
176static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
177module_param(pause_filter_count_max, ushort, 0444);
178
99840a75
SC
179/*
180 * Use nested page tables by default. Note, NPT may get forced off by
181 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
182 */
183bool npt_enabled = true;
184module_param_named(npt, npt_enabled, bool, 0444);
e3da3acd 185
e2358851
DB
186/* allow nested virtualization in KVM/SVM */
187static int nested = true;
236de055
AG
188module_param(nested, int, S_IRUGO);
189
d647eb63
PB
190/* enable/disable Next RIP Save */
191static int nrips = true;
192module_param(nrips, int, 0444);
193
89c8a498
JN
194/* enable/disable Virtual VMLOAD VMSAVE */
195static int vls = true;
196module_param(vls, int, 0444);
197
640bd6e5 198/* enable/disable Virtual GIF */
ea91559b 199int vgif = true;
640bd6e5 200module_param(vgif, int, 0444);
5ea11f2b 201
4c84926e
ML
202/* enable/disable LBR virtualization */
203static int lbrv = true;
204module_param(lbrv, int, 0444);
205
f800650a
ML
206static int tsc_scaling = true;
207module_param(tsc_scaling, int, 0444);
208
fdf513e3
VK
209/*
210 * enable / disable AVIC. Because the defaults differ for APICv
211 * support between VMX and SVM we cannot use module_param_named.
212 */
213static bool avic;
214module_param(avic, bool, 0444);
215
291bd20d 216bool __read_mostly dump_invalid_vmcb;
6f2f8453
PB
217module_param(dump_invalid_vmcb, bool, 0644);
218
4b639a9f
ML
219
220bool intercept_smi = true;
221module_param(intercept_smi, bool, 0444);
222
223
2e215216 224static bool svm_gp_erratum_intercept = true;
82a11e9c 225
7607b717
BS
226static u8 rsm_ins_bytes[] = "\x0f\xaa";
227
4866d5e3 228static unsigned long iopm_base;
6aa8b732
AK
229
230struct kvm_ldttss_desc {
231 u16 limit0;
232 u16 base0;
e0231715
JR
233 unsigned base1:8, type:5, dpl:2, p:1;
234 unsigned limit1:4, zero0:3, g:1, base2:8;
6aa8b732
AK
235 u32 base3;
236 u32 zero1;
237} __attribute__((packed));
238
eaf78265 239DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
6aa8b732 240
844d69c2
SC
241/*
242 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
243 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
244 *
245 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
246 * defer the restoration of TSC_AUX until the CPU returns to userspace.
247 */
0caa0a77 248static int tsc_aux_uret_slot __read_mostly = -1;
844d69c2 249
09941fbb 250static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
6aa8b732 251
9d8f549d 252#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
6aa8b732
AK
253#define MSRS_RANGE_SIZE 2048
254#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
255
883b0a91 256u32 svm_msrpm_offset(u32 msr)
455716fa
JR
257{
258 u32 offset;
259 int i;
260
261 for (i = 0; i < NUM_MSR_MAPS; i++) {
262 if (msr < msrpm_ranges[i] ||
263 msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
264 continue;
265
266 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
267 offset += (i * MSRS_RANGE_SIZE); /* add range offset */
268
269 /* Now we have the u8 offset - but need the u32 offset */
270 return offset / 4;
271 }
272
273 /* MSR not in any range */
274 return MSR_INVALID;
275}
276
4d9c83f5
SC
277static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
278
1af4a119 279static int get_npt_level(void)
4b16184c
JR
280{
281#ifdef CONFIG_X86_64
43e540cc 282 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4b16184c
JR
283#else
284 return PT32E_ROOT_LEVEL;
285#endif
286}
287
72f211ec 288int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
6aa8b732 289{
c513f484 290 struct vcpu_svm *svm = to_svm(vcpu);
2fcf4876 291 u64 old_efer = vcpu->arch.efer;
6dc696d4 292 vcpu->arch.efer = efer;
9167ab79
PB
293
294 if (!npt_enabled) {
295 /* Shadow paging assumes NX to be available. */
296 efer |= EFER_NX;
297
298 if (!(efer & EFER_LMA))
299 efer &= ~EFER_LME;
300 }
6aa8b732 301
2fcf4876
ML
302 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
303 if (!(efer & EFER_SVME)) {
f7e57078 304 svm_leave_nested(vcpu);
2fcf4876 305 svm_set_gif(svm, true);
82a11e9c
BD
306 /* #GP intercept is still needed for vmware backdoor */
307 if (!enable_vmware_backdoor)
308 clr_exception_intercept(svm, GP_VECTOR);
2fcf4876
ML
309
310 /*
311 * Free the nested guest state, unless we are in SMM.
312 * In this case we will return to the nested guest
313 * as soon as we leave SMM.
314 */
63129754 315 if (!is_smm(vcpu))
2fcf4876
ML
316 svm_free_nested(svm);
317
318 } else {
319 int ret = svm_allocate_nested(svm);
320
321 if (ret) {
322 vcpu->arch.efer = old_efer;
323 return ret;
324 }
82a11e9c 325
0b0be065
SC
326 /*
327 * Never intercept #GP for SEV guests, KVM can't
328 * decrypt guest memory to workaround the erratum.
329 */
330 if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
82a11e9c 331 set_exception_intercept(svm, GP_VECTOR);
2fcf4876 332 }
c513f484
PB
333 }
334
335 svm->vmcb->save.efer = efer | EFER_SVME;
06e7852c 336 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
72f211ec 337 return 0;
6aa8b732
AK
338}
339
6aa8b732
AK
340static int is_external_interrupt(u32 info)
341{
342 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
343 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
344}
345
37ccdcbe 346static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2809f5d2
GC
347{
348 struct vcpu_svm *svm = to_svm(vcpu);
349 u32 ret = 0;
350
351 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
37ccdcbe
PB
352 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
353 return ret;
2809f5d2
GC
354}
355
356static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
357{
358 struct vcpu_svm *svm = to_svm(vcpu);
359
360 if (mask == 0)
361 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
362 else
363 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
364
365}
366
6ef88d6e
SC
367static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
368 bool commit_side_effects)
6aa8b732 369{
a2fa3e9f 370 struct vcpu_svm *svm = to_svm(vcpu);
6ef88d6e 371 unsigned long old_rflags;
a2fa3e9f 372
f1c6366e
TL
373 /*
374 * SEV-ES does not expose the next RIP. The RIP update is controlled by
375 * the type of exit and the #VC handler in the guest.
376 */
377 if (sev_es_guest(vcpu->kvm))
378 goto done;
379
d647eb63 380 if (nrips && svm->vmcb->control.next_rip != 0) {
d2922422 381 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
6bc31bdc 382 svm->next_rip = svm->vmcb->control.next_rip;
f104765b 383 }
6bc31bdc 384
1957aa63 385 if (!svm->next_rip) {
6ef88d6e
SC
386 if (unlikely(!commit_side_effects))
387 old_rflags = svm->vmcb->save.rflags;
388
1957aa63
SC
389 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
390 return 0;
6ef88d6e
SC
391
392 if (unlikely(!commit_side_effects))
393 svm->vmcb->save.rflags = old_rflags;
1957aa63 394 } else {
1957aa63
SC
395 kvm_rip_write(vcpu, svm->next_rip);
396 }
f1c6366e
TL
397
398done:
6ef88d6e
SC
399 if (likely(commit_side_effects))
400 svm_set_interrupt_shadow(vcpu, 0);
f8ea7c60 401
60fc3d02 402 return 1;
6aa8b732
AK
403}
404
6ef88d6e
SC
405static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
406{
407 return __svm_skip_emulated_instruction(vcpu, true);
408}
409
410static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
411{
412 unsigned long rip, old_rip = kvm_rip_read(vcpu);
413 struct vcpu_svm *svm = to_svm(vcpu);
414
415 /*
416 * Due to architectural shortcomings, the CPU doesn't always provide
417 * NextRIP, e.g. if KVM intercepted an exception that occurred while
418 * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
419 * the instruction even if NextRIP is supported to acquire the next
420 * RIP so that it can be shoved into the NextRIP field, otherwise
421 * hardware will fail to advance guest RIP during event injection.
422 * Drop the exception/interrupt if emulation fails and effectively
423 * retry the instruction, it's the least awful option. If NRIPS is
424 * in use, the skip must not commit any side effects such as clearing
425 * the interrupt shadow or RFLAGS.RF.
426 */
427 if (!__svm_skip_emulated_instruction(vcpu, !nrips))
428 return -EIO;
429
430 rip = kvm_rip_read(vcpu);
431
432 /*
433 * Save the injection information, even when using next_rip, as the
434 * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
435 * doesn't complete due to a VM-Exit occurring while the CPU is
436 * vectoring the event. Decoding the instruction isn't guaranteed to
437 * work as there may be no backing instruction, e.g. if the event is
438 * being injected by L1 for L2, or if the guest is patching INT3 into
439 * a different instruction.
440 */
441 svm->soft_int_injected = true;
442 svm->soft_int_csbase = svm->vmcb->save.cs.base;
443 svm->soft_int_old_rip = old_rip;
444 svm->soft_int_next_rip = rip;
445
446 if (nrips)
447 kvm_rip_write(vcpu, old_rip);
448
449 if (static_cpu_has(X86_FEATURE_NRIPS))
450 svm->vmcb->control.next_rip = rip;
451
452 return 0;
453}
454
cfcd20e5 455static void svm_queue_exception(struct kvm_vcpu *vcpu)
116a4752
JK
456{
457 struct vcpu_svm *svm = to_svm(vcpu);
cfcd20e5
WL
458 unsigned nr = vcpu->arch.exception.nr;
459 bool has_error_code = vcpu->arch.exception.has_error_code;
cfcd20e5 460 u32 error_code = vcpu->arch.exception.error_code;
116a4752 461
63129754 462 kvm_deliver_exception_payload(vcpu);
da998b46 463
6ef88d6e
SC
464 if (kvm_exception_is_soft(nr) &&
465 svm_update_soft_interrupt_rip(vcpu))
466 return;
66b7138f 467
116a4752
JK
468 svm->vmcb->control.event_inj = nr
469 | SVM_EVTINJ_VALID
470 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
471 | SVM_EVTINJ_TYPE_EXEPT;
472 svm->vmcb->control.event_inj_err = error_code;
473}
474
67ec6607
JR
475static void svm_init_erratum_383(void)
476{
477 u32 low, high;
478 int err;
479 u64 val;
480
e6ee94d5 481 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
67ec6607
JR
482 return;
483
484 /* Use _safe variants to not break nested virtualization */
485 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
486 if (err)
487 return;
488
489 val |= (1ULL << 47);
490
491 low = lower_32_bits(val);
492 high = upper_32_bits(val);
493
494 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
495
496 erratum_383_found = true;
497}
498
2b036c6b
BO
499static void svm_init_osvw(struct kvm_vcpu *vcpu)
500{
501 /*
502 * Guests should see errata 400 and 415 as fixed (assuming that
503 * HLT and IO instructions are intercepted).
504 */
505 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
506 vcpu->arch.osvw.status = osvw_status & ~(6ULL);
507
508 /*
509 * By increasing VCPU's osvw.length to 3 we are telling the guest that
510 * all osvw.status bits inside that length, including bit 0 (which is
511 * reserved for erratum 298), are valid. However, if host processor's
512 * osvw_len is 0 then osvw_status[0] carries no information. We need to
513 * be conservative here and therefore we tell the guest that erratum 298
514 * is present (because we really don't know).
515 */
516 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
517 vcpu->arch.osvw.status |= 1;
518}
519
6aa8b732
AK
520static int has_svm(void)
521{
63d1142f 522 const char *msg;
6aa8b732 523
63d1142f 524 if (!cpu_has_svm(&msg)) {
ff81ff10 525 printk(KERN_INFO "has_svm: %s\n", msg);
6aa8b732
AK
526 return 0;
527 }
528
4d96f910 529 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
ccd85d90
SC
530 pr_info("KVM is unsupported when running as an SEV guest\n");
531 return 0;
532 }
533
6aa8b732
AK
534 return 1;
535}
536
11d39e8c
ML
537void __svm_write_tsc_multiplier(u64 multiplier)
538{
539 preempt_disable();
540
541 if (multiplier == __this_cpu_read(current_tsc_ratio))
542 goto out;
543
544 wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
545 __this_cpu_write(current_tsc_ratio, multiplier);
546out:
547 preempt_enable();
548}
549
13a34e06 550static void svm_hardware_disable(void)
6aa8b732 551{
fbc0db76 552 /* Make sure we clean up behind us */
f800650a 553 if (tsc_scaling)
11d39e8c 554 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
fbc0db76 555
2c8dceeb 556 cpu_svm_disable();
1018faa6
JR
557
558 amd_pmu_disable_virt();
6aa8b732
AK
559}
560
13a34e06 561static int svm_hardware_enable(void)
6aa8b732
AK
562{
563
0fe1e009 564 struct svm_cpu_data *sd;
6aa8b732 565 uint64_t efer;
6aa8b732
AK
566 struct desc_struct *gdt;
567 int me = raw_smp_processor_id();
568
10474ae8
AG
569 rdmsrl(MSR_EFER, efer);
570 if (efer & EFER_SVME)
571 return -EBUSY;
572
6aa8b732 573 if (!has_svm()) {
1f5b77f5 574 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
10474ae8 575 return -EINVAL;
6aa8b732 576 }
0fe1e009 577 sd = per_cpu(svm_data, me);
0fe1e009 578 if (!sd) {
1f5b77f5 579 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
10474ae8 580 return -EINVAL;
6aa8b732
AK
581 }
582
0fe1e009
TH
583 sd->asid_generation = 1;
584 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
585 sd->next_asid = sd->max_asid + 1;
ed3cd233 586 sd->min_asid = max_sev_asid + 1;
6aa8b732 587
45fc8757 588 gdt = get_current_gdt_rw();
0fe1e009 589 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
6aa8b732 590
9962d032 591 wrmsrl(MSR_EFER, efer | EFER_SVME);
6aa8b732 592
85ca8be9 593 wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
10474ae8 594
fbc0db76 595 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
f800650a
ML
596 /*
597 * Set the default value, even if we don't use TSC scaling
598 * to avoid having stale value in the msr
599 */
11d39e8c 600 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
fbc0db76
JR
601 }
602
2b036c6b
BO
603
604 /*
605 * Get OSVW bits.
606 *
607 * Note that it is possible to have a system with mixed processor
608 * revisions and therefore different OSVW bits. If bits are not the same
609 * on different processors then choose the worst case (i.e. if erratum
610 * is present on one processor and not on another then assume that the
611 * erratum is present everywhere).
612 */
613 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
614 uint64_t len, status = 0;
615 int err;
616
617 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
618 if (!err)
619 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
620 &err);
621
622 if (err)
623 osvw_status = osvw_len = 0;
624 else {
625 if (len < osvw_len)
626 osvw_len = len;
627 osvw_status |= status;
628 osvw_status &= (1ULL << osvw_len) - 1;
629 }
630 } else
631 osvw_status = osvw_len = 0;
632
67ec6607
JR
633 svm_init_erratum_383();
634
1018faa6
JR
635 amd_pmu_enable_virt();
636
10474ae8 637 return 0;
6aa8b732
AK
638}
639
0da1db75
JR
640static void svm_cpu_uninit(int cpu)
641{
a2b2d4bf 642 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
0da1db75 643
0fe1e009 644 if (!sd)
0da1db75
JR
645 return;
646
a2b2d4bf 647 per_cpu(svm_data, cpu) = NULL;
70cd94e6 648 kfree(sd->sev_vmcbs);
0fe1e009
TH
649 __free_page(sd->save_area);
650 kfree(sd);
0da1db75
JR
651}
652
6aa8b732
AK
653static int svm_cpu_init(int cpu)
654{
0fe1e009 655 struct svm_cpu_data *sd;
b95c221c 656 int ret = -ENOMEM;
6aa8b732 657
0fe1e009
TH
658 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
659 if (!sd)
b95c221c 660 return ret;
0fe1e009 661 sd->cpu = cpu;
58356767 662 sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
0fe1e009 663 if (!sd->save_area)
d80b64ff 664 goto free_cpu_data;
b95c221c 665
b95c221c
SC
666 ret = sev_cpu_init(sd);
667 if (ret)
668 goto free_save_area;
70cd94e6 669
0fe1e009 670 per_cpu(svm_data, cpu) = sd;
6aa8b732
AK
671
672 return 0;
673
d80b64ff
ML
674free_save_area:
675 __free_page(sd->save_area);
676free_cpu_data:
0fe1e009 677 kfree(sd);
b95c221c 678 return ret;
6aa8b732
AK
679
680}
681
fd6fa73d 682static int direct_access_msr_slot(u32 msr)
ac72a9b7 683{
fd6fa73d 684 u32 i;
ac72a9b7
JR
685
686 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
fd6fa73d
AG
687 if (direct_access_msrs[i].index == msr)
688 return i;
ac72a9b7 689
fd6fa73d
AG
690 return -ENOENT;
691}
692
693static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
694 int write)
695{
696 struct vcpu_svm *svm = to_svm(vcpu);
697 int slot = direct_access_msr_slot(msr);
698
699 if (slot == -ENOENT)
700 return;
701
702 /* Set the shadow bitmaps to the desired intercept states */
703 if (read)
704 set_bit(slot, svm->shadow_msr_intercept.read);
705 else
706 clear_bit(slot, svm->shadow_msr_intercept.read);
707
708 if (write)
709 set_bit(slot, svm->shadow_msr_intercept.write);
710 else
711 clear_bit(slot, svm->shadow_msr_intercept.write);
ac72a9b7
JR
712}
713
fd6fa73d
AG
714static bool valid_msr_intercept(u32 index)
715{
716 return direct_access_msr_slot(index) != -ENOENT;
ac72a9b7
JR
717}
718
476c9bd8 719static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
b2ac58f9
KA
720{
721 u8 bit_write;
722 unsigned long tmp;
723 u32 offset;
724 u32 *msrpm;
725
726 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
727 to_svm(vcpu)->msrpm;
728
729 offset = svm_msrpm_offset(msr);
730 bit_write = 2 * (msr & 0x0f) + 1;
731 tmp = msrpm[offset];
732
733 BUG_ON(offset == MSR_INVALID);
734
735 return !!test_bit(bit_write, &tmp);
736}
737
fd6fa73d
AG
738static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
739 u32 msr, int read, int write)
6aa8b732 740{
73c25546 741 struct vcpu_svm *svm = to_svm(vcpu);
455716fa
JR
742 u8 bit_read, bit_write;
743 unsigned long tmp;
744 u32 offset;
6aa8b732 745
ac72a9b7
JR
746 /*
747 * If this warning triggers extend the direct_access_msrs list at the
748 * beginning of the file
749 */
750 WARN_ON(!valid_msr_intercept(msr));
751
fd6fa73d
AG
752 /* Enforce non allowed MSRs to trap */
753 if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
754 read = 0;
755
756 if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
757 write = 0;
758
455716fa
JR
759 offset = svm_msrpm_offset(msr);
760 bit_read = 2 * (msr & 0x0f);
761 bit_write = 2 * (msr & 0x0f) + 1;
762 tmp = msrpm[offset];
763
764 BUG_ON(offset == MSR_INVALID);
765
766 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
767 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
768
769 msrpm[offset] = tmp;
c4327f15
VP
770
771 svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
73c25546 772 svm->nested.force_msr_bitmap_recalc = true;
6aa8b732
AK
773}
774
376c6d28
TL
775void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
776 int read, int write)
6aa8b732 777{
fd6fa73d
AG
778 set_shadow_msr_intercept(vcpu, msr, read, write);
779 set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
780}
781
2fcf4876 782u32 *svm_vcpu_alloc_msrpm(void)
6aa8b732 783{
47903dc1
KS
784 unsigned int order = get_order(MSRPM_SIZE);
785 struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
476c9bd8 786 u32 *msrpm;
f4c847a9
ML
787
788 if (!pages)
789 return NULL;
6aa8b732 790
f4c847a9 791 msrpm = page_address(pages);
47903dc1 792 memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
f65c229c 793
476c9bd8
AL
794 return msrpm;
795}
796
2fcf4876 797void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
476c9bd8
AL
798{
799 int i;
800
ac72a9b7
JR
801 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
802 if (!direct_access_msrs[i].always)
803 continue;
476c9bd8 804 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
ac72a9b7 805 }
f4c847a9 806}
ac72a9b7 807
4d1d7942
SS
808void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
809{
810 int i;
811
091abbf5
ML
812 if (intercept == svm->x2avic_msrs_intercepted)
813 return;
814
4d1d7942
SS
815 if (avic_mode != AVIC_MODE_X2 ||
816 !apic_x2apic_mode(svm->vcpu.arch.apic))
817 return;
818
819 for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
820 int index = direct_access_msrs[i].index;
821
822 if ((index < APIC_BASE_MSR) ||
823 (index > APIC_BASE_MSR + 0xff))
824 continue;
825 set_msr_interception(&svm->vcpu, svm->msrpm, index,
826 !intercept, !intercept);
827 }
091abbf5
ML
828
829 svm->x2avic_msrs_intercepted = intercept;
4d1d7942 830}
2fcf4876
ML
831
832void svm_vcpu_free_msrpm(u32 *msrpm)
f4c847a9 833{
47903dc1 834 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
f65c229c
JR
835}
836
fd6fa73d
AG
837static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
838{
839 struct vcpu_svm *svm = to_svm(vcpu);
840 u32 i;
841
842 /*
843 * Set intercept permissions for all direct access MSRs again. They
844 * will automatically get filtered through the MSR filter, so we are
845 * back in sync after this.
846 */
847 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
848 u32 msr = direct_access_msrs[i].index;
849 u32 read = test_bit(i, svm->shadow_msr_intercept.read);
850 u32 write = test_bit(i, svm->shadow_msr_intercept.write);
851
852 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
ac72a9b7 853 }
f65c229c
JR
854}
855
323c3d80
JR
856static void add_msr_offset(u32 offset)
857{
858 int i;
859
860 for (i = 0; i < MSRPM_OFFSETS; ++i) {
861
862 /* Offset already in list? */
863 if (msrpm_offsets[i] == offset)
bfc733a7 864 return;
323c3d80
JR
865
866 /* Slot used by another offset? */
867 if (msrpm_offsets[i] != MSR_INVALID)
868 continue;
869
870 /* Add offset to list */
871 msrpm_offsets[i] = offset;
872
873 return;
6aa8b732 874 }
323c3d80
JR
875
876 /*
877 * If this BUG triggers the msrpm_offsets table has an overflow. Just
878 * increase MSRPM_OFFSETS in this case.
879 */
bfc733a7 880 BUG();
6aa8b732
AK
881}
882
323c3d80 883static void init_msrpm_offsets(void)
f65c229c 884{
323c3d80 885 int i;
f65c229c 886
323c3d80
JR
887 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
888
889 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
890 u32 offset;
891
892 offset = svm_msrpm_offset(direct_access_msrs[i].index);
893 BUG_ON(offset == MSR_INVALID);
894
895 add_msr_offset(offset);
896 }
f65c229c
JR
897}
898
1d5a1b58
ML
899void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
900{
901 to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
902 to_vmcb->save.br_from = from_vmcb->save.br_from;
903 to_vmcb->save.br_to = from_vmcb->save.br_to;
904 to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
905 to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
906
907 vmcb_mark_dirty(to_vmcb, VMCB_LBR);
908}
909
476c9bd8 910static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
24e09cbf 911{
476c9bd8 912 struct vcpu_svm *svm = to_svm(vcpu);
24e09cbf 913
0dc92119 914 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
476c9bd8
AL
915 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
916 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
917 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
918 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
1d5a1b58
ML
919
920 /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
921 if (is_guest_mode(vcpu))
922 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
24e09cbf
JR
923}
924
476c9bd8 925static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
24e09cbf 926{
476c9bd8 927 struct vcpu_svm *svm = to_svm(vcpu);
24e09cbf 928
0dc92119 929 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
476c9bd8
AL
930 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
931 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
932 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
933 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
1d5a1b58
ML
934
935 /*
936 * Move the LBR msrs back to the vmcb01 to avoid copying them
937 * on nested guest entries.
938 */
939 if (is_guest_mode(vcpu))
940 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
941}
942
943static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
944{
945 /*
946 * If the LBR virtualization is disabled, the LBR msrs are always
947 * kept in the vmcb01 to avoid copying them on nested guest entries.
948 *
949 * If nested, and the LBR virtualization is enabled/disabled, the msrs
950 * are moved between the vmcb01 and vmcb02 as needed.
951 */
952 struct vmcb *vmcb =
953 (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
954 svm->vmcb : svm->vmcb01.ptr;
955
956 switch (index) {
957 case MSR_IA32_DEBUGCTLMSR:
958 return vmcb->save.dbgctl;
959 case MSR_IA32_LASTBRANCHFROMIP:
960 return vmcb->save.br_from;
961 case MSR_IA32_LASTBRANCHTOIP:
962 return vmcb->save.br_to;
963 case MSR_IA32_LASTINTFROMIP:
964 return vmcb->save.last_excp_from;
965 case MSR_IA32_LASTINTTOIP:
966 return vmcb->save.last_excp_to;
967 default:
968 KVM_BUG(false, svm->vcpu.kvm,
969 "%s: Unknown MSR 0x%x", __func__, index);
970 return 0;
971 }
972}
973
974void svm_update_lbrv(struct kvm_vcpu *vcpu)
975{
976 struct vcpu_svm *svm = to_svm(vcpu);
977
978 bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
979 DEBUGCTLMSR_LBR;
980
981 bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
982 LBR_CTL_ENABLE_MASK);
983
d20c796c
ML
984 if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
985 if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
986 enable_lbrv = true;
987
1d5a1b58
ML
988 if (enable_lbrv == current_enable_lbrv)
989 return;
990
991 if (enable_lbrv)
992 svm_enable_lbrv(vcpu);
993 else
994 svm_disable_lbrv(vcpu);
24e09cbf
JR
995}
996
883b0a91 997void disable_nmi_singlestep(struct vcpu_svm *svm)
4aebd0e9
LP
998{
999 svm->nmi_singlestep = false;
640bd6e5 1000
ab2f4d73
LP
1001 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1002 /* Clear our flags if they were not set by the guest */
1003 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1004 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1005 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1006 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1007 }
4aebd0e9
LP
1008}
1009
8566ac8b
BM
1010static void grow_ple_window(struct kvm_vcpu *vcpu)
1011{
1012 struct vcpu_svm *svm = to_svm(vcpu);
1013 struct vmcb_control_area *control = &svm->vmcb->control;
1014 int old = control->pause_filter_count;
1015
e3cdaab5 1016 if (kvm_pause_in_guest(vcpu->kvm))
74fd41ed
ML
1017 return;
1018
8566ac8b
BM
1019 control->pause_filter_count = __grow_ple_window(old,
1020 pause_filter_count,
1021 pause_filter_count_grow,
1022 pause_filter_count_max);
1023
4f75bcc3 1024 if (control->pause_filter_count != old) {
06e7852c 1025 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
4f75bcc3
PX
1026 trace_kvm_ple_window_update(vcpu->vcpu_id,
1027 control->pause_filter_count, old);
1028 }
8566ac8b
BM
1029}
1030
1031static void shrink_ple_window(struct kvm_vcpu *vcpu)
1032{
1033 struct vcpu_svm *svm = to_svm(vcpu);
1034 struct vmcb_control_area *control = &svm->vmcb->control;
1035 int old = control->pause_filter_count;
1036
e3cdaab5 1037 if (kvm_pause_in_guest(vcpu->kvm))
74fd41ed
ML
1038 return;
1039
8566ac8b
BM
1040 control->pause_filter_count =
1041 __shrink_ple_window(old,
1042 pause_filter_count,
1043 pause_filter_count_shrink,
1044 pause_filter_count);
4f75bcc3 1045 if (control->pause_filter_count != old) {
06e7852c 1046 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
4f75bcc3
PX
1047 trace_kvm_ple_window_update(vcpu->vcpu_id,
1048 control->pause_filter_count, old);
1049 }
8566ac8b
BM
1050}
1051
23e5092b 1052static void svm_hardware_unsetup(void)
dd58f3c9
LR
1053{
1054 int cpu;
1055
23e5092b 1056 sev_hardware_unsetup();
dd58f3c9
LR
1057
1058 for_each_possible_cpu(cpu)
1059 svm_cpu_uninit(cpu);
1060
47903dc1
KS
1061 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1062 get_order(IOPM_SIZE));
dd58f3c9
LR
1063 iopm_base = 0;
1064}
1065
6aa8b732
AK
1066static void init_seg(struct vmcb_seg *seg)
1067{
1068 seg->selector = 0;
1069 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
e0231715 1070 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
6aa8b732
AK
1071 seg->limit = 0xffff;
1072 seg->base = 0;
1073}
1074
1075static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1076{
1077 seg->selector = 0;
1078 seg->attrib = SVM_SELECTOR_P_MASK | type;
1079 seg->limit = 0xffff;
1080 seg->base = 0;
1081}
1082
307a94c7
IS
1083static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1084{
1085 struct vcpu_svm *svm = to_svm(vcpu);
1086
1087 return svm->nested.ctl.tsc_offset;
1088}
1089
1090static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1091{
5228eb96
ML
1092 struct vcpu_svm *svm = to_svm(vcpu);
1093
1094 return svm->tsc_ratio_msr;
307a94c7
IS
1095}
1096
edcfe540 1097static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
f4e1b3c8
ZA
1098{
1099 struct vcpu_svm *svm = to_svm(vcpu);
116a0a23 1100
edcfe540
IS
1101 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1102 svm->vmcb->control.tsc_offset = offset;
06e7852c 1103 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
f4e1b3c8
ZA
1104}
1105
11d39e8c 1106static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1ab9287a 1107{
11d39e8c 1108 __svm_write_tsc_multiplier(multiplier);
1ab9287a
IS
1109}
1110
11d39e8c 1111
3b195ac9
SC
1112/* Evaluate instruction intercepts that depend on guest CPUID features. */
1113static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1114 struct vcpu_svm *svm)
4407a797
BM
1115{
1116 /*
0a8ed2ea
SC
1117 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1118 * roots, or if INVPCID is disabled in the guest to inject #UD.
4407a797
BM
1119 */
1120 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
0a8ed2ea
SC
1121 if (!npt_enabled ||
1122 !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
4407a797
BM
1123 svm_set_intercept(svm, INTERCEPT_INVPCID);
1124 else
1125 svm_clr_intercept(svm, INTERCEPT_INVPCID);
1126 }
3b195ac9
SC
1127
1128 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1129 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1130 svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1131 else
1132 svm_set_intercept(svm, INTERCEPT_RDTSCP);
1133 }
4407a797
BM
1134}
1135
36e8194d
PB
1136static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1137{
1138 struct vcpu_svm *svm = to_svm(vcpu);
1139
1140 if (guest_cpuid_is_intel(vcpu)) {
1141 /*
1142 * We must intercept SYSENTER_EIP and SYSENTER_ESP
1143 * accesses because the processor only stores 32 bits.
1144 * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1145 */
1146 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1147 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1148 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1149
1150 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1151 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
b9f3973a
ML
1152
1153 svm->v_vmload_vmsave_enabled = false;
36e8194d
PB
1154 } else {
1155 /*
1156 * If hardware supports Virtual VMLOAD VMSAVE then enable it
1157 * in VMCB and clear intercepts to avoid #VMEXIT.
1158 */
1159 if (vls) {
1160 svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1161 svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1162 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1163 }
1164 /* No need to intercept these MSRs */
1165 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1166 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1167 }
1168}
1169
63129754 1170static void init_vmcb(struct kvm_vcpu *vcpu)
6aa8b732 1171{
63129754 1172 struct vcpu_svm *svm = to_svm(vcpu);
1ee73a33
ML
1173 struct vmcb *vmcb = svm->vmcb01.ptr;
1174 struct vmcb_control_area *control = &vmcb->control;
1175 struct vmcb_save_area *save = &vmcb->save;
6aa8b732 1176
830bd71f
BM
1177 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1178 svm_set_intercept(svm, INTERCEPT_CR3_READ);
1179 svm_set_intercept(svm, INTERCEPT_CR4_READ);
1180 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1181 svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1182 svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
63129754 1183 if (!kvm_vcpu_apicv_active(vcpu))
830bd71f 1184 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
6aa8b732 1185
5315c716 1186 set_dr_intercepts(svm);
6aa8b732 1187
18c918c5
JR
1188 set_exception_intercept(svm, PF_VECTOR);
1189 set_exception_intercept(svm, UD_VECTOR);
1190 set_exception_intercept(svm, MC_VECTOR);
54a20552 1191 set_exception_intercept(svm, AC_VECTOR);
cbdb967a 1192 set_exception_intercept(svm, DB_VECTOR);
9718420e
LA
1193 /*
1194 * Guest access to VMware backdoor ports could legitimately
1195 * trigger #GP because of TSS I/O permission bitmap.
1196 * We intercept those #GP and allow access to them anyway
0b0be065
SC
1197 * as VMware does. Don't intercept #GP for SEV guests as KVM can't
1198 * decrypt guest memory to decode the faulting instruction.
9718420e 1199 */
0b0be065 1200 if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
9718420e 1201 set_exception_intercept(svm, GP_VECTOR);
6aa8b732 1202
a284ba56
JR
1203 svm_set_intercept(svm, INTERCEPT_INTR);
1204 svm_set_intercept(svm, INTERCEPT_NMI);
4b639a9f
ML
1205
1206 if (intercept_smi)
1207 svm_set_intercept(svm, INTERCEPT_SMI);
1208
a284ba56
JR
1209 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1210 svm_set_intercept(svm, INTERCEPT_RDPMC);
1211 svm_set_intercept(svm, INTERCEPT_CPUID);
1212 svm_set_intercept(svm, INTERCEPT_INVD);
1213 svm_set_intercept(svm, INTERCEPT_INVLPG);
1214 svm_set_intercept(svm, INTERCEPT_INVLPGA);
1215 svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1216 svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1217 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1218 svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1219 svm_set_intercept(svm, INTERCEPT_VMRUN);
1220 svm_set_intercept(svm, INTERCEPT_VMMCALL);
1221 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1222 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1223 svm_set_intercept(svm, INTERCEPT_STGI);
1224 svm_set_intercept(svm, INTERCEPT_CLGI);
1225 svm_set_intercept(svm, INTERCEPT_SKINIT);
1226 svm_set_intercept(svm, INTERCEPT_WBINVD);
1227 svm_set_intercept(svm, INTERCEPT_XSETBV);
1228 svm_set_intercept(svm, INTERCEPT_RDPRU);
1229 svm_set_intercept(svm, INTERCEPT_RSM);
6aa8b732 1230
63129754 1231 if (!kvm_mwait_in_guest(vcpu->kvm)) {
a284ba56
JR
1232 svm_set_intercept(svm, INTERCEPT_MONITOR);
1233 svm_set_intercept(svm, INTERCEPT_MWAIT);
668fffa3
MT
1234 }
1235
63129754 1236 if (!kvm_hlt_in_guest(vcpu->kvm))
a284ba56 1237 svm_set_intercept(svm, INTERCEPT_HLT);
caa057a2 1238
d0ec49d4
TL
1239 control->iopm_base_pa = __sme_set(iopm_base);
1240 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
6aa8b732
AK
1241 control->int_ctl = V_INTR_MASKING_MASK;
1242
1243 init_seg(&save->es);
1244 init_seg(&save->ss);
1245 init_seg(&save->ds);
1246 init_seg(&save->fs);
1247 init_seg(&save->gs);
1248
1249 save->cs.selector = 0xf000;
04b66839 1250 save->cs.base = 0xffff0000;
6aa8b732
AK
1251 /* Executable/Readable Code Segment */
1252 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1253 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1254 save->cs.limit = 0xffff;
6aa8b732 1255
4f117ce4 1256 save->gdtr.base = 0;
6aa8b732 1257 save->gdtr.limit = 0xffff;
4f117ce4 1258 save->idtr.base = 0;
6aa8b732
AK
1259 save->idtr.limit = 0xffff;
1260
1261 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1262 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1263
709ddebf
JR
1264 if (npt_enabled) {
1265 /* Setup VMCB for Nested Paging */
cea3a19b 1266 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
a284ba56 1267 svm_clr_intercept(svm, INTERCEPT_INVLPG);
18c918c5 1268 clr_exception_intercept(svm, PF_VECTOR);
830bd71f
BM
1269 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1270 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
63129754 1271 save->g_pat = vcpu->arch.pat;
709ddebf 1272 save->cr3 = 0;
709ddebf 1273 }
193015ad 1274 svm->current_vmcb->asid_generation = 0;
7e8e6eed 1275 svm->asid = 0;
1371d904 1276
c74ad08f
ML
1277 svm->nested.vmcb12_gpa = INVALID_GPA;
1278 svm->nested.last_vmcb12_gpa = INVALID_GPA;
2af9194d 1279
63129754 1280 if (!kvm_pause_in_guest(vcpu->kvm)) {
8566ac8b
BM
1281 control->pause_filter_count = pause_filter_count;
1282 if (pause_filter_thresh)
1283 control->pause_filter_thresh = pause_filter_thresh;
a284ba56 1284 svm_set_intercept(svm, INTERCEPT_PAUSE);
8566ac8b 1285 } else {
a284ba56 1286 svm_clr_intercept(svm, INTERCEPT_PAUSE);
565d0998
ML
1287 }
1288
3b195ac9 1289 svm_recalc_instruction_intercepts(vcpu, svm);
4407a797 1290
89c8a498 1291 /*
d00b99c5
BM
1292 * If the host supports V_SPEC_CTRL then disable the interception
1293 * of MSR_IA32_SPEC_CTRL.
89c8a498 1294 */
d00b99c5
BM
1295 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1296 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1297
63129754 1298 if (kvm_vcpu_apicv_active(vcpu))
1ee73a33 1299 avic_init_vmcb(svm, vmcb);
89c8a498 1300
640bd6e5 1301 if (vgif) {
a284ba56
JR
1302 svm_clr_intercept(svm, INTERCEPT_STGI);
1303 svm_clr_intercept(svm, INTERCEPT_CLGI);
640bd6e5
JN
1304 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1305 }
1306
63129754 1307 if (sev_guest(vcpu->kvm)) {
1654efcb 1308 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
35c6f649 1309 clr_exception_intercept(svm, UD_VECTOR);
376c6d28 1310
63129754 1311 if (sev_es_guest(vcpu->kvm)) {
376c6d28
TL
1312 /* Perform SEV-ES specific VMCB updates */
1313 sev_es_init_vmcb(svm);
1314 }
35c6f649 1315 }
1654efcb 1316
1ee73a33 1317 svm_hv_init_vmcb(vmcb);
36e8194d 1318 init_vmcb_after_set_cpuid(vcpu);
1e0c7d40 1319
1ee73a33 1320 vmcb_mark_all_dirty(vmcb);
8d28fec4 1321
2af9194d 1322 enable_gif(svm);
44a95dae
SS
1323}
1324
9ebe530b
SC
1325static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1326{
1327 struct vcpu_svm *svm = to_svm(vcpu);
44a95dae 1328
9ebe530b
SC
1329 svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1330
1331 svm_init_osvw(vcpu);
1332 vcpu->arch.microcode_version = 0x01000065;
938c8745 1333 svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
9ebe530b
SC
1334
1335 if (sev_es_guest(vcpu->kvm))
1336 sev_es_vcpu_reset(svm);
44a95dae
SS
1337}
1338
d28bc9dd 1339static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
04d2cc77
AK
1340{
1341 struct vcpu_svm *svm = to_svm(vcpu);
1342
b2ac58f9 1343 svm->spec_ctrl = 0;
ccbcd267 1344 svm->virt_spec_ctrl = 0;
b2ac58f9 1345
63129754 1346 init_vmcb(vcpu);
9ebe530b
SC
1347
1348 if (!init_event)
1349 __svm_vcpu_reset(vcpu);
04d2cc77
AK
1350}
1351
4995a368
CA
1352void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1353{
1354 svm->current_vmcb = target_vmcb;
1355 svm->vmcb = target_vmcb->ptr;
4995a368
CA
1356}
1357
23e5092b 1358static int svm_vcpu_create(struct kvm_vcpu *vcpu)
6aa8b732 1359{
a2fa3e9f 1360 struct vcpu_svm *svm;
4995a368 1361 struct page *vmcb01_page;
add5e2f0 1362 struct page *vmsa_page = NULL;
fb3f0f51 1363 int err;
6aa8b732 1364
a9dd6f09
SC
1365 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1366 svm = to_svm(vcpu);
fb3f0f51 1367
b7af4043 1368 err = -ENOMEM;
4995a368
CA
1369 vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1370 if (!vmcb01_page)
987b2594 1371 goto out;
6aa8b732 1372
63129754 1373 if (sev_es_guest(vcpu->kvm)) {
add5e2f0
TL
1374 /*
1375 * SEV-ES guests require a separate VMSA page used to contain
1376 * the encrypted register state of the guest.
1377 */
1378 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1379 if (!vmsa_page)
1380 goto error_free_vmcb_page;
ed02b213
TL
1381
1382 /*
1383 * SEV-ES guests maintain an encrypted version of their FPU
1384 * state which is restored and saved on VMRUN and VMEXIT.
d69c1382
TG
1385 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1386 * do xsave/xrstor on it.
ed02b213 1387 */
d69c1382 1388 fpstate_set_confidential(&vcpu->arch.guest_fpu);
add5e2f0
TL
1389 }
1390
dfa20099
SS
1391 err = avic_init_vcpu(svm);
1392 if (err)
add5e2f0 1393 goto error_free_vmsa_page;
44a95dae 1394
476c9bd8 1395 svm->msrpm = svm_vcpu_alloc_msrpm();
054409ab
CZ
1396 if (!svm->msrpm) {
1397 err = -ENOMEM;
add5e2f0 1398 goto error_free_vmsa_page;
054409ab 1399 }
b7af4043 1400
091abbf5
ML
1401 svm->x2avic_msrs_intercepted = true;
1402
4995a368
CA
1403 svm->vmcb01.ptr = page_address(vmcb01_page);
1404 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
9ebe530b 1405 svm_switch_vmcb(svm, &svm->vmcb01);
add5e2f0
TL
1406
1407 if (vmsa_page)
b67a4cc3 1408 svm->sev_es.vmsa = page_address(vmsa_page);
add5e2f0 1409
a7fc06dd 1410 svm->guest_state_loaded = false;
4995a368 1411
a9dd6f09 1412 return 0;
36241b8c 1413
add5e2f0
TL
1414error_free_vmsa_page:
1415 if (vmsa_page)
1416 __free_page(vmsa_page);
8d22b90e 1417error_free_vmcb_page:
4995a368 1418 __free_page(vmcb01_page);
987b2594 1419out:
a9dd6f09 1420 return err;
6aa8b732
AK
1421}
1422
fd65d314
JM
1423static void svm_clear_current_vmcb(struct vmcb *vmcb)
1424{
1425 int i;
1426
1427 for_each_online_cpu(i)
1428 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
1429}
1430
23e5092b 1431static void svm_vcpu_free(struct kvm_vcpu *vcpu)
6aa8b732 1432{
a2fa3e9f
GH
1433 struct vcpu_svm *svm = to_svm(vcpu);
1434
fd65d314
JM
1435 /*
1436 * The vmcb page can be recycled, causing a false negative in
1437 * svm_vcpu_load(). So, ensure that no logical CPU has this
1438 * vmcb page recorded as its current vmcb.
1439 */
1440 svm_clear_current_vmcb(svm->vmcb);
1441
2fcf4876
ML
1442 svm_free_nested(svm);
1443
add5e2f0
TL
1444 sev_free_vcpu(vcpu);
1445
4995a368 1446 __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
47903dc1 1447 __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
6aa8b732
AK
1448}
1449
23e5092b 1450static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
6aa8b732 1451{
a2fa3e9f 1452 struct vcpu_svm *svm = to_svm(vcpu);
a7fc06dd 1453 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
0cc5064d 1454
ce7ea0cf
TL
1455 if (sev_es_guest(vcpu->kvm))
1456 sev_es_unmap_ghcb(svm);
1457
a7fc06dd
MR
1458 if (svm->guest_state_loaded)
1459 return;
1460
a7fc06dd
MR
1461 /*
1462 * Save additional host state that will be restored on VMEXIT (sev-es)
1463 * or subsequent vmload of host save area.
1464 */
068f7ea6 1465 vmsave(__sme_page_pa(sd->save_area));
63129754 1466 if (sev_es_guest(vcpu->kvm)) {
3dd2775b
TL
1467 struct sev_es_save_area *hostsa;
1468 hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
068f7ea6 1469
23e5092b 1470 sev_es_prepare_switch_to_guest(hostsa);
86137773 1471 }
fbc0db76 1472
11d39e8c
ML
1473 if (tsc_scaling)
1474 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
a7fc06dd 1475
0caa0a77
SC
1476 if (likely(tsc_aux_uret_slot >= 0))
1477 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
8221c137 1478
a7fc06dd
MR
1479 svm->guest_state_loaded = true;
1480}
1481
1482static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1483{
844d69c2 1484 to_svm(vcpu)->guest_state_loaded = false;
a7fc06dd
MR
1485}
1486
1487static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1488{
1489 struct vcpu_svm *svm = to_svm(vcpu);
1490 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1491
15d45071
AR
1492 if (sd->current_vmcb != svm->vmcb) {
1493 sd->current_vmcb = svm->vmcb;
1494 indirect_branch_prediction_barrier();
1495 }
bf5f6b9d 1496 if (kvm_vcpu_apicv_active(vcpu))
ba8ec273 1497 avic_vcpu_load(vcpu, cpu);
6aa8b732
AK
1498}
1499
1500static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1501{
bf5f6b9d 1502 if (kvm_vcpu_apicv_active(vcpu))
ba8ec273 1503 avic_vcpu_put(vcpu);
bf5f6b9d 1504
a7fc06dd 1505 svm_prepare_host_switch(vcpu);
8221c137 1506
e1beb1d3 1507 ++vcpu->stat.host_state_reload;
6aa8b732
AK
1508}
1509
6aa8b732
AK
1510static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1511{
9b611747
LP
1512 struct vcpu_svm *svm = to_svm(vcpu);
1513 unsigned long rflags = svm->vmcb->save.rflags;
1514
1515 if (svm->nmi_singlestep) {
1516 /* Hide our flags if they were not set by the guest */
1517 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1518 rflags &= ~X86_EFLAGS_TF;
1519 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1520 rflags &= ~X86_EFLAGS_RF;
1521 }
1522 return rflags;
6aa8b732
AK
1523}
1524
1525static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1526{
9b611747
LP
1527 if (to_svm(vcpu)->nmi_singlestep)
1528 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1529
ae9fedc7 1530 /*
bb3541f1 1531 * Any change of EFLAGS.VM is accompanied by a reload of SS
ae9fedc7
PB
1532 * (caused by either a task switch or an inter-privilege IRET),
1533 * so we do not need to update the CPL here.
1534 */
a2fa3e9f 1535 to_svm(vcpu)->vmcb->save.rflags = rflags;
6aa8b732
AK
1536}
1537
c5063551
MO
1538static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1539{
1540 struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1541
1542 return sev_es_guest(vcpu->kvm)
1543 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1544 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1545}
1546
6de4f3ad
AK
1547static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1548{
40e49c4f
LJ
1549 kvm_register_mark_available(vcpu, reg);
1550
6de4f3ad
AK
1551 switch (reg) {
1552 case VCPU_EXREG_PDPTR:
40e49c4f
LJ
1553 /*
1554 * When !npt_enabled, mmu->pdptrs[] is already available since
1555 * it is always updated per SDM when moving to CRs.
1556 */
1557 if (npt_enabled)
2df4a5eb 1558 load_pdptrs(vcpu, kvm_read_cr3(vcpu));
6de4f3ad
AK
1559 break;
1560 default:
67369273 1561 KVM_BUG_ON(1, vcpu->kvm);
6de4f3ad
AK
1562 }
1563}
1564
e14b7786 1565static void svm_set_vintr(struct vcpu_svm *svm)
64b5bd27
PB
1566{
1567 struct vmcb_control_area *control;
1568
f1577ab2
ML
1569 /*
1570 * The following fields are ignored when AVIC is enabled
1571 */
f44509f8 1572 WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
f1577ab2 1573
a284ba56 1574 svm_set_intercept(svm, INTERCEPT_VINTR);
64b5bd27
PB
1575
1576 /*
1577 * This is just a dummy VINTR to actually cause a vmexit to happen.
1578 * Actual injection of virtual interrupts happens through EVENTINJ.
1579 */
1580 control = &svm->vmcb->control;
1581 control->int_vector = 0x0;
1582 control->int_ctl &= ~V_INTR_PRIO_MASK;
1583 control->int_ctl |= V_IRQ_MASK |
1584 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
06e7852c 1585 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
64b5bd27
PB
1586}
1587
f0b85051
AG
1588static void svm_clear_vintr(struct vcpu_svm *svm)
1589{
a284ba56 1590 svm_clr_intercept(svm, INTERCEPT_VINTR);
64b5bd27 1591
d8e4e58f 1592 /* Drop int_ctl fields related to VINTR injection. */
0f923e07 1593 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
d8e4e58f 1594 if (is_guest_mode(&svm->vcpu)) {
0f923e07 1595 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
fb7333df 1596
d8e4e58f
PB
1597 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1598 (svm->nested.ctl.int_ctl & V_TPR_MASK));
0f923e07
ML
1599
1600 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1601 V_IRQ_INJECTION_BITS_MASK;
aee77e11
ML
1602
1603 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
d8e4e58f
PB
1604 }
1605
06e7852c 1606 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
f0b85051
AG
1607}
1608
6aa8b732
AK
1609static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1610{
a2fa3e9f 1611 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
cc3ed80a 1612 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
6aa8b732
AK
1613
1614 switch (seg) {
1615 case VCPU_SREG_CS: return &save->cs;
1616 case VCPU_SREG_DS: return &save->ds;
1617 case VCPU_SREG_ES: return &save->es;
cc3ed80a
ML
1618 case VCPU_SREG_FS: return &save01->fs;
1619 case VCPU_SREG_GS: return &save01->gs;
6aa8b732 1620 case VCPU_SREG_SS: return &save->ss;
cc3ed80a
ML
1621 case VCPU_SREG_TR: return &save01->tr;
1622 case VCPU_SREG_LDTR: return &save01->ldtr;
6aa8b732
AK
1623 }
1624 BUG();
8b6d44c7 1625 return NULL;
6aa8b732
AK
1626}
1627
1628static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1629{
1630 struct vmcb_seg *s = svm_seg(vcpu, seg);
1631
1632 return s->base;
1633}
1634
1635static void svm_get_segment(struct kvm_vcpu *vcpu,
1636 struct kvm_segment *var, int seg)
1637{
1638 struct vmcb_seg *s = svm_seg(vcpu, seg);
1639
1640 var->base = s->base;
1641 var->limit = s->limit;
1642 var->selector = s->selector;
1643 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1644 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1645 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1646 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1647 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1648 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1649 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
80112c89
JM
1650
1651 /*
1652 * AMD CPUs circa 2014 track the G bit for all segments except CS.
1653 * However, the SVM spec states that the G bit is not observed by the
1654 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1655 * So let's synthesize a legal G bit for all segments, this helps
1656 * running KVM nested. It also helps cross-vendor migration, because
1657 * Intel's vmentry has a check on the 'G' bit.
1658 */
1659 var->g = s->limit > 0xfffff;
25022acc 1660
e0231715
JR
1661 /*
1662 * AMD's VMCB does not have an explicit unusable field, so emulate it
19bca6ab
AP
1663 * for cross vendor migration purposes by "not present"
1664 */
8eae9570 1665 var->unusable = !var->present;
19bca6ab 1666
1fbdc7a5 1667 switch (seg) {
1fbdc7a5
AP
1668 case VCPU_SREG_TR:
1669 /*
1670 * Work around a bug where the busy flag in the tr selector
1671 * isn't exposed
1672 */
c0d09828 1673 var->type |= 0x2;
1fbdc7a5
AP
1674 break;
1675 case VCPU_SREG_DS:
1676 case VCPU_SREG_ES:
1677 case VCPU_SREG_FS:
1678 case VCPU_SREG_GS:
1679 /*
1680 * The accessed bit must always be set in the segment
1681 * descriptor cache, although it can be cleared in the
1682 * descriptor, the cached bit always remains at 1. Since
1683 * Intel has a check on this, set it here to support
1684 * cross-vendor migration.
1685 */
1686 if (!var->unusable)
1687 var->type |= 0x1;
1688 break;
b586eb02 1689 case VCPU_SREG_SS:
e0231715
JR
1690 /*
1691 * On AMD CPUs sometimes the DB bit in the segment
b586eb02
AP
1692 * descriptor is left as 1, although the whole segment has
1693 * been made unusable. Clear it here to pass an Intel VMX
1694 * entry check when cross vendor migrating.
1695 */
1696 if (var->unusable)
1697 var->db = 0;
d9c1b543 1698 /* This is symmetric with svm_set_segment() */
33b458d2 1699 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
b586eb02 1700 break;
1fbdc7a5 1701 }
6aa8b732
AK
1702}
1703
2e4d2653
IE
1704static int svm_get_cpl(struct kvm_vcpu *vcpu)
1705{
1706 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1707
1708 return save->cpl;
1709}
1710
872e0c53
SC
1711static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1712{
1713 struct kvm_segment cs;
1714
1715 svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1716 *db = cs.db;
1717 *l = cs.l;
1718}
1719
89a27f4d 1720static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1721{
a2fa3e9f
GH
1722 struct vcpu_svm *svm = to_svm(vcpu);
1723
89a27f4d
GN
1724 dt->size = svm->vmcb->save.idtr.limit;
1725 dt->address = svm->vmcb->save.idtr.base;
6aa8b732
AK
1726}
1727
89a27f4d 1728static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1729{
a2fa3e9f
GH
1730 struct vcpu_svm *svm = to_svm(vcpu);
1731
89a27f4d
GN
1732 svm->vmcb->save.idtr.limit = dt->size;
1733 svm->vmcb->save.idtr.base = dt->address ;
06e7852c 1734 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
6aa8b732
AK
1735}
1736
89a27f4d 1737static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1738{
a2fa3e9f
GH
1739 struct vcpu_svm *svm = to_svm(vcpu);
1740
89a27f4d
GN
1741 dt->size = svm->vmcb->save.gdtr.limit;
1742 dt->address = svm->vmcb->save.gdtr.base;
6aa8b732
AK
1743}
1744
89a27f4d 1745static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1746{
a2fa3e9f
GH
1747 struct vcpu_svm *svm = to_svm(vcpu);
1748
89a27f4d
GN
1749 svm->vmcb->save.gdtr.limit = dt->size;
1750 svm->vmcb->save.gdtr.base = dt->address ;
06e7852c 1751 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
6aa8b732
AK
1752}
1753
559c7c75 1754static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
405329fc
MR
1755{
1756 struct vcpu_svm *svm = to_svm(vcpu);
1757
1758 /*
1759 * For guests that don't set guest_state_protected, the cr3 update is
1760 * handled via kvm_mmu_load() while entering the guest. For guests
1761 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1762 * VMCB save area now, since the save area will become the initial
1763 * contents of the VMSA, and future VMCB save area updates won't be
1764 * seen.
1765 */
1766 if (sev_es_guest(vcpu->kvm)) {
1767 svm->vmcb->save.cr3 = cr3;
1768 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1769 }
1770}
1771
883b0a91 1772void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
6aa8b732 1773{
a2fa3e9f 1774 struct vcpu_svm *svm = to_svm(vcpu);
2a32a77c 1775 u64 hcr0 = cr0;
c53bbe21 1776 bool old_paging = is_paging(vcpu);
a2fa3e9f 1777
05b3e0c2 1778#ifdef CONFIG_X86_64
f1c6366e 1779 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
707d92fa 1780 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
f6801dff 1781 vcpu->arch.efer |= EFER_LMA;
2b5203ee 1782 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
6aa8b732
AK
1783 }
1784
d77c26fc 1785 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
f6801dff 1786 vcpu->arch.efer &= ~EFER_LMA;
2b5203ee 1787 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
6aa8b732
AK
1788 }
1789 }
1790#endif
ad312c7c 1791 vcpu->arch.cr0 = cr0;
888f9f3e 1792
c53bbe21 1793 if (!npt_enabled) {
2a32a77c 1794 hcr0 |= X86_CR0_PG | X86_CR0_WP;
c53bbe21
ML
1795 if (old_paging != is_paging(vcpu))
1796 svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1797 }
02daab21 1798
bcf166a9
PB
1799 /*
1800 * re-enable caching here because the QEMU bios
1801 * does not do it - this results in some delay at
1802 * reboot
1803 */
1804 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
2a32a77c
PB
1805 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1806
1807 svm->vmcb->save.cr0 = hcr0;
06e7852c 1808 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
2a32a77c
PB
1809
1810 /*
1811 * SEV-ES guests must always keep the CR intercepts cleared. CR
1812 * tracking is done using the CR write traps.
1813 */
63129754 1814 if (sev_es_guest(vcpu->kvm))
2a32a77c
PB
1815 return;
1816
1817 if (hcr0 == cr0) {
1818 /* Selective CR0 write remains on. */
1819 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1820 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1821 } else {
1822 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1823 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1824 }
6aa8b732
AK
1825}
1826
c2fe3cd4
SC
1827static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1828{
1829 return true;
1830}
1831
1832void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
6aa8b732 1833{
1e02ce4c 1834 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
dc924b06 1835 unsigned long old_cr4 = vcpu->arch.cr4;
e5eab0ce
JR
1836
1837 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
4d9c83f5 1838 svm_flush_tlb_current(vcpu);
6394b649 1839
ec077263 1840 vcpu->arch.cr4 = cr4;
c53bbe21 1841 if (!npt_enabled) {
ec077263 1842 cr4 |= X86_CR4_PAE;
c53bbe21
ML
1843
1844 if (!is_paging(vcpu))
1845 cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1846 }
6394b649 1847 cr4 |= host_cr4_mce;
ec077263 1848 to_svm(vcpu)->vmcb->save.cr4 = cr4;
06e7852c 1849 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
2259c17f
JM
1850
1851 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1852 kvm_update_cpuid_runtime(vcpu);
6aa8b732
AK
1853}
1854
1855static void svm_set_segment(struct kvm_vcpu *vcpu,
1856 struct kvm_segment *var, int seg)
1857{
a2fa3e9f 1858 struct vcpu_svm *svm = to_svm(vcpu);
6aa8b732
AK
1859 struct vmcb_seg *s = svm_seg(vcpu, seg);
1860
1861 s->base = var->base;
1862 s->limit = var->limit;
1863 s->selector = var->selector;
d9c1b543
RP
1864 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1865 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1866 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1867 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1868 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1869 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1870 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1871 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
ae9fedc7
PB
1872
1873 /*
1874 * This is always accurate, except if SYSRET returned to a segment
1875 * with SS.DPL != 3. Intel does not have this quirk, and always
1876 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1877 * would entail passing the CPL to userspace and back.
1878 */
1879 if (seg == VCPU_SREG_SS)
d9c1b543
RP
1880 /* This is symmetric with svm_get_segment() */
1881 svm->vmcb->save.cpl = (var->dpl & 3);
6aa8b732 1882
06e7852c 1883 vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
6aa8b732
AK
1884}
1885
b6a7cc35 1886static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
6aa8b732 1887{
d0bfb940
JK
1888 struct vcpu_svm *svm = to_svm(vcpu);
1889
18c918c5 1890 clr_exception_intercept(svm, BP_VECTOR);
44c11430 1891
d0bfb940 1892 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
d0bfb940 1893 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
18c918c5 1894 set_exception_intercept(svm, BP_VECTOR);
6986982f 1895 }
44c11430
GN
1896}
1897
0fe1e009 1898static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
6aa8b732 1899{
0fe1e009
TH
1900 if (sd->next_asid > sd->max_asid) {
1901 ++sd->asid_generation;
4faefff3 1902 sd->next_asid = sd->min_asid;
a2fa3e9f 1903 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
7e8e6eed 1904 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
6aa8b732
AK
1905 }
1906
193015ad 1907 svm->current_vmcb->asid_generation = sd->asid_generation;
7e8e6eed 1908 svm->asid = sd->next_asid++;
6aa8b732
AK
1909}
1910
d67668e9 1911static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
73aaf249 1912{
d67668e9 1913 struct vmcb *vmcb = svm->vmcb;
73aaf249 1914
8d4846b9
TL
1915 if (svm->vcpu.arch.guest_state_protected)
1916 return;
1917
d67668e9
PB
1918 if (unlikely(value != vmcb->save.dr6)) {
1919 vmcb->save.dr6 = value;
06e7852c 1920 vmcb_mark_dirty(vmcb, VMCB_DR);
d67668e9 1921 }
73aaf249
JK
1922}
1923
facb0139
PB
1924static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1925{
1926 struct vcpu_svm *svm = to_svm(vcpu);
1927
8d4846b9
TL
1928 if (vcpu->arch.guest_state_protected)
1929 return;
1930
facb0139
PB
1931 get_debugreg(vcpu->arch.db[0], 0);
1932 get_debugreg(vcpu->arch.db[1], 1);
1933 get_debugreg(vcpu->arch.db[2], 2);
1934 get_debugreg(vcpu->arch.db[3], 3);
d67668e9 1935 /*
9a3ecd5e 1936 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
d67668e9
PB
1937 * because db_interception might need it. We can do it before vmentry.
1938 */
5679b803 1939 vcpu->arch.dr6 = svm->vmcb->save.dr6;
facb0139 1940 vcpu->arch.dr7 = svm->vmcb->save.dr7;
facb0139
PB
1941 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1942 set_dr_intercepts(svm);
1943}
1944
020df079 1945static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
6aa8b732 1946{
42dbaa5a 1947 struct vcpu_svm *svm = to_svm(vcpu);
42dbaa5a 1948
8d4846b9
TL
1949 if (vcpu->arch.guest_state_protected)
1950 return;
1951
020df079 1952 svm->vmcb->save.dr7 = value;
06e7852c 1953 vmcb_mark_dirty(svm->vmcb, VMCB_DR);
6aa8b732
AK
1954}
1955
63129754 1956static int pf_interception(struct kvm_vcpu *vcpu)
6aa8b732 1957{
63129754
PB
1958 struct vcpu_svm *svm = to_svm(vcpu);
1959
6d1b867d 1960 u64 fault_address = svm->vmcb->control.exit_info_2;
1261bfa3 1961 u64 error_code = svm->vmcb->control.exit_info_1;
6aa8b732 1962
63129754 1963 return kvm_handle_page_fault(vcpu, error_code, fault_address,
00b10fe1
BS
1964 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1965 svm->vmcb->control.insn_bytes : NULL,
d0006530
PB
1966 svm->vmcb->control.insn_len);
1967}
1968
63129754 1969static int npf_interception(struct kvm_vcpu *vcpu)
d0006530 1970{
63129754
PB
1971 struct vcpu_svm *svm = to_svm(vcpu);
1972
76ff371b 1973 u64 fault_address = svm->vmcb->control.exit_info_2;
d0006530
PB
1974 u64 error_code = svm->vmcb->control.exit_info_1;
1975
1976 trace_kvm_page_fault(fault_address, error_code);
63129754 1977 return kvm_mmu_page_fault(vcpu, fault_address, error_code,
00b10fe1
BS
1978 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1979 svm->vmcb->control.insn_bytes : NULL,
d0006530 1980 svm->vmcb->control.insn_len);
6aa8b732
AK
1981}
1982
63129754 1983static int db_interception(struct kvm_vcpu *vcpu)
d0bfb940 1984{
63129754
PB
1985 struct kvm_run *kvm_run = vcpu->run;
1986 struct vcpu_svm *svm = to_svm(vcpu);
851ba692 1987
63129754 1988 if (!(vcpu->guest_debug &
44c11430 1989 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
6be7d306 1990 !svm->nmi_singlestep) {
9a3ecd5e 1991 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
63129754 1992 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
d0bfb940
JK
1993 return 1;
1994 }
44c11430 1995
6be7d306 1996 if (svm->nmi_singlestep) {
4aebd0e9 1997 disable_nmi_singlestep(svm);
99c22179
VK
1998 /* Make sure we check for pending NMIs upon entry */
1999 kvm_make_request(KVM_REQ_EVENT, vcpu);
44c11430
GN
2000 }
2001
63129754 2002 if (vcpu->guest_debug &
e0231715 2003 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
44c11430 2004 kvm_run->exit_reason = KVM_EXIT_DEBUG;
dee919d1
PB
2005 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2006 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
44c11430
GN
2007 kvm_run->debug.arch.pc =
2008 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2009 kvm_run->debug.arch.exception = DB_VECTOR;
2010 return 0;
2011 }
2012
2013 return 1;
d0bfb940
JK
2014}
2015
63129754 2016static int bp_interception(struct kvm_vcpu *vcpu)
d0bfb940 2017{
63129754
PB
2018 struct vcpu_svm *svm = to_svm(vcpu);
2019 struct kvm_run *kvm_run = vcpu->run;
851ba692 2020
d0bfb940
JK
2021 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2022 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2023 kvm_run->debug.arch.exception = BP_VECTOR;
2024 return 0;
2025}
2026
63129754 2027static int ud_interception(struct kvm_vcpu *vcpu)
7aa81cc0 2028{
63129754 2029 return handle_ud(vcpu);
7aa81cc0
AL
2030}
2031
63129754 2032static int ac_interception(struct kvm_vcpu *vcpu)
54a20552 2033{
63129754 2034 kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
54a20552
EN
2035 return 1;
2036}
2037
67ec6607
JR
2038static bool is_erratum_383(void)
2039{
2040 int err, i;
2041 u64 value;
2042
2043 if (!erratum_383_found)
2044 return false;
2045
2046 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2047 if (err)
2048 return false;
2049
2050 /* Bit 62 may or may not be set for this mce */
2051 value &= ~(1ULL << 62);
2052
2053 if (value != 0xb600000000010015ULL)
2054 return false;
2055
2056 /* Clear MCi_STATUS registers */
2057 for (i = 0; i < 6; ++i)
2058 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2059
2060 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2061 if (!err) {
2062 u32 low, high;
2063
2064 value &= ~(1ULL << 2);
2065 low = lower_32_bits(value);
2066 high = upper_32_bits(value);
2067
2068 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2069 }
2070
2071 /* Flush tlb to evict multi-match entries */
2072 __flush_tlb_all();
2073
2074 return true;
2075}
2076
63129754 2077static void svm_handle_mce(struct kvm_vcpu *vcpu)
53371b50 2078{
67ec6607
JR
2079 if (is_erratum_383()) {
2080 /*
2081 * Erratum 383 triggered. Guest state is corrupt so kill the
2082 * guest.
2083 */
2084 pr_err("KVM: Guest triggered AMD Erratum 383\n");
2085
63129754 2086 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
67ec6607
JR
2087
2088 return;
2089 }
2090
53371b50
JR
2091 /*
2092 * On an #MC intercept the MCE handler is not called automatically in
2093 * the host. So do it by hand here.
2094 */
1c164cb3 2095 kvm_machine_check();
fe5913e4
JR
2096}
2097
63129754 2098static int mc_interception(struct kvm_vcpu *vcpu)
fe5913e4 2099{
53371b50
JR
2100 return 1;
2101}
2102
63129754 2103static int shutdown_interception(struct kvm_vcpu *vcpu)
46fe4ddd 2104{
63129754
PB
2105 struct kvm_run *kvm_run = vcpu->run;
2106 struct vcpu_svm *svm = to_svm(vcpu);
851ba692 2107
8164a5ff
TL
2108 /*
2109 * The VM save area has already been encrypted so it
2110 * cannot be reinitialized - just terminate.
2111 */
63129754 2112 if (sev_es_guest(vcpu->kvm))
8164a5ff
TL
2113 return -EINVAL;
2114
46fe4ddd 2115 /*
265e4353
SC
2116 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
2117 * the VMCB in a known good state. Unfortuately, KVM doesn't have
2118 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2119 * userspace. At a platform view, INIT is acceptable behavior as
2120 * there exist bare metal platforms that automatically INIT the CPU
2121 * in response to shutdown.
46fe4ddd 2122 */
a2fa3e9f 2123 clear_page(svm->vmcb);
265e4353 2124 kvm_vcpu_reset(vcpu, true);
46fe4ddd
JR
2125
2126 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2127 return 0;
2128}
2129
63129754 2130static int io_interception(struct kvm_vcpu *vcpu)
6aa8b732 2131{
63129754 2132 struct vcpu_svm *svm = to_svm(vcpu);
d77c26fc 2133 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
dca7f128 2134 int size, in, string;
039576c0 2135 unsigned port;
6aa8b732 2136
63129754 2137 ++vcpu->stat.io_exits;
e70669ab 2138 string = (io_info & SVM_IOIO_STR_MASK) != 0;
039576c0
AK
2139 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2140 port = io_info >> 16;
2141 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
7ed9abfe
TL
2142
2143 if (string) {
2144 if (sev_es_guest(vcpu->kvm))
2145 return sev_es_string_io(svm, size, port, in);
2146 else
2147 return kvm_emulate_instruction(vcpu, 0);
2148 }
2149
cf8f70bf 2150 svm->next_rip = svm->vmcb->control.exit_info_2;
cf8f70bf 2151
63129754 2152 return kvm_fast_pio(vcpu, size, port, in);
c47f098d
JR
2153}
2154
63129754 2155static int nmi_interception(struct kvm_vcpu *vcpu)
a0698055 2156{
a0698055
JR
2157 return 1;
2158}
2159
991afbbe
ML
2160static int smi_interception(struct kvm_vcpu *vcpu)
2161{
2162 return 1;
2163}
2164
63129754 2165static int intr_interception(struct kvm_vcpu *vcpu)
6aa8b732 2166{
63129754 2167 ++vcpu->stat.irq_exits;
6aa8b732
AK
2168 return 1;
2169}
2170
2ac636a6 2171static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
6aa8b732 2172{
63129754 2173 struct vcpu_svm *svm = to_svm(vcpu);
9e8f0fbf 2174 struct vmcb *vmcb12;
8c5fbf1a 2175 struct kvm_host_map map;
b742c1e6 2176 int ret;
9966bf68 2177
63129754 2178 if (nested_svm_check_permissions(vcpu))
5542675b
AG
2179 return 1;
2180
63129754 2181 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
8c5fbf1a
KA
2182 if (ret) {
2183 if (ret == -EINVAL)
63129754 2184 kvm_inject_gp(vcpu, 0);
9966bf68 2185 return 1;
8c5fbf1a
KA
2186 }
2187
9e8f0fbf 2188 vmcb12 = map.hva;
9966bf68 2189
63129754 2190 ret = kvm_skip_emulated_instruction(vcpu);
9966bf68 2191
adc2a237 2192 if (vmload) {
2bb16bea 2193 svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
adc2a237
ML
2194 svm->sysenter_eip_hi = 0;
2195 svm->sysenter_esp_hi = 0;
9a9e7481 2196 } else {
2bb16bea 2197 svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
9a9e7481 2198 }
e3e9ed3d 2199
63129754 2200 kvm_vcpu_unmap(vcpu, &map, true);
5542675b 2201
b742c1e6 2202 return ret;
5542675b
AG
2203}
2204
2ac636a6 2205static int vmload_interception(struct kvm_vcpu *vcpu)
5542675b 2206{
2ac636a6
SC
2207 return vmload_vmsave_interception(vcpu, true);
2208}
5542675b 2209
2ac636a6
SC
2210static int vmsave_interception(struct kvm_vcpu *vcpu)
2211{
2212 return vmload_vmsave_interception(vcpu, false);
5542675b
AG
2213}
2214
63129754 2215static int vmrun_interception(struct kvm_vcpu *vcpu)
3d6368ef 2216{
63129754 2217 if (nested_svm_check_permissions(vcpu))
3d6368ef
AG
2218 return 1;
2219
63129754 2220 return nested_svm_vmrun(vcpu);
3d6368ef
AG
2221}
2222
82a11e9c
BD
2223enum {
2224 NONE_SVM_INSTR,
2225 SVM_INSTR_VMRUN,
2226 SVM_INSTR_VMLOAD,
2227 SVM_INSTR_VMSAVE,
2228};
2229
2230/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2231static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2232{
2233 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2234
2235 if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2236 return NONE_SVM_INSTR;
2237
2238 switch (ctxt->modrm) {
2239 case 0xd8: /* VMRUN */
2240 return SVM_INSTR_VMRUN;
2241 case 0xda: /* VMLOAD */
2242 return SVM_INSTR_VMLOAD;
2243 case 0xdb: /* VMSAVE */
2244 return SVM_INSTR_VMSAVE;
2245 default:
2246 break;
2247 }
2248
2249 return NONE_SVM_INSTR;
2250}
2251
2252static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2253{
14c2bf81
WH
2254 const int guest_mode_exit_codes[] = {
2255 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2256 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2257 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2258 };
63129754 2259 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
82a11e9c
BD
2260 [SVM_INSTR_VMRUN] = vmrun_interception,
2261 [SVM_INSTR_VMLOAD] = vmload_interception,
2262 [SVM_INSTR_VMSAVE] = vmsave_interception,
2263 };
2264 struct vcpu_svm *svm = to_svm(vcpu);
2df8d380 2265 int ret;
82a11e9c 2266
14c2bf81 2267 if (is_guest_mode(vcpu)) {
2df8d380 2268 /* Returns '1' or -errno on failure, '0' on success. */
3a87c7e0 2269 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2df8d380
SC
2270 if (ret)
2271 return ret;
2272 return 1;
2273 }
63129754 2274 return svm_instr_handlers[opcode](vcpu);
82a11e9c
BD
2275}
2276
2277/*
2278 * #GP handling code. Note that #GP can be triggered under the following two
2279 * cases:
2280 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2281 * some AMD CPUs when EAX of these instructions are in the reserved memory
2282 * regions (e.g. SMM memory on host).
2283 * 2) VMware backdoor
2284 */
63129754 2285static int gp_interception(struct kvm_vcpu *vcpu)
82a11e9c 2286{
63129754 2287 struct vcpu_svm *svm = to_svm(vcpu);
82a11e9c
BD
2288 u32 error_code = svm->vmcb->control.exit_info_1;
2289 int opcode;
2290
2291 /* Both #GP cases have zero error_code */
2292 if (error_code)
2293 goto reinject;
2294
2295 /* Decode the instruction for usage later */
2296 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2297 goto reinject;
2298
2299 opcode = svm_instr_opcode(vcpu);
2300
2301 if (opcode == NONE_SVM_INSTR) {
2302 if (!enable_vmware_backdoor)
2303 goto reinject;
2304
2305 /*
2306 * VMware backdoor emulation on #GP interception only handles
2307 * IN{S}, OUT{S}, and RDPMC.
2308 */
14c2bf81
WH
2309 if (!is_guest_mode(vcpu))
2310 return kvm_emulate_instruction(vcpu,
82a11e9c 2311 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
47c28d43
DV
2312 } else {
2313 /* All SVM instructions expect page aligned RAX */
2314 if (svm->vmcb->save.rax & ~PAGE_MASK)
2315 goto reinject;
2316
82a11e9c 2317 return emulate_svm_instr(vcpu, opcode);
47c28d43 2318 }
82a11e9c
BD
2319
2320reinject:
2321 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2322 return 1;
2323}
2324
ffdf7f9e
PB
2325void svm_set_gif(struct vcpu_svm *svm, bool value)
2326{
2327 if (value) {
2328 /*
2329 * If VGIF is enabled, the STGI intercept is only added to
2330 * detect the opening of the SMI/NMI window; remove it now.
2331 * Likewise, clear the VINTR intercept, we will set it
2332 * again while processing KVM_REQ_EVENT if needed.
2333 */
ea91559b 2334 if (vgif)
a284ba56
JR
2335 svm_clr_intercept(svm, INTERCEPT_STGI);
2336 if (svm_is_intercept(svm, INTERCEPT_VINTR))
ffdf7f9e
PB
2337 svm_clear_vintr(svm);
2338
2339 enable_gif(svm);
2340 if (svm->vcpu.arch.smi_pending ||
2341 svm->vcpu.arch.nmi_pending ||
2342 kvm_cpu_has_injectable_intr(&svm->vcpu))
2343 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2344 } else {
2345 disable_gif(svm);
2346
2347 /*
2348 * After a CLGI no interrupts should come. But if vGIF is
2349 * in use, we still rely on the VINTR intercept (rather than
2350 * STGI) to detect an open interrupt window.
2351 */
ea91559b 2352 if (!vgif)
ffdf7f9e
PB
2353 svm_clear_vintr(svm);
2354 }
2355}
2356
63129754 2357static int stgi_interception(struct kvm_vcpu *vcpu)
1371d904 2358{
b742c1e6
LP
2359 int ret;
2360
63129754 2361 if (nested_svm_check_permissions(vcpu))
1371d904
AG
2362 return 1;
2363
63129754
PB
2364 ret = kvm_skip_emulated_instruction(vcpu);
2365 svm_set_gif(to_svm(vcpu), true);
b742c1e6 2366 return ret;
1371d904
AG
2367}
2368
63129754 2369static int clgi_interception(struct kvm_vcpu *vcpu)
1371d904 2370{
b742c1e6
LP
2371 int ret;
2372
63129754 2373 if (nested_svm_check_permissions(vcpu))
1371d904
AG
2374 return 1;
2375
63129754
PB
2376 ret = kvm_skip_emulated_instruction(vcpu);
2377 svm_set_gif(to_svm(vcpu), false);
b742c1e6 2378 return ret;
1371d904
AG
2379}
2380
63129754 2381static int invlpga_interception(struct kvm_vcpu *vcpu)
ff092385 2382{
bc9eff67
SC
2383 gva_t gva = kvm_rax_read(vcpu);
2384 u32 asid = kvm_rcx_read(vcpu);
ff092385 2385
bc9eff67
SC
2386 /* FIXME: Handle an address size prefix. */
2387 if (!is_long_mode(vcpu))
2388 gva = (u32)gva;
ff092385 2389
bc9eff67 2390 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
532a46b9 2391
ff092385 2392 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
bc9eff67 2393 kvm_mmu_invlpg(vcpu, gva);
532a46b9 2394
63129754 2395 return kvm_skip_emulated_instruction(vcpu);
dab429a7
DK
2396}
2397
63129754 2398static int skinit_interception(struct kvm_vcpu *vcpu)
81dd35d4 2399{
63129754 2400 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
81dd35d4 2401
63129754 2402 kvm_queue_exception(vcpu, UD_VECTOR);
0cb8410b
JM
2403 return 1;
2404}
2405
63129754 2406static int task_switch_interception(struct kvm_vcpu *vcpu)
6aa8b732 2407{
63129754 2408 struct vcpu_svm *svm = to_svm(vcpu);
37817f29 2409 u16 tss_selector;
64a7ec06
GN
2410 int reason;
2411 int int_type = svm->vmcb->control.exit_int_info &
2412 SVM_EXITINTINFO_TYPE_MASK;
8317c298 2413 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
fe8e7f83
GN
2414 uint32_t type =
2415 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2416 uint32_t idt_v =
2417 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
e269fb21
JK
2418 bool has_error_code = false;
2419 u32 error_code = 0;
37817f29
IE
2420
2421 tss_selector = (u16)svm->vmcb->control.exit_info_1;
64a7ec06 2422
37817f29
IE
2423 if (svm->vmcb->control.exit_info_2 &
2424 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
64a7ec06
GN
2425 reason = TASK_SWITCH_IRET;
2426 else if (svm->vmcb->control.exit_info_2 &
2427 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2428 reason = TASK_SWITCH_JMP;
fe8e7f83 2429 else if (idt_v)
64a7ec06
GN
2430 reason = TASK_SWITCH_GATE;
2431 else
2432 reason = TASK_SWITCH_CALL;
2433
fe8e7f83
GN
2434 if (reason == TASK_SWITCH_GATE) {
2435 switch (type) {
2436 case SVM_EXITINTINFO_TYPE_NMI:
63129754 2437 vcpu->arch.nmi_injected = false;
fe8e7f83
GN
2438 break;
2439 case SVM_EXITINTINFO_TYPE_EXEPT:
e269fb21
JK
2440 if (svm->vmcb->control.exit_info_2 &
2441 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2442 has_error_code = true;
2443 error_code =
2444 (u32)svm->vmcb->control.exit_info_2;
2445 }
63129754 2446 kvm_clear_exception_queue(vcpu);
fe8e7f83
GN
2447 break;
2448 case SVM_EXITINTINFO_TYPE_INTR:
bdc2d7ad 2449 case SVM_EXITINTINFO_TYPE_SOFT:
63129754 2450 kvm_clear_interrupt_queue(vcpu);
fe8e7f83
GN
2451 break;
2452 default:
2453 break;
2454 }
2455 }
64a7ec06 2456
8317c298
GN
2457 if (reason != TASK_SWITCH_GATE ||
2458 int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2459 (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
f8ea7c60 2460 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
23e5092b 2461 if (!svm_skip_emulated_instruction(vcpu))
738fece4 2462 return 0;
f8ea7c60 2463 }
64a7ec06 2464
7f3d35fd
KW
2465 if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2466 int_vec = -1;
2467
63129754 2468 return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
60fc3d02 2469 has_error_code, error_code);
6aa8b732
AK
2470}
2471
63129754 2472static int iret_interception(struct kvm_vcpu *vcpu)
6aa8b732 2473{
63129754 2474 struct vcpu_svm *svm = to_svm(vcpu);
6aa8b732 2475
63129754
PB
2476 ++vcpu->stat.nmi_window_exits;
2477 vcpu->arch.hflags |= HF_IRET_MASK;
2478 if (!sev_es_guest(vcpu->kvm)) {
4444dfe4 2479 svm_clr_intercept(svm, INTERCEPT_IRET);
63129754 2480 svm->nmi_iret_rip = kvm_rip_read(vcpu);
4444dfe4 2481 }
63129754 2482 kvm_make_request(KVM_REQ_EVENT, vcpu);
95ba8273
GN
2483 return 1;
2484}
2485
63129754 2486static int invlpg_interception(struct kvm_vcpu *vcpu)
a7052897 2487{
df4f3108 2488 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
63129754 2489 return kvm_emulate_instruction(vcpu, 0);
df4f3108 2490
63129754
PB
2491 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2492 return kvm_skip_emulated_instruction(vcpu);
a7052897
MT
2493}
2494
63129754 2495static int emulate_on_interception(struct kvm_vcpu *vcpu)
6aa8b732 2496{
63129754 2497 return kvm_emulate_instruction(vcpu, 0);
6aa8b732
AK
2498}
2499
63129754 2500static int rsm_interception(struct kvm_vcpu *vcpu)
7607b717 2501{
63129754 2502 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
7607b717
BS
2503}
2504
63129754 2505static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
52eb5a6d 2506 unsigned long val)
628afd2a 2507{
63129754
PB
2508 struct vcpu_svm *svm = to_svm(vcpu);
2509 unsigned long cr0 = vcpu->arch.cr0;
628afd2a 2510 bool ret = false;
628afd2a 2511
63129754 2512 if (!is_guest_mode(vcpu) ||
8fc78909 2513 (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
628afd2a
JR
2514 return false;
2515
2516 cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2517 val &= ~SVM_CR0_SELECTIVE_MASK;
2518
2519 if (cr0 ^ val) {
2520 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2521 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2522 }
2523
2524 return ret;
2525}
2526
7ff76d58
AP
2527#define CR_VALID (1ULL << 63)
2528
63129754 2529static int cr_interception(struct kvm_vcpu *vcpu)
7ff76d58 2530{
63129754 2531 struct vcpu_svm *svm = to_svm(vcpu);
7ff76d58
AP
2532 int reg, cr;
2533 unsigned long val;
2534 int err;
2535
2536 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
63129754 2537 return emulate_on_interception(vcpu);
7ff76d58
AP
2538
2539 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
63129754 2540 return emulate_on_interception(vcpu);
7ff76d58
AP
2541
2542 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
5e57518d
DK
2543 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2544 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2545 else
2546 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
7ff76d58
AP
2547
2548 err = 0;
2549 if (cr >= 16) { /* mov to cr */
2550 cr -= 16;
27b4a9c4 2551 val = kvm_register_read(vcpu, reg);
95b28ac9 2552 trace_kvm_cr_write(cr, val);
7ff76d58
AP
2553 switch (cr) {
2554 case 0:
63129754
PB
2555 if (!check_selective_cr0_intercepted(vcpu, val))
2556 err = kvm_set_cr0(vcpu, val);
977b2d03
JR
2557 else
2558 return 1;
2559
7ff76d58
AP
2560 break;
2561 case 3:
63129754 2562 err = kvm_set_cr3(vcpu, val);
7ff76d58
AP
2563 break;
2564 case 4:
63129754 2565 err = kvm_set_cr4(vcpu, val);
7ff76d58
AP
2566 break;
2567 case 8:
63129754 2568 err = kvm_set_cr8(vcpu, val);
7ff76d58
AP
2569 break;
2570 default:
2571 WARN(1, "unhandled write to CR%d", cr);
63129754 2572 kvm_queue_exception(vcpu, UD_VECTOR);
7ff76d58
AP
2573 return 1;
2574 }
2575 } else { /* mov from cr */
2576 switch (cr) {
2577 case 0:
63129754 2578 val = kvm_read_cr0(vcpu);
7ff76d58
AP
2579 break;
2580 case 2:
63129754 2581 val = vcpu->arch.cr2;
7ff76d58
AP
2582 break;
2583 case 3:
63129754 2584 val = kvm_read_cr3(vcpu);
7ff76d58
AP
2585 break;
2586 case 4:
63129754 2587 val = kvm_read_cr4(vcpu);
7ff76d58
AP
2588 break;
2589 case 8:
63129754 2590 val = kvm_get_cr8(vcpu);
7ff76d58
AP
2591 break;
2592 default:
2593 WARN(1, "unhandled read from CR%d", cr);
63129754 2594 kvm_queue_exception(vcpu, UD_VECTOR);
7ff76d58
AP
2595 return 1;
2596 }
27b4a9c4 2597 kvm_register_write(vcpu, reg, val);
95b28ac9 2598 trace_kvm_cr_read(cr, val);
7ff76d58 2599 }
63129754 2600 return kvm_complete_insn_gp(vcpu, err);
7ff76d58
AP
2601}
2602
63129754 2603static int cr_trap(struct kvm_vcpu *vcpu)
f27ad38a 2604{
63129754 2605 struct vcpu_svm *svm = to_svm(vcpu);
f27ad38a
TL
2606 unsigned long old_value, new_value;
2607 unsigned int cr;
d1949b93 2608 int ret = 0;
f27ad38a
TL
2609
2610 new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2611
2612 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2613 switch (cr) {
2614 case 0:
2615 old_value = kvm_read_cr0(vcpu);
2616 svm_set_cr0(vcpu, new_value);
2617
2618 kvm_post_set_cr0(vcpu, old_value, new_value);
2619 break;
5b51cb13
TL
2620 case 4:
2621 old_value = kvm_read_cr4(vcpu);
2622 svm_set_cr4(vcpu, new_value);
2623
2624 kvm_post_set_cr4(vcpu, old_value, new_value);
2625 break;
d1949b93 2626 case 8:
63129754 2627 ret = kvm_set_cr8(vcpu, new_value);
d1949b93 2628 break;
f27ad38a
TL
2629 default:
2630 WARN(1, "unhandled CR%d write trap", cr);
2631 kvm_queue_exception(vcpu, UD_VECTOR);
2632 return 1;
2633 }
2634
d1949b93 2635 return kvm_complete_insn_gp(vcpu, ret);
f27ad38a
TL
2636}
2637
63129754 2638static int dr_interception(struct kvm_vcpu *vcpu)
cae3797a 2639{
63129754 2640 struct vcpu_svm *svm = to_svm(vcpu);
cae3797a
AP
2641 int reg, dr;
2642 unsigned long val;
996ff542 2643 int err = 0;
cae3797a 2644
63129754 2645 if (vcpu->guest_debug == 0) {
facb0139
PB
2646 /*
2647 * No more DR vmexits; force a reload of the debug registers
2648 * and reenter on this instruction. The next vmexit will
2649 * retrieve the full state of the debug registers.
2650 */
2651 clr_dr_intercepts(svm);
63129754 2652 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
facb0139
PB
2653 return 1;
2654 }
2655
cae3797a 2656 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
63129754 2657 return emulate_on_interception(vcpu);
cae3797a
AP
2658
2659 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2660 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
996ff542
PB
2661 if (dr >= 16) { /* mov to DRn */
2662 dr -= 16;
27b4a9c4 2663 val = kvm_register_read(vcpu, reg);
63129754 2664 err = kvm_set_dr(vcpu, dr, val);
cae3797a 2665 } else {
63129754 2666 kvm_get_dr(vcpu, dr, &val);
27b4a9c4 2667 kvm_register_write(vcpu, reg, val);
cae3797a
AP
2668 }
2669
63129754 2670 return kvm_complete_insn_gp(vcpu, err);
cae3797a
AP
2671}
2672
63129754 2673static int cr8_write_interception(struct kvm_vcpu *vcpu)
1d075434 2674{
eea1cff9 2675 int r;
851ba692 2676
63129754 2677 u8 cr8_prev = kvm_get_cr8(vcpu);
0a5fff19 2678 /* instruction emulation calls kvm_set_cr8() */
63129754
PB
2679 r = cr_interception(vcpu);
2680 if (lapic_in_kernel(vcpu))
7ff76d58 2681 return r;
63129754 2682 if (cr8_prev <= kvm_get_cr8(vcpu))
7ff76d58 2683 return r;
63129754 2684 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
1d075434
JR
2685 return 0;
2686}
2687
63129754 2688static int efer_trap(struct kvm_vcpu *vcpu)
2985afbc
TL
2689{
2690 struct msr_data msr_info;
2691 int ret;
2692
2693 /*
2694 * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2695 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2696 * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2697 * the guest doesn't have X86_FEATURE_SVM.
2698 */
2699 msr_info.host_initiated = false;
2700 msr_info.index = MSR_EFER;
63129754
PB
2701 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2702 ret = kvm_set_msr_common(vcpu, &msr_info);
2985afbc 2703
63129754 2704 return kvm_complete_insn_gp(vcpu, ret);
2985afbc
TL
2705}
2706
801e459a
TL
2707static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2708{
d1d93fa9
TL
2709 msr->data = 0;
2710
2711 switch (msr->index) {
2712 case MSR_F10H_DECFG:
2713 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
2714 msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
2715 break;
d574c539
VK
2716 case MSR_IA32_PERF_CAPABILITIES:
2717 return 0;
d1d93fa9 2718 default:
12bc2132 2719 return KVM_MSR_RET_INVALID;
d1d93fa9
TL
2720 }
2721
2722 return 0;
801e459a
TL
2723}
2724
609e36d3 2725static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
6aa8b732 2726{
a2fa3e9f
GH
2727 struct vcpu_svm *svm = to_svm(vcpu);
2728
609e36d3 2729 switch (msr_info->index) {
5228eb96
ML
2730 case MSR_AMD64_TSC_RATIO:
2731 if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
2732 return 1;
2733 msr_info->data = svm->tsc_ratio_msr;
2734 break;
8c06585d 2735 case MSR_STAR:
cc3ed80a 2736 msr_info->data = svm->vmcb01.ptr->save.star;
6aa8b732 2737 break;
0e859cac 2738#ifdef CONFIG_X86_64
6aa8b732 2739 case MSR_LSTAR:
cc3ed80a 2740 msr_info->data = svm->vmcb01.ptr->save.lstar;
6aa8b732
AK
2741 break;
2742 case MSR_CSTAR:
cc3ed80a 2743 msr_info->data = svm->vmcb01.ptr->save.cstar;
6aa8b732
AK
2744 break;
2745 case MSR_KERNEL_GS_BASE:
cc3ed80a 2746 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
6aa8b732
AK
2747 break;
2748 case MSR_SYSCALL_MASK:
cc3ed80a 2749 msr_info->data = svm->vmcb01.ptr->save.sfmask;
6aa8b732
AK
2750 break;
2751#endif
2752 case MSR_IA32_SYSENTER_CS:
cc3ed80a 2753 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
6aa8b732
AK
2754 break;
2755 case MSR_IA32_SYSENTER_EIP:
adc2a237
ML
2756 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2757 if (guest_cpuid_is_intel(vcpu))
2758 msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
6aa8b732
AK
2759 break;
2760 case MSR_IA32_SYSENTER_ESP:
adc2a237
ML
2761 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2762 if (guest_cpuid_is_intel(vcpu))
2763 msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
6aa8b732 2764 break;
46896c73 2765 case MSR_TSC_AUX:
46896c73
PB
2766 msr_info->data = svm->tsc_aux;
2767 break;
a2938c80 2768 case MSR_IA32_DEBUGCTLMSR:
a2938c80 2769 case MSR_IA32_LASTBRANCHFROMIP:
a2938c80 2770 case MSR_IA32_LASTBRANCHTOIP:
a2938c80 2771 case MSR_IA32_LASTINTFROMIP:
a2938c80 2772 case MSR_IA32_LASTINTTOIP:
1d5a1b58 2773 msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
a2938c80 2774 break;
b286d5d8 2775 case MSR_VM_HSAVE_PA:
609e36d3 2776 msr_info->data = svm->nested.hsave_msr;
b286d5d8 2777 break;
eb6f302e 2778 case MSR_VM_CR:
609e36d3 2779 msr_info->data = svm->nested.vm_cr_msr;
eb6f302e 2780 break;
b2ac58f9
KA
2781 case MSR_IA32_SPEC_CTRL:
2782 if (!msr_info->host_initiated &&
39485ed9 2783 !guest_has_spec_ctrl_msr(vcpu))
b2ac58f9
KA
2784 return 1;
2785
d00b99c5
BM
2786 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2787 msr_info->data = svm->vmcb->save.spec_ctrl;
2788 else
2789 msr_info->data = svm->spec_ctrl;
b2ac58f9 2790 break;
bc226f07
TL
2791 case MSR_AMD64_VIRT_SPEC_CTRL:
2792 if (!msr_info->host_initiated &&
2793 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2794 return 1;
2795
2796 msr_info->data = svm->virt_spec_ctrl;
2797 break;
ae8b7875
BP
2798 case MSR_F15H_IC_CFG: {
2799
2800 int family, model;
2801
2802 family = guest_cpuid_family(vcpu);
2803 model = guest_cpuid_model(vcpu);
2804
2805 if (family < 0 || model < 0)
2806 return kvm_get_msr_common(vcpu, msr_info);
2807
2808 msr_info->data = 0;
2809
2810 if (family == 0x15 &&
2811 (model >= 0x2 && model < 0x20))
2812 msr_info->data = 0x1E;
2813 }
2814 break;
d1d93fa9
TL
2815 case MSR_F10H_DECFG:
2816 msr_info->data = svm->msr_decfg;
2817 break;
6aa8b732 2818 default:
609e36d3 2819 return kvm_get_msr_common(vcpu, msr_info);
6aa8b732
AK
2820 }
2821 return 0;
2822}
2823
f1c6366e
TL
2824static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2825{
2826 struct vcpu_svm *svm = to_svm(vcpu);
b67a4cc3 2827 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
63129754 2828 return kvm_complete_insn_gp(vcpu, err);
f1c6366e 2829
b67a4cc3
PG
2830 ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2831 ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
f1c6366e
TL
2832 X86_TRAP_GP |
2833 SVM_EVTINJ_TYPE_EXEPT |
2834 SVM_EVTINJ_VALID);
2835 return 1;
2836}
2837
4a810181
JR
2838static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2839{
2840 struct vcpu_svm *svm = to_svm(vcpu);
2841 int svm_dis, chg_mask;
2842
2843 if (data & ~SVM_VM_CR_VALID_MASK)
2844 return 1;
2845
2846 chg_mask = SVM_VM_CR_VALID_MASK;
2847
2848 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2849 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2850
2851 svm->nested.vm_cr_msr &= ~chg_mask;
2852 svm->nested.vm_cr_msr |= (data & chg_mask);
2853
2854 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2855
2856 /* check for svm_disable while efer.svme is set */
2857 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2858 return 1;
2859
2860 return 0;
2861}
2862
8fe8ab46 2863static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
6aa8b732 2864{
a2fa3e9f 2865 struct vcpu_svm *svm = to_svm(vcpu);
844d69c2 2866 int r;
a2fa3e9f 2867
8fe8ab46
WA
2868 u32 ecx = msr->index;
2869 u64 data = msr->data;
6aa8b732 2870 switch (ecx) {
5228eb96 2871 case MSR_AMD64_TSC_RATIO:
e910a53f
ML
2872
2873 if (!svm->tsc_scaling_enabled) {
2874
2875 if (!msr->host_initiated)
2876 return 1;
2877 /*
2878 * In case TSC scaling is not enabled, always
2879 * leave this MSR at the default value.
2880 *
2881 * Due to bug in qemu 6.2.0, it would try to set
2882 * this msr to 0 if tsc scaling is not enabled.
2883 * Ignore this value as well.
2884 */
2885 if (data != 0 && data != svm->tsc_ratio_msr)
2886 return 1;
2887 break;
2888 }
5228eb96 2889
bb2aa78e 2890 if (data & SVM_TSC_RATIO_RSVD)
5228eb96
ML
2891 return 1;
2892
2893 svm->tsc_ratio_msr = data;
2894
2895 if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
2896 nested_svm_update_tsc_ratio_msr(vcpu);
2897
2898 break;
15038e14
PB
2899 case MSR_IA32_CR_PAT:
2900 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2901 return 1;
2902 vcpu->arch.pat = data;
4995a368
CA
2903 svm->vmcb01.ptr->save.g_pat = data;
2904 if (is_guest_mode(vcpu))
2905 nested_vmcb02_compute_g_pat(svm);
06e7852c 2906 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
15038e14 2907 break;
b2ac58f9
KA
2908 case MSR_IA32_SPEC_CTRL:
2909 if (!msr->host_initiated &&
39485ed9 2910 !guest_has_spec_ctrl_msr(vcpu))
b2ac58f9
KA
2911 return 1;
2912
841c2be0 2913 if (kvm_spec_ctrl_test_value(data))
b2ac58f9
KA
2914 return 1;
2915
d00b99c5
BM
2916 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2917 svm->vmcb->save.spec_ctrl = data;
2918 else
2919 svm->spec_ctrl = data;
b2ac58f9
KA
2920 if (!data)
2921 break;
2922
2923 /*
2924 * For non-nested:
2925 * When it's written (to non-zero) for the first time, pass
2926 * it through.
2927 *
2928 * For nested:
2929 * The handling of the MSR bitmap for L2 guests is done in
2930 * nested_svm_vmrun_msrpm.
2931 * We update the L1 MSR bit as well since it will end up
2932 * touching the MSR anyway now.
2933 */
476c9bd8 2934 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
b2ac58f9 2935 break;
15d45071
AR
2936 case MSR_IA32_PRED_CMD:
2937 if (!msr->host_initiated &&
39485ed9 2938 !guest_has_pred_cmd_msr(vcpu))
15d45071
AR
2939 return 1;
2940
2941 if (data & ~PRED_CMD_IBPB)
2942 return 1;
39485ed9 2943 if (!boot_cpu_has(X86_FEATURE_IBPB))
6441fa61 2944 return 1;
15d45071
AR
2945 if (!data)
2946 break;
2947
2948 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
476c9bd8 2949 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
15d45071 2950 break;
bc226f07
TL
2951 case MSR_AMD64_VIRT_SPEC_CTRL:
2952 if (!msr->host_initiated &&
2953 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2954 return 1;
2955
2956 if (data & ~SPEC_CTRL_SSBD)
2957 return 1;
2958
2959 svm->virt_spec_ctrl = data;
2960 break;
8c06585d 2961 case MSR_STAR:
cc3ed80a 2962 svm->vmcb01.ptr->save.star = data;
6aa8b732 2963 break;
49b14f24 2964#ifdef CONFIG_X86_64
6aa8b732 2965 case MSR_LSTAR:
cc3ed80a 2966 svm->vmcb01.ptr->save.lstar = data;
6aa8b732
AK
2967 break;
2968 case MSR_CSTAR:
cc3ed80a 2969 svm->vmcb01.ptr->save.cstar = data;
6aa8b732
AK
2970 break;
2971 case MSR_KERNEL_GS_BASE:
cc3ed80a 2972 svm->vmcb01.ptr->save.kernel_gs_base = data;
6aa8b732
AK
2973 break;
2974 case MSR_SYSCALL_MASK:
cc3ed80a 2975 svm->vmcb01.ptr->save.sfmask = data;
6aa8b732
AK
2976 break;
2977#endif
2978 case MSR_IA32_SYSENTER_CS:
cc3ed80a 2979 svm->vmcb01.ptr->save.sysenter_cs = data;
6aa8b732
AK
2980 break;
2981 case MSR_IA32_SYSENTER_EIP:
adc2a237
ML
2982 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
2983 /*
2984 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
2985 * when we spoof an Intel vendor ID (for cross vendor migration).
2986 * In this case we use this intercept to track the high
2987 * 32 bit part of these msrs to support Intel's
2988 * implementation of SYSENTER/SYSEXIT.
2989 */
2990 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
6aa8b732
AK
2991 break;
2992 case MSR_IA32_SYSENTER_ESP:
adc2a237
ML
2993 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
2994 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
6aa8b732 2995 break;
46896c73 2996 case MSR_TSC_AUX:
46896c73 2997 /*
844d69c2
SC
2998 * TSC_AUX is usually changed only during boot and never read
2999 * directly. Intercept TSC_AUX instead of exposing it to the
3000 * guest via direct_access_msrs, and switch it via user return.
46896c73 3001 */
844d69c2 3002 preempt_disable();
0caa0a77 3003 r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
844d69c2
SC
3004 preempt_enable();
3005 if (r)
3006 return 1;
3007
46896c73 3008 svm->tsc_aux = data;
46896c73 3009 break;
a2938c80 3010 case MSR_IA32_DEBUGCTLMSR:
4c84926e 3011 if (!lbrv) {
a737f256
CD
3012 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3013 __func__, data);
24e09cbf
JR
3014 break;
3015 }
3016 if (data & DEBUGCTL_RESERVED_BITS)
3017 return 1;
3018
1d5a1b58
ML
3019 if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
3020 svm->vmcb->save.dbgctl = data;
24e09cbf 3021 else
1d5a1b58
ML
3022 svm->vmcb01.ptr->save.dbgctl = data;
3023
3024 svm_update_lbrv(vcpu);
3025
a2938c80 3026 break;
b286d5d8 3027 case MSR_VM_HSAVE_PA:
fce7e152
VK
3028 /*
3029 * Old kernels did not validate the value written to
3030 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
3031 * value to allow live migrating buggy or malicious guests
3032 * originating from those kernels.
3033 */
3034 if (!msr->host_initiated && !page_address_valid(vcpu, data))
3035 return 1;
3036
3037 svm->nested.hsave_msr = data & PAGE_MASK;
62b9abaa 3038 break;
3c5d0a44 3039 case MSR_VM_CR:
4a810181 3040 return svm_set_vm_cr(vcpu, data);
3c5d0a44 3041 case MSR_VM_IGNNE:
a737f256 3042 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3c5d0a44 3043 break;
d1d93fa9
TL
3044 case MSR_F10H_DECFG: {
3045 struct kvm_msr_entry msr_entry;
3046
3047 msr_entry.index = msr->index;
3048 if (svm_get_msr_feature(&msr_entry))
3049 return 1;
3050
3051 /* Check the supported bits */
3052 if (data & ~msr_entry.data)
3053 return 1;
3054
3055 /* Don't allow the guest to change a bit, #GP */
3056 if (!msr->host_initiated && (data ^ msr_entry.data))
3057 return 1;
3058
3059 svm->msr_decfg = data;
3060 break;
3061 }
6aa8b732 3062 default:
8fe8ab46 3063 return kvm_set_msr_common(vcpu, msr);
6aa8b732
AK
3064 }
3065 return 0;
3066}
3067
63129754 3068static int msr_interception(struct kvm_vcpu *vcpu)
6aa8b732 3069{
63129754 3070 if (to_svm(vcpu)->vmcb->control.exit_info_1)
5ff3a351 3071 return kvm_emulate_wrmsr(vcpu);
6aa8b732 3072 else
5ff3a351 3073 return kvm_emulate_rdmsr(vcpu);
6aa8b732
AK
3074}
3075
63129754 3076static int interrupt_window_interception(struct kvm_vcpu *vcpu)
c1150d8c 3077{
63129754
PB
3078 kvm_make_request(KVM_REQ_EVENT, vcpu);
3079 svm_clear_vintr(to_svm(vcpu));
f3515dc3
SS
3080
3081 /*
f44509f8 3082 * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
f3515dc3
SS
3083 * In this case AVIC was temporarily disabled for
3084 * requesting the IRQ window and we have to re-enable it.
f44509f8
ML
3085 *
3086 * If running nested, still remove the VM wide AVIC inhibit to
3087 * support case in which the interrupt window was requested when the
3088 * vCPU was not running nested.
3089
3090 * All vCPUs which run still run nested, will remain to have their
3091 * AVIC still inhibited due to per-cpu AVIC inhibition.
f3515dc3 3092 */
320af55a 3093 kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
f3515dc3 3094
63129754 3095 ++vcpu->stat.irq_window_exits;
c1150d8c
DL
3096 return 1;
3097}
3098
63129754 3099static int pause_interception(struct kvm_vcpu *vcpu)
565d0998 3100{
f1c6366e 3101 bool in_kernel;
f1c6366e
TL
3102 /*
3103 * CPL is not made available for an SEV-ES guest, therefore
3104 * vcpu->arch.preempted_in_kernel can never be true. Just
3105 * set in_kernel to false as well.
3106 */
63129754 3107 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
de63ad4c 3108
74fd41ed 3109 grow_ple_window(vcpu);
8566ac8b 3110
de63ad4c 3111 kvm_vcpu_on_spin(vcpu, in_kernel);
c8781fea 3112 return kvm_skip_emulated_instruction(vcpu);
87c00572
GS
3113}
3114
63129754 3115static int invpcid_interception(struct kvm_vcpu *vcpu)
87c00572 3116{
63129754 3117 struct vcpu_svm *svm = to_svm(vcpu);
4407a797
BM
3118 unsigned long type;
3119 gva_t gva;
3120
3121 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3122 kvm_queue_exception(vcpu, UD_VECTOR);
3123 return 1;
3124 }
3125
3126 /*
3127 * For an INVPCID intercept:
3128 * EXITINFO1 provides the linear address of the memory operand.
3129 * EXITINFO2 provides the contents of the register operand.
3130 */
3131 type = svm->vmcb->control.exit_info_2;
3132 gva = svm->vmcb->control.exit_info_1;
3133
4407a797
BM
3134 return kvm_handle_invpcid(vcpu, type, gva);
3135}
3136
63129754 3137static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7ff76d58
AP
3138 [SVM_EXIT_READ_CR0] = cr_interception,
3139 [SVM_EXIT_READ_CR3] = cr_interception,
3140 [SVM_EXIT_READ_CR4] = cr_interception,
3141 [SVM_EXIT_READ_CR8] = cr_interception,
5e57518d 3142 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
628afd2a 3143 [SVM_EXIT_WRITE_CR0] = cr_interception,
7ff76d58
AP
3144 [SVM_EXIT_WRITE_CR3] = cr_interception,
3145 [SVM_EXIT_WRITE_CR4] = cr_interception,
e0231715 3146 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
cae3797a
AP
3147 [SVM_EXIT_READ_DR0] = dr_interception,
3148 [SVM_EXIT_READ_DR1] = dr_interception,
3149 [SVM_EXIT_READ_DR2] = dr_interception,
3150 [SVM_EXIT_READ_DR3] = dr_interception,
3151 [SVM_EXIT_READ_DR4] = dr_interception,
3152 [SVM_EXIT_READ_DR5] = dr_interception,
3153 [SVM_EXIT_READ_DR6] = dr_interception,
3154 [SVM_EXIT_READ_DR7] = dr_interception,
3155 [SVM_EXIT_WRITE_DR0] = dr_interception,
3156 [SVM_EXIT_WRITE_DR1] = dr_interception,
3157 [SVM_EXIT_WRITE_DR2] = dr_interception,
3158 [SVM_EXIT_WRITE_DR3] = dr_interception,
3159 [SVM_EXIT_WRITE_DR4] = dr_interception,
3160 [SVM_EXIT_WRITE_DR5] = dr_interception,
3161 [SVM_EXIT_WRITE_DR6] = dr_interception,
3162 [SVM_EXIT_WRITE_DR7] = dr_interception,
d0bfb940
JK
3163 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
3164 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
7aa81cc0 3165 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
e0231715 3166 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
e0231715 3167 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
54a20552 3168 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
9718420e 3169 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
e0231715 3170 [SVM_EXIT_INTR] = intr_interception,
c47f098d 3171 [SVM_EXIT_NMI] = nmi_interception,
991afbbe 3172 [SVM_EXIT_SMI] = smi_interception,
c1150d8c 3173 [SVM_EXIT_VINTR] = interrupt_window_interception,
32c23c7d 3174 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
5ff3a351 3175 [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
95ba8273 3176 [SVM_EXIT_IRET] = iret_interception,
5ff3a351 3177 [SVM_EXIT_INVD] = kvm_emulate_invd,
565d0998 3178 [SVM_EXIT_PAUSE] = pause_interception,
5ff3a351 3179 [SVM_EXIT_HLT] = kvm_emulate_halt,
a7052897 3180 [SVM_EXIT_INVLPG] = invlpg_interception,
ff092385 3181 [SVM_EXIT_INVLPGA] = invlpga_interception,
e0231715 3182 [SVM_EXIT_IOIO] = io_interception,
6aa8b732
AK
3183 [SVM_EXIT_MSR] = msr_interception,
3184 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
46fe4ddd 3185 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
3d6368ef 3186 [SVM_EXIT_VMRUN] = vmrun_interception,
5ff3a351 3187 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
5542675b
AG
3188 [SVM_EXIT_VMLOAD] = vmload_interception,
3189 [SVM_EXIT_VMSAVE] = vmsave_interception,
1371d904
AG
3190 [SVM_EXIT_STGI] = stgi_interception,
3191 [SVM_EXIT_CLGI] = clgi_interception,
532a46b9 3192 [SVM_EXIT_SKINIT] = skinit_interception,
3b195ac9 3193 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
5ff3a351
SC
3194 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
3195 [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
3196 [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
92f9895c 3197 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
5ff3a351 3198 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
2985afbc 3199 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
f27ad38a 3200 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
5b51cb13 3201 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
d1949b93 3202 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
4407a797 3203 [SVM_EXIT_INVPCID] = invpcid_interception,
d0006530 3204 [SVM_EXIT_NPF] = npf_interception,
7607b717 3205 [SVM_EXIT_RSM] = rsm_interception,
18f40c53
SS
3206 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
3207 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
291bd20d 3208 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
6aa8b732
AK
3209};
3210
ae8cc059 3211static void dump_vmcb(struct kvm_vcpu *vcpu)
3f10c846
JR
3212{
3213 struct vcpu_svm *svm = to_svm(vcpu);
3214 struct vmcb_control_area *control = &svm->vmcb->control;
3215 struct vmcb_save_area *save = &svm->vmcb->save;
cc3ed80a 3216 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3f10c846 3217
6f2f8453
PB
3218 if (!dump_invalid_vmcb) {
3219 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3220 return;
3221 }
3222
18f63b15
JM
3223 pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3224 svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3f10c846 3225 pr_err("VMCB Control Area:\n");
03bfeeb9
BM
3226 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3227 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
30abaa88
BM
3228 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3229 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
9780d51d 3230 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
c62e2e94
BM
3231 pr_err("%-20s%08x %08x\n", "intercepts:",
3232 control->intercepts[INTERCEPT_WORD3],
3233 control->intercepts[INTERCEPT_WORD4]);
ae8cc059 3234 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
1d8fb44a
BM
3235 pr_err("%-20s%d\n", "pause filter threshold:",
3236 control->pause_filter_thresh);
ae8cc059
JP
3237 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3238 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3239 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3240 pr_err("%-20s%d\n", "asid:", control->asid);
3241 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3242 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3243 pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3244 pr_err("%-20s%08x\n", "int_state:", control->int_state);
3245 pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3246 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3247 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3248 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3249 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3250 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3251 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
44a95dae 3252 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
291bd20d 3253 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
ae8cc059
JP
3254 pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3255 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
0dc92119 3256 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
ae8cc059 3257 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
44a95dae
SS
3258 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3259 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3260 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
376c6d28 3261 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3f10c846 3262 pr_err("VMCB State Save Area:\n");
ae8cc059
JP
3263 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3264 "es:",
3265 save->es.selector, save->es.attrib,
3266 save->es.limit, save->es.base);
3267 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3268 "cs:",
3269 save->cs.selector, save->cs.attrib,
3270 save->cs.limit, save->cs.base);
3271 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3272 "ss:",
3273 save->ss.selector, save->ss.attrib,
3274 save->ss.limit, save->ss.base);
3275 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3276 "ds:",
3277 save->ds.selector, save->ds.attrib,
3278 save->ds.limit, save->ds.base);
3279 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3280 "fs:",
cc3ed80a
ML
3281 save01->fs.selector, save01->fs.attrib,
3282 save01->fs.limit, save01->fs.base);
ae8cc059
JP
3283 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3284 "gs:",
cc3ed80a
ML
3285 save01->gs.selector, save01->gs.attrib,
3286 save01->gs.limit, save01->gs.base);
ae8cc059
JP
3287 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3288 "gdtr:",
3289 save->gdtr.selector, save->gdtr.attrib,
3290 save->gdtr.limit, save->gdtr.base);
3291 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3292 "ldtr:",
cc3ed80a
ML
3293 save01->ldtr.selector, save01->ldtr.attrib,
3294 save01->ldtr.limit, save01->ldtr.base);
ae8cc059
JP
3295 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3296 "idtr:",
3297 save->idtr.selector, save->idtr.attrib,
3298 save->idtr.limit, save->idtr.base);
3299 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3300 "tr:",
cc3ed80a
ML
3301 save01->tr.selector, save01->tr.attrib,
3302 save01->tr.limit, save01->tr.base);
046f773b
BS
3303 pr_err("vmpl: %d cpl: %d efer: %016llx\n",
3304 save->vmpl, save->cpl, save->efer);
ae8cc059
JP
3305 pr_err("%-15s %016llx %-13s %016llx\n",
3306 "cr0:", save->cr0, "cr2:", save->cr2);
3307 pr_err("%-15s %016llx %-13s %016llx\n",
3308 "cr3:", save->cr3, "cr4:", save->cr4);
3309 pr_err("%-15s %016llx %-13s %016llx\n",
3310 "dr6:", save->dr6, "dr7:", save->dr7);
3311 pr_err("%-15s %016llx %-13s %016llx\n",
3312 "rip:", save->rip, "rflags:", save->rflags);
3313 pr_err("%-15s %016llx %-13s %016llx\n",
3314 "rsp:", save->rsp, "rax:", save->rax);
3315 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a 3316 "star:", save01->star, "lstar:", save01->lstar);
ae8cc059 3317 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a 3318 "cstar:", save01->cstar, "sfmask:", save01->sfmask);
ae8cc059 3319 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a
ML
3320 "kernel_gs_base:", save01->kernel_gs_base,
3321 "sysenter_cs:", save01->sysenter_cs);
ae8cc059 3322 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a
ML
3323 "sysenter_esp:", save01->sysenter_esp,
3324 "sysenter_eip:", save01->sysenter_eip);
ae8cc059
JP
3325 pr_err("%-15s %016llx %-13s %016llx\n",
3326 "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3327 pr_err("%-15s %016llx %-13s %016llx\n",
3328 "br_from:", save->br_from, "br_to:", save->br_to);
3329 pr_err("%-15s %016llx %-13s %016llx\n",
3330 "excp_from:", save->last_excp_from,
3331 "excp_to:", save->last_excp_to);
3f10c846
JR
3332}
3333
98242dca 3334static bool svm_check_exit_valid(u64 exit_code)
e9093fd4 3335{
7a4bca85
ML
3336 return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3337 svm_exit_handlers[exit_code]);
3338}
e9093fd4 3339
7a4bca85
ML
3340static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3341{
e9093fd4
TL
3342 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3343 dump_vmcb(vcpu);
3344 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3345 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3346 vcpu->run->internal.ndata = 2;
3347 vcpu->run->internal.data[0] = exit_code;
3348 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
7a4bca85 3349 return 0;
e9093fd4
TL
3350}
3351
63129754 3352int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
e9093fd4 3353{
98242dca 3354 if (!svm_check_exit_valid(exit_code))
7a4bca85 3355 return svm_handle_invalid_exit(vcpu, exit_code);
e9093fd4
TL
3356
3357#ifdef CONFIG_RETPOLINE
3358 if (exit_code == SVM_EXIT_MSR)
63129754 3359 return msr_interception(vcpu);
e9093fd4 3360 else if (exit_code == SVM_EXIT_VINTR)
63129754 3361 return interrupt_window_interception(vcpu);
e9093fd4 3362 else if (exit_code == SVM_EXIT_INTR)
63129754 3363 return intr_interception(vcpu);
e9093fd4 3364 else if (exit_code == SVM_EXIT_HLT)
5ff3a351 3365 return kvm_emulate_halt(vcpu);
e9093fd4 3366 else if (exit_code == SVM_EXIT_NPF)
63129754 3367 return npf_interception(vcpu);
e9093fd4 3368#endif
63129754 3369 return svm_exit_handlers[exit_code](vcpu);
e9093fd4
TL
3370}
3371
0a62a031
DE
3372static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3373 u64 *info1, u64 *info2,
235ba74f 3374 u32 *intr_info, u32 *error_code)
586f9607
AK
3375{
3376 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3377
0a62a031 3378 *reason = control->exit_code;
586f9607
AK
3379 *info1 = control->exit_info_1;
3380 *info2 = control->exit_info_2;
235ba74f
SC
3381 *intr_info = control->exit_int_info;
3382 if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3383 (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3384 *error_code = control->exit_int_info_err;
3385 else
3386 *error_code = 0;
586f9607
AK
3387}
3388
23e5092b 3389static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6aa8b732 3390{
04d2cc77 3391 struct vcpu_svm *svm = to_svm(vcpu);
851ba692 3392 struct kvm_run *kvm_run = vcpu->run;
a2fa3e9f 3393 u32 exit_code = svm->vmcb->control.exit_code;
6aa8b732 3394
0a62a031 3395 trace_kvm_exit(vcpu, KVM_ISA_SVM);
8b89fe1f 3396
f1c6366e
TL
3397 /* SEV-ES guests must use the CR write traps to track CR registers. */
3398 if (!sev_es_guest(vcpu->kvm)) {
3399 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3400 vcpu->arch.cr0 = svm->vmcb->save.cr0;
3401 if (npt_enabled)
3402 vcpu->arch.cr3 = svm->vmcb->save.cr3;
3403 }
af9ca2d7 3404
2030753d 3405 if (is_guest_mode(vcpu)) {
410e4d57
JR
3406 int vmexit;
3407
0a62a031 3408 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
d8cabddf 3409
410e4d57
JR
3410 vmexit = nested_svm_exit_special(svm);
3411
3412 if (vmexit == NESTED_EXIT_CONTINUE)
3413 vmexit = nested_svm_exit_handled(svm);
3414
3415 if (vmexit == NESTED_EXIT_DONE)
cf74a78b 3416 return 1;
cf74a78b
AG
3417 }
3418
04d2cc77
AK
3419 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3420 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3421 kvm_run->fail_entry.hardware_entry_failure_reason
3422 = svm->vmcb->control.exit_code;
8a14fe4f 3423 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3f10c846 3424 dump_vmcb(vcpu);
04d2cc77
AK
3425 return 0;
3426 }
3427
a2fa3e9f 3428 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
709ddebf 3429 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
55c5e464
JR
3430 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3431 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
6614c7d0 3432 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
6aa8b732 3433 "exit_code 0x%x\n",
b8688d51 3434 __func__, svm->vmcb->control.exit_int_info,
6aa8b732
AK
3435 exit_code);
3436
404d5d7b 3437 if (exit_fastpath != EXIT_FASTPATH_NONE)
1e9e2622 3438 return 1;
404d5d7b 3439
63129754 3440 return svm_invoke_exit_handler(vcpu, exit_code);
6aa8b732
AK
3441}
3442
3443static void reload_tss(struct kvm_vcpu *vcpu)
3444{
73cd6e5f 3445 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
6aa8b732 3446
0fe1e009 3447 sd->tss_desc->type = 9; /* available 32/64-bit TSS */
6aa8b732
AK
3448 load_TR_desc();
3449}
3450
63129754 3451static void pre_svm_run(struct kvm_vcpu *vcpu)
6aa8b732 3452{
63129754
PB
3453 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3454 struct vcpu_svm *svm = to_svm(vcpu);
6aa8b732 3455
af18fa77 3456 /*
44f1b558
SC
3457 * If the previous vmrun of the vmcb occurred on a different physical
3458 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
3459 * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3460 */
63129754 3461 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
193015ad 3462 svm->current_vmcb->asid_generation = 0;
af18fa77 3463 vmcb_mark_all_dirty(svm->vmcb);
63129754 3464 svm->current_vmcb->cpu = vcpu->cpu;
af18fa77
CA
3465 }
3466
63129754
PB
3467 if (sev_guest(vcpu->kvm))
3468 return pre_sev_run(svm, vcpu->cpu);
70cd94e6 3469
4b656b12 3470 /* FIXME: handle wraparound of asid_generation */
193015ad 3471 if (svm->current_vmcb->asid_generation != sd->asid_generation)
0fe1e009 3472 new_asid(svm, sd);
6aa8b732
AK
3473}
3474
95ba8273
GN
3475static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3476{
3477 struct vcpu_svm *svm = to_svm(vcpu);
3478
3479 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
159fc6fa
MS
3480
3481 if (svm->nmi_l1_to_l2)
3482 return;
3483
95ba8273 3484 vcpu->arch.hflags |= HF_NMI_MASK;
63129754 3485 if (!sev_es_guest(vcpu->kvm))
4444dfe4 3486 svm_set_intercept(svm, INTERCEPT_IRET);
95ba8273
GN
3487 ++vcpu->stat.nmi_injections;
3488}
6aa8b732 3489
2d613912 3490static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
2a8067f1
ED
3491{
3492 struct vcpu_svm *svm = to_svm(vcpu);
7e5b5ef8 3493 u32 type;
2a8067f1 3494
7e5b5ef8
SC
3495 if (vcpu->arch.interrupt.soft) {
3496 if (svm_update_soft_interrupt_rip(vcpu))
3497 return;
cf74a78b 3498
7e5b5ef8
SC
3499 type = SVM_EVTINJ_TYPE_SOFT;
3500 } else {
3501 type = SVM_EVTINJ_TYPE_INTR;
3502 }
2a8067f1 3503
2d613912
SC
3504 trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3505 vcpu->arch.interrupt.soft, reinjected);
9fb2d2b4
GN
3506 ++vcpu->stat.irq_injections;
3507
219b65dc 3508 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
7e5b5ef8 3509 SVM_EVTINJ_VALID | type;
2a8067f1
ED
3510}
3511
66fa226c
ML
3512void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3513 int trig_mode, int vector)
57dfd7b5 3514{
66fa226c 3515 /*
ce0a58f4 3516 * apic->apicv_active must be read after vcpu->mode.
66fa226c
ML
3517 * Pairs with smp_store_release in vcpu_enter_guest.
3518 */
3519 bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
57dfd7b5 3520
ce0a58f4
SC
3521 /* Note, this is called iff the local APIC is in-kernel. */
3522 if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
66fa226c 3523 /* Process the interrupt via inject_pending_event */
57dfd7b5
SC
3524 kvm_make_request(KVM_REQ_EVENT, vcpu);
3525 kvm_vcpu_kick(vcpu);
66fa226c
ML
3526 return;
3527 }
3528
3529 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3530 if (in_guest_mode) {
3531 /*
3532 * Signal the doorbell to tell hardware to inject the IRQ. If
3533 * the vCPU exits the guest before the doorbell chimes, hardware
3534 * will automatically process AVIC interrupts at the next VMRUN.
3535 */
3536 avic_ring_doorbell(vcpu);
57dfd7b5 3537 } else {
66fa226c
ML
3538 /*
3539 * Wake the vCPU if it was blocking. KVM will then detect the
3540 * pending IRQ when checking if the vCPU has a wake event.
3541 */
3542 kvm_vcpu_wake_up(vcpu);
57dfd7b5
SC
3543 }
3544}
3545
66fa226c
ML
3546static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
3547 int trig_mode, int vector)
3548{
3549 kvm_lapic_set_irr(vector, apic);
3550
3551 /*
3552 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3553 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3554 * the read of guest_mode. This guarantees that either VMRUN will see
3555 * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3556 * will signal the doorbell if the CPU has already entered the guest.
3557 */
3558 smp_mb__after_atomic();
3559 svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3560}
3561
b6a7cc35 3562static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
aaacfc9a
JR
3563{
3564 struct vcpu_svm *svm = to_svm(vcpu);
aaacfc9a 3565
f1c6366e
TL
3566 /*
3567 * SEV-ES guests must always keep the CR intercepts cleared. CR
3568 * tracking is done using the CR write traps.
3569 */
3570 if (sev_es_guest(vcpu->kvm))
3571 return;
3572
01c3b2b5 3573 if (nested_svm_virtualize_tpr(vcpu))
88ab24ad
JR
3574 return;
3575
830bd71f 3576 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
596f3142 3577
95ba8273 3578 if (irr == -1)
aaacfc9a
JR
3579 return;
3580
95ba8273 3581 if (tpr >= irr)
830bd71f 3582 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
95ba8273 3583}
aaacfc9a 3584
cae96af1 3585bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
95ba8273
GN
3586{
3587 struct vcpu_svm *svm = to_svm(vcpu);
3588 struct vmcb *vmcb = svm->vmcb;
88c604b6 3589 bool ret;
9c3d370a 3590
cae96af1 3591 if (!gif_set(svm))
bbdad0b5
PB
3592 return true;
3593
cae96af1
PB
3594 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3595 return false;
3596
3597 ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
63129754 3598 (vcpu->arch.hflags & HF_NMI_MASK);
924584cc
JR
3599
3600 return ret;
aaacfc9a
JR
3601}
3602
c9d40913 3603static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
cae96af1
PB
3604{
3605 struct vcpu_svm *svm = to_svm(vcpu);
3606 if (svm->nested.nested_run_pending)
c9d40913 3607 return -EBUSY;
cae96af1 3608
2b0ecccb
ML
3609 if (svm_nmi_blocked(vcpu))
3610 return 0;
3611
c300ab9f
PB
3612 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
3613 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
c9d40913 3614 return -EBUSY;
2b0ecccb 3615 return 1;
cae96af1
PB
3616}
3617
3cfc3092
JK
3618static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3619{
63129754 3620 return !!(vcpu->arch.hflags & HF_NMI_MASK);
3cfc3092
JK
3621}
3622
3623static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3624{
3625 struct vcpu_svm *svm = to_svm(vcpu);
3626
3627 if (masked) {
63129754
PB
3628 vcpu->arch.hflags |= HF_NMI_MASK;
3629 if (!sev_es_guest(vcpu->kvm))
4444dfe4 3630 svm_set_intercept(svm, INTERCEPT_IRET);
3cfc3092 3631 } else {
63129754
PB
3632 vcpu->arch.hflags &= ~HF_NMI_MASK;
3633 if (!sev_es_guest(vcpu->kvm))
4444dfe4 3634 svm_clr_intercept(svm, INTERCEPT_IRET);
3cfc3092
JK
3635 }
3636}
3637
cae96af1 3638bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
78646121
GN
3639{
3640 struct vcpu_svm *svm = to_svm(vcpu);
3641 struct vmcb *vmcb = svm->vmcb;
7fcdb510 3642
fc6f7c03 3643 if (!gif_set(svm))
cae96af1 3644 return true;
7fcdb510 3645
c5063551 3646 if (is_guest_mode(vcpu)) {
fc6f7c03 3647 /* As long as interrupts are being delivered... */
e9fd761a 3648 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
4995a368 3649 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
fc6f7c03
PB
3650 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3651 return true;
3652
3653 /* ... vmexits aren't blocked by the interrupt shadow */
3654 if (nested_exit_on_intr(svm))
3655 return false;
3656 } else {
c5063551 3657 if (!svm_get_if_flag(vcpu))
fc6f7c03
PB
3658 return true;
3659 }
3660
3661 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
cae96af1
PB
3662}
3663
c9d40913 3664static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
cae96af1
PB
3665{
3666 struct vcpu_svm *svm = to_svm(vcpu);
2b0ecccb 3667
cae96af1 3668 if (svm->nested.nested_run_pending)
c9d40913 3669 return -EBUSY;
cae96af1 3670
2b0ecccb
ML
3671 if (svm_interrupt_blocked(vcpu))
3672 return 0;
3673
c300ab9f
PB
3674 /*
3675 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3676 * e.g. if the IRQ arrived asynchronously after checking nested events.
3677 */
3678 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
c9d40913 3679 return -EBUSY;
c300ab9f 3680
2b0ecccb 3681 return 1;
78646121
GN
3682}
3683
b6a7cc35 3684static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
6aa8b732 3685{
219b65dc 3686 struct vcpu_svm *svm = to_svm(vcpu);
219b65dc 3687
e0231715
JR
3688 /*
3689 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3690 * 1, because that's a separate STGI/VMRUN intercept. The next time we
3691 * get that intercept, this function will be called again though and
640bd6e5
JN
3692 * we'll get the vintr intercept. However, if the vGIF feature is
3693 * enabled, the STGI interception will not occur. Enable the irq
3694 * window under the assumption that the hardware will set the GIF.
e0231715 3695 */
ea91559b 3696 if (vgif || gif_set(svm)) {
f3515dc3
SS
3697 /*
3698 * IRQ window is not needed when AVIC is enabled,
3699 * unless we have pending ExtINT since it cannot be injected
f44509f8 3700 * via AVIC. In such case, KVM needs to temporarily disable AVIC,
f3515dc3 3701 * and fallback to injecting IRQ via V_IRQ.
f44509f8
ML
3702 *
3703 * If running nested, AVIC is already locally inhibited
3704 * on this vCPU, therefore there is no need to request
3705 * the VM wide AVIC inhibition.
f3515dc3 3706 */
f44509f8
ML
3707 if (!is_guest_mode(vcpu))
3708 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3709
219b65dc 3710 svm_set_vintr(svm);
219b65dc 3711 }
85f455f7
ED
3712}
3713
b6a7cc35 3714static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
c1150d8c 3715{
04d2cc77 3716 struct vcpu_svm *svm = to_svm(vcpu);
c1150d8c 3717
63129754 3718 if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
c9a7953f 3719 return; /* IRET will cause a vm exit */
44c11430 3720
640bd6e5 3721 if (!gif_set(svm)) {
ea91559b 3722 if (vgif)
a284ba56 3723 svm_set_intercept(svm, INTERCEPT_STGI);
1a5e1852 3724 return; /* STGI will cause a vm exit */
640bd6e5 3725 }
1a5e1852 3726
e0231715
JR
3727 /*
3728 * Something prevents NMI from been injected. Single step over possible
3729 * problem (IRET or exception injection or interrupt shadow)
3730 */
ab2f4d73 3731 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
6be7d306 3732 svm->nmi_singlestep = true;
44c11430 3733 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
c1150d8c
DL
3734}
3735
4d9c83f5 3736static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
d9e368d6 3737{
38e5e92f
JR
3738 struct vcpu_svm *svm = to_svm(vcpu);
3739
4a41e43c
SC
3740 /*
3741 * Flush only the current ASID even if the TLB flush was invoked via
3742 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
3743 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3744 * unconditionally does a TLB flush on both nested VM-Enter and nested
3745 * VM-Exit (via kvm_mmu_reset_context()).
3746 */
38e5e92f
JR
3747 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3748 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3749 else
193015ad 3750 svm->current_vmcb->asid_generation--;
d9e368d6
AK
3751}
3752
faff8758
JS
3753static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3754{
3755 struct vcpu_svm *svm = to_svm(vcpu);
3756
3757 invlpga(gva, svm->vmcb->control.asid);
3758}
3759
d7bf8221
JR
3760static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3761{
3762 struct vcpu_svm *svm = to_svm(vcpu);
3763
01c3b2b5 3764 if (nested_svm_virtualize_tpr(vcpu))
88ab24ad
JR
3765 return;
3766
830bd71f 3767 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
d7bf8221 3768 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
615d5193 3769 kvm_set_cr8(vcpu, cr8);
d7bf8221
JR
3770 }
3771}
3772
649d6864
JR
3773static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3774{
3775 struct vcpu_svm *svm = to_svm(vcpu);
3776 u64 cr8;
3777
01c3b2b5 3778 if (nested_svm_virtualize_tpr(vcpu) ||
3bbf3565 3779 kvm_vcpu_apicv_active(vcpu))
88ab24ad
JR
3780 return;
3781
649d6864
JR
3782 cr8 = kvm_get_cr8(vcpu);
3783 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3784 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3785}
3786
6ef88d6e
SC
3787static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3788 int type)
3789{
7e5b5ef8
SC
3790 bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3791 bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
6ef88d6e
SC
3792 struct vcpu_svm *svm = to_svm(vcpu);
3793
3794 /*
3795 * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3796 * associated with the original soft exception/interrupt. next_rip is
3797 * cleared on all exits that can occur while vectoring an event, so KVM
3798 * needs to manually set next_rip for re-injection. Unlike the !nrips
3799 * case below, this needs to be done if and only if KVM is re-injecting
3800 * the same event, i.e. if the event is a soft exception/interrupt,
3801 * otherwise next_rip is unused on VMRUN.
3802 */
7e5b5ef8 3803 if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
6ef88d6e
SC
3804 kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3805 svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3806 /*
3807 * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3808 * injecting the soft exception/interrupt. That advancement needs to
3809 * be unwound if vectoring didn't complete. Note, the new event may
3810 * not be the injected event, e.g. if KVM injected an INTn, the INTn
3811 * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3812 * be the reported vectored event, but RIP still needs to be unwound.
3813 */
7e5b5ef8 3814 else if (!nrips && (is_soft || is_exception) &&
6ef88d6e
SC
3815 kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
3816 kvm_rip_write(vcpu, svm->soft_int_old_rip);
3817}
3818
63129754 3819static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
9222be18 3820{
63129754 3821 struct vcpu_svm *svm = to_svm(vcpu);
9222be18
GN
3822 u8 vector;
3823 int type;
3824 u32 exitintinfo = svm->vmcb->control.exit_int_info;
159fc6fa 3825 bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
6ef88d6e 3826 bool soft_int_injected = svm->soft_int_injected;
66b7138f 3827
159fc6fa 3828 svm->nmi_l1_to_l2 = false;
6ef88d6e 3829 svm->soft_int_injected = false;
9222be18 3830
bd3d1ec3
AK
3831 /*
3832 * If we've made progress since setting HF_IRET_MASK, we've
3833 * executed an IRET and can allow NMI injection.
3834 */
63129754
PB
3835 if ((vcpu->arch.hflags & HF_IRET_MASK) &&
3836 (sev_es_guest(vcpu->kvm) ||
3837 kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3838 vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3839 kvm_make_request(KVM_REQ_EVENT, vcpu);
3842d135 3840 }
44c11430 3841
63129754
PB
3842 vcpu->arch.nmi_injected = false;
3843 kvm_clear_exception_queue(vcpu);
3844 kvm_clear_interrupt_queue(vcpu);
9222be18
GN
3845
3846 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3847 return;
3848
63129754 3849 kvm_make_request(KVM_REQ_EVENT, vcpu);
3842d135 3850
9222be18
GN
3851 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3852 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3853
6ef88d6e
SC
3854 if (soft_int_injected)
3855 svm_complete_soft_interrupt(vcpu, vector, type);
cd9e6da8 3856
9222be18
GN
3857 switch (type) {
3858 case SVM_EXITINTINFO_TYPE_NMI:
63129754 3859 vcpu->arch.nmi_injected = true;
159fc6fa 3860 svm->nmi_l1_to_l2 = nmi_l1_to_l2;
9222be18
GN
3861 break;
3862 case SVM_EXITINTINFO_TYPE_EXEPT:
f1c6366e
TL
3863 /*
3864 * Never re-inject a #VC exception.
3865 */
3866 if (vector == X86_TRAP_VC)
3867 break;
3868
9222be18
GN
3869 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3870 u32 err = svm->vmcb->control.exit_int_info_err;
63129754 3871 kvm_requeue_exception_e(vcpu, vector, err);
9222be18
GN
3872
3873 } else
63129754 3874 kvm_requeue_exception(vcpu, vector);
9222be18
GN
3875 break;
3876 case SVM_EXITINTINFO_TYPE_INTR:
63129754 3877 kvm_queue_interrupt(vcpu, vector, false);
9222be18 3878 break;
7e5b5ef8
SC
3879 case SVM_EXITINTINFO_TYPE_SOFT:
3880 kvm_queue_interrupt(vcpu, vector, true);
3881 break;
9222be18
GN
3882 default:
3883 break;
3884 }
7e5b5ef8 3885
9222be18
GN
3886}
3887
b463a6f7
AK
3888static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3889{
3890 struct vcpu_svm *svm = to_svm(vcpu);
3891 struct vmcb_control_area *control = &svm->vmcb->control;
3892
3893 control->exit_int_info = control->event_inj;
3894 control->exit_int_info_err = control->event_inj_err;
3895 control->event_inj = 0;
63129754 3896 svm_complete_interrupts(vcpu);
b463a6f7
AK
3897}
3898
fc4fad79
SC
3899static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
3900{
3901 return 1;
3902}
3903
404d5d7b 3904static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
a9ab13ff 3905{
4e810adb 3906 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
a9ab13ff
WL
3907 to_svm(vcpu)->vmcb->control.exit_info_1)
3908 return handle_fastpath_set_msr_irqoff(vcpu);
3909
3910 return EXIT_FASTPATH_NONE;
3911}
3912
63129754 3913static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
135961e0 3914{
63129754 3915 struct vcpu_svm *svm = to_svm(vcpu);
d1788191 3916 unsigned long vmcb_pa = svm->current_vmcb->pa;
63129754 3917
b2d2af7e 3918 guest_state_enter_irqoff();
135961e0 3919
63129754 3920 if (sev_es_guest(vcpu->kvm)) {
d1788191 3921 __svm_sev_es_vcpu_run(vmcb_pa);
16809ecd 3922 } else {
e79b91bb
MR
3923 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3924
d1788191
SC
3925 /*
3926 * Use a single vmcb (vmcb01 because it's always valid) for
3927 * context switching guest state via VMLOAD/VMSAVE, that way
3928 * the state doesn't need to be copied between vmcb01 and
3929 * vmcb02 when switching vmcbs for nested virtualization.
3930 */
cc3ed80a 3931 vmload(svm->vmcb01.pa);
d1788191 3932 __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
cc3ed80a 3933 vmsave(svm->vmcb01.pa);
135961e0 3934
e79b91bb 3935 vmload(__sme_page_pa(sd->save_area));
16809ecd 3936 }
135961e0 3937
b2d2af7e 3938 guest_state_exit_irqoff();
135961e0
TG
3939}
3940
b95273f1 3941static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
6aa8b732 3942{
a2fa3e9f 3943 struct vcpu_svm *svm = to_svm(vcpu);
d9e368d6 3944
d95df951
LB
3945 trace_kvm_entry(vcpu);
3946
2041a06a
JR
3947 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3948 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3949 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3950
a12713c2
LP
3951 /*
3952 * Disable singlestep if we're injecting an interrupt/exception.
3953 * We don't want our modified rflags to be pushed on the stack where
3954 * we might not be able to easily reset them if we disabled NMI
3955 * singlestep later.
3956 */
3957 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3958 /*
3959 * Event injection happens before external interrupts cause a
3960 * vmexit and interrupts are disabled here, so smp_send_reschedule
3961 * is enough to force an immediate vmexit.
3962 */
3963 disable_nmi_singlestep(svm);
3964 smp_send_reschedule(vcpu->cpu);
3965 }
3966
63129754 3967 pre_svm_run(vcpu);
6aa8b732 3968
649d6864
JR
3969 sync_lapic_to_cr8(vcpu);
3970
7e8e6eed
CA
3971 if (unlikely(svm->asid != svm->vmcb->control.asid)) {
3972 svm->vmcb->control.asid = svm->asid;
3973 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3974 }
cda0ffdd 3975 svm->vmcb->save.cr2 = vcpu->arch.cr2;
6aa8b732 3976
1183646a
VP
3977 svm_hv_update_vp_id(svm->vmcb, vcpu);
3978
d67668e9
PB
3979 /*
3980 * Run with all-zero DR6 unless needed, so that we can get the exact cause
3981 * of a #DB.
3982 */
63129754 3983 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
d67668e9
PB
3984 svm_set_dr6(svm, vcpu->arch.dr6);
3985 else
9a3ecd5e 3986 svm_set_dr6(svm, DR6_ACTIVE_LOW);
d67668e9 3987
04d2cc77 3988 clgi();
139a12cf 3989 kvm_load_guest_xsave_state(vcpu);
04d2cc77 3990
010fd37f 3991 kvm_wait_lapic_expire(vcpu);
b6c4bc65 3992
b2ac58f9
KA
3993 /*
3994 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3995 * it's non-zero. Since vmentry is serialising on affected CPUs, there
3996 * is no need to worry about the conditional branch over the wrmsr
3997 * being speculatively taken.
3998 */
d00b99c5
BM
3999 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4000 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
b2ac58f9 4001
63129754 4002 svm_vcpu_enter_exit(vcpu);
15e6c22f 4003
b2ac58f9
KA
4004 /*
4005 * We do not use IBRS in the kernel. If this vCPU has used the
4006 * SPEC_CTRL MSR it may have left it on; save the value and
4007 * turn it off. This is much more efficient than blindly adding
4008 * it to the atomic save/restore list. Especially as the former
4009 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
4010 *
4011 * For non-nested case:
4012 * If the L01 MSR bitmap does not intercept the MSR, then we need to
4013 * save it.
4014 *
4015 * For nested case:
4016 * If the L02 MSR bitmap does not intercept the MSR, then we need to
4017 * save it.
4018 */
d00b99c5
BM
4019 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
4020 unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
ecb586bd 4021 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
b2ac58f9 4022
63129754 4023 if (!sev_es_guest(vcpu->kvm))
16809ecd 4024 reload_tss(vcpu);
6aa8b732 4025
d00b99c5
BM
4026 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4027 x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
024d83ca 4028
63129754 4029 if (!sev_es_guest(vcpu->kvm)) {
16809ecd
TL
4030 vcpu->arch.cr2 = svm->vmcb->save.cr2;
4031 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4032 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4033 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4034 }
41e68b69 4035 vcpu->arch.regs_dirty = 0;
13c34e07 4036
3781c01c 4037 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
db215756 4038 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
3781c01c 4039
139a12cf 4040 kvm_load_host_xsave_state(vcpu);
3781c01c
JR
4041 stgi();
4042
4043 /* Any pending NMI will happen here */
4044
4045 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
63129754 4046 kvm_after_interrupt(vcpu);
3781c01c 4047
d7bf8221
JR
4048 sync_cr8_to_lapic(vcpu);
4049
a2fa3e9f 4050 svm->next_rip = 0;
63129754 4051 if (is_guest_mode(vcpu)) {
9e8f0fbf 4052 nested_sync_control_from_vmcb02(svm);
b93af02c
KS
4053
4054 /* Track VMRUNs that have made past consistency checking */
4055 if (svm->nested.nested_run_pending &&
4056 svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4057 ++vcpu->stat.nested_run;
4058
2d8a42be
PB
4059 svm->nested.nested_run_pending = 0;
4060 }
9222be18 4061
38e5e92f 4062 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
e42c6828 4063 vmcb_mark_all_clean(svm->vmcb);
38e5e92f 4064
631bc487
GN
4065 /* if exit due to PF check for async PF */
4066 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
63129754 4067 vcpu->arch.apf.host_apf_flags =
68fd66f1 4068 kvm_read_and_reset_apf_flags();
631bc487 4069
41e68b69 4070 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
fe5913e4
JR
4071
4072 /*
4073 * We need to handle MC intercepts here before the vcpu has a chance to
4074 * change the physical cpu
4075 */
4076 if (unlikely(svm->vmcb->control.exit_code ==
4077 SVM_EXIT_EXCP_BASE + MC_VECTOR))
63129754 4078 svm_handle_mce(vcpu);
8d28fec4 4079
63129754 4080 svm_complete_interrupts(vcpu);
4e810adb
WL
4081
4082 if (is_guest_mode(vcpu))
4083 return EXIT_FASTPATH_NONE;
4084
4085 return svm_exit_handlers_fastpath(vcpu);
6aa8b732
AK
4086}
4087
e83bc09c 4088static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
2a40b900 4089 int root_level)
6aa8b732 4090{
a2fa3e9f 4091 struct vcpu_svm *svm = to_svm(vcpu);
689f3bf2 4092 unsigned long cr3;
a2fa3e9f 4093
689f3bf2 4094 if (npt_enabled) {
4a98623d 4095 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
06e7852c 4096 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
1c97f0a0 4097
1e0c7d40
VP
4098 hv_track_root_tdp(vcpu, root_hpa);
4099
978ce583 4100 cr3 = vcpu->arch.cr3;
78c7d900 4101 } else if (root_level >= PT64_ROOT_4LEVEL) {
4a98623d 4102 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
e83bc09c
SC
4103 } else {
4104 /* PCID in the guest should be impossible with a 32-bit MMU. */
4105 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4106 cr3 = root_hpa;
689f3bf2 4107 }
1c97f0a0 4108
978ce583 4109 svm->vmcb->save.cr3 = cr3;
06e7852c 4110 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1c97f0a0
JR
4111}
4112
6aa8b732
AK
4113static int is_disabled(void)
4114{
6031a61c
JR
4115 u64 vm_cr;
4116
4117 rdmsrl(MSR_VM_CR, vm_cr);
4118 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4119 return 1;
4120
6aa8b732
AK
4121 return 0;
4122}
4123
102d8325
IM
4124static void
4125svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4126{
4127 /*
4128 * Patch in the VMMCALL instruction:
4129 */
4130 hypercall[0] = 0x0f;
4131 hypercall[1] = 0x01;
4132 hypercall[2] = 0xd9;
102d8325
IM
4133}
4134
f257d6dc 4135static int __init svm_check_processor_compat(void)
002c7f7c 4136{
f257d6dc 4137 return 0;
002c7f7c
YS
4138}
4139
5719455f
TL
4140/*
4141 * The kvm parameter can be NULL (module initialization, or invocation before
4142 * VM creation). Be sure to check the kvm parameter before using it.
4143 */
4144static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
6d396b55 4145{
e87555e5
VK
4146 switch (index) {
4147 case MSR_IA32_MCG_EXT_CTL:
95c5c7c7 4148 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
e87555e5 4149 return false;
5719455f
TL
4150 case MSR_IA32_SMBASE:
4151 /* SEV-ES guests do not support SMM, so report false */
4152 if (kvm && sev_es_guest(kvm))
4153 return false;
4154 break;
e87555e5
VK
4155 default:
4156 break;
4157 }
4158
6d396b55
PB
4159 return true;
4160}
4161
7c1b761b 4162static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
0e851880 4163{
6092d3d3 4164 struct vcpu_svm *svm = to_svm(vcpu);
96308b06 4165 struct kvm_cpuid_entry2 *best;
6092d3d3 4166
7204160e 4167 vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
96be4e06 4168 boot_cpu_has(X86_FEATURE_XSAVE) &&
7204160e
AL
4169 boot_cpu_has(X86_FEATURE_XSAVES);
4170
6092d3d3 4171 /* Update nrips enabled cache */
4eb87460 4172 svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
63129754 4173 guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
46781eae 4174
5228eb96 4175 svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
d20c796c 4176 svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
5228eb96 4177
b9f3973a
ML
4178 svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4179
74fd41ed
ML
4180 svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
4181 guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
4182
4183 svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
4184 guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
4185
0b349662
ML
4186 svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
4187
3b195ac9 4188 svm_recalc_instruction_intercepts(vcpu, svm);
4407a797 4189
96308b06
BM
4190 /* For sev guests, the memory encryption bit is not reserved in CR3. */
4191 if (sev_guest(vcpu->kvm)) {
277ad7d5 4192 best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
96308b06 4193 if (best)
ca29e145 4194 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
96308b06
BM
4195 }
4196
36e8194d 4197 init_vmcb_after_set_cpuid(vcpu);
0e851880
SY
4198}
4199
f5f48ee1
SY
4200static bool svm_has_wbinvd_exit(void)
4201{
4202 return true;
4203}
4204
8061252e 4205#define PRE_EX(exit) { .exit_code = (exit), \
40e19b51 4206 .stage = X86_ICPT_PRE_EXCEPT, }
cfec82cb 4207#define POST_EX(exit) { .exit_code = (exit), \
40e19b51 4208 .stage = X86_ICPT_POST_EXCEPT, }
d7eb8203 4209#define POST_MEM(exit) { .exit_code = (exit), \
40e19b51 4210 .stage = X86_ICPT_POST_MEMACCESS, }
cfec82cb 4211
09941fbb 4212static const struct __x86_intercept {
cfec82cb
JR
4213 u32 exit_code;
4214 enum x86_intercept_stage stage;
cfec82cb
JR
4215} x86_intercept_map[] = {
4216 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
4217 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
4218 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
4219 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
4220 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
3b88e41a
JR
4221 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
4222 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
dee6bb70
JR
4223 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
4224 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
4225 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
4226 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
4227 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
4228 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
4229 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
4230 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
01de8b09
JR
4231 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
4232 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
4233 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
4234 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
4235 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
4236 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
4237 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
4238 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
d7eb8203
JR
4239 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
4240 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
4241 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
8061252e
JR
4242 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
4243 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
4244 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
4245 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
4246 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
4247 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
4248 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
4249 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
4250 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
bf608f88
JR
4251 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
4252 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
4253 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
4254 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
4255 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
4256 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
4257 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
f6511935
JR
4258 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
4259 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
4260 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
4261 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
02d4160f 4262 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
cfec82cb
JR
4263};
4264
8061252e 4265#undef PRE_EX
cfec82cb 4266#undef POST_EX
d7eb8203 4267#undef POST_MEM
cfec82cb 4268
8a76d7f2
JR
4269static int svm_check_intercept(struct kvm_vcpu *vcpu,
4270 struct x86_instruction_info *info,
21f1b8f2
SC
4271 enum x86_intercept_stage stage,
4272 struct x86_exception *exception)
8a76d7f2 4273{
cfec82cb
JR
4274 struct vcpu_svm *svm = to_svm(vcpu);
4275 int vmexit, ret = X86EMUL_CONTINUE;
4276 struct __x86_intercept icpt_info;
4277 struct vmcb *vmcb = svm->vmcb;
4278
4279 if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4280 goto out;
4281
4282 icpt_info = x86_intercept_map[info->intercept];
4283
40e19b51 4284 if (stage != icpt_info.stage)
cfec82cb
JR
4285 goto out;
4286
4287 switch (icpt_info.exit_code) {
4288 case SVM_EXIT_READ_CR0:
4289 if (info->intercept == x86_intercept_cr_read)
4290 icpt_info.exit_code += info->modrm_reg;
4291 break;
4292 case SVM_EXIT_WRITE_CR0: {
4293 unsigned long cr0, val;
cfec82cb
JR
4294
4295 if (info->intercept == x86_intercept_cr_write)
4296 icpt_info.exit_code += info->modrm_reg;
4297
62baf44c
JK
4298 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4299 info->intercept == x86_intercept_clts)
cfec82cb
JR
4300 break;
4301
8fc78909 4302 if (!(vmcb12_is_intercept(&svm->nested.ctl,
c62e2e94 4303 INTERCEPT_SELECTIVE_CR0)))
cfec82cb
JR
4304 break;
4305
4306 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4307 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
4308
4309 if (info->intercept == x86_intercept_lmsw) {
4310 cr0 &= 0xfUL;
4311 val &= 0xfUL;
4312 /* lmsw can't clear PE - catch this here */
4313 if (cr0 & X86_CR0_PE)
4314 val |= X86_CR0_PE;
4315 }
4316
4317 if (cr0 ^ val)
4318 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4319
4320 break;
4321 }
3b88e41a
JR
4322 case SVM_EXIT_READ_DR0:
4323 case SVM_EXIT_WRITE_DR0:
4324 icpt_info.exit_code += info->modrm_reg;
4325 break;
8061252e
JR
4326 case SVM_EXIT_MSR:
4327 if (info->intercept == x86_intercept_wrmsr)
4328 vmcb->control.exit_info_1 = 1;
4329 else
4330 vmcb->control.exit_info_1 = 0;
4331 break;
bf608f88
JR
4332 case SVM_EXIT_PAUSE:
4333 /*
4334 * We get this for NOP only, but pause
4335 * is rep not, check this here
4336 */
4337 if (info->rep_prefix != REPE_PREFIX)
4338 goto out;
49a8afca 4339 break;
f6511935
JR
4340 case SVM_EXIT_IOIO: {
4341 u64 exit_info;
4342 u32 bytes;
4343
f6511935
JR
4344 if (info->intercept == x86_intercept_in ||
4345 info->intercept == x86_intercept_ins) {
6cbc5f5a
JK
4346 exit_info = ((info->src_val & 0xffff) << 16) |
4347 SVM_IOIO_TYPE_MASK;
f6511935 4348 bytes = info->dst_bytes;
6493f157 4349 } else {
6cbc5f5a 4350 exit_info = (info->dst_val & 0xffff) << 16;
6493f157 4351 bytes = info->src_bytes;
f6511935
JR
4352 }
4353
4354 if (info->intercept == x86_intercept_outs ||
4355 info->intercept == x86_intercept_ins)
4356 exit_info |= SVM_IOIO_STR_MASK;
4357
4358 if (info->rep_prefix)
4359 exit_info |= SVM_IOIO_REP_MASK;
4360
4361 bytes = min(bytes, 4u);
4362
4363 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4364
4365 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4366
4367 vmcb->control.exit_info_1 = exit_info;
4368 vmcb->control.exit_info_2 = info->next_rip;
4369
4370 break;
4371 }
cfec82cb
JR
4372 default:
4373 break;
4374 }
4375
f104765b
BD
4376 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4377 if (static_cpu_has(X86_FEATURE_NRIPS))
4378 vmcb->control.next_rip = info->next_rip;
cfec82cb
JR
4379 vmcb->control.exit_code = icpt_info.exit_code;
4380 vmexit = nested_svm_exit_handled(svm);
4381
4382 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4383 : X86EMUL_CONTINUE;
4384
4385out:
4386 return ret;
8a76d7f2
JR
4387}
4388
a9ab13ff 4389static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
a547c6db 4390{
6cd88243
PB
4391 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4392 vcpu->arch.at_instruction_boundary = true;
a547c6db
YZ
4393}
4394
ae97a3b8
RK
4395static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4396{
830f01b0 4397 if (!kvm_pause_in_guest(vcpu->kvm))
8566ac8b 4398 shrink_ple_window(vcpu);
ae97a3b8
RK
4399}
4400
74f16909
BP
4401static void svm_setup_mce(struct kvm_vcpu *vcpu)
4402{
4403 /* [63:9] are reserved. */
4404 vcpu->arch.mcg_cap &= 0x1ff;
4405}
4406
cae96af1 4407bool svm_smi_blocked(struct kvm_vcpu *vcpu)
72d7b374 4408{
05cade71
LP
4409 struct vcpu_svm *svm = to_svm(vcpu);
4410
4411 /* Per APM Vol.2 15.22.2 "Response to SMI" */
4412 if (!gif_set(svm))
cae96af1
PB
4413 return true;
4414
4415 return is_smm(vcpu);
4416}
4417
c9d40913 4418static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
cae96af1
PB
4419{
4420 struct vcpu_svm *svm = to_svm(vcpu);
4421 if (svm->nested.nested_run_pending)
c9d40913 4422 return -EBUSY;
05cade71 4423
2b0ecccb
ML
4424 if (svm_smi_blocked(vcpu))
4425 return 0;
4426
c300ab9f
PB
4427 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
4428 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
c9d40913 4429 return -EBUSY;
c300ab9f 4430
2b0ecccb 4431 return 1;
72d7b374
LP
4432}
4433
ecc513e5 4434static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
0234bf88 4435{
05cade71 4436 struct vcpu_svm *svm = to_svm(vcpu);
37be407b 4437 struct kvm_host_map map_save;
05cade71
LP
4438 int ret;
4439
136a55c0
ML
4440 if (!is_guest_mode(vcpu))
4441 return 0;
05cade71 4442
136a55c0
ML
4443 /* FED8h - SVM Guest */
4444 put_smstate(u64, smstate, 0x7ed8, 1);
4445 /* FEE0h - SVM Guest VMCB Physical Address */
4446 put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
05cade71 4447
136a55c0
ML
4448 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4449 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4450 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
37be407b 4451
249f3249 4452 ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
136a55c0
ML
4453 if (ret)
4454 return ret;
4455
4456 /*
4457 * KVM uses VMCB01 to store L1 host state while L2 runs but
4458 * VMCB01 is going to be used during SMM and thus the state will
4459 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4460 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4461 * format of the area is identical to guest save area offsetted
4462 * by 0x400 (matches the offset of 'struct vmcb_save_area'
4463 * within 'struct vmcb'). Note: HSAVE area may also be used by
4464 * L1 hypervisor to save additional host context (e.g. KVM does
23e5092b 4465 * that, see svm_prepare_switch_to_guest()) which must be
136a55c0
ML
4466 * preserved.
4467 */
4468 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
4469 &map_save) == -EINVAL)
4470 return 1;
37be407b 4471
136a55c0 4472 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
37be407b 4473
136a55c0
ML
4474 svm_copy_vmrun_state(map_save.hva + 0x400,
4475 &svm->vmcb01.ptr->save);
37be407b 4476
136a55c0 4477 kvm_vcpu_unmap(vcpu, &map_save, true);
0234bf88
LP
4478 return 0;
4479}
4480
ecc513e5 4481static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
0234bf88 4482{
05cade71 4483 struct vcpu_svm *svm = to_svm(vcpu);
37be407b 4484 struct kvm_host_map map, map_save;
136a55c0
ML
4485 u64 saved_efer, vmcb12_gpa;
4486 struct vmcb *vmcb12;
4487 int ret;
05cade71 4488
136a55c0
ML
4489 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4490 return 0;
05cade71 4491
136a55c0
ML
4492 /* Non-zero if SMI arrived while vCPU was in guest mode. */
4493 if (!GET_SMSTATE(u64, smstate, 0x7ed8))
4494 return 0;
3ebb5d26 4495
136a55c0
ML
4496 if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4497 return 1;
3ebb5d26 4498
136a55c0
ML
4499 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
4500 if (!(saved_efer & EFER_SVME))
4501 return 1;
3ebb5d26 4502
136a55c0
ML
4503 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
4504 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
4505 return 1;
3ebb5d26 4506
136a55c0
ML
4507 ret = 1;
4508 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
4509 goto unmap_map;
37be407b 4510
136a55c0
ML
4511 if (svm_allocate_nested(svm))
4512 goto unmap_save;
37be407b 4513
136a55c0
ML
4514 /*
4515 * Restore L1 host state from L1 HSAVE area as VMCB01 was
4516 * used during SMM (see svm_enter_smm())
4517 */
37be407b 4518
136a55c0 4519 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
e2e6e449 4520
136a55c0
ML
4521 /*
4522 * Enter the nested guest now
4523 */
59cd9bc5 4524
e8efa4ff
ML
4525 vmcb_mark_all_dirty(svm->vmcb01.ptr);
4526
136a55c0 4527 vmcb12 = map.hva;
7907160d 4528 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
f2740a8d 4529 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
136a55c0
ML
4530 ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
4531
759cbd59
ML
4532 if (ret)
4533 goto unmap_save;
4534
4535 svm->nested.nested_run_pending = 1;
4536
136a55c0
ML
4537unmap_save:
4538 kvm_vcpu_unmap(vcpu, &map_save, true);
4539unmap_map:
4540 kvm_vcpu_unmap(vcpu, &map, true);
59cd9bc5 4541 return ret;
0234bf88
LP
4542}
4543
b6a7cc35 4544static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
cc3d967f
LP
4545{
4546 struct vcpu_svm *svm = to_svm(vcpu);
4547
4548 if (!gif_set(svm)) {
ea91559b 4549 if (vgif)
a284ba56 4550 svm_set_intercept(svm, INTERCEPT_STGI);
cc3d967f 4551 /* STGI will cause a vm exit */
c9d40913
PB
4552 } else {
4553 /* We must be in SMM; RSM will cause a vmexit anyway. */
cc3d967f 4554 }
cc3d967f
LP
4555}
4556
4d31d9ef
SC
4557static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4558 void *insn, int insn_len)
05d5a486 4559{
09e3e2a1
SC
4560 bool smep, smap, is_user;
4561 unsigned long cr4;
3280cc22 4562 u64 error_code;
e72436bc 4563
55467fcd
SC
4564 /* Emulation is always possible when KVM has access to all guest state. */
4565 if (!sev_guest(vcpu->kvm))
4566 return true;
4567
132627c6
SC
4568 /* #UD and #GP should never be intercepted for SEV guests. */
4569 WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4570 EMULTYPE_TRAP_UD_FORCED |
4571 EMULTYPE_VMWARE_GP));
4572
bc624d9f 4573 /*
55467fcd
SC
4574 * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4575 * to guest register state.
bc624d9f
TL
4576 */
4577 if (sev_es_guest(vcpu->kvm))
4578 return false;
4579
04c40f34
SC
4580 /*
4581 * Emulation is possible if the instruction is already decoded, e.g.
4582 * when completing I/O after returning from userspace.
4583 */
4584 if (emul_type & EMULTYPE_NO_DECODE)
4585 return true;
4586
4587 /*
4588 * Emulation is possible for SEV guests if and only if a prefilled
4589 * buffer containing the bytes of the intercepted instruction is
4590 * available. SEV guest memory is encrypted with a guest specific key
4591 * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4592 * decode garbage.
4593 *
4594 * Inject #UD if KVM reached this point without an instruction buffer.
4595 * In practice, this path should never be hit by a well-behaved guest,
4596 * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
4597 * is still theoretically reachable, e.g. via unaccelerated fault-like
4598 * AVIC access, and needs to be handled by KVM to avoid putting the
4599 * guest into an infinite loop. Injecting #UD is somewhat arbitrary,
4600 * but its the least awful option given lack of insight into the guest.
4601 */
4602 if (unlikely(!insn)) {
4603 kvm_queue_exception(vcpu, UD_VECTOR);
4604 return false;
4605 }
4606
4607 /*
4608 * Emulate for SEV guests if the insn buffer is not empty. The buffer
4609 * will be empty if the DecodeAssist microcode cannot fetch bytes for
4610 * the faulting instruction because the code fetch itself faulted, e.g.
4611 * the guest attempted to fetch from emulated MMIO or a guest page
4612 * table used to translate CS:RIP resides in emulated MMIO.
4613 */
4614 if (likely(insn_len))
4615 return true;
4616
05d5a486 4617 /*
118154bd
LA
4618 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4619 *
4620 * Errata:
04c40f34
SC
4621 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4622 * possible that CPU microcode implementing DecodeAssist will fail to
4623 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4624 * be '0'. This happens because microcode reads CS:RIP using a _data_
4625 * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
4626 * gives up and does not fill the instruction bytes buffer.
118154bd 4627 *
3280cc22
SC
4628 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4629 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4630 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4631 * GuestIntrBytes field of the VMCB.
05d5a486 4632 *
04c40f34
SC
4633 * This does _not_ mean that the erratum has been encountered, as the
4634 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4635 * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4636 * encountered a reserved/not-present #PF.
05d5a486 4637 *
3280cc22
SC
4638 * To hit the erratum, the following conditions must be true:
4639 * 1. CR4.SMAP=1 (obviously).
4640 * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
4641 * have been hit as the guest would have encountered a SMEP
4642 * violation #PF, not a #NPF.
4643 * 3. The #NPF is not due to a code fetch, in which case failure to
4644 * retrieve the instruction bytes is legitimate (see abvoe).
4645 *
4646 * In addition, don't apply the erratum workaround if the #NPF occurred
4647 * while translating guest page tables (see below).
05d5a486 4648 */
3280cc22
SC
4649 error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4650 if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4651 goto resume_guest;
4652
09e3e2a1
SC
4653 cr4 = kvm_read_cr4(vcpu);
4654 smep = cr4 & X86_CR4_SMEP;
4655 smap = cr4 & X86_CR4_SMAP;
4656 is_user = svm_get_cpl(vcpu) == 3;
118154bd 4657 if (smap && (!smep || is_user)) {
118154bd 4658 pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
cdf85e0c
SC
4659
4660 /*
4661 * If the fault occurred in userspace, arbitrarily inject #GP
4662 * to avoid killing the guest and to hopefully avoid confusing
4663 * the guest kernel too much, e.g. injecting #PF would not be
4664 * coherent with respect to the guest's page tables. Request
4665 * triple fault if the fault occurred in the kernel as there's
4666 * no fault that KVM can inject without confusing the guest.
4667 * In practice, the triple fault is moot as no sane SEV kernel
4668 * will execute from user memory while also running with SMAP=1.
4669 */
4670 if (is_user)
4671 kvm_inject_gp(vcpu, 0);
4672 else
4673 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
05d5a486
SB
4674 }
4675
3280cc22
SC
4676resume_guest:
4677 /*
4678 * If the erratum was not hit, simply resume the guest and let it fault
4679 * again. While awful, e.g. the vCPU may get stuck in an infinite loop
4680 * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
4681 * userspace will kill the guest, and letting the emulator read garbage
4682 * will yield random behavior and potentially corrupt the guest.
4683 *
4684 * Simply resuming the guest is technically not a violation of the SEV
4685 * architecture. AMD's APM states that all code fetches and page table
4686 * accesses for SEV guest are encrypted, regardless of the C-Bit. The
4687 * APM also states that encrypted accesses to MMIO are "ignored", but
4688 * doesn't explicitly define "ignored", i.e. doing nothing and letting
4689 * the guest spin is technically "ignoring" the access.
4690 */
05d5a486
SB
4691 return false;
4692}
4693
4b9852f4
LA
4694static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4695{
4696 struct vcpu_svm *svm = to_svm(vcpu);
4697
4698 /*
4699 * TODO: Last condition latch INIT signals on vCPU when
4700 * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
33b22172
PB
4701 * To properly emulate the INIT intercept,
4702 * svm_check_nested_events() should call nested_svm_vmexit()
4703 * if an INIT signal is pending.
4b9852f4
LA
4704 */
4705 return !gif_set(svm) ||
c62e2e94 4706 (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
4b9852f4
LA
4707}
4708
647daca2
TL
4709static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4710{
4711 if (!sev_es_guest(vcpu->kvm))
4712 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4713
4714 sev_vcpu_deliver_sipi_vector(vcpu, vector);
4715}
4716
eaf78265
JR
4717static void svm_vm_destroy(struct kvm *kvm)
4718{
4719 avic_vm_destroy(kvm);
4720 sev_vm_destroy(kvm);
4721}
4722
4723static int svm_vm_init(struct kvm *kvm)
4724{
830f01b0
WL
4725 if (!pause_filter_count || !pause_filter_thresh)
4726 kvm->arch.pause_in_guest = true;
4727
fdf513e3 4728 if (enable_apicv) {
eaf78265
JR
4729 int ret = avic_vm_init(kvm);
4730 if (ret)
4731 return ret;
4732 }
4733
eaf78265
JR
4734 return 0;
4735}
4736
9c14ee21 4737static struct kvm_x86_ops svm_x86_ops __initdata = {
9dadfc4a
SC
4738 .name = "kvm_amd",
4739
23e5092b 4740 .hardware_unsetup = svm_hardware_unsetup,
6aa8b732
AK
4741 .hardware_enable = svm_hardware_enable,
4742 .hardware_disable = svm_hardware_disable,
bc226f07 4743 .has_emulated_msr = svm_has_emulated_msr,
6aa8b732 4744
23e5092b
SC
4745 .vcpu_create = svm_vcpu_create,
4746 .vcpu_free = svm_vcpu_free,
04d2cc77 4747 .vcpu_reset = svm_vcpu_reset,
6aa8b732 4748
562b6b08 4749 .vm_size = sizeof(struct kvm_svm),
4e19c36f 4750 .vm_init = svm_vm_init,
1654efcb 4751 .vm_destroy = svm_vm_destroy,
44a95dae 4752
23e5092b 4753 .prepare_switch_to_guest = svm_prepare_switch_to_guest,
6aa8b732
AK
4754 .vcpu_load = svm_vcpu_load,
4755 .vcpu_put = svm_vcpu_put,
a3c19d5b
SC
4756 .vcpu_blocking = avic_vcpu_blocking,
4757 .vcpu_unblocking = avic_vcpu_unblocking,
6aa8b732 4758
b6a7cc35 4759 .update_exception_bitmap = svm_update_exception_bitmap,
801e459a 4760 .get_msr_feature = svm_get_msr_feature,
6aa8b732
AK
4761 .get_msr = svm_get_msr,
4762 .set_msr = svm_set_msr,
4763 .get_segment_base = svm_get_segment_base,
4764 .get_segment = svm_get_segment,
4765 .set_segment = svm_set_segment,
2e4d2653 4766 .get_cpl = svm_get_cpl,
872e0c53 4767 .get_cs_db_l_bits = svm_get_cs_db_l_bits,
6aa8b732 4768 .set_cr0 = svm_set_cr0,
559c7c75 4769 .post_set_cr3 = sev_post_set_cr3,
c2fe3cd4 4770 .is_valid_cr4 = svm_is_valid_cr4,
6aa8b732
AK
4771 .set_cr4 = svm_set_cr4,
4772 .set_efer = svm_set_efer,
4773 .get_idt = svm_get_idt,
4774 .set_idt = svm_set_idt,
4775 .get_gdt = svm_get_gdt,
4776 .set_gdt = svm_set_gdt,
020df079 4777 .set_dr7 = svm_set_dr7,
facb0139 4778 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
6de4f3ad 4779 .cache_reg = svm_cache_reg,
6aa8b732
AK
4780 .get_rflags = svm_get_rflags,
4781 .set_rflags = svm_set_rflags,
c5063551 4782 .get_if_flag = svm_get_if_flag,
be94f6b7 4783
4d9c83f5
SC
4784 .flush_tlb_all = svm_flush_tlb_current,
4785 .flush_tlb_current = svm_flush_tlb_current,
e27bc044 4786 .flush_tlb_gva = svm_flush_tlb_gva,
4d9c83f5 4787 .flush_tlb_guest = svm_flush_tlb_current,
6aa8b732 4788
fc4fad79 4789 .vcpu_pre_run = svm_vcpu_pre_run,
e27bc044 4790 .vcpu_run = svm_vcpu_run,
23e5092b
SC
4791 .handle_exit = svm_handle_exit,
4792 .skip_emulated_instruction = svm_skip_emulated_instruction,
5ef8acbd 4793 .update_emulated_instruction = NULL,
2809f5d2
GC
4794 .set_interrupt_shadow = svm_set_interrupt_shadow,
4795 .get_interrupt_shadow = svm_get_interrupt_shadow,
102d8325 4796 .patch_hypercall = svm_patch_hypercall,
23e5092b 4797 .inject_irq = svm_inject_irq,
e27bc044 4798 .inject_nmi = svm_inject_nmi,
298101da 4799 .queue_exception = svm_queue_exception,
b463a6f7 4800 .cancel_injection = svm_cancel_injection,
78646121 4801 .interrupt_allowed = svm_interrupt_allowed,
95ba8273 4802 .nmi_allowed = svm_nmi_allowed,
3cfc3092
JK
4803 .get_nmi_mask = svm_get_nmi_mask,
4804 .set_nmi_mask = svm_set_nmi_mask,
b6a7cc35
JB
4805 .enable_nmi_window = svm_enable_nmi_window,
4806 .enable_irq_window = svm_enable_irq_window,
4807 .update_cr8_intercept = svm_update_cr8_intercept,
05c4fe8c 4808 .set_virtual_apic_mode = avic_set_virtual_apic_mode,
db6e7adf
SC
4809 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4810 .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
db6e7adf 4811 .apicv_post_state_restore = avic_apicv_post_state_restore,
cbc94022 4812
586f9607 4813 .get_exit_info = svm_get_exit_info,
586f9607 4814
7c1b761b 4815 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4e47c7a6 4816
f5f48ee1 4817 .has_wbinvd_exit = svm_has_wbinvd_exit,
99e3e30a 4818
307a94c7
IS
4819 .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4820 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
edcfe540 4821 .write_tsc_offset = svm_write_tsc_offset,
1ab9287a 4822 .write_tsc_multiplier = svm_write_tsc_multiplier,
1c97f0a0 4823
727a7e27 4824 .load_mmu_pgd = svm_load_mmu_pgd,
8a76d7f2
JR
4825
4826 .check_intercept = svm_check_intercept,
95b5a48c 4827 .handle_exit_irqoff = svm_handle_exit_irqoff,
ae97a3b8 4828
d264ee0c
SC
4829 .request_immediate_exit = __kvm_request_immediate_exit,
4830
ae97a3b8 4831 .sched_in = svm_sched_in,
25462f7f 4832
33b22172
PB
4833 .nested_ops = &svm_nested_ops,
4834
57dfd7b5 4835 .deliver_interrupt = svm_deliver_interrupt,
db6e7adf 4836 .pi_update_irte = avic_pi_update_irte,
74f16909 4837 .setup_mce = svm_setup_mce,
0234bf88 4838
72d7b374 4839 .smi_allowed = svm_smi_allowed,
ecc513e5
SC
4840 .enter_smm = svm_enter_smm,
4841 .leave_smm = svm_leave_smm,
b6a7cc35 4842 .enable_smi_window = svm_enable_smi_window,
1654efcb 4843
559c7c75
SC
4844 .mem_enc_ioctl = sev_mem_enc_ioctl,
4845 .mem_enc_register_region = sev_mem_enc_register_region,
4846 .mem_enc_unregister_region = sev_mem_enc_unregister_region,
683412cc 4847 .guest_memory_reclaimed = sev_guest_memory_reclaimed,
57b119da 4848
559c7c75
SC
4849 .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
4850 .vm_move_enc_context_from = sev_vm_move_enc_context_from,
54526d1f 4851
09e3e2a1 4852 .can_emulate_instruction = svm_can_emulate_instruction,
4b9852f4
LA
4853
4854 .apic_init_signal_blocked = svm_apic_init_signal_blocked,
fd6fa73d
AG
4855
4856 .msr_filter_changed = svm_msr_filter_changed,
f1c6366e 4857 .complete_emulated_msr = svm_complete_emulated_msr,
647daca2
TL
4858
4859 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
f44509f8 4860 .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
6aa8b732
AK
4861};
4862
54744e17
SC
4863/*
4864 * The default MMIO mask is a single bit (excluding the present bit),
4865 * which could conflict with the memory encryption bit. Check for
4866 * memory encryption support and override the default MMIO mask if
4867 * memory encryption is enabled.
4868 */
4869static __init void svm_adjust_mmio_mask(void)
4870{
4871 unsigned int enc_bit, mask_bit;
4872 u64 msr, mask;
4873
4874 /* If there is no memory encryption support, use existing mask */
4875 if (cpuid_eax(0x80000000) < 0x8000001f)
4876 return;
4877
4878 /* If memory encryption is not enabled, use existing mask */
4879 rdmsrl(MSR_AMD64_SYSCFG, msr);
4880 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
4881 return;
4882
4883 enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
4884 mask_bit = boot_cpu_data.x86_phys_bits;
4885
4886 /* Increment the mask bit if it is the same as the encryption bit */
4887 if (enc_bit == mask_bit)
4888 mask_bit++;
4889
4890 /*
4891 * If the mask bit location is below 52, then some bits above the
4892 * physical addressing limit will always be reserved, so use the
4893 * rsvd_bits() function to generate the mask. This mask, along with
4894 * the present bit, will be used to generate a page fault with
4895 * PFER.RSV = 1.
4896 *
4897 * If the mask bit location is 52 (or above), then clear the mask.
4898 */
4899 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
4900
4901 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
4902}
4903
4904static __init void svm_set_cpu_caps(void)
4905{
4906 kvm_set_cpu_caps();
4907
938c8745 4908 kvm_caps.supported_xss = 0;
54744e17
SC
4909
4910 /* CPUID 0x80000001 and 0x8000000A (SVM features) */
4911 if (nested) {
4912 kvm_cpu_cap_set(X86_FEATURE_SVM);
91f673b3 4913 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
54744e17
SC
4914
4915 if (nrips)
4916 kvm_cpu_cap_set(X86_FEATURE_NRIPS);
4917
4918 if (npt_enabled)
4919 kvm_cpu_cap_set(X86_FEATURE_NPT);
4920
4921 if (tsc_scaling)
4922 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
4923
b9f3973a
ML
4924 if (vls)
4925 kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
d20c796c
ML
4926 if (lbrv)
4927 kvm_cpu_cap_set(X86_FEATURE_LBRV);
b9f3973a 4928
74fd41ed
ML
4929 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
4930 kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
4931
4932 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
4933 kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
4934
0b349662
ML
4935 if (vgif)
4936 kvm_cpu_cap_set(X86_FEATURE_VGIF);
4937
54744e17
SC
4938 /* Nested VM can receive #VMEXIT instead of triggering #GP */
4939 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
4940 }
4941
4942 /* CPUID 0x80000008 */
4943 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
4944 boot_cpu_has(X86_FEATURE_AMD_SSBD))
4945 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
4946
4947 /* AMD PMU PERFCTR_CORE CPUID */
4948 if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
4949 kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
4950
4951 /* CPUID 0x8000001F (SME/SEV features) */
4952 sev_set_cpu_caps();
4953}
4954
4955static __init int svm_hardware_setup(void)
4956{
4957 int cpu;
4958 struct page *iopm_pages;
4959 void *iopm_va;
4960 int r;
4961 unsigned int order = get_order(IOPM_SIZE);
4962
4963 /*
4964 * NX is required for shadow paging and for NPT if the NX huge pages
4965 * mitigation is enabled.
4966 */
4967 if (!boot_cpu_has(X86_FEATURE_NX)) {
4968 pr_err_ratelimited("NX (Execute Disable) not supported\n");
4969 return -EOPNOTSUPP;
4970 }
4971 kvm_enable_efer_bits(EFER_NX);
4972
4973 iopm_pages = alloc_pages(GFP_KERNEL, order);
4974
4975 if (!iopm_pages)
4976 return -ENOMEM;
4977
4978 iopm_va = page_address(iopm_pages);
4979 memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
4980 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
4981
4982 init_msrpm_offsets();
4983
938c8745
SC
4984 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
4985 XFEATURE_MASK_BNDCSR);
54744e17
SC
4986
4987 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
4988 kvm_enable_efer_bits(EFER_FFXSR);
4989
4990 if (tsc_scaling) {
4991 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
4992 tsc_scaling = false;
4993 } else {
4994 pr_info("TSC scaling supported\n");
938c8745 4995 kvm_caps.has_tsc_control = true;
54744e17
SC
4996 }
4997 }
938c8745
SC
4998 kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
4999 kvm_caps.tsc_scaling_ratio_frac_bits = 32;
54744e17
SC
5000
5001 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
5002
5003 /* Check for pause filtering support */
5004 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
5005 pause_filter_count = 0;
5006 pause_filter_thresh = 0;
5007 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
5008 pause_filter_thresh = 0;
5009 }
5010
5011 if (nested) {
5012 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
5013 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
5014 }
5015
5016 /*
5017 * KVM's MMU doesn't support using 2-level paging for itself, and thus
5018 * NPT isn't supported if the host is using 2-level paging since host
5019 * CR4 is unchanged on VMRUN.
5020 */
5021 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
5022 npt_enabled = false;
5023
5024 if (!boot_cpu_has(X86_FEATURE_NPT))
5025 npt_enabled = false;
5026
5027 /* Force VM NPT level equal to the host's paging level */
5028 kvm_configure_mmu(npt_enabled, get_npt_level(),
5029 get_npt_level(), PG_LEVEL_1G);
5030 pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
5031
e54f1ff2
KH
5032 /* Setup shadow_me_value and shadow_me_mask */
5033 kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5034
54744e17
SC
5035 /* Note, SEV setup consumes npt_enabled. */
5036 sev_hardware_setup();
5037
5038 svm_hv_hardware_setup();
5039
5040 svm_adjust_mmio_mask();
5041
5042 for_each_possible_cpu(cpu) {
5043 r = svm_cpu_init(cpu);
5044 if (r)
5045 goto err;
5046 }
5047
5048 if (nrips) {
5049 if (!boot_cpu_has(X86_FEATURE_NRIPS))
5050 nrips = false;
5051 }
5052
4bdec12a 5053 enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
54744e17 5054
4bdec12a 5055 if (!enable_apicv) {
a3c19d5b
SC
5056 svm_x86_ops.vcpu_blocking = NULL;
5057 svm_x86_ops.vcpu_unblocking = NULL;
f44509f8 5058 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
54744e17
SC
5059 }
5060
5061 if (vls) {
5062 if (!npt_enabled ||
5063 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5064 !IS_ENABLED(CONFIG_X86_64)) {
5065 vls = false;
5066 } else {
5067 pr_info("Virtual VMLOAD VMSAVE supported\n");
5068 }
5069 }
5070
5071 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5072 svm_gp_erratum_intercept = false;
5073
5074 if (vgif) {
5075 if (!boot_cpu_has(X86_FEATURE_VGIF))
5076 vgif = false;
5077 else
5078 pr_info("Virtual GIF supported\n");
5079 }
5080
5081 if (lbrv) {
5082 if (!boot_cpu_has(X86_FEATURE_LBRV))
5083 lbrv = false;
5084 else
5085 pr_info("LBR virtualization supported\n");
5086 }
5087
5088 if (!enable_pmu)
5089 pr_info("PMU virtualization is disabled\n");
5090
5091 svm_set_cpu_caps();
5092
5093 /*
5094 * It seems that on AMD processors PTE's accessed bit is
5095 * being set by the CPU hardware before the NPF vmexit.
5096 * This is not expected behaviour and our tests fail because
5097 * of it.
5098 * A workaround here is to disable support for
5099 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5100 * In this case userspace can know if there is support using
5101 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5102 * it
5103 * If future AMD CPU models change the behaviour described above,
5104 * this variable can be changed accordingly
5105 */
5106 allow_smaller_maxphyaddr = !npt_enabled;
5107
5108 return 0;
5109
5110err:
23e5092b 5111 svm_hardware_unsetup();
54744e17
SC
5112 return r;
5113}
5114
5115
d008dfdb
SC
5116static struct kvm_x86_init_ops svm_init_ops __initdata = {
5117 .cpu_has_kvm_support = has_svm,
5118 .disabled_by_bios = is_disabled,
5119 .hardware_setup = svm_hardware_setup,
5120 .check_processor_compatibility = svm_check_processor_compat,
5121
5122 .runtime_ops = &svm_x86_ops,
34886e79 5123 .pmu_ops = &amd_pmu_ops,
6aa8b732
AK
5124};
5125
5126static int __init svm_init(void)
5127{
d07f46f9
TL
5128 __unused_size_checks();
5129
d008dfdb 5130 return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
0ee75bea 5131 __alignof__(struct vcpu_svm), THIS_MODULE);
6aa8b732
AK
5132}
5133
5134static void __exit svm_exit(void)
5135{
cb498ea2 5136 kvm_exit();
6aa8b732
AK
5137}
5138
5139module_init(svm_init)
5140module_exit(svm_exit)